diff --git a/.idea/ai.iml b/.idea/ai.iml
index 1394582..0c65e39 100644
--- a/.idea/ai.iml
+++ b/.idea/ai.iml
@@ -8,6 +8,7 @@
+
diff --git a/AGENT.md b/AGENT.md
new file mode 100644
index 0000000..6061588
--- /dev/null
+++ b/AGENT.md
@@ -0,0 +1,15 @@
+We're building an AI-first trading platform by integrating user-facing TradingView charts and chat with an AI assistant that helps do research, develop indicators (signals), and write strategies, using the Dexorder trading framework we provide.
+
+This monorepo has:
+bin/ scripts, mostly build and deploy
+deploy/ kubernetes deployment and configuration
+doc/ documentation
+flink/ Apache Flink application mode processes data from Kafka
+iceberg/ Apache Iceberg for historical OHLC etc
+ingestor/ Data sources publish to Kafka
+kafka/ Apache Kafka
+protobuf/ Messaging entities
+relay/ Rust+ZeroMQ stateless router
+web/ Vue 3 / Pinia / PrimeVue / TradingView
+
+See doc/protocol.md for messaging architecture
diff --git a/bin/build-all b/bin/build-all
index 4ff59e0..8b6fb8c 100755
--- a/bin/build-all
+++ b/bin/build-all
@@ -4,6 +4,7 @@
set -e
DIR="$(cd "$(dirname "$0")" && pwd)"
+ROOT_DIR="$(cd "$DIR/.." && pwd)"
echo "Building all container images..."
echo
@@ -13,5 +14,31 @@ echo
"$DIR/build" ingestor "$@"
"$DIR/build" web "$@"
+# Build lifecycle-sidecar (Go binary, no protobuf sync needed)
+echo "Building lifecycle-sidecar..."
+cd "$ROOT_DIR/lifecycle-sidecar"
+
+# Determine tag
+if [ "$1" == "dev" ]; then
+ TAG="dev$(date +%Y%m%d%H%M%S)"
+else
+ # Check for uncommitted changes
+ DIRTY="$(git status | grep 'Changes ' || true)"
+ if [ "$DIRTY" != "" ]; then
+ echo "lifecycle-sidecar has uncommitted changes."
+ echo "Use '$0 dev' to build a development-tagged version instead."
+ exit 1
+ fi
+ TAG="$(git log --oneline | head -1 | cut -d ' ' -f 1)"
+fi
+
+REMOTE=${REMOTE:-ghcr.io/dexorder}
+
+docker build -t lifecycle-sidecar:latest -t lifecycle-sidecar:$TAG .
+docker tag lifecycle-sidecar:$TAG $REMOTE/lifecycle-sidecar:$TAG
+docker tag $REMOTE/lifecycle-sidecar:$TAG $REMOTE/lifecycle-sidecar:latest
+
+echo "$(date)" built $REMOTE/lifecycle-sidecar:$TAG
+
echo
echo "All images built successfully!"
diff --git a/bin/dev b/bin/dev
index 5d84c91..4ec7afe 100755
--- a/bin/dev
+++ b/bin/dev
@@ -19,7 +19,7 @@ usage() {
echo "Commands:"
echo " start Start minikube and deploy all services"
echo " stop Stop minikube"
- echo " restart [svc] Rebuild and redeploy all services, or just one (relay|ingestor|flink)"
+ echo " restart [svc] Rebuild and redeploy all services, or just one (relay|ingestor|flink|sidecar)"
echo " rebuild [svc] Rebuild all custom images, or just one"
echo " deploy [svc] Deploy/update all services, or just one"
echo " status Show status of all services"
@@ -127,12 +127,23 @@ rebuild_images() {
docker tag "dexorder/ai-flink:$FLINK_TAG" "dexorder/flink:$FLINK_TAG"
fi
- # Save the tags for deployment (all three, preserving any we didn't rebuild)
+ # Build lifecycle-sidecar (Go binary)
+ if [ "$service" == "all" ] || [ "$service" == "lifecycle-sidecar" ] || [ "$service" == "sidecar" ]; then
+ echo -e "${GREEN}→${NC} Building lifecycle-sidecar..."
+ cd "$ROOT_DIR/lifecycle-sidecar"
+ SIDECAR_TAG="dev$(date +%Y%m%d%H%M%S)"
+ docker build -t lifecycle-sidecar:latest -t lifecycle-sidecar:$SIDECAR_TAG . || exit 1
+ echo -e "${GREEN}✓ Built lifecycle-sidecar:$SIDECAR_TAG${NC}"
+ cd "$ROOT_DIR"
+ fi
+
+ # Save the tags for deployment (all services, preserving any we didn't rebuild)
echo "RELAY_TAG=$RELAY_TAG" > "$ROOT_DIR/.dev-image-tag"
echo "INGEST_TAG=$INGEST_TAG" >> "$ROOT_DIR/.dev-image-tag"
echo "FLINK_TAG=$FLINK_TAG" >> "$ROOT_DIR/.dev-image-tag"
+ echo "SIDECAR_TAG=$SIDECAR_TAG" >> "$ROOT_DIR/.dev-image-tag"
- echo -e "${GREEN}✓ Images built: relay=$RELAY_TAG, ingestor=$INGEST_TAG, flink=$FLINK_TAG${NC}"
+ echo -e "${GREEN}✓ Images built: relay=$RELAY_TAG, ingestor=$INGEST_TAG, flink=$FLINK_TAG, sidecar=$SIDECAR_TAG${NC}"
}
deploy_services() {
diff --git a/client-py/dexorder/lifecycle_manager.py b/client-py/dexorder/lifecycle_manager.py
new file mode 100644
index 0000000..882d865
--- /dev/null
+++ b/client-py/dexorder/lifecycle_manager.py
@@ -0,0 +1,230 @@
+"""
+Container lifecycle manager for agent containers.
+
+Tracks activity and triggers to determine when the container should shut down.
+Exits with code 42 to signal clean idle shutdown to the lifecycle sidecar.
+"""
+
+import asyncio
+import logging
+import os
+import signal
+import sys
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Optional, Set
+
+logger = logging.getLogger(__name__)
+
+# Exit code to signal clean idle shutdown to sidecar
+EXIT_CODE_IDLE_SHUTDOWN = 42
+
+# File to write exit code for sidecar to read
+EXIT_CODE_FILE = Path("/var/run/agent/exit_code")
+
+
+class LifecycleManager:
+ """
+ Manages container lifecycle based on activity and triggers.
+
+ The container shuts itself down when:
+ 1. No active triggers (data subscriptions, CEP patterns, etc.)
+ 2. No recent user activity (MCP calls)
+ 3. Idle timeout has elapsed
+ """
+
+ def __init__(
+ self,
+ idle_timeout_minutes: int = 15,
+ check_interval_seconds: int = 60,
+ enable_shutdown: bool = True,
+ ):
+ """
+ Initialize lifecycle manager.
+
+ Args:
+ idle_timeout_minutes: Minutes of inactivity before shutdown
+ check_interval_seconds: Interval between idle checks
+ enable_shutdown: If False, only log idle state without exiting (for testing)
+ """
+ self.idle_timeout = timedelta(minutes=idle_timeout_minutes)
+ self.check_interval = check_interval_seconds
+ self.enable_shutdown = enable_shutdown
+
+ self.last_activity: datetime = datetime.now()
+ self.active_triggers: Set[str] = set()
+ self._running = False
+ self._check_task: Optional[asyncio.Task] = None
+
+ logger.info(
+ "Lifecycle manager initialized: idle_timeout=%dm, check_interval=%ds, shutdown_enabled=%s",
+ idle_timeout_minutes,
+ check_interval_seconds,
+ enable_shutdown,
+ )
+
+ def record_activity(self) -> None:
+ """
+ Record user activity (called on MCP tool/resource/prompt invocations).
+ Resets the idle timer.
+ """
+ self.last_activity = datetime.now()
+ logger.debug("Activity recorded, idle timer reset")
+
+ def update_triggers(self, triggers: Set[str]) -> None:
+ """
+ Update the set of active triggers.
+
+ Args:
+ triggers: Set of active trigger IDs (data subscriptions, CEP patterns, etc.)
+ """
+ if triggers != self.active_triggers:
+ added = triggers - self.active_triggers
+ removed = self.active_triggers - triggers
+
+ if added:
+ logger.info("Triggers added: %s", added)
+ if removed:
+ logger.info("Triggers removed: %s", removed)
+
+ self.active_triggers = triggers
+ logger.info("Active triggers: %d", len(self.active_triggers))
+
+ def add_trigger(self, trigger_id: str) -> None:
+ """Add a single trigger."""
+ if trigger_id not in self.active_triggers:
+ self.active_triggers.add(trigger_id)
+ logger.info("Trigger added: %s (total: %d)", trigger_id, len(self.active_triggers))
+
+ def remove_trigger(self, trigger_id: str) -> None:
+ """Remove a single trigger."""
+ if trigger_id in self.active_triggers:
+ self.active_triggers.remove(trigger_id)
+ logger.info("Trigger removed: %s (total: %d)", trigger_id, len(self.active_triggers))
+
+ def is_idle(self) -> bool:
+ """
+ Check if container is idle and should shut down.
+
+ Returns:
+ True if no triggers and idle timeout exceeded
+ """
+ has_triggers = len(self.active_triggers) > 0
+ idle_time = datetime.now() - self.last_activity
+ is_past_timeout = idle_time > self.idle_timeout
+
+ if has_triggers:
+ logger.debug("Not idle: has %d active triggers", len(self.active_triggers))
+ return False
+
+ if not is_past_timeout:
+ logger.debug(
+ "Not idle: last activity %s ago (timeout: %s)",
+ idle_time,
+ self.idle_timeout,
+ )
+ return False
+
+ logger.info(
+ "Container is idle: no triggers and %s since last activity", idle_time
+ )
+ return True
+
+ async def start(self) -> None:
+ """Start the lifecycle manager background task."""
+ if self._running:
+ logger.warning("Lifecycle manager already running")
+ return
+
+ self._running = True
+ self._check_task = asyncio.create_task(self._check_loop())
+ logger.info("Lifecycle manager started")
+
+ async def stop(self) -> None:
+ """Stop the lifecycle manager."""
+ self._running = False
+ if self._check_task:
+ self._check_task.cancel()
+ try:
+ await self._check_task
+ except asyncio.CancelledError:
+ pass
+ logger.info("Lifecycle manager stopped")
+
+ async def _check_loop(self) -> None:
+ """Background task that periodically checks if container should shut down."""
+ while self._running:
+ try:
+ await asyncio.sleep(self.check_interval)
+
+ if self.is_idle():
+ if self.enable_shutdown:
+ logger.info("Initiating idle shutdown (exit code %d)", EXIT_CODE_IDLE_SHUTDOWN)
+ self._write_exit_code(EXIT_CODE_IDLE_SHUTDOWN)
+
+ # Give sidecar a moment to see the exit code file
+ await asyncio.sleep(1)
+
+ # Exit with special code
+ os._exit(EXIT_CODE_IDLE_SHUTDOWN)
+ else:
+ logger.info(
+ "Container is idle but shutdown is disabled (testing mode)"
+ )
+
+ except asyncio.CancelledError:
+ logger.info("Check loop cancelled")
+ raise
+ except Exception as e:
+ logger.error("Error in lifecycle check loop: %s", e, exc_info=True)
+
+ def _write_exit_code(self, code: int) -> None:
+ """Write exit code to shared file for sidecar to read."""
+ try:
+ EXIT_CODE_FILE.parent.mkdir(parents=True, exist_ok=True)
+ EXIT_CODE_FILE.write_text(str(code))
+ logger.debug("Wrote exit code %d to %s", code, EXIT_CODE_FILE)
+ except Exception as e:
+ logger.warning("Failed to write exit code file: %s", e)
+
+ def setup_signal_handlers(self) -> None:
+ """
+ Setup signal handlers for graceful shutdown.
+ On SIGTERM/SIGINT, exit normally (not with code 42) to allow restart.
+ """
+
+ def signal_handler(signum, frame):
+ logger.info("Received signal %d, exiting normally", signum)
+ sys.exit(0)
+
+ signal.signal(signal.SIGTERM, signal_handler)
+ signal.signal(signal.SIGINT, signal_handler)
+
+
+# Singleton instance for easy access across the application
+_lifecycle_manager: Optional[LifecycleManager] = None
+
+
+def get_lifecycle_manager() -> LifecycleManager:
+ """Get or create the global lifecycle manager instance."""
+ global _lifecycle_manager
+ if _lifecycle_manager is None:
+ # Load configuration from environment
+ idle_timeout = int(os.environ.get("IDLE_TIMEOUT_MINUTES", "15"))
+ check_interval = int(os.environ.get("IDLE_CHECK_INTERVAL_SECONDS", "60"))
+ enable_shutdown = os.environ.get("ENABLE_IDLE_SHUTDOWN", "true").lower() == "true"
+
+ _lifecycle_manager = LifecycleManager(
+ idle_timeout_minutes=idle_timeout,
+ check_interval_seconds=check_interval,
+ enable_shutdown=enable_shutdown,
+ )
+ return _lifecycle_manager
+
+
+async def start_lifecycle_manager() -> LifecycleManager:
+ """Initialize and start the lifecycle manager."""
+ manager = get_lifecycle_manager()
+ manager.setup_signal_handlers()
+ await manager.start()
+ return manager
diff --git a/client-py/dexorder/mcp_auth_middleware.py b/client-py/dexorder/mcp_auth_middleware.py
new file mode 100644
index 0000000..1e4018e
--- /dev/null
+++ b/client-py/dexorder/mcp_auth_middleware.py
@@ -0,0 +1,43 @@
+# openclaw/auth.py
+
+class MCPAuthMiddleware:
+ """Authenticates incoming MCP connections based on configured mode."""
+
+ def __init__(self, config: AuthConfig):
+ self.config = config
+ self._jwks_client = None # lazy-loaded for platform mode
+
+ async def authenticate(self, request) -> AuthContext:
+ match self.config.mode:
+ case "local":
+ # stdio transport or localhost-only binding
+ # No auth needed — if you can exec into the container,
+ # you're the user
+ return AuthContext(user_id=self.config.local_user_id,
+ source="local")
+
+ case "token":
+ # User-generated API key (standalone remote access)
+ token = extract_bearer_token(request)
+ if not verify_token_hash(token, self.config.tokens):
+ raise AuthError("Invalid API token")
+ return AuthContext(user_id=self.config.local_user_id,
+ source="api_key")
+
+ case "platform":
+ # JWT signed by the OpenClaw platform
+ token = extract_bearer_token(request)
+ claims = await self._verify_platform_jwt(token)
+ if claims["sub"] != self.config.expected_user_id:
+ raise AuthError("User ID mismatch")
+ return AuthContext(user_id=claims["sub"],
+ source="platform",
+ scopes=claims.get("scopes", []))
+
+ async def _verify_platform_jwt(self, token: str) -> dict:
+ if not self._jwks_client:
+ self._jwks_client = JWKSClient(self.config.platform_jwks_url)
+ signing_key = await self._jwks_client.get_signing_key_from_jwt(token)
+ return jwt.decode(token, signing_key.key,
+ algorithms=["RS256"],
+ audience="openclaw-mcp")
diff --git a/deploy/k8s/base/admission-policy.yaml b/deploy/k8s/base/admission-policy.yaml
new file mode 100644
index 0000000..5cca3f4
--- /dev/null
+++ b/deploy/k8s/base/admission-policy.yaml
@@ -0,0 +1,110 @@
+# ValidatingAdmissionPolicy to restrict images in dexorder-agents namespace
+# Requires Kubernetes 1.30+ (or 1.28+ with feature gate)
+# This is the critical security control that prevents arbitrary image execution
+# even if the gateway is compromised.
+---
+apiVersion: admissionregistration.k8s.io/v1
+kind: ValidatingAdmissionPolicy
+metadata:
+ name: dexorder-agent-image-policy
+spec:
+ failurePolicy: Fail
+ matchConstraints:
+ namespaceSelector:
+ matchLabels:
+ dexorder.io/type: agents
+ resourceRules:
+ - apiGroups: ["apps"]
+ apiVersions: ["v1"]
+ resources: ["deployments"]
+ operations: ["CREATE", "UPDATE"]
+ validations:
+ # Only allow images from our approved registry with agent prefix
+ - expression: |
+ object.spec.template.spec.containers.all(c,
+ c.image.startsWith('ghcr.io/dexorder/agent:') ||
+ c.image.startsWith('ghcr.io/dexorder/agent-'))
+ message: "Only approved dexorder agent images are allowed in the agents namespace"
+ reason: Forbidden
+
+ # No privileged containers
+ - expression: |
+ object.spec.template.spec.containers.all(c,
+ !has(c.securityContext) ||
+ !has(c.securityContext.privileged) ||
+ c.securityContext.privileged == false)
+ message: "Privileged containers are not allowed"
+ reason: Forbidden
+
+ # No hostPath volumes
+ - expression: |
+ !has(object.spec.template.spec.volumes) ||
+ object.spec.template.spec.volumes.all(v,
+ !has(v.hostPath))
+ message: "hostPath volumes are not allowed"
+ reason: Forbidden
+
+ # No hostNetwork
+ - expression: |
+ !has(object.spec.template.spec.hostNetwork) ||
+ object.spec.template.spec.hostNetwork == false
+ message: "hostNetwork is not allowed"
+ reason: Forbidden
+
+ # No hostPID
+ - expression: |
+ !has(object.spec.template.spec.hostPID) ||
+ object.spec.template.spec.hostPID == false
+ message: "hostPID is not allowed"
+ reason: Forbidden
+
+ # Containers must run as non-root
+ - expression: |
+ object.spec.template.spec.containers.all(c,
+ has(c.securityContext) &&
+ has(c.securityContext.runAsNonRoot) &&
+ c.securityContext.runAsNonRoot == true)
+ message: "Containers must run as non-root"
+ reason: Forbidden
+
+ # Must drop all capabilities
+ - expression: |
+ object.spec.template.spec.containers.all(c,
+ has(c.securityContext) &&
+ has(c.securityContext.capabilities) &&
+ has(c.securityContext.capabilities.drop) &&
+ c.securityContext.capabilities.drop.exists(cap, cap == 'ALL'))
+ message: "Containers must drop all capabilities"
+ reason: Forbidden
+
+ # Read-only root filesystem
+ - expression: |
+ object.spec.template.spec.containers.all(c,
+ has(c.securityContext) &&
+ has(c.securityContext.readOnlyRootFilesystem) &&
+ c.securityContext.readOnlyRootFilesystem == true)
+ message: "Containers must have read-only root filesystem"
+ reason: Forbidden
+
+ # Resource limits must be set
+ - expression: |
+ object.spec.template.spec.containers.all(c,
+ has(c.resources) &&
+ has(c.resources.limits) &&
+ has(c.resources.limits.memory) &&
+ has(c.resources.limits.cpu))
+ message: "Containers must have resource limits set"
+ reason: Forbidden
+---
+apiVersion: admissionregistration.k8s.io/v1
+kind: ValidatingAdmissionPolicyBinding
+metadata:
+ name: dexorder-agent-image-policy-binding
+spec:
+ policyName: dexorder-agent-image-policy
+ validationActions:
+ - Deny
+ matchResources:
+ namespaceSelector:
+ matchLabels:
+ dexorder.io/type: agents
diff --git a/deploy/k8s/base/agent-deployment-example.yaml b/deploy/k8s/base/agent-deployment-example.yaml
new file mode 100644
index 0000000..a46cda8
--- /dev/null
+++ b/deploy/k8s/base/agent-deployment-example.yaml
@@ -0,0 +1,221 @@
+# Example agent deployment with lifecycle sidecar
+# This would be created by the gateway for each user
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: agent-user-abc123
+ namespace: dexorder-agents
+ labels:
+ app.kubernetes.io/name: agent
+ app.kubernetes.io/component: user-agent
+ dexorder.io/component: agent
+ dexorder.io/user-id: user-abc123
+ dexorder.io/deployment: agent-user-abc123
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ dexorder.io/user-id: user-abc123
+ template:
+ metadata:
+ labels:
+ dexorder.io/component: agent
+ dexorder.io/user-id: user-abc123
+ dexorder.io/deployment: agent-user-abc123
+ spec:
+ serviceAccountName: agent-lifecycle
+
+ # Share PID namespace so sidecar can monitor main container
+ shareProcessNamespace: true
+
+ # Security context
+ securityContext:
+ runAsNonRoot: true
+ runAsUser: 1000
+ fsGroup: 1000
+ seccompProfile:
+ type: RuntimeDefault
+
+ containers:
+ # Main agent container
+ - name: agent
+ image: ghcr.io/dexorder/agent:latest
+ imagePullPolicy: Always
+
+ # Security context (required by admission policy)
+ securityContext:
+ allowPrivilegeEscalation: false
+ runAsNonRoot: true
+ runAsUser: 1000
+ readOnlyRootFilesystem: true
+ capabilities:
+ drop:
+ - ALL
+
+ # Resource limits (required by admission policy)
+ resources:
+ requests:
+ memory: "256Mi"
+ cpu: "100m"
+ limits:
+ memory: "1Gi"
+ cpu: "1000m"
+
+ # Environment variables
+ env:
+ - name: USER_ID
+ value: "user-abc123"
+ - name: IDLE_TIMEOUT_MINUTES
+ value: "15"
+ - name: IDLE_CHECK_INTERVAL_SECONDS
+ value: "60"
+ - name: ENABLE_IDLE_SHUTDOWN
+ value: "true"
+ - name: MCP_SERVER_PORT
+ value: "3000"
+ - name: ZMQ_CONTROL_PORT
+ value: "5555"
+
+ # Ports
+ ports:
+ - name: mcp
+ containerPort: 3000
+ protocol: TCP
+ - name: zmq-control
+ containerPort: 5555
+ protocol: TCP
+
+ # Volume mounts
+ volumeMounts:
+ - name: agent-data
+ mountPath: /app/data
+ - name: tmp
+ mountPath: /tmp
+ - name: shared-run
+ mountPath: /var/run/agent
+
+ # Liveness probe (agent's MCP server)
+ livenessProbe:
+ httpGet:
+ path: /health
+ port: mcp
+ initialDelaySeconds: 10
+ periodSeconds: 30
+ timeoutSeconds: 5
+
+ # Readiness probe
+ readinessProbe:
+ httpGet:
+ path: /ready
+ port: mcp
+ initialDelaySeconds: 5
+ periodSeconds: 10
+
+ # Lifecycle sidecar
+ - name: lifecycle-sidecar
+ image: ghcr.io/dexorder/lifecycle-sidecar:latest
+ imagePullPolicy: Always
+
+ # Security context
+ securityContext:
+ allowPrivilegeEscalation: false
+ runAsNonRoot: true
+ runAsUser: 1000
+ readOnlyRootFilesystem: true
+ capabilities:
+ drop:
+ - ALL
+
+ # Resource limits
+ resources:
+ requests:
+ memory: "32Mi"
+ cpu: "10m"
+ limits:
+ memory: "64Mi"
+ cpu: "50m"
+
+ # Environment variables (injected via downward API)
+ env:
+ - name: NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.namespace
+ - name: DEPLOYMENT_NAME
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.labels['dexorder.io/deployment']
+ - name: USER_TYPE
+ value: "free" # Gateway sets this based on license
+ - name: MAIN_CONTAINER_PID
+ value: "1" # In shared PID namespace, main container is typically PID 1
+
+ # Volume mounts
+ volumeMounts:
+ - name: shared-run
+ mountPath: /var/run/agent
+ readOnly: true
+
+ # Volumes
+ volumes:
+ # Persistent data (user files, state)
+ - name: agent-data
+ persistentVolumeClaim:
+ claimName: agent-user-abc123-data
+
+ # Temporary writable filesystem (read-only rootfs)
+ - name: tmp
+ emptyDir:
+ medium: Memory
+ sizeLimit: 128Mi
+
+ # Shared between main container and sidecar
+ - name: shared-run
+ emptyDir:
+ medium: Memory
+ sizeLimit: 1Mi
+
+ # Restart policy
+ restartPolicy: Always
+
+ # Termination grace period
+ terminationGracePeriodSeconds: 30
+---
+# PVC for agent persistent data
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+ name: agent-user-abc123-data
+ namespace: dexorder-agents
+ labels:
+ dexorder.io/user-id: user-abc123
+spec:
+ accessModes:
+ - ReadWriteOnce
+ resources:
+ requests:
+ storage: 1Gi
+ storageClassName: standard # Or your preferred storage class
+---
+# Service to expose agent MCP endpoint
+apiVersion: v1
+kind: Service
+metadata:
+ name: agent-user-abc123
+ namespace: dexorder-agents
+ labels:
+ dexorder.io/user-id: user-abc123
+spec:
+ type: ClusterIP
+ selector:
+ dexorder.io/user-id: user-abc123
+ ports:
+ - name: mcp
+ port: 3000
+ targetPort: mcp
+ protocol: TCP
+ - name: zmq-control
+ port: 5555
+ targetPort: zmq-control
+ protocol: TCP
diff --git a/deploy/k8s/base/agent-quotas.yaml b/deploy/k8s/base/agent-quotas.yaml
new file mode 100644
index 0000000..660d8db
--- /dev/null
+++ b/deploy/k8s/base/agent-quotas.yaml
@@ -0,0 +1,53 @@
+# Resource constraints for the dexorder-agents namespace
+# These limits apply regardless of what the gateway requests
+---
+# LimitRange: per-container defaults and maximums
+apiVersion: v1
+kind: LimitRange
+metadata:
+ name: agent-limits
+ namespace: dexorder-agents
+spec:
+ limits:
+ # Default limits applied if deployment doesn't specify
+ - type: Container
+ default:
+ memory: "512Mi"
+ cpu: "500m"
+ defaultRequest:
+ memory: "256Mi"
+ cpu: "100m"
+ # Maximum any single container can request
+ max:
+ memory: "2Gi"
+ cpu: "2000m"
+ min:
+ memory: "64Mi"
+ cpu: "50m"
+ # PVC size limits
+ - type: PersistentVolumeClaim
+ max:
+ storage: "10Gi"
+ min:
+ storage: "100Mi"
+---
+# ResourceQuota: total namespace limits
+# Prevents a compromised gateway from exhausting cluster resources
+apiVersion: v1
+kind: ResourceQuota
+metadata:
+ name: agent-quota
+ namespace: dexorder-agents
+spec:
+ hard:
+ # Total compute limits for all agents combined
+ requests.cpu: "20"
+ requests.memory: "40Gi"
+ limits.cpu: "40"
+ limits.memory: "80Gi"
+ # Object count limits
+ pods: "100"
+ persistentvolumeclaims: "100"
+ services: "100"
+ # Storage limits
+ requests.storage: "500Gi"
diff --git a/deploy/k8s/base/gateway-rbac.yaml b/deploy/k8s/base/gateway-rbac.yaml
new file mode 100644
index 0000000..53929fd
--- /dev/null
+++ b/deploy/k8s/base/gateway-rbac.yaml
@@ -0,0 +1,65 @@
+# RBAC for gateway to CREATE agent deployments only
+# Principle of least privilege: gateway can ONLY create deployments/services/PVCs
+# in the dexorder-agents namespace. Deletion is handled by the lifecycle sidecar.
+# No pods, secrets, exec, or cross-namespace access.
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: gateway
+ namespace: dexorder-system
+---
+# Role scoped to dexorder-agents namespace only
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+ name: agent-creator
+ namespace: dexorder-agents
+rules:
+ # Deployments: create and read only (deletion handled by sidecar)
+ - apiGroups: ["apps"]
+ resources: ["deployments"]
+ verbs: ["create", "get", "list", "watch", "patch", "update"]
+
+ # PVCs: create and read (deletion handled by sidecar)
+ - apiGroups: [""]
+ resources: ["persistentvolumeclaims"]
+ verbs: ["create", "get", "list", "watch"]
+
+ # Services: create and manage agent MCP endpoints
+ - apiGroups: [""]
+ resources: ["services"]
+ verbs: ["create", "get", "list", "watch", "patch", "update"]
+
+ # Read-only pod access for status checks (no exec!)
+ - apiGroups: [""]
+ resources: ["pods"]
+ verbs: ["get", "list", "watch"]
+
+ # Pod logs for debugging (read-only)
+ - apiGroups: [""]
+ resources: ["pods/log"]
+ verbs: ["get"]
+
+ # Explicitly NOT included:
+ # - deployments/delete - handled by lifecycle sidecar
+ # - pvc/delete - handled by lifecycle sidecar
+ # - services/delete - handled by lifecycle sidecar
+ # - pods (create/delete) - must go through deployments
+ # - pods/exec, pods/attach - no shell access
+ # - secrets, configmaps - no credential access
+ # - any resources in other namespaces
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+ name: gateway-agent-creator
+ namespace: dexorder-agents
+subjects:
+ - kind: ServiceAccount
+ name: gateway
+ namespace: dexorder-system
+roleRef:
+ kind: Role
+ name: agent-creator
+ apiGroup: rbac.authorization.k8s.io
diff --git a/deploy/k8s/base/init.yaml b/deploy/k8s/base/init.yaml
index 54d5370..31c74d6 100644
--- a/deploy/k8s/base/init.yaml
+++ b/deploy/k8s/base/init.yaml
@@ -1,3 +1,6 @@
+# Runtime and security initialization for dexorder AI platform
+# Apply this first: kubectl apply -f init.yaml
+---
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
diff --git a/deploy/k8s/base/kustomization.yaml b/deploy/k8s/base/kustomization.yaml
index bae21bc..66e3b92 100644
--- a/deploy/k8s/base/kustomization.yaml
+++ b/deploy/k8s/base/kustomization.yaml
@@ -1,5 +1,26 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
-resources: []
- # ingress.yaml - removed until we have services to expose
+resources:
+ # Core initialization (runtime classes)
+ - init.yaml
+ # Namespace definitions with PodSecurity labels
+ - namespaces.yaml
+ # RBAC for gateway to create agents (creation only)
+ - gateway-rbac.yaml
+ # RBAC for lifecycle sidecar (self-deletion)
+ - lifecycle-sidecar-rbac.yaml
+ # Admission policies (image restriction, security requirements)
+ - admission-policy.yaml
+ # Resource quotas and limits for agents namespace
+ - agent-quotas.yaml
+ # Network isolation policies
+ - network-policies.yaml
+ # Gateway service (uncomment when ready)
+ # - gateway.yaml
+ # Example agent deployment (for reference, not applied by default)
+ # - agent-deployment-example.yaml
+ # Services (uncomment as needed)
+ # - backend.yaml
+ # - web.yaml
+ # - ingress.yaml
diff --git a/deploy/k8s/base/lifecycle-sidecar-rbac.yaml b/deploy/k8s/base/lifecycle-sidecar-rbac.yaml
new file mode 100644
index 0000000..b3b2bd3
--- /dev/null
+++ b/deploy/k8s/base/lifecycle-sidecar-rbac.yaml
@@ -0,0 +1,53 @@
+# RBAC for lifecycle sidecar - allows self-deletion only
+# Each agent pod gets this ServiceAccount and can only delete its own deployment
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: agent-lifecycle
+ namespace: dexorder-agents
+---
+# Role allowing deletion of deployments and PVCs
+# This is scoped to the dexorder-agents namespace
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+ name: agent-self-delete
+ namespace: dexorder-agents
+rules:
+ # Allow getting and deleting deployments
+ - apiGroups: ["apps"]
+ resources: ["deployments"]
+ verbs: ["get", "delete"]
+
+ # Allow getting and deleting PVCs (for anonymous users)
+ - apiGroups: [""]
+ resources: ["persistentvolumeclaims"]
+ verbs: ["get", "delete"]
+
+ # Read-only access to pods (for status checking)
+ - apiGroups: [""]
+ resources: ["pods"]
+ verbs: ["get", "list"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+ name: agent-self-delete
+ namespace: dexorder-agents
+subjects:
+ - kind: ServiceAccount
+ name: agent-lifecycle
+ namespace: dexorder-agents
+roleRef:
+ kind: Role
+ name: agent-self-delete
+ apiGroup: rbac.authorization.k8s.io
+---
+# Additional security: ValidatingWebhookConfiguration to restrict deletion
+# This ensures sidecars can only delete their own deployment
+# Requires a validating webhook server (can be added later)
+# For now, we rely on:
+# 1. Sidecar only knowing its own deployment name (from env)
+# 2. RBAC limiting to dexorder-agents namespace
+# 3. Admission policy restricting deployment creation (already defined)
diff --git a/deploy/k8s/base/namespaces.yaml b/deploy/k8s/base/namespaces.yaml
new file mode 100644
index 0000000..54afbe0
--- /dev/null
+++ b/deploy/k8s/base/namespaces.yaml
@@ -0,0 +1,24 @@
+# Namespace definitions for dexorder AI platform
+# - dexorder-system: gateway, flink, kafka, and other infrastructure
+# - dexorder-agents: user agent containers (isolated, restricted)
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+ name: dexorder-system
+ labels:
+ app.kubernetes.io/part-of: dexorder
+ dexorder.io/type: system
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+ name: dexorder-agents
+ labels:
+ app.kubernetes.io/part-of: dexorder
+ dexorder.io/type: agents
+ # Enforce restricted pod security standards
+ pod-security.kubernetes.io/enforce: restricted
+ pod-security.kubernetes.io/enforce-version: latest
+ pod-security.kubernetes.io/audit: restricted
+ pod-security.kubernetes.io/warn: restricted
diff --git a/deploy/k8s/base/network-policies.yaml b/deploy/k8s/base/network-policies.yaml
new file mode 100644
index 0000000..8e4558a
--- /dev/null
+++ b/deploy/k8s/base/network-policies.yaml
@@ -0,0 +1,121 @@
+# Network policies for agent isolation
+# Agents can only communicate with specific services, not with each other
+# or with the Kubernetes API
+---
+# Default deny all ingress and egress in agents namespace
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+ name: default-deny-all
+ namespace: dexorder-agents
+spec:
+ podSelector: {}
+ policyTypes:
+ - Ingress
+ - Egress
+---
+# Allow agents to receive connections from gateway (MCP)
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+ name: allow-gateway-ingress
+ namespace: dexorder-agents
+spec:
+ podSelector:
+ matchLabels:
+ dexorder.io/component: agent
+ policyTypes:
+ - Ingress
+ ingress:
+ - from:
+ - namespaceSelector:
+ matchLabels:
+ dexorder.io/type: system
+ podSelector:
+ matchLabels:
+ app: gateway
+ ports:
+ - protocol: TCP
+ port: 3000 # MCP server port
+ - protocol: TCP
+ port: 5555 # ZeroMQ control channel
+---
+# Allow agents to connect to required services
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+ name: allow-agent-egress
+ namespace: dexorder-agents
+spec:
+ podSelector:
+ matchLabels:
+ dexorder.io/component: agent
+ policyTypes:
+ - Egress
+ egress:
+ # DNS resolution (required)
+ - to:
+ - namespaceSelector: {}
+ podSelector:
+ matchLabels:
+ k8s-app: kube-dns
+ ports:
+ - protocol: UDP
+ port: 53
+ - protocol: TCP
+ port: 53
+ # Gateway in system namespace (for callbacks)
+ - to:
+ - namespaceSelector:
+ matchLabels:
+ dexorder.io/type: system
+ podSelector:
+ matchLabels:
+ app: gateway
+ ports:
+ - protocol: TCP
+ port: 8080
+ # Kafka/Redpanda for data subscriptions
+ - to:
+ - namespaceSelector:
+ matchLabels:
+ dexorder.io/type: system
+ podSelector:
+ matchLabels:
+ app: redpanda
+ ports:
+ - protocol: TCP
+ port: 9092
+ # External HTTPS (for exchange APIs, LLM APIs)
+ - to:
+ - ipBlock:
+ cidr: 0.0.0.0/0
+ except:
+ # Block access to k8s API server (common ranges)
+ - 10.0.0.0/8
+ - 172.16.0.0/12
+ - 192.168.0.0/16
+ ports:
+ - protocol: TCP
+ port: 443
+---
+# System namespace: allow ingress from agents
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+ name: allow-agent-callbacks
+ namespace: dexorder-system
+spec:
+ podSelector:
+ matchLabels:
+ app: gateway
+ policyTypes:
+ - Ingress
+ ingress:
+ - from:
+ - namespaceSelector:
+ matchLabels:
+ dexorder.io/type: agents
+ ports:
+ - protocol: TCP
+ port: 8080
diff --git a/deploy/k8s/dev/admission-policy-patch.yaml b/deploy/k8s/dev/admission-policy-patch.yaml
new file mode 100644
index 0000000..7a6728a
--- /dev/null
+++ b/deploy/k8s/dev/admission-policy-patch.yaml
@@ -0,0 +1,97 @@
+# Dev admission policy: allow local registry images
+# In dev, we also allow images from localhost/minikube registry
+---
+apiVersion: admissionregistration.k8s.io/v1
+kind: ValidatingAdmissionPolicy
+metadata:
+ name: dexorder-agent-image-policy
+spec:
+ failurePolicy: Fail
+ matchConstraints:
+ namespaceSelector:
+ matchLabels:
+ dexorder.io/type: agents
+ resourceRules:
+ - apiGroups: ["apps"]
+ apiVersions: ["v1"]
+ resources: ["deployments"]
+ operations: ["CREATE", "UPDATE"]
+ validations:
+ # Allow local dev images in addition to production registry
+ - expression: |
+ object.spec.template.spec.containers.all(c,
+ c.image.startsWith('ghcr.io/dexorder/agent:') ||
+ c.image.startsWith('ghcr.io/dexorder/agent-') ||
+ c.image.startsWith('localhost:5000/dexorder/agent') ||
+ c.image.startsWith('dexorder/agent'))
+ message: "Only approved dexorder agent images are allowed"
+ reason: Forbidden
+
+ # No privileged containers
+ - expression: |
+ object.spec.template.spec.containers.all(c,
+ !has(c.securityContext) ||
+ !has(c.securityContext.privileged) ||
+ c.securityContext.privileged == false)
+ message: "Privileged containers are not allowed"
+ reason: Forbidden
+
+ # No hostPath volumes
+ - expression: |
+ !has(object.spec.template.spec.volumes) ||
+ object.spec.template.spec.volumes.all(v,
+ !has(v.hostPath))
+ message: "hostPath volumes are not allowed"
+ reason: Forbidden
+
+ # No hostNetwork
+ - expression: |
+ !has(object.spec.template.spec.hostNetwork) ||
+ object.spec.template.spec.hostNetwork == false
+ message: "hostNetwork is not allowed"
+ reason: Forbidden
+
+ # No hostPID
+ - expression: |
+ !has(object.spec.template.spec.hostPID) ||
+ object.spec.template.spec.hostPID == false
+ message: "hostPID is not allowed"
+ reason: Forbidden
+
+ # Containers must run as non-root
+ - expression: |
+ object.spec.template.spec.containers.all(c,
+ has(c.securityContext) &&
+ has(c.securityContext.runAsNonRoot) &&
+ c.securityContext.runAsNonRoot == true)
+ message: "Containers must run as non-root"
+ reason: Forbidden
+
+ # Must drop all capabilities
+ - expression: |
+ object.spec.template.spec.containers.all(c,
+ has(c.securityContext) &&
+ has(c.securityContext.capabilities) &&
+ has(c.securityContext.capabilities.drop) &&
+ c.securityContext.capabilities.drop.exists(cap, cap == 'ALL'))
+ message: "Containers must drop all capabilities"
+ reason: Forbidden
+
+ # Read-only root filesystem
+ - expression: |
+ object.spec.template.spec.containers.all(c,
+ has(c.securityContext) &&
+ has(c.securityContext.readOnlyRootFilesystem) &&
+ c.securityContext.readOnlyRootFilesystem == true)
+ message: "Containers must have read-only root filesystem"
+ reason: Forbidden
+
+ # Resource limits must be set
+ - expression: |
+ object.spec.template.spec.containers.all(c,
+ has(c.resources) &&
+ has(c.resources.limits) &&
+ has(c.resources.limits.memory) &&
+ has(c.resources.limits.cpu))
+ message: "Containers must have resource limits set"
+ reason: Forbidden
diff --git a/deploy/k8s/dev/agent-quotas-patch.yaml b/deploy/k8s/dev/agent-quotas-patch.yaml
new file mode 100644
index 0000000..34a3a57
--- /dev/null
+++ b/deploy/k8s/dev/agent-quotas-patch.yaml
@@ -0,0 +1,19 @@
+# Dev/minikube resource quota overrides
+# Smaller limits appropriate for local development
+---
+apiVersion: v1
+kind: ResourceQuota
+metadata:
+ name: agent-quota
+ namespace: dexorder-agents
+spec:
+ hard:
+ # Reduced for minikube
+ requests.cpu: "4"
+ requests.memory: "8Gi"
+ limits.cpu: "8"
+ limits.memory: "16Gi"
+ pods: "20"
+ persistentvolumeclaims: "20"
+ services: "20"
+ requests.storage: "50Gi"
diff --git a/deploy/k8s/dev/kustomization.yaml b/deploy/k8s/dev/kustomization.yaml
index 14168e5..c5bef4a 100644
--- a/deploy/k8s/dev/kustomization.yaml
+++ b/deploy/k8s/dev/kustomization.yaml
@@ -1,16 +1,20 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
-namespace: default
+# Note: namespaces are defined in base; workloads go to dexorder-system
+namespace: dexorder-system
-# Base resources
+# Base resources (includes security policies)
resources:
- ../base
- infrastructure.yaml
-# No patches needed currently
-patches: []
- # ingress-dev.yaml - removed until we have services to expose
+# Dev-specific patches
+patches:
+ # Reduced resource quotas for minikube
+ - path: agent-quotas-patch.yaml
+ # Allow local registry images
+ - path: admission-policy-patch.yaml
# ConfigMaps for service configs
configMapGenerator:
diff --git a/deploy/k8s/prod/kustomization.yaml b/deploy/k8s/prod/kustomization.yaml
index 6bd96fb..b8a746b 100644
--- a/deploy/k8s/prod/kustomization.yaml
+++ b/deploy/k8s/prod/kustomization.yaml
@@ -1,9 +1,10 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
-namespace: default
+# Note: namespaces are defined in base; workloads go to dexorder-system
+namespace: dexorder-system
-# Base resources (backend, web, ingress, init/gVisor)
+# Base resources (includes all security policies)
resources:
- ../base
@@ -38,3 +39,10 @@ images:
newTag: latest
- name: dexorder/ai-web
newTag: latest
+ - name: ghcr.io/dexorder/gateway
+ newTag: latest
+ - name: lifecycle-sidecar
+ newName: ghcr.io/dexorder/lifecycle-sidecar
+ newTag: latest
+ - name: ghcr.io/dexorder/agent
+ newTag: latest
diff --git a/doc/agent_harness_flow.md b/doc/agent_harness_flow.md
new file mode 100644
index 0000000..24a5c72
--- /dev/null
+++ b/doc/agent_harness_flow.md
@@ -0,0 +1,21 @@
+┌─────────────────────────────────────────────────┐
+│ Agent Harness (your servers) │
+│ │
+│ on_message(user_id, message): │
+│ 1. Look up user's MCP endpoint from Postgres │
+│ 2. mcp.call("get_context_summary") │
+│ 3. mcp.call("get_conversation_history", 20) │
+│ 4. Build prompt: │
+│ system = BASE_PROMPT │
+│ + context_summary │
+│ + user_agent_prompt (from MCP) │
+│ messages = history + new message │
+│ 5. LLM call (your API key) │
+│ 6. While LLM wants tool calls: │
+│ - Platform tools → handle locally │
+│ - User tools → proxy to MCP │
+│ - LLM call again with results │
+│ 7. mcp.call("save_message", ...) │
+│ 8. Return response to user │
+│ │
+└─────────────────────────────────────────────────┘
diff --git a/doc/agent_redesign.md b/doc/agent_redesign.md
index 159244d..d318e72 100644
--- a/doc/agent_redesign.md
+++ b/doc/agent_redesign.md
@@ -1,9 +1,11 @@
Generally use skills instead of subagents, except for the analysis subagent.
-## User-specific files
+## User-specific files and tools
* Indicators
* Strategies
* Watchlists
* Preferences
* Trading style
* Charting / colors
+* Executors (really just sub-strategies)
+ * tactical-level order generators e.g. TWAP, iceberg, etc.
diff --git a/doc/config.md b/doc/config.md
deleted file mode 100644
index b520ac7..0000000
--- a/doc/config.md
+++ /dev/null
@@ -1,18 +0,0 @@
-This file describes all the configuration options used by all components. All configuration is divided into regular config and secrets, and k8s will mount either or both as a yaml file accessible to the process.
-
-# Configuration
-
-* `flink_hostname`
-* ... various zmq ports for flink ...
-* `iceberg_catalog_hostname`
-* `iceberg_catalog_port`
-* `iceberg_catalog_database`
-* etc
-
-
-# Secrets
-
-* `iceberg_catalog_username`
-* `iceberg_catalog_password`
-* etc.
-
diff --git a/doc/container_lifecycle_management.md b/doc/container_lifecycle_management.md
new file mode 100644
index 0000000..bf2ed3d
--- /dev/null
+++ b/doc/container_lifecycle_management.md
@@ -0,0 +1,313 @@
+# Container Lifecycle Management
+
+## Overview
+
+User agent containers self-manage their lifecycle to optimize resource usage. Containers automatically shut down when idle (no triggers + no recent activity) and clean themselves up using a lifecycle sidecar.
+
+## Architecture
+
+```
+┌──────────────────────────────────────────────────────────┐
+│ Agent Pod │
+│ ┌───────────────────┐ ┌──────────────────────┐ │
+│ │ Agent Container │ │ Lifecycle Sidecar │ │
+│ │ ─────────────── │ │ ────────────────── │ │
+│ │ │ │ │ │
+│ │ Lifecycle Manager │ │ Watches exit code │ │
+│ │ - Track activity │ │ - Detects exit 42 │ │
+│ │ - Track triggers │ │ - Calls k8s API │ │
+│ │ - Exit 42 if idle │ │ - Deletes deployment │ │
+│ └───────────────────┘ └──────────────────────┘ │
+│ │ │ │
+│ │ writes exit_code │ │
+│ └────►/var/run/agent/exit_code │
+│ │ │
+└───────────────────────────────────────┼──────────────────┘
+ │
+ ▼ k8s API (RBAC)
+ ┌─────────────────────┐
+ │ Delete Deployment │
+ │ Delete PVC (if anon)│
+ └─────────────────────┘
+```
+
+## Components
+
+### 1. Lifecycle Manager (Python)
+
+**Location**: `client-py/dexorder/lifecycle_manager.py`
+
+Runs inside the agent container and tracks:
+- **Activity**: MCP tool/resource/prompt calls reset the idle timer
+- **Triggers**: Data subscriptions, CEP patterns, etc.
+- **Idle state**: No triggers + idle timeout exceeded
+
+**Configuration** (via environment variables):
+- `IDLE_TIMEOUT_MINUTES`: Minutes before shutdown (default: 15)
+- `IDLE_CHECK_INTERVAL_SECONDS`: Check frequency (default: 60)
+- `ENABLE_IDLE_SHUTDOWN`: Enable/disable shutdown (default: true)
+
+**Usage in agent code**:
+```python
+from dexorder.lifecycle_manager import get_lifecycle_manager
+
+# On startup
+manager = get_lifecycle_manager()
+await manager.start()
+
+# On MCP calls (tool/resource/prompt)
+manager.record_activity()
+
+# When triggers change
+manager.add_trigger("data_sub_BTC_USDT")
+manager.remove_trigger("data_sub_BTC_USDT")
+
+# Or batch update
+manager.update_triggers({"trigger_1", "trigger_2"})
+```
+
+**Exit behavior**:
+- Idle shutdown: Exit with code `42`
+- Signal (SIGTERM/SIGINT): Exit with code `0` (allows restart)
+- Errors/crashes: Exit with error code (allows restart)
+
+### 2. Lifecycle Sidecar (Go)
+
+**Location**: `lifecycle-sidecar/`
+
+Runs alongside the agent container with shared PID namespace. Monitors the main container process and:
+- On exit code `42`: Deletes deployment (and PVC if anonymous user)
+- On any other exit code: Exits with same code (k8s restarts pod)
+
+**Configuration** (via environment, injected by downward API):
+- `NAMESPACE`: Pod's namespace
+- `DEPLOYMENT_NAME`: Deployment name (from pod label)
+- `USER_TYPE`: License tier (`anonymous`, `free`, `paid`, `enterprise`)
+- `MAIN_CONTAINER_PID`: PID of main container (default: 1)
+
+**RBAC**: Has permission to delete deployments and PVCs **only in dexorder-agents namespace**. Cannot delete other deployments due to:
+1. Only knows its own deployment name (from env)
+2. RBAC scoped to namespace
+3. No cross-pod communication
+
+### 3. Gateway (TypeScript)
+
+**Location**: `gateway/src/harness/agent-harness.ts`
+
+Creates agent deployments when users connect. Has permissions to:
+- ✅ Create deployments, services, PVCs
+- ✅ Read pod status and logs
+- ✅ Update deployments (e.g., resource limits)
+- ❌ Delete deployments (handled by sidecar)
+- ❌ Exec into pods
+- ❌ Access secrets
+
+## Lifecycle States
+
+```
+┌─────────────┐
+│ CREATED │ ← Gateway creates deployment
+└──────┬──────┘
+ │
+ ▼
+┌─────────────┐
+│ RUNNING │ ← User interacts, has triggers
+└──────┬──────┘
+ │
+ ▼
+┌─────────────┐
+│ IDLE │ ← No triggers + timeout exceeded
+└──────┬──────┘
+ │
+ ▼
+┌─────────────┐
+│ SHUTDOWN │ ← Exit code 42
+└──────┬──────┘
+ │
+ ▼
+┌─────────────┐
+│ DELETED │ ← Sidecar deletes deployment
+└─────────────┘
+```
+
+## Idle Detection Logic
+
+Container is **IDLE** when:
+1. `active_triggers.isEmpty()` AND
+2. `(now - last_activity) > idle_timeout`
+
+Container is **ACTIVE** when:
+1. Has any active triggers (data subscriptions, CEP patterns, etc.) OR
+2. Recent user activity (MCP calls within timeout)
+
+## Cleanup Policies by License Tier
+
+| User Type | Idle Timeout | PVC Policy | Notes |
+|--------------|--------------|------------|-------|
+| Anonymous | 15 minutes | Delete | Ephemeral, no data retention |
+| Free | 15 minutes | Retain | Can resume session |
+| Paid | 60 minutes | Retain | Longer grace period |
+| Enterprise | No shutdown | Retain | Always-on containers |
+
+Configured via `USER_TYPE` env var in deployment.
+
+## Security
+
+### Principle of Least Privilege
+
+**Gateway**:
+- Can create agent resources
+- Cannot delete agent resources
+- Cannot access other namespaces
+- Cannot exec into pods
+
+**Lifecycle Sidecar**:
+- Can delete its own deployment only
+- Cannot delete other deployments
+- Scoped to dexorder-agents namespace
+- No exec, no secrets access
+
+### Admission Control
+
+All deployments in `dexorder-agents` namespace are subject to:
+- Image allowlist (only approved images)
+- Security context enforcement (non-root, drop caps, read-only rootfs)
+- Resource limits required
+- PodSecurity standards (restricted profile)
+
+See `deploy/k8s/base/admission-policy.yaml`
+
+### Network Isolation
+
+Agents are network-isolated via NetworkPolicy:
+- Can connect to gateway (MCP)
+- Can connect to Redpanda (data streams)
+- Can make outbound HTTPS (exchanges, LLM APIs)
+- Cannot access k8s API
+- Cannot access system namespace
+- Cannot access other agent pods
+
+See `deploy/k8s/base/network-policies.yaml`
+
+## Deployment
+
+### 1. Apply Security Policies
+
+```bash
+kubectl apply -k deploy/k8s/dev # or prod
+```
+
+This creates:
+- Namespaces (`dexorder-system`, `dexorder-agents`)
+- RBAC (gateway, lifecycle sidecar)
+- Admission policies
+- Network policies
+- Resource quotas
+
+### 2. Build and Push Lifecycle Sidecar
+
+```bash
+cd lifecycle-sidecar
+docker build -t ghcr.io/dexorder/lifecycle-sidecar:latest .
+docker push ghcr.io/dexorder/lifecycle-sidecar:latest
+```
+
+### 3. Gateway Creates Agent Deployments
+
+When a user connects, the gateway creates:
+- Deployment with agent + sidecar
+- PVC for persistent data
+- Service for MCP endpoint
+
+See `deploy/k8s/base/agent-deployment-example.yaml` for template.
+
+## Testing
+
+### Test Lifecycle Manager Locally
+
+```python
+from dexorder.lifecycle_manager import LifecycleManager
+
+# Disable actual shutdown for testing
+manager = LifecycleManager(
+ idle_timeout_minutes=1,
+ check_interval_seconds=10,
+ enable_shutdown=False # Only log, don't exit
+)
+
+await manager.start()
+
+# Simulate activity
+manager.record_activity()
+
+# Simulate triggers
+manager.add_trigger("test_trigger")
+await asyncio.sleep(70) # Wait past timeout
+manager.remove_trigger("test_trigger")
+await asyncio.sleep(70) # Should detect idle
+
+await manager.stop()
+```
+
+### Test Sidecar Locally
+
+```bash
+# Build
+cd lifecycle-sidecar
+go build -o lifecycle-sidecar main.go
+
+# Run (requires k8s config)
+export NAMESPACE=dexorder-agents
+export DEPLOYMENT_NAME=agent-test
+export USER_TYPE=free
+./lifecycle-sidecar
+```
+
+### Integration Test
+
+1. Deploy test agent with sidecar
+2. Verify agent starts and is healthy
+3. Stop sending MCP calls and remove all triggers
+4. Wait for idle timeout + check interval
+5. Verify deployment is deleted
+
+## Troubleshooting
+
+### Container not shutting down when idle
+
+Check logs:
+```bash
+kubectl logs -n dexorder-agents agent-user-abc123 -c agent
+```
+
+Verify:
+- `ENABLE_IDLE_SHUTDOWN=true`
+- No active triggers: `manager.active_triggers` should be empty
+- Idle timeout exceeded
+
+### Sidecar not deleting deployment
+
+Check sidecar logs:
+```bash
+kubectl logs -n dexorder-agents agent-user-abc123 -c lifecycle-sidecar
+```
+
+Verify:
+- Exit code file exists: `/var/run/agent/exit_code` contains `42`
+- RBAC permissions: `kubectl auth can-i delete deployments --as=system:serviceaccount:dexorder-agents:agent-lifecycle -n dexorder-agents`
+- Deployment name matches: Check `DEPLOYMENT_NAME` env var
+
+### Gateway can't create deployments
+
+Check gateway logs and verify:
+- ServiceAccount exists: `kubectl get sa gateway -n dexorder-system`
+- RoleBinding exists: `kubectl get rolebinding gateway-agent-creator -n dexorder-agents`
+- Admission policy allows image: Check image name matches allowlist in `admission-policy.yaml`
+
+## Future Enhancements
+
+1. **Graceful shutdown notifications**: Warn users before shutdown via websocket
+2. **Predictive scaling**: Keep frequently-used containers warm
+3. **Tiered storage**: Move old PVCs to cheaper storage class
+4. **Metrics**: Expose lifecycle metrics (idle rate, shutdown count, etc.)
+5. **Cost allocation**: Track resource usage per user/license tier
diff --git a/doc/gateway_container_creation.md b/doc/gateway_container_creation.md
new file mode 100644
index 0000000..dc938a9
--- /dev/null
+++ b/doc/gateway_container_creation.md
@@ -0,0 +1,286 @@
+# Gateway Container Creation
+
+## Overview
+
+The gateway automatically provisions user agent containers when users authenticate. This ensures each user has their own isolated environment running their MCP server with persistent storage.
+
+## Authentication Flow with Container Creation
+
+```
+User connects (WebSocket/Telegram)
+ ↓
+ Send "Authenticating..." status
+ ↓
+ Verify token/channel link
+ ↓
+ Lookup user license from DB
+ ↓
+ Send "Starting workspace..." status
+ ↓
+┌────────────────────────────────────┐
+│ ContainerManager.ensureRunning() │
+│ ┌──────────────────────────────┐ │
+│ │ Check if deployment exists │ │
+│ └──────────────────────────────┘ │
+│ ↓ │
+│ Does it exist? │
+│ ↙ ↘ │
+│ Yes No │
+│ │ │ │
+│ │ ┌──────────────────┐ │
+│ │ │ Create deployment│ │
+│ │ │ Create PVC │ │
+│ │ │ Create service │ │
+│ │ └──────────────────┘ │
+│ │ │ │
+│ └────────────┘ │
+│ ↓ │
+│ Wait for deployment ready │
+│ (polls every 2s, timeout 2min) │
+│ ↓ │
+│ Compute MCP endpoint URL │
+│ (internal k8s service DNS) │
+└────────────────────────────────────┘
+ ↓
+ Update license.mcpServerUrl
+ ↓
+ Send "Connected" status
+ ↓
+ Initialize AgentHarness
+ ↓
+ Connect to user's MCP server
+ ↓
+ Ready for messages
+```
+
+## Container Naming Convention
+
+All resources follow a consistent naming pattern based on `userId`:
+
+```typescript
+userId: "user-abc123"
+ ↓
+deploymentName: "agent-user-abc123"
+serviceName: "agent-user-abc123"
+pvcName: "agent-user-abc123-data"
+mcpEndpoint: "http://agent-user-abc123.dexorder-agents.svc.cluster.local:3000"
+```
+
+User IDs are sanitized to be Kubernetes-compliant (lowercase alphanumeric + hyphens).
+
+## Templates by License Tier
+
+Templates are located in `gateway/src/k8s/templates/`:
+- `free-tier.yaml`
+- `pro-tier.yaml`
+- `enterprise-tier.yaml`
+
+### Variable Substitution
+
+Templates use simple string replacement:
+- `{{userId}}` - User ID
+- `{{deploymentName}}` - Computed deployment name
+- `{{serviceName}}` - Computed service name
+- `{{pvcName}}` - Computed PVC name
+- `{{agentImage}}` - Agent container image (from env)
+- `{{sidecarImage}}` - Lifecycle sidecar image (from env)
+- `{{storageClass}}` - Kubernetes storage class (from env)
+
+### Resource Limits
+
+| Tier | Memory Request | Memory Limit | CPU Request | CPU Limit | Storage | Idle Timeout |
+|------|----------------|--------------|-------------|-----------|---------|--------------|
+| **Free** | 256Mi | 512Mi | 100m | 500m | 1Gi | 15min |
+| **Pro** | 512Mi | 2Gi | 250m | 2000m | 10Gi | 60min |
+| **Enterprise** | 1Gi | 4Gi | 500m | 4000m | 50Gi | Never (shutdown disabled) |
+
+## Components
+
+### KubernetesClient (`gateway/src/k8s/client.ts`)
+
+Low-level k8s API wrapper:
+- `deploymentExists(name)` - Check if deployment exists
+- `createAgentDeployment(spec)` - Create deployment/service/PVC from template
+- `waitForDeploymentReady(name, timeout)` - Poll until ready
+- `getServiceEndpoint(name)` - Get service URL
+- `deleteAgentDeployment(userId)` - Cleanup (for testing)
+
+Static helpers:
+- `getDeploymentName(userId)` - Generate deployment name
+- `getServiceName(userId)` - Generate service name
+- `getPvcName(userId)` - Generate PVC name
+- `getMcpEndpoint(userId, namespace)` - Compute internal service URL
+
+### ContainerManager (`gateway/src/k8s/container-manager.ts`)
+
+High-level orchestration:
+- `ensureContainerRunning(userId, license)` - Main entry point
+ - Returns: `{ mcpEndpoint, wasCreated }`
+ - Creates deployment if missing
+ - Waits for ready state
+ - Returns endpoint URL
+- `getContainerStatus(userId)` - Check status without creating
+- `deleteContainer(userId)` - Manual cleanup
+
+### Authenticator (`gateway/src/auth/authenticator.ts`)
+
+Updated to call container manager:
+- `authenticateWebSocket()` - Calls `ensureContainerRunning()` before returning `AuthContext`
+- `authenticateTelegram()` - Same for Telegram webhooks
+
+### WebSocketHandler (`gateway/src/channels/websocket-handler.ts`)
+
+Multi-phase connection protocol:
+1. Send `{type: 'status', status: 'authenticating'}`
+2. Authenticate (may take 30-120s if creating container)
+3. Send `{type: 'status', status: 'initializing'}`
+4. Initialize agent harness
+5. Send `{type: 'connected', ...}`
+
+This gives the client visibility into the startup process.
+
+## Configuration
+
+Environment variables:
+
+```bash
+# Kubernetes
+KUBERNETES_NAMESPACE=dexorder-agents
+KUBERNETES_IN_CLUSTER=true # false for local dev
+KUBERNETES_CONTEXT=minikube # for local dev only
+
+# Container images
+AGENT_IMAGE=ghcr.io/dexorder/agent:latest
+SIDECAR_IMAGE=ghcr.io/dexorder/lifecycle-sidecar:latest
+
+# Storage
+AGENT_STORAGE_CLASS=standard
+```
+
+## Security
+
+The gateway uses a restricted ServiceAccount with RBAC:
+
+**Can do:**
+- ✅ Create deployments in `dexorder-agents` namespace
+- ✅ Create services in `dexorder-agents` namespace
+- ✅ Create PVCs in `dexorder-agents` namespace
+- ✅ Read pod status and logs (debugging)
+- ✅ Update deployments (future: resource scaling)
+
+**Cannot do:**
+- ❌ Delete deployments (handled by lifecycle sidecar)
+- ❌ Delete PVCs (handled by lifecycle sidecar)
+- ❌ Exec into pods
+- ❌ Access secrets or configmaps
+- ❌ Create resources in other namespaces
+- ❌ Access Kubernetes API from agent containers (blocked by NetworkPolicy)
+
+See `deploy/k8s/base/gateway-rbac.yaml` for full configuration.
+
+## Lifecycle
+
+### Container Creation (Gateway)
+- User authenticates
+- Gateway checks if deployment exists
+- If missing, creates from template
+- Waits for ready (2min timeout)
+- Returns MCP endpoint
+
+### Container Deletion (Lifecycle Sidecar)
+- Container tracks activity and triggers
+- When idle (no triggers + timeout), exits with code 42
+- Sidecar detects exit code 42
+- Sidecar deletes deployment + optional PVC via k8s API
+- Gateway creates fresh container on next authentication
+
+See `doc/container_lifecycle_management.md` for full lifecycle details.
+
+## Error Handling
+
+| Error | Gateway Action | User Experience |
+|-------|----------------|-----------------|
+| Deployment creation fails | Log error, return auth failure | "Authentication failed" |
+| Wait timeout (image pull, etc.) | Log warning, return 503 | "Service unavailable, retry" |
+| Service not found | Retry with backoff | Transparent retry |
+| MCP connection fails | Return error | "Failed to connect to workspace" |
+| Existing deployment not ready | Wait 30s, continue if still not ready | May connect to partially-ready container |
+
+## Local Development
+
+For local development (outside k8s):
+
+1. Start minikube:
+```bash
+minikube start
+minikube addons enable storage-provisioner
+```
+
+2. Apply security policies:
+```bash
+kubectl apply -k deploy/k8s/dev
+```
+
+3. Configure gateway for local k8s:
+```bash
+# .env
+KUBERNETES_IN_CLUSTER=false
+KUBERNETES_CONTEXT=minikube
+KUBERNETES_NAMESPACE=dexorder-agents
+```
+
+4. Run gateway:
+```bash
+cd gateway
+npm run dev
+```
+
+5. Connect via WebSocket:
+```bash
+wscat -c "ws://localhost:3000/ws/chat" -H "Authorization: Bearer your-jwt"
+```
+
+The gateway will create deployments in minikube. View with:
+```bash
+kubectl get deployments -n dexorder-agents
+kubectl get pods -n dexorder-agents
+kubectl logs -n dexorder-agents agent-user-abc123 -c agent
+```
+
+## Production Deployment
+
+1. Build and push gateway image:
+```bash
+cd gateway
+docker build -t ghcr.io/dexorder/gateway:latest .
+docker push ghcr.io/dexorder/gateway:latest
+```
+
+2. Deploy to k8s:
+```bash
+kubectl apply -k deploy/k8s/prod
+```
+
+3. Gateway runs in `dexorder-system` namespace
+4. Creates agent containers in `dexorder-agents` namespace
+5. Admission policies enforce image allowlist and security constraints
+
+## Monitoring
+
+Useful metrics to track:
+- Container creation latency (time from auth to ready)
+- Container creation failure rate
+- Active containers by license tier
+- Resource usage per tier
+- Idle shutdown rate
+
+These can be exported via Prometheus or logged to monitoring service.
+
+## Future Enhancements
+
+1. **Pre-warming**: Create containers for active users before they connect
+2. **Image updates**: Handle agent image version migrations with user consent
+3. **Multi-region**: Geo-distributed container placement
+4. **Cost tracking**: Per-user resource usage and billing
+5. **Auto-scaling**: Scale down to 0 replicas instead of deletion (faster restart)
+6. **Container pools**: Shared warm containers for anonymous users
diff --git a/doc/m_c_p_client_authentication_modes.md b/doc/m_c_p_client_authentication_modes.md
new file mode 100644
index 0000000..a3c56c8
--- /dev/null
+++ b/doc/m_c_p_client_authentication_modes.md
@@ -0,0 +1,80 @@
+Mode A: Platform Harness → Hosted Container (internal)
+ Auth: mTLS + platform-signed user claim
+ Network: k8s internal, never hits the internet
+
+Mode B: Platform Harness → External User Container (remote)
+ Auth: OAuth2 token issued by your platform
+ Network: public internet, TLS required
+
+Mode C: Third-party MCP Client → External User Container (standalone)
+ Auth: User-managed API key or local-only (no network)
+ Network: localhost or user's own network
+
+┌──────────────────────────────────────────────────────────┐
+│ Platform (Postgres) │
+│ │
+│ users │
+│ ├── id, email, password_hash, plan_tier │
+│ │ │
+│ containers │
+│ ├── user_id │
+│ ├── type: "hosted" | "external" │
+│ ├── mcp_endpoint: "internal-svc:3100" | "https://..." │
+│ ├── auth_method: "mtls" | "platform_token" | "api_key" │
+│ └── public_key_fingerprint (for pinning external certs) │
+│ │
+│ api_tokens │
+│ ├── user_id │
+│ ├── token_hash │
+│ ├── scopes: ["mcp:tools", "mcp:resources", "data:read"] │
+│ ├── expires_at │
+│ └── issued_for: "platform_harness" | "user_direct" │
+│ │
+└──────────────────────────────────────────────────────────┘
+
+## Mode A
+
+Harness ──mTLS──▶ k8s Service ──▶ User Container MCP
+Validates: source is platform namespace
+Extracts: user_id from forwarded header
+
+## Mode B
+
+Registration flow (one-time):
+1. User provides their MCP endpoint URL in platform settings
+2. Platform generates a scoped token (JWT, short-lived, auto-refreshed)
+3. User configures their MCP server to accept tokens signed by your platform
+4. Platform stores the endpoint + auth method
+
+Runtime:
+┌──────────┐ HTTPS + Bearer token ┌────────────────────┐
+│ Harness │ ─────────────────────────▶ │ External MCP Server│
+│ │ Authorization: │ │
+│ │ Bearer │ Validates: │
+│ │ │ - JWT signature │
+│ │ │ (your public │
+│ │ │ key, JWKS) │
+│ │ │ - user_id claim │
+│ │ │ matches self │
+│ │ │ - not expired │
+└──────────┘ └────────────────────┘
+
+## Mode C
+
+```yaml
+# openclaw/config.yaml
+auth:
+ # For local-only use (Claude Desktop, Cursor, etc via stdio)
+ mode: "local" # no network auth needed
+
+ # OR for remote access
+ mode: "token"
+ tokens:
+ - name: "my-laptop"
+ hash: "sha256:..." # generated by `openclaw token create`
+
+ # OR for platform integration
+ mode: "platform"
+ platform_jwks_url: "https://api.openclaw.io/.well-known/jwks.json"
+ expected_user_id: "user_abc123"
+```
diff --git a/doc/m_c_p_tools_architecture.md b/doc/m_c_p_tools_architecture.md
new file mode 100644
index 0000000..8c56f34
--- /dev/null
+++ b/doc/m_c_p_tools_architecture.md
@@ -0,0 +1,29 @@
+MCP Tools (User Container)
+├── Memory
+│ ├── get_conversation_history(limit)
+│ ├── save_message(role, content)
+│ ├── search_memory(query) ← semantic search over past conversations
+│ └── get_context_summary() ← "who is this user, what do they care about"
+│
+├── Strategies & Indicators
+│ ├── list_strategies()
+│ ├── read_strategy(name)
+│ ├── write_strategy(name, code)
+│ ├── list_indicators()
+│ ├── read_indicator(name)
+│ ├── write_indicator(name, code)
+│ └── run_backtest(strategy, params)
+│
+├── Preferences
+│ ├── get_preferences()
+│ ├── set_preference(key, value)
+│ └── get_agent_prompt() ← user's custom system prompt additions
+│
+├── Trading
+│ ├── get_watchlist()
+│ ├── execute_trade(params)
+│ ├── get_positions()
+│ └── get_trade_history()
+│
+└── Sandbox
+ └── run_python(code) ← datascience toolset, matplotlib, etc.
diff --git a/protobuf/protocol.md b/doc/protocol.md
similarity index 100%
rename from protobuf/protocol.md
rename to doc/protocol.md
diff --git a/doc/user_mcp_resources.md b/doc/user_mcp_resources.md
new file mode 100644
index 0000000..9decf89
--- /dev/null
+++ b/doc/user_mcp_resources.md
@@ -0,0 +1,472 @@
+# User MCP Server - Resource Architecture
+
+The user's MCP server container owns **all** conversation history, RAG, and contextual data. The platform gateway is a thin, stateless orchestrator that only holds the Anthropic API key.
+
+## Architecture Principle
+
+**User Container = Fat Context**
+- Conversation history (PostgreSQL/SQLite)
+- RAG system (embeddings, vector search)
+- User preferences and custom prompts
+- Trading context (positions, watchlists, alerts)
+- All user-specific data
+
+**Platform Gateway = Thin Orchestrator**
+- Anthropic API key (platform pays for LLM)
+- Session management (WebSocket/Telegram connections)
+- MCP client connection pooling
+- Tool routing (platform vs user tools)
+- **Zero conversation state stored**
+
+## MCP Resources for Context Injection
+
+Resources are **read-only** data sources that provide context to the LLM. They're fetched before each Claude API call and embedded in the conversation.
+
+### Standard Context Resources
+
+#### 1. `context://user-profile`
+**Purpose:** User's trading background and preferences
+
+**MIME Type:** `text/plain`
+
+**Example Content:**
+```
+User Profile:
+- Trading experience: Intermediate
+- Preferred timeframes: 1h, 4h, 1d
+- Risk tolerance: Medium
+- Focus: Swing trading with technical indicators
+- Favorite indicators: RSI, MACD, Bollinger Bands
+- Active pairs: BTC/USDT, ETH/USDT, SOL/USDT
+```
+
+**Implementation Notes:**
+- Stored in user's database `user_preferences` table
+- Updated via preference management tools
+- Includes inferred data from usage patterns
+
+---
+
+#### 2. `context://conversation-summary`
+**Purpose:** Semantic summary of recent conversation with RAG-enhanced context
+
+**MIME Type:** `text/plain`
+
+**Example Content:**
+```
+Recent Conversation Summary:
+
+Last 10 messages (summarized):
+- User asked about moving average crossover strategies
+- Discussed backtesting parameters for BTC/USDT
+- Reviewed risk management with 2% position sizing
+- Explored adding RSI filter to reduce false signals
+
+Relevant past discussions (RAG search):
+- 2 weeks ago: Similar strategy development on ETH/USDT
+- 1 month ago: User prefers simple strategies over complex ones
+- Past preference: Avoid strategies with >5 indicators
+
+Current focus: Optimizing MA crossover with momentum filter
+```
+
+**Implementation Notes:**
+- Last N messages stored in `conversation_history` table
+- RAG search against embeddings of past conversations
+- Semantic search using user's current message as query
+- ChromaDB/pgvector for embedding storage
+- Summary generated on-demand (can be cached for 1-5 minutes)
+
+**RAG Integration:**
+```python
+async def get_conversation_summary() -> str:
+ # Get recent messages
+ recent = await db.get_recent_messages(limit=50)
+
+ # Semantic search for relevant context
+ relevant = await rag.search_conversation_history(
+ query=recent[-1].content, # Last user message
+ limit=5,
+ min_score=0.7
+ )
+
+ # Build summary
+ return build_summary(recent[-10:], relevant)
+```
+
+---
+
+#### 3. `context://workspace-state`
+**Purpose:** Current trading workspace (chart, positions, watchlist)
+
+**MIME Type:** `application/json`
+
+**Example Content:**
+```json
+{
+ "currentChart": {
+ "ticker": "BINANCE:BTC/USDT",
+ "timeframe": "1h",
+ "indicators": ["SMA(20)", "RSI(14)", "MACD(12,26,9)"]
+ },
+ "watchlist": ["BTC/USDT", "ETH/USDT", "SOL/USDT"],
+ "openPositions": [
+ {
+ "ticker": "BTC/USDT",
+ "side": "long",
+ "size": 0.1,
+ "entryPrice": 45000,
+ "currentPrice": 46500,
+ "unrealizedPnL": 150
+ }
+ ],
+ "recentAlerts": [
+ {
+ "type": "price_alert",
+ "message": "BTC/USDT crossed above $46,000",
+ "timestamp": "2025-01-15T10:30:00Z"
+ }
+ ]
+}
+```
+
+**Implementation Notes:**
+- Synced from web client chart state
+- Updated via WebSocket sync protocol
+- Includes active indicators on current chart
+- Position data from trading system
+
+---
+
+#### 4. `context://system-prompt`
+**Purpose:** User's custom instructions and preferences for AI behavior
+
+**MIME Type:** `text/plain`
+
+**Example Content:**
+```
+Custom Instructions:
+- Be concise and data-driven
+- Always show risk/reward ratios
+- Prefer simple strategies over complex ones
+- When suggesting trades, include stop-loss and take-profit levels
+- Explain your reasoning in trading decisions
+```
+
+**Implementation Notes:**
+- User-editable in preferences UI
+- Appended **last** to system prompt (highest priority)
+- Can override platform defaults
+- Stored in `user_preferences.custom_prompt` field
+
+---
+
+## MCP Tools for Actions
+
+Tools are for **actions** that have side effects. These are **not** used for context fetching.
+
+### Conversation Management
+- `save_message(role, content, timestamp)` - Save message to history
+- `search_conversation(query, limit)` - Explicit semantic search (for user queries like "what did we discuss about BTC?")
+
+### Strategy & Indicators
+- `list_strategies()` - List user's strategies
+- `read_strategy(name)` - Get strategy code
+- `write_strategy(name, code)` - Save strategy
+- `run_backtest(strategy, params)` - Execute backtest
+
+### Trading
+- `get_watchlist()` - Get watchlist (action that may trigger sync)
+- `execute_trade(params)` - Execute trade order
+- `get_positions()` - Fetch current positions from exchange
+
+### Sandbox
+- `run_python(code)` - Execute Python code with data science libraries
+
+---
+
+## Gateway Harness Flow
+
+```typescript
+// gateway/src/harness/agent-harness.ts
+
+async handleMessage(message: InboundMessage): Promise {
+ // 1. Fetch context resources from user's MCP
+ const contextResources = await fetchContextResources([
+ 'context://user-profile',
+ 'context://conversation-summary', // <-- RAG happens here
+ 'context://workspace-state',
+ 'context://system-prompt',
+ ]);
+
+ // 2. Build system prompt from resources
+ const systemPrompt = buildSystemPrompt(contextResources);
+
+ // 3. Build messages with embedded conversation context
+ const messages = buildMessages(message, contextResources);
+
+ // 4. Get tools from MCP
+ const tools = await mcpClient.listTools();
+
+ // 5. Call Claude with embedded context
+ const response = await anthropic.messages.create({
+ model: 'claude-3-5-sonnet-20241022',
+ system: systemPrompt, // <-- User profile + workspace + custom prompt
+ messages, // <-- Conversation summary from RAG
+ tools,
+ });
+
+ // 6. Save to user's MCP (tool call)
+ await mcpClient.callTool('save_message', { role: 'user', content: message.content });
+ await mcpClient.callTool('save_message', { role: 'assistant', content: response });
+
+ return response;
+}
+```
+
+---
+
+## User MCP Server Implementation (Python)
+
+### Resource Handler
+
+```python
+# user-mcp/src/resources.py
+
+from mcp.server import Server
+from mcp.types import Resource, ResourceTemplate
+import asyncpg
+
+server = Server("dexorder-user")
+
+@server.list_resources()
+async def list_resources() -> list[Resource]:
+ return [
+ Resource(
+ uri="context://user-profile",
+ name="User Profile",
+ description="Trading style, preferences, and background",
+ mimeType="text/plain",
+ ),
+ Resource(
+ uri="context://conversation-summary",
+ name="Conversation Summary",
+ description="Recent conversation with RAG-enhanced context",
+ mimeType="text/plain",
+ ),
+ Resource(
+ uri="context://workspace-state",
+ name="Workspace State",
+ description="Current chart, watchlist, positions",
+ mimeType="application/json",
+ ),
+ Resource(
+ uri="context://system-prompt",
+ name="Custom System Prompt",
+ description="User's custom AI instructions",
+ mimeType="text/plain",
+ ),
+ ]
+
+@server.read_resource()
+async def read_resource(uri: str) -> str:
+ if uri == "context://user-profile":
+ return await build_user_profile()
+ elif uri == "context://conversation-summary":
+ return await build_conversation_summary()
+ elif uri == "context://workspace-state":
+ return await build_workspace_state()
+ elif uri == "context://system-prompt":
+ return await get_custom_prompt()
+ else:
+ raise ValueError(f"Unknown resource: {uri}")
+```
+
+### RAG Integration
+
+```python
+# user-mcp/src/rag.py
+
+import chromadb
+from sentence_transformers import SentenceTransformer
+
+class ConversationRAG:
+ def __init__(self, db_path: str):
+ self.chroma = chromadb.PersistentClient(path=db_path)
+ self.collection = self.chroma.get_or_create_collection("conversations")
+ self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
+
+ async def search_conversation_history(
+ self,
+ query: str,
+ limit: int = 5,
+ min_score: float = 0.7
+ ) -> list[dict]:
+ """Semantic search over conversation history"""
+ # Embed query
+ query_embedding = self.embedder.encode(query).tolist()
+
+ # Search
+ results = self.collection.query(
+ query_embeddings=[query_embedding],
+ n_results=limit,
+ )
+
+ # Filter by score and format
+ relevant = []
+ for i, score in enumerate(results['distances'][0]):
+ if score >= min_score:
+ relevant.append({
+ 'content': results['documents'][0][i],
+ 'metadata': results['metadatas'][0][i],
+ 'score': score,
+ })
+
+ return relevant
+
+ async def add_message(self, message_id: str, role: str, content: str, metadata: dict):
+ """Add message to RAG index"""
+ embedding = self.embedder.encode(content).tolist()
+
+ self.collection.add(
+ ids=[message_id],
+ embeddings=[embedding],
+ documents=[content],
+ metadatas=[{
+ 'role': role,
+ 'timestamp': metadata.get('timestamp'),
+ **metadata
+ }]
+ )
+```
+
+### Conversation Summary Builder
+
+```python
+# user-mcp/src/context.py
+
+async def build_conversation_summary(user_id: str) -> str:
+ """Build conversation summary with RAG"""
+ # 1. Get recent messages
+ recent_messages = await db.get_messages(
+ user_id=user_id,
+ limit=50,
+ order='desc'
+ )
+
+ # 2. Get current focus (last user message)
+ last_user_msg = next(
+ (m for m in recent_messages if m.role == 'user'),
+ None
+ )
+
+ if not last_user_msg:
+ return "No recent conversation history."
+
+ # 3. RAG search for relevant context
+ rag = ConversationRAG(f"/data/users/{user_id}/rag")
+ relevant_context = await rag.search_conversation_history(
+ query=last_user_msg.content,
+ limit=5,
+ min_score=0.7
+ )
+
+ # 4. Build summary
+ summary = f"Recent Conversation Summary:\n\n"
+
+ # Recent messages (last 10)
+ summary += "Last 10 messages:\n"
+ for msg in recent_messages[-10:]:
+ summary += f"- {msg.role}: {msg.content[:100]}...\n"
+
+ # Relevant past context
+ if relevant_context:
+ summary += "\nRelevant past discussions (RAG):\n"
+ for ctx in relevant_context:
+ timestamp = ctx['metadata'].get('timestamp', 'unknown')
+ summary += f"- [{timestamp}] {ctx['content'][:150]}...\n"
+
+ # Inferred focus
+ summary += f"\nCurrent focus: {infer_topic(last_user_msg.content)}\n"
+
+ return summary
+
+def infer_topic(message: str) -> str:
+ """Simple topic extraction"""
+ keywords = {
+ 'strategy': ['strategy', 'backtest', 'trading system'],
+ 'indicator': ['indicator', 'rsi', 'macd', 'moving average'],
+ 'analysis': ['analyze', 'chart', 'price action'],
+ 'risk': ['risk', 'position size', 'stop loss'],
+ }
+
+ message_lower = message.lower()
+ for topic, words in keywords.items():
+ if any(word in message_lower for word in words):
+ return topic
+
+ return 'general trading discussion'
+```
+
+---
+
+## Benefits of This Architecture
+
+1. **Privacy**: Conversation history never leaves user's container
+2. **Customization**: Each user controls their RAG, embeddings, prompt engineering
+3. **Scalability**: Platform harness is stateless - horizontally scalable
+4. **Cost Control**: Platform pays for Claude, users pay for their compute/storage
+5. **Portability**: Users can export/migrate their entire context
+6. **Development**: Users can test prompts/context locally without platform involvement
+
+---
+
+## Future Enhancements
+
+### Dynamic Resource URIs
+
+Support parameterized resources:
+```
+context://conversation/{session_id}
+context://strategy/{strategy_name}
+context://backtest/{backtest_id}/results
+```
+
+### Resource Templates
+
+MCP supports resource templates for dynamic discovery:
+```python
+@server.list_resource_templates()
+async def list_templates() -> list[ResourceTemplate]:
+ return [
+ ResourceTemplate(
+ uriTemplate="context://strategy/{name}",
+ name="Strategy Context",
+ description="Context for specific strategy",
+ )
+ ]
+```
+
+### Streaming Resources
+
+For large context (e.g., full backtest results), support streaming:
+```python
+@server.read_resource()
+async def read_resource(uri: str) -> AsyncIterator[str]:
+ if uri.startswith("context://backtest/"):
+ async for chunk in stream_backtest_results(uri):
+ yield chunk
+```
+
+---
+
+## Migration Path
+
+For users with existing conversation history in platform DB:
+
+1. **Export script**: Migrate platform history → user container DB
+2. **RAG indexing**: Embed all historical messages into ChromaDB
+3. **Preference migration**: Copy user preferences to container
+4. **Cutover**: Switch to resource-based context fetching
+
+Platform can keep read-only archive for compliance, but active context lives in user container.
diff --git a/gateway/.dockerignore b/gateway/.dockerignore
new file mode 100644
index 0000000..fdf9854
--- /dev/null
+++ b/gateway/.dockerignore
@@ -0,0 +1,9 @@
+node_modules
+dist
+.env
+.env.*
+!.env.example
+*.log
+.git
+.gitignore
+README.md
diff --git a/gateway/.env.example b/gateway/.env.example
new file mode 100644
index 0000000..7db60eb
--- /dev/null
+++ b/gateway/.env.example
@@ -0,0 +1,39 @@
+# Server configuration
+PORT=3000
+HOST=0.0.0.0
+LOG_LEVEL=info
+CORS_ORIGIN=*
+
+# Database
+DATABASE_URL=postgresql://postgres:postgres@localhost:5432/dexorder
+
+# LLM Provider API Keys (configure at least one)
+# Anthropic Claude
+ANTHROPIC_API_KEY=sk-ant-xxxxx
+
+# OpenAI GPT
+OPENAI_API_KEY=sk-xxxxx
+
+# Google Gemini
+GOOGLE_API_KEY=xxxxx
+
+# OpenRouter (access to 300+ models with one key)
+OPENROUTER_API_KEY=sk-or-xxxxx
+
+# Default model (if user has no preference)
+DEFAULT_MODEL_PROVIDER=anthropic
+DEFAULT_MODEL=claude-3-5-sonnet-20241022
+
+# Telegram (optional)
+TELEGRAM_BOT_TOKEN=
+
+# Kubernetes configuration
+KUBERNETES_NAMESPACE=dexorder-agents
+KUBERNETES_IN_CLUSTER=false
+KUBERNETES_CONTEXT=minikube
+AGENT_IMAGE=ghcr.io/dexorder/agent:latest
+SIDECAR_IMAGE=ghcr.io/dexorder/lifecycle-sidecar:latest
+AGENT_STORAGE_CLASS=standard
+
+# Redis (for session management - future)
+# REDIS_URL=redis://localhost:6379
diff --git a/gateway/.gitignore b/gateway/.gitignore
new file mode 100644
index 0000000..3a5c1d0
--- /dev/null
+++ b/gateway/.gitignore
@@ -0,0 +1,6 @@
+node_modules
+dist
+.env
+.env.local
+*.log
+.DS_Store
diff --git a/gateway/ARCHITECTURE.md b/gateway/ARCHITECTURE.md
new file mode 100644
index 0000000..1a7a77b
--- /dev/null
+++ b/gateway/ARCHITECTURE.md
@@ -0,0 +1,313 @@
+# Gateway Architecture: LangChain.js + LangGraph
+
+## Why LangChain.js (Not Vercel AI SDK or Direct Anthropic SDK)?
+
+### The Decision
+
+After evaluating Vercel AI SDK and LangChain.js, we chose **LangChain.js + LangGraph** for these reasons:
+
+1. **Multi-model support**: 300+ models via OpenRouter, plus direct integrations
+2. **Complex workflows**: LangGraph for stateful trading analysis pipelines
+3. **No vendor lock-in**: Switch between Anthropic, OpenAI, Google with one line
+4. **Streaming**: Same as Vercel AI SDK (`.stream()` method)
+5. **Tool calling**: Unified across all providers
+6. **Trading-specific**: State management, conditional branching, human-in-the-loop
+
+**We don't need Vercel AI SDK because:**
+- ❌ We use Vue (not React) - don't need React hooks
+- ❌ We have Node.js servers (not edge) - don't need edge runtime
+- ✅ **DO need** complex workflows (strategy analysis, backtesting, approvals)
+- ✅ **DO need** stateful execution (resume from failures)
+
+---
+
+## Architecture Layers
+
+### Layer 1: Model Abstraction (`src/llm/`)
+
+**Provider Factory** (`provider.ts`)
+```typescript
+const factory = new LLMProviderFactory(config, logger);
+
+// Create any model
+const claude = factory.createModel({
+ provider: 'anthropic',
+ model: 'claude-3-5-sonnet-20241022',
+});
+
+const gpt4 = factory.createModel({
+ provider: 'openai',
+ model: 'gpt-4o',
+});
+```
+
+**Model Router** (`router.ts`)
+```typescript
+const router = new ModelRouter(factory, logger);
+
+// Intelligently route based on:
+// - User license (free → Gemini Flash, pro → GPT-4, enterprise → Claude)
+// - Query complexity (simple → cheap, complex → smart)
+// - User preference (if set in license.preferredModel)
+// - Cost optimization (always use cheapest)
+
+const model = await router.route(
+ message.content,
+ userLicense,
+ RoutingStrategy.COMPLEXITY
+);
+```
+
+---
+
+### Layer 2: Agent Harness (`src/harness/`)
+
+**Stateless Orchestrator**
+
+The harness has **ZERO conversation state**. Everything lives in user's MCP container.
+
+**Flow:**
+```typescript
+async handleMessage(message: InboundMessage) {
+ // 1. Fetch context from user's MCP (resources, not tools)
+ const resources = await mcpClient.listResources();
+ const context = await Promise.all([
+ mcpClient.readResource('context://user-profile'), // Trading style
+ mcpClient.readResource('context://conversation-summary'), // RAG summary
+ mcpClient.readResource('context://workspace-state'), // Current chart
+ mcpClient.readResource('context://system-prompt'), // Custom instructions
+ ]);
+
+ // 2. Route to appropriate model
+ const model = await modelRouter.route(message, license);
+
+ // 3. Build messages with embedded context
+ const messages = buildLangChainMessages(systemPrompt, context);
+
+ // 4. Call LLM
+ const response = await model.invoke(messages);
+
+ // 5. Save to user's MCP (tool call)
+ await mcpClient.callTool('save_message', { role: 'user', content: message });
+ await mcpClient.callTool('save_message', { role: 'assistant', content: response });
+
+ return response;
+}
+```
+
+**Streaming variant:**
+```typescript
+async *streamMessage(message: InboundMessage) {
+ const model = await modelRouter.route(message, license);
+ const messages = buildMessages(context, message);
+
+ const stream = await model.stream(messages);
+
+ let fullResponse = '';
+ for await (const chunk of stream) {
+ fullResponse += chunk.content;
+ yield chunk.content; // Stream to WebSocket/Telegram
+ }
+
+ // Save after streaming completes
+ await mcpClient.callTool('save_message', { /* ... */ });
+}
+```
+
+---
+
+### Layer 3: Workflows (`src/workflows/`)
+
+**LangGraph for Complex Trading Analysis**
+
+```typescript
+// Example: Strategy Analysis Pipeline
+const workflow = new StateGraph(StrategyAnalysisState)
+ .addNode('code_review', async (state) => {
+ const model = new ChatAnthropic({ model: 'claude-3-opus' });
+ const review = await model.invoke(`Review: ${state.strategyCode}`);
+ return { codeReview: review.content };
+ })
+ .addNode('backtest', async (state) => {
+ // Call user's MCP backtest tool
+ const results = await mcpClient.callTool('run_backtest', {
+ strategy: state.strategyCode,
+ ticker: state.ticker,
+ });
+ return { backtestResults: results };
+ })
+ .addNode('risk_assessment', async (state) => {
+ const model = new ChatAnthropic({ model: 'claude-3-5-sonnet' });
+ const assessment = await model.invoke(
+ `Analyze risk: ${JSON.stringify(state.backtestResults)}`
+ );
+ return { riskAssessment: assessment.content };
+ })
+ .addNode('human_approval', async (state) => {
+ // Pause for user review (human-in-the-loop)
+ return { humanApproved: await waitForUserApproval(state) };
+ })
+ .addConditionalEdges('human_approval', (state) => {
+ return state.humanApproved ? 'deploy' : 'reject';
+ })
+ .compile();
+
+// Execute
+const result = await workflow.invoke({
+ strategyCode: userCode,
+ ticker: 'BTC/USDT',
+ timeframe: '1h',
+});
+```
+
+**Benefits:**
+- **Stateful**: Resume if server crashes mid-analysis
+- **Conditional**: Route based on results (if Sharpe > 2 → deploy, else → reject)
+- **Human-in-the-loop**: Pause for user approval
+- **Multi-step**: Each node can use different models
+
+---
+
+## User Context Architecture
+
+### MCP Resources (Not Tools)
+
+**User's MCP server exposes resources** (read-only context):
+
+```
+context://user-profile → Trading style, preferences
+context://conversation-summary → RAG-generated summary
+context://workspace-state → Current chart, positions
+context://system-prompt → User's custom AI instructions
+```
+
+**Gateway fetches and embeds in LLM call:**
+```typescript
+const userProfile = await mcpClient.readResource('context://user-profile');
+const conversationSummary = await mcpClient.readResource('context://conversation-summary');
+
+// User's MCP server runs RAG search and returns summary
+// Gateway embeds this in Claude/GPT prompt
+```
+
+**Why resources, not tools?**
+- Resources = context injection (read-only)
+- Tools = actions (write operations)
+- Context should be fetched **before** LLM call, not during
+
+---
+
+## Model Routing Strategies
+
+### 1. User Preference
+```typescript
+// User's license has preferred model
+{
+ "preferredModel": {
+ "provider": "anthropic",
+ "model": "claude-3-5-sonnet-20241022"
+ }
+}
+
+// Router uses this if set
+```
+
+### 2. Complexity-Based
+```typescript
+const isComplex = message.includes('backtest') || message.length > 200;
+
+if (isComplex) {
+ return { provider: 'anthropic', model: 'claude-3-opus' }; // Smart
+} else {
+ return { provider: 'openai', model: 'gpt-4o-mini' }; // Fast
+}
+```
+
+### 3. License Tier
+```typescript
+switch (license.licenseType) {
+ case 'free':
+ return { provider: 'google', model: 'gemini-2.0-flash-exp' }; // Cheap
+ case 'pro':
+ return { provider: 'openai', model: 'gpt-4o' }; // Balanced
+ case 'enterprise':
+ return { provider: 'anthropic', model: 'claude-3-5-sonnet' }; // Premium
+}
+```
+
+### 4. Cost-Optimized
+```typescript
+return { provider: 'google', model: 'gemini-2.0-flash-exp' }; // Always cheapest
+```
+
+---
+
+## When to Use What
+
+### Simple Chat → Agent Harness
+```typescript
+// User: "What's the RSI on BTC?"
+// → Fast streaming response via harness.streamMessage()
+```
+
+### Complex Analysis → LangGraph Workflow
+```typescript
+// User: "Analyze this strategy and backtest it"
+// → Multi-step workflow: code review → backtest → risk → approval
+```
+
+### Direct Tool Call → MCP Client
+```typescript
+// User: "Get my watchlist"
+// → Direct MCP tool call, no LLM needed
+```
+
+---
+
+## Data Flow
+
+```
+User Message ("Analyze my strategy")
+ ↓
+Gateway → Route to workflow (not harness)
+ ↓
+LangGraph Workflow:
+ ├─ Node 1: Code Review (Claude Opus)
+ │ └─ Analyzes strategy code
+ ├─ Node 2: Backtest (MCP tool call)
+ │ └─ User's container runs backtest
+ ├─ Node 3: Risk Assessment (Claude Sonnet)
+ │ └─ Evaluates results
+ ├─ Node 4: Human Approval (pause)
+ │ └─ User reviews in UI
+ └─ Node 5: Recommendation (GPT-4o-mini)
+ └─ Final decision
+
+Result → Return to user
+```
+
+---
+
+## Benefits Summary
+
+| Feature | LangChain.js | Vercel AI SDK | Direct Anthropic SDK |
+|---------|--------------|---------------|----------------------|
+| Multi-model | ✅ 300+ models | ✅ 100+ models | ❌ Anthropic only |
+| Streaming | ✅ `.stream()` | ✅ `streamText()` | ✅ `.stream()` |
+| Tool calling | ✅ Unified | ✅ Unified | ✅ Anthropic format |
+| Complex workflows | ✅ LangGraph | ❌ Limited | ❌ DIY |
+| Stateful agents | ✅ LangGraph | ❌ No | ❌ No |
+| Human-in-the-loop | ✅ LangGraph | ❌ No | ❌ No |
+| React hooks | ❌ N/A | ✅ `useChat()` | ❌ N/A |
+| Bundle size | Large (101kb) | Small (30kb) | Medium (60kb) |
+| **Dexorder needs** | **✅ Perfect fit** | **❌ Missing workflows** | **❌ Vendor lock-in** |
+
+---
+
+## Next Steps
+
+1. **Implement tool calling** in agent harness (bind MCP tools to LangChain)
+2. **Add state persistence** for LangGraph (PostgreSQL checkpointer)
+3. **Build more workflows**: market scanner, portfolio optimizer
+4. **Add monitoring**: Track model usage, costs, latency
+5. **User container**: Implement Python MCP server with resources
diff --git a/gateway/Dockerfile b/gateway/Dockerfile
new file mode 100644
index 0000000..4b7a63f
--- /dev/null
+++ b/gateway/Dockerfile
@@ -0,0 +1,40 @@
+FROM node:22-alpine AS builder
+
+WORKDIR /app
+
+# Copy package files
+COPY package*.json ./
+COPY tsconfig.json ./
+
+# Install dependencies
+RUN npm ci
+
+# Copy source
+COPY src ./src
+
+# Build
+RUN npm run build
+
+# Production image
+FROM node:22-alpine
+
+WORKDIR /app
+
+# Copy package files
+COPY package*.json ./
+
+# Install production dependencies only
+RUN npm ci --omit=dev
+
+# Copy built application
+COPY --from=builder /app/dist ./dist
+
+# Create non-root user
+RUN addgroup -g 1001 -S nodejs && \
+ adduser -S nodejs -u 1001
+
+USER nodejs
+
+EXPOSE 3000
+
+CMD ["node", "dist/main.js"]
diff --git a/gateway/README.md b/gateway/README.md
new file mode 100644
index 0000000..15c20c6
--- /dev/null
+++ b/gateway/README.md
@@ -0,0 +1,212 @@
+# Dexorder Gateway
+
+Multi-channel gateway with agent harness for the Dexorder AI platform.
+
+## Architecture
+
+```
+┌─────────────────────────────────────────────────────────┐
+│ Platform Gateway │
+│ (Node.js/Fastify) │
+│ │
+│ ┌────────────────────────────────────────────────┐ │
+│ │ Channels │ │
+│ │ - WebSocket (/ws/chat) │ │
+│ │ - Telegram Webhook (/webhook/telegram) │ │
+│ └────────────────────────────────────────────────┘ │
+│ ↕ │
+│ ┌────────────────────────────────────────────────┐ │
+│ │ Authenticator │ │
+│ │ - JWT verification (WebSocket) │ │
+│ │ - Channel linking (Telegram) │ │
+│ │ - User license lookup (PostgreSQL) │ │
+│ └────────────────────────────────────────────────┘ │
+│ ↕ │
+│ ┌────────────────────────────────────────────────┐ │
+│ │ Agent Harness (per-session) │ │
+│ │ - Claude API integration │ │
+│ │ - MCP client connector │ │
+│ │ - Conversation state │ │
+│ └────────────────────────────────────────────────┘ │
+│ ↕ │
+│ ┌────────────────────────────────────────────────┐ │
+│ │ MCP Client │ │
+│ │ - User container connection │ │
+│ │ - Tool routing │ │
+│ └────────────────────────────────────────────────┘ │
+└─────────────────────────────────────────────────────────┘
+ ↕
+ ┌───────────────────────────────┐
+ │ User MCP Server (Python) │
+ │ - Strategies, indicators │
+ │ - Memory, preferences │
+ │ - Backtest sandbox │
+ └───────────────────────────────────┘
+```
+
+## Features
+
+- **Automatic container provisioning**: Creates user agent containers on-demand via Kubernetes
+- **Multi-channel support**: WebSocket and Telegram webhooks
+- **Per-channel authentication**: JWT for web, channel linking for chat apps
+- **User license management**: Feature flags and resource limits from PostgreSQL
+- **Container lifecycle management**: Auto-shutdown on idle (handled by container sidecar)
+- **License-based resources**: Different memory/CPU/storage limits per tier
+- **Multi-model LLM support**: Anthropic Claude, OpenAI GPT, Google Gemini, OpenRouter (300+ models)
+- **Zero vendor lock-in**: Switch models with one line, powered by LangChain.js
+- **Intelligent routing**: Auto-select models based on complexity, license tier, or user preference
+- **Streaming responses**: Real-time chat with WebSocket and Telegram
+- **Complex workflows**: LangGraph for stateful trading analysis (backtest → risk → approval)
+- **Agent harness**: Stateless orchestrator (all context lives in user's MCP container)
+- **MCP resource integration**: User's RAG, conversation history, and preferences
+
+## Container Management
+
+When a user authenticates, the gateway:
+
+1. **Checks for existing container**: Queries Kubernetes for deployment
+2. **Creates if missing**: Renders YAML template based on license tier
+3. **Waits for ready**: Polls deployment status until healthy
+4. **Returns MCP endpoint**: Computed from service name
+5. **Connects to MCP server**: Proceeds with normal authentication flow
+
+Container templates by license tier:
+
+| Tier | Memory | CPU | Storage | Idle Timeout |
+|------|--------|-----|---------|--------------|
+| Free | 512Mi | 500m | 1Gi | 15min |
+| Pro | 2Gi | 2000m | 10Gi | 60min |
+| Enterprise | 4Gi | 4000m | 50Gi | Never |
+
+Containers self-manage their lifecycle using the lifecycle sidecar (see `../lifecycle-sidecar/`)
+
+## Setup
+
+### Prerequisites
+
+- Node.js >= 22.0.0
+- PostgreSQL database
+- At least one LLM provider API key:
+ - Anthropic Claude
+ - OpenAI GPT
+ - Google Gemini
+ - OpenRouter (one key for 300+ models)
+
+### Development
+
+1. Install dependencies:
+```bash
+npm install
+```
+
+2. Copy environment template:
+```bash
+cp .env.example .env
+```
+
+3. Configure `.env` (see `.env.example`):
+```bash
+DATABASE_URL=postgresql://postgres:postgres@localhost:5432/dexorder
+
+# Configure at least one provider
+ANTHROPIC_API_KEY=sk-ant-xxxxx
+# OPENAI_API_KEY=sk-xxxxx
+# GOOGLE_API_KEY=xxxxx
+# OPENROUTER_API_KEY=sk-or-xxxxx
+
+# Optional: Set default model
+DEFAULT_MODEL_PROVIDER=anthropic
+DEFAULT_MODEL=claude-3-5-sonnet-20241022
+```
+
+4. Run development server:
+```bash
+npm run dev
+```
+
+### Production Build
+
+```bash
+npm run build
+npm start
+```
+
+### Docker
+
+```bash
+docker build -t dexorder/gateway:latest .
+docker run -p 3000:3000 --env-file .env dexorder/gateway:latest
+```
+
+## Database Schema
+
+Required PostgreSQL tables (will be documented separately):
+
+### `user_licenses`
+- `user_id` (text, primary key)
+- `email` (text)
+- `license_type` (text: 'free', 'pro', 'enterprise')
+- `features` (jsonb)
+- `resource_limits` (jsonb)
+- `mcp_server_url` (text)
+- `expires_at` (timestamp, nullable)
+- `created_at` (timestamp)
+- `updated_at` (timestamp)
+
+### `user_channel_links`
+- `id` (serial, primary key)
+- `user_id` (text, foreign key)
+- `channel_type` (text: 'telegram', 'slack', 'discord')
+- `channel_user_id` (text)
+- `created_at` (timestamp)
+
+## API Endpoints
+
+### WebSocket
+
+**`GET /ws/chat`**
+- WebSocket connection for web client
+- Auth: Bearer token in headers
+- Protocol: JSON messages
+
+Example:
+```javascript
+const ws = new WebSocket('ws://localhost:3000/ws/chat', {
+ headers: {
+ 'Authorization': 'Bearer your-jwt-token'
+ }
+});
+
+ws.on('message', (data) => {
+ const msg = JSON.parse(data);
+ console.log(msg);
+});
+
+ws.send(JSON.stringify({
+ type: 'message',
+ content: 'Hello, AI!'
+}));
+```
+
+### Telegram Webhook
+
+**`POST /webhook/telegram`**
+- Telegram bot webhook endpoint
+- Auth: Telegram user linked to platform user
+- Automatically processes incoming messages
+
+### Health Check
+
+**`GET /health`**
+- Returns server health status
+
+## TODO
+
+- [ ] Implement JWT verification with JWKS
+- [ ] Implement MCP HTTP/SSE transport
+- [ ] Add Redis for session persistence
+- [ ] Add rate limiting per user license
+- [ ] Add message usage tracking
+- [ ] Add streaming responses for WebSocket
+- [ ] Add Slack and Discord channel handlers
+- [ ] Add session cleanup/timeout logic
diff --git a/gateway/package.json b/gateway/package.json
new file mode 100644
index 0000000..5655783
--- /dev/null
+++ b/gateway/package.json
@@ -0,0 +1,42 @@
+{
+ "name": "@dexorder/gateway",
+ "version": "0.1.0",
+ "type": "module",
+ "private": true,
+ "description": "Multi-channel gateway with agent harness for Dexorder AI platform",
+ "scripts": {
+ "dev": "tsx watch src/main.ts",
+ "build": "tsc",
+ "start": "node dist/main.js",
+ "typecheck": "tsc --noEmit"
+ },
+ "dependencies": {
+ "@fastify/cors": "^10.0.1",
+ "@fastify/websocket": "^11.0.1",
+ "@kubernetes/client-node": "^0.21.0",
+ "@langchain/anthropic": "^0.3.8",
+ "@langchain/core": "^0.3.24",
+ "@langchain/google-genai": "^0.1.6",
+ "@langchain/langgraph": "^0.2.26",
+ "@langchain/openai": "^0.3.21",
+ "@langchain/openrouter": "^0.1.2",
+ "@modelcontextprotocol/sdk": "^1.0.4",
+ "fastify": "^5.2.0",
+ "ioredis": "^5.4.2",
+ "js-yaml": "^4.1.0",
+ "pg": "^8.13.1",
+ "pino": "^9.6.0",
+ "pino-pretty": "^13.0.0",
+ "zod": "^3.24.1"
+ },
+ "devDependencies": {
+ "@types/js-yaml": "^4.0.9",
+ "@types/node": "^22.10.2",
+ "@types/pg": "^8.11.10",
+ "tsx": "^4.19.2",
+ "typescript": "^5.7.2"
+ },
+ "engines": {
+ "node": ">=22.0.0"
+ }
+}
diff --git a/gateway/schema.sql b/gateway/schema.sql
new file mode 100644
index 0000000..d1ae4aa
--- /dev/null
+++ b/gateway/schema.sql
@@ -0,0 +1,79 @@
+-- User license and authorization schema
+
+CREATE TABLE IF NOT EXISTS user_licenses (
+ user_id TEXT PRIMARY KEY,
+ email TEXT,
+ license_type TEXT NOT NULL CHECK (license_type IN ('free', 'pro', 'enterprise')),
+ features JSONB NOT NULL DEFAULT '{
+ "maxIndicators": 5,
+ "maxStrategies": 3,
+ "maxBacktestDays": 30,
+ "realtimeData": false,
+ "customExecutors": false,
+ "apiAccess": false
+ }',
+ resource_limits JSONB NOT NULL DEFAULT '{
+ "maxConcurrentSessions": 1,
+ "maxMessagesPerDay": 100,
+ "maxTokensPerMessage": 4096,
+ "rateLimitPerMinute": 10
+ }',
+ mcp_server_url TEXT NOT NULL,
+ preferred_model JSONB DEFAULT NULL,
+ expires_at TIMESTAMP WITH TIME ZONE,
+ created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
+ updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
+);
+
+COMMENT ON COLUMN user_licenses.preferred_model IS 'Optional model preference: {"provider": "anthropic", "model": "claude-3-5-sonnet-20241022", "temperature": 0.7}';
+
+CREATE INDEX idx_user_licenses_expires_at ON user_licenses(expires_at)
+ WHERE expires_at IS NOT NULL;
+
+-- Channel linking for multi-channel support
+CREATE TABLE IF NOT EXISTS user_channel_links (
+ id SERIAL PRIMARY KEY,
+ user_id TEXT NOT NULL REFERENCES user_licenses(user_id) ON DELETE CASCADE,
+ channel_type TEXT NOT NULL CHECK (channel_type IN ('telegram', 'slack', 'discord', 'websocket')),
+ channel_user_id TEXT NOT NULL,
+ metadata JSONB,
+ created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
+ UNIQUE(channel_type, channel_user_id)
+);
+
+CREATE INDEX idx_user_channel_links_user_id ON user_channel_links(user_id);
+CREATE INDEX idx_user_channel_links_channel ON user_channel_links(channel_type, channel_user_id);
+
+-- Example data for development
+INSERT INTO user_licenses (user_id, email, license_type, mcp_server_url, features, resource_limits, preferred_model)
+VALUES (
+ 'dev-user-001',
+ 'dev@example.com',
+ 'pro',
+ 'http://localhost:8080/mcp',
+ '{
+ "maxIndicators": 50,
+ "maxStrategies": 20,
+ "maxBacktestDays": 365,
+ "realtimeData": true,
+ "customExecutors": true,
+ "apiAccess": true
+ }',
+ '{
+ "maxConcurrentSessions": 5,
+ "maxMessagesPerDay": 1000,
+ "maxTokensPerMessage": 8192,
+ "rateLimitPerMinute": 60
+ }',
+ '{
+ "provider": "anthropic",
+ "model": "claude-3-5-sonnet-20241022",
+ "temperature": 0.7
+ }'
+)
+ON CONFLICT (user_id) DO NOTHING;
+
+-- Example Telegram link
+INSERT INTO user_channel_links (user_id, channel_type, channel_user_id)
+VALUES ('dev-user-001', 'telegram', '123456789')
+ON CONFLICT (channel_type, channel_user_id) DO NOTHING;
diff --git a/gateway/src/auth/authenticator.ts b/gateway/src/auth/authenticator.ts
new file mode 100644
index 0000000..3a7b23d
--- /dev/null
+++ b/gateway/src/auth/authenticator.ts
@@ -0,0 +1,146 @@
+import type { FastifyRequest, FastifyBaseLogger } from 'fastify';
+import { UserService } from '../db/user-service.js';
+import { ChannelType, type AuthContext } from '../types/user.js';
+import type { ContainerManager } from '../k8s/container-manager.js';
+
+export interface AuthenticatorConfig {
+ userService: UserService;
+ containerManager: ContainerManager;
+ logger: FastifyBaseLogger;
+}
+
+/**
+ * Multi-channel authenticator
+ * Handles authentication for WebSocket, Telegram, and other channels
+ */
+export class Authenticator {
+ private config: AuthenticatorConfig;
+
+ constructor(config: AuthenticatorConfig) {
+ this.config = config;
+ }
+
+ /**
+ * Authenticate WebSocket connection via JWT token
+ * Also ensures the user's container is running
+ */
+ async authenticateWebSocket(
+ request: FastifyRequest
+ ): Promise {
+ try {
+ const token = this.extractBearerToken(request);
+ if (!token) {
+ this.config.logger.warn('No bearer token in WebSocket connection');
+ return null;
+ }
+
+ const userId = await this.config.userService.verifyWebToken(token);
+ if (!userId) {
+ this.config.logger.warn('Invalid JWT token');
+ return null;
+ }
+
+ const license = await this.config.userService.getUserLicense(userId);
+ if (!license) {
+ this.config.logger.warn({ userId }, 'User license not found');
+ return null;
+ }
+
+ // Ensure container is running (may take time if creating new container)
+ this.config.logger.info({ userId }, 'Ensuring user container is running');
+ const { mcpEndpoint, wasCreated } = await this.config.containerManager.ensureContainerRunning(
+ userId,
+ license
+ );
+
+ this.config.logger.info(
+ { userId, mcpEndpoint, wasCreated },
+ 'Container is ready'
+ );
+
+ // Update license with actual MCP endpoint
+ license.mcpServerUrl = mcpEndpoint;
+
+ const sessionId = `ws_${userId}_${Date.now()}`;
+
+ return {
+ userId,
+ channelType: ChannelType.WEBSOCKET,
+ channelUserId: userId, // For WebSocket, same as userId
+ sessionId,
+ license,
+ authenticatedAt: new Date(),
+ };
+ } catch (error) {
+ this.config.logger.error({ error }, 'WebSocket authentication error');
+ return null;
+ }
+ }
+
+ /**
+ * Authenticate Telegram webhook
+ * Also ensures the user's container is running
+ */
+ async authenticateTelegram(telegramUserId: string): Promise {
+ try {
+ const userId = await this.config.userService.getUserIdFromChannel(
+ 'telegram',
+ telegramUserId
+ );
+
+ if (!userId) {
+ this.config.logger.warn(
+ { telegramUserId },
+ 'Telegram user not linked to platform user'
+ );
+ return null;
+ }
+
+ const license = await this.config.userService.getUserLicense(userId);
+ if (!license) {
+ this.config.logger.warn({ userId }, 'User license not found');
+ return null;
+ }
+
+ // Ensure container is running
+ this.config.logger.info({ userId }, 'Ensuring user container is running');
+ const { mcpEndpoint, wasCreated } = await this.config.containerManager.ensureContainerRunning(
+ userId,
+ license
+ );
+
+ this.config.logger.info(
+ { userId, mcpEndpoint, wasCreated },
+ 'Container is ready'
+ );
+
+ // Update license with actual MCP endpoint
+ license.mcpServerUrl = mcpEndpoint;
+
+ const sessionId = `tg_${telegramUserId}_${Date.now()}`;
+
+ return {
+ userId,
+ channelType: ChannelType.TELEGRAM,
+ channelUserId: telegramUserId,
+ sessionId,
+ license,
+ authenticatedAt: new Date(),
+ };
+ } catch (error) {
+ this.config.logger.error({ error }, 'Telegram authentication error');
+ return null;
+ }
+ }
+
+ /**
+ * Extract bearer token from request headers
+ */
+ private extractBearerToken(request: FastifyRequest): string | null {
+ const auth = request.headers.authorization;
+ if (!auth || !auth.startsWith('Bearer ')) {
+ return null;
+ }
+ return auth.substring(7);
+ }
+}
diff --git a/gateway/src/channels/telegram-handler.ts b/gateway/src/channels/telegram-handler.ts
new file mode 100644
index 0000000..8e00dd0
--- /dev/null
+++ b/gateway/src/channels/telegram-handler.ts
@@ -0,0 +1,163 @@
+import type { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify';
+import type { Authenticator } from '../auth/authenticator.js';
+import { AgentHarness } from '../harness/agent-harness.js';
+import type { InboundMessage } from '../types/messages.js';
+import { randomUUID } from 'crypto';
+
+import type { ProviderConfig } from '../llm/provider.js';
+
+export interface TelegramHandlerConfig {
+ authenticator: Authenticator;
+ providerConfig: ProviderConfig;
+ telegramBotToken: string;
+}
+
+interface TelegramUpdate {
+ update_id: number;
+ message?: {
+ message_id: number;
+ from: {
+ id: number;
+ first_name: string;
+ username?: string;
+ };
+ chat: {
+ id: number;
+ type: string;
+ };
+ text?: string;
+ photo?: Array<{
+ file_id: string;
+ file_size: number;
+ }>;
+ };
+}
+
+/**
+ * Telegram webhook handler
+ */
+export class TelegramHandler {
+ private config: TelegramHandlerConfig;
+ private sessions = new Map();
+
+ constructor(config: TelegramHandlerConfig) {
+ this.config = config;
+ }
+
+ /**
+ * Register Telegram webhook routes
+ */
+ register(app: FastifyInstance): void {
+ app.post('/webhook/telegram', async (request: FastifyRequest, reply: FastifyReply) => {
+ await this.handleWebhook(request, reply, app);
+ });
+ }
+
+ /**
+ * Handle Telegram webhook
+ */
+ private async handleWebhook(
+ request: FastifyRequest,
+ reply: FastifyReply,
+ app: FastifyInstance
+ ): Promise {
+ const logger = app.log;
+
+ try {
+ const update = request.body as TelegramUpdate;
+
+ if (!update.message?.text) {
+ // Ignore non-text messages for now
+ reply.code(200).send({ ok: true });
+ return;
+ }
+
+ const telegramUserId = update.message.from.id.toString();
+ const chatId = update.message.chat.id;
+ const text = update.message.text;
+
+ logger.info({ telegramUserId, chatId, text }, 'Received Telegram message');
+
+ // Authenticate
+ const authContext = await this.config.authenticator.authenticateTelegram(telegramUserId);
+ if (!authContext) {
+ logger.warn({ telegramUserId }, 'Telegram user not authenticated');
+ await this.sendTelegramMessage(
+ chatId,
+ 'Please link your Telegram account to Dexorder first.'
+ );
+ reply.code(200).send({ ok: true });
+ return;
+ }
+
+ // Get or create harness
+ let harness = this.sessions.get(authContext.sessionId);
+ if (!harness) {
+ harness = new AgentHarness({
+ userId: authContext.userId,
+ sessionId: authContext.sessionId,
+ license: authContext.license,
+ providerConfig: this.config.providerConfig,
+ logger,
+ });
+ await harness.initialize();
+ this.sessions.set(authContext.sessionId, harness);
+ }
+
+ // Process message
+ const inboundMessage: InboundMessage = {
+ messageId: randomUUID(),
+ userId: authContext.userId,
+ sessionId: authContext.sessionId,
+ content: text,
+ timestamp: new Date(),
+ };
+
+ const response = await harness.handleMessage(inboundMessage);
+
+ // Send response back to Telegram
+ await this.sendTelegramMessage(chatId, response.content);
+
+ reply.code(200).send({ ok: true });
+ } catch (error) {
+ logger.error({ error }, 'Error handling Telegram webhook');
+ reply.code(500).send({ ok: false, error: 'Internal server error' });
+ }
+ }
+
+ /**
+ * Send message to Telegram chat
+ */
+ private async sendTelegramMessage(chatId: number, text: string): Promise {
+ const url = `https://api.telegram.org/bot${this.config.telegramBotToken}/sendMessage`;
+
+ try {
+ const response = await fetch(url, {
+ method: 'POST',
+ headers: {
+ 'Content-Type': 'application/json',
+ },
+ body: JSON.stringify({
+ chat_id: chatId,
+ text,
+ parse_mode: 'Markdown',
+ }),
+ });
+
+ if (!response.ok) {
+ throw new Error(`Telegram API error: ${response.statusText}`);
+ }
+ } catch (error) {
+ console.error('Failed to send Telegram message:', error);
+ throw error;
+ }
+ }
+
+ /**
+ * Cleanup old sessions (call periodically)
+ */
+ async cleanupSessions(maxAgeMs = 30 * 60 * 1000): Promise {
+ // TODO: Track session last activity and cleanup
+ // For now, sessions persist until server restart
+ }
+}
diff --git a/gateway/src/channels/websocket-handler.ts b/gateway/src/channels/websocket-handler.ts
new file mode 100644
index 0000000..3cbcd3e
--- /dev/null
+++ b/gateway/src/channels/websocket-handler.ts
@@ -0,0 +1,161 @@
+import type { FastifyInstance, FastifyRequest } from 'fastify';
+import type { WebSocket } from '@fastify/websocket';
+import type { Authenticator } from '../auth/authenticator.js';
+import { AgentHarness } from '../harness/agent-harness.js';
+import type { InboundMessage } from '../types/messages.js';
+import { randomUUID } from 'crypto';
+
+import type { ProviderConfig } from '../llm/provider.js';
+
+export interface WebSocketHandlerConfig {
+ authenticator: Authenticator;
+ providerConfig: ProviderConfig;
+}
+
+/**
+ * WebSocket channel handler
+ */
+export class WebSocketHandler {
+ private config: WebSocketHandlerConfig;
+ private sessions = new Map();
+
+ constructor(config: WebSocketHandlerConfig) {
+ this.config = config;
+ }
+
+ /**
+ * Register WebSocket routes
+ */
+ register(app: FastifyInstance): void {
+ app.get(
+ '/ws/chat',
+ { websocket: true },
+ async (socket: WebSocket, request: FastifyRequest) => {
+ await this.handleConnection(socket, request, app);
+ }
+ );
+ }
+
+ /**
+ * Handle WebSocket connection
+ */
+ private async handleConnection(
+ socket: WebSocket,
+ request: FastifyRequest,
+ app: FastifyInstance
+ ): Promise {
+ const logger = app.log;
+
+ // Send initial connecting message
+ socket.send(
+ JSON.stringify({
+ type: 'status',
+ status: 'authenticating',
+ message: 'Authenticating...',
+ })
+ );
+
+ // Authenticate (this may take time if creating container)
+ const authContext = await this.config.authenticator.authenticateWebSocket(request);
+ if (!authContext) {
+ logger.warn('WebSocket authentication failed');
+ socket.send(
+ JSON.stringify({
+ type: 'error',
+ message: 'Authentication failed',
+ })
+ );
+ socket.close(1008, 'Authentication failed');
+ return;
+ }
+
+ logger.info(
+ { userId: authContext.userId, sessionId: authContext.sessionId },
+ 'WebSocket connection authenticated'
+ );
+
+ // Send workspace starting message
+ socket.send(
+ JSON.stringify({
+ type: 'status',
+ status: 'initializing',
+ message: 'Starting your workspace...',
+ })
+ );
+
+ // Create agent harness
+ const harness = new AgentHarness({
+ userId: authContext.userId,
+ sessionId: authContext.sessionId,
+ license: authContext.license,
+ providerConfig: this.config.providerConfig,
+ logger,
+ });
+
+ try {
+ await harness.initialize();
+ this.sessions.set(authContext.sessionId, harness);
+
+ // Send connected message
+ socket.send(
+ JSON.stringify({
+ type: 'connected',
+ sessionId: authContext.sessionId,
+ userId: authContext.userId,
+ licenseType: authContext.license.licenseType,
+ message: 'Connected to Dexorder AI',
+ })
+ );
+
+ // Handle messages
+ socket.on('message', async (data: Buffer) => {
+ try {
+ const payload = JSON.parse(data.toString());
+
+ if (payload.type === 'message') {
+ const inboundMessage: InboundMessage = {
+ messageId: randomUUID(),
+ userId: authContext.userId,
+ sessionId: authContext.sessionId,
+ content: payload.content,
+ attachments: payload.attachments,
+ timestamp: new Date(),
+ };
+
+ const response = await harness.handleMessage(inboundMessage);
+
+ socket.send(
+ JSON.stringify({
+ type: 'message',
+ ...response,
+ })
+ );
+ }
+ } catch (error) {
+ logger.error({ error }, 'Error handling WebSocket message');
+ socket.send(
+ JSON.stringify({
+ type: 'error',
+ message: 'Failed to process message',
+ })
+ );
+ }
+ });
+
+ // Handle disconnection
+ socket.on('close', async () => {
+ logger.info({ sessionId: authContext.sessionId }, 'WebSocket disconnected');
+ await harness.cleanup();
+ this.sessions.delete(authContext.sessionId);
+ });
+
+ socket.on('error', (error) => {
+ logger.error({ error, sessionId: authContext.sessionId }, 'WebSocket error');
+ });
+ } catch (error) {
+ logger.error({ error }, 'Failed to initialize agent harness');
+ socket.close(1011, 'Internal server error');
+ await harness.cleanup();
+ }
+ }
+}
diff --git a/gateway/src/db/user-service.ts b/gateway/src/db/user-service.ts
new file mode 100644
index 0000000..3a1805e
--- /dev/null
+++ b/gateway/src/db/user-service.ts
@@ -0,0 +1,107 @@
+import { Pool, PoolClient } from 'pg';
+import type { UserLicense } from '../types/user.js';
+import { UserLicenseSchema } from '../types/user.js';
+
+export class UserService {
+ private pool: Pool;
+
+ constructor(connectionString: string) {
+ this.pool = new Pool({
+ connectionString,
+ max: 20,
+ idleTimeoutMillis: 30000,
+ connectionTimeoutMillis: 2000,
+ });
+ }
+
+ /**
+ * Get user license by user ID
+ */
+ async getUserLicense(userId: string): Promise {
+ const client = await this.pool.connect();
+ try {
+ const result = await client.query(
+ `SELECT
+ user_id as "userId",
+ email,
+ license_type as "licenseType",
+ features,
+ resource_limits as "resourceLimits",
+ mcp_server_url as "mcpServerUrl",
+ preferred_model as "preferredModel",
+ expires_at as "expiresAt",
+ created_at as "createdAt",
+ updated_at as "updatedAt"
+ FROM user_licenses
+ WHERE user_id = $1
+ AND (expires_at IS NULL OR expires_at > NOW())`,
+ [userId]
+ );
+
+ if (result.rows.length === 0) {
+ return null;
+ }
+
+ const row = result.rows[0];
+
+ // Parse and validate
+ return UserLicenseSchema.parse({
+ userId: row.userId,
+ email: row.email,
+ licenseType: row.licenseType,
+ features: row.features,
+ resourceLimits: row.resourceLimits,
+ mcpServerUrl: row.mcpServerUrl,
+ preferredModel: row.preferredModel,
+ expiresAt: row.expiresAt,
+ createdAt: row.createdAt,
+ updatedAt: row.updatedAt,
+ });
+ } finally {
+ client.release();
+ }
+ }
+
+ /**
+ * Get user ID from channel-specific identifier
+ */
+ async getUserIdFromChannel(channelType: string, channelUserId: string): Promise {
+ const client = await this.pool.connect();
+ try {
+ const result = await client.query(
+ `SELECT user_id
+ FROM user_channel_links
+ WHERE channel_type = $1 AND channel_user_id = $2`,
+ [channelType, channelUserId]
+ );
+
+ return result.rows.length > 0 ? result.rows[0].user_id : null;
+ } finally {
+ client.release();
+ }
+ }
+
+ /**
+ * Verify JWT token from web client
+ * TODO: Implement JWT verification with JWKS
+ */
+ async verifyWebToken(token: string): Promise {
+ // Placeholder - implement JWT verification
+ // For now, decode without verification (INSECURE - FOR DEV ONLY)
+ try {
+ const payload = JSON.parse(
+ Buffer.from(token.split('.')[1], 'base64').toString()
+ );
+ return payload.sub || null;
+ } catch {
+ return null;
+ }
+ }
+
+ /**
+ * Close database pool
+ */
+ async close(): Promise {
+ await this.pool.end();
+ }
+}
diff --git a/gateway/src/harness/agent-harness.ts b/gateway/src/harness/agent-harness.ts
new file mode 100644
index 0000000..9721c2a
--- /dev/null
+++ b/gateway/src/harness/agent-harness.ts
@@ -0,0 +1,306 @@
+import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
+import type { BaseMessage } from '@langchain/core/messages';
+import { HumanMessage, AIMessage, SystemMessage } from '@langchain/core/messages';
+import type { FastifyBaseLogger } from 'fastify';
+import type { UserLicense } from '../types/user.js';
+import type { InboundMessage, OutboundMessage } from '../types/messages.js';
+import { MCPClientConnector } from './mcp-client.js';
+import { CONTEXT_URIS, type ResourceContent } from '../types/resources.js';
+import { LLMProviderFactory, type ProviderConfig } from '../llm/provider.js';
+import { ModelRouter, RoutingStrategy } from '../llm/router.js';
+
+export interface AgentHarnessConfig {
+ userId: string;
+ sessionId: string;
+ license: UserLicense;
+ providerConfig: ProviderConfig;
+ logger: FastifyBaseLogger;
+}
+
+/**
+ * Agent harness orchestrates between LLM and user's MCP server.
+ *
+ * This is a STATELESS orchestrator - all conversation history, RAG, and context
+ * lives in the user's MCP server container. The harness only:
+ * 1. Fetches context from user's MCP resources
+ * 2. Routes to appropriate LLM model
+ * 3. Calls LLM with embedded context
+ * 4. Routes tool calls to user's MCP or platform tools
+ * 5. Saves messages back to user's MCP
+ */
+export class AgentHarness {
+ private config: AgentHarnessConfig;
+ private modelFactory: LLMProviderFactory;
+ private modelRouter: ModelRouter;
+ private mcpClient: MCPClientConnector;
+
+ constructor(config: AgentHarnessConfig) {
+ this.config = config;
+
+ this.modelFactory = new LLMProviderFactory(config.providerConfig, config.logger);
+ this.modelRouter = new ModelRouter(this.modelFactory, config.logger);
+
+ this.mcpClient = new MCPClientConnector({
+ userId: config.userId,
+ mcpServerUrl: config.license.mcpServerUrl,
+ logger: config.logger,
+ });
+ }
+
+ /**
+ * Initialize harness and connect to user's MCP server
+ */
+ async initialize(): Promise {
+ this.config.logger.info(
+ { userId: this.config.userId, sessionId: this.config.sessionId },
+ 'Initializing agent harness'
+ );
+
+ try {
+ await this.mcpClient.connect();
+ this.config.logger.info('Agent harness initialized');
+ } catch (error) {
+ this.config.logger.error({ error }, 'Failed to initialize agent harness');
+ throw error;
+ }
+ }
+
+ /**
+ * Handle incoming message from user
+ */
+ async handleMessage(message: InboundMessage): Promise {
+ this.config.logger.info(
+ { messageId: message.messageId, userId: message.userId },
+ 'Processing user message'
+ );
+
+ try {
+ // 1. Fetch context resources from user's MCP server
+ this.config.logger.debug('Fetching context resources from MCP');
+ const contextResources = await this.fetchContextResources();
+
+ // 2. Build system prompt from resources
+ const systemPrompt = this.buildSystemPrompt(contextResources);
+
+ // 3. Build messages with conversation context from MCP
+ const messages = this.buildMessages(message, contextResources);
+
+ // 4. Route to appropriate model
+ const model = await this.modelRouter.route(
+ message.content,
+ this.config.license,
+ RoutingStrategy.COMPLEXITY
+ );
+
+ // 5. Build LangChain messages
+ const langchainMessages = this.buildLangChainMessages(systemPrompt, messages);
+
+ // 6. Call LLM with streaming
+ this.config.logger.debug('Invoking LLM');
+ const response = await model.invoke(langchainMessages);
+
+ // 7. Extract text response (tool handling TODO)
+ const assistantMessage = response.content as string;
+
+ // 8. Save messages to user's MCP server
+ this.config.logger.debug('Saving messages to MCP');
+ await this.mcpClient.callTool('save_message', {
+ role: 'user',
+ content: message.content,
+ timestamp: message.timestamp.toISOString(),
+ });
+ await this.mcpClient.callTool('save_message', {
+ role: 'assistant',
+ content: assistantMessage,
+ timestamp: new Date().toISOString(),
+ });
+
+ return {
+ messageId: `msg_${Date.now()}`,
+ sessionId: message.sessionId,
+ content: assistantMessage,
+ timestamp: new Date(),
+ };
+ } catch (error) {
+ this.config.logger.error({ error }, 'Error processing message');
+ throw error;
+ }
+ }
+
+ /**
+ * Stream response from LLM
+ */
+ async *streamMessage(message: InboundMessage): AsyncGenerator {
+ try {
+ // Fetch context
+ const contextResources = await this.fetchContextResources();
+ const systemPrompt = this.buildSystemPrompt(contextResources);
+ const messages = this.buildMessages(message, contextResources);
+
+ // Route to model
+ const model = await this.modelRouter.route(
+ message.content,
+ this.config.license,
+ RoutingStrategy.COMPLEXITY
+ );
+
+ // Build messages
+ const langchainMessages = this.buildLangChainMessages(systemPrompt, messages);
+
+ // Stream response
+ const stream = await model.stream(langchainMessages);
+
+ let fullResponse = '';
+ for await (const chunk of stream) {
+ const content = chunk.content as string;
+ fullResponse += content;
+ yield content;
+ }
+
+ // Save after streaming completes
+ await this.mcpClient.callTool('save_message', {
+ role: 'user',
+ content: message.content,
+ timestamp: message.timestamp.toISOString(),
+ });
+ await this.mcpClient.callTool('save_message', {
+ role: 'assistant',
+ content: fullResponse,
+ timestamp: new Date().toISOString(),
+ });
+ } catch (error) {
+ this.config.logger.error({ error }, 'Error streaming message');
+ throw error;
+ }
+ }
+
+ /**
+ * Fetch context resources from user's MCP server
+ */
+ private async fetchContextResources(): Promise {
+ const contextUris = [
+ CONTEXT_URIS.USER_PROFILE,
+ CONTEXT_URIS.CONVERSATION_SUMMARY,
+ CONTEXT_URIS.WORKSPACE_STATE,
+ CONTEXT_URIS.SYSTEM_PROMPT,
+ ];
+
+ const resources = await Promise.all(
+ contextUris.map(async (uri) => {
+ try {
+ return await this.mcpClient.readResource(uri);
+ } catch (error) {
+ this.config.logger.warn({ error, uri }, 'Failed to fetch resource, using empty');
+ return { uri, text: '' };
+ }
+ })
+ );
+
+ return resources;
+ }
+
+ /**
+ * Build messages array with context from resources
+ */
+ private buildMessages(
+ currentMessage: InboundMessage,
+ contextResources: ResourceContent[]
+ ): Array<{ role: string; content: string }> {
+ const conversationSummary = contextResources.find(
+ (r) => r.uri === CONTEXT_URIS.CONVERSATION_SUMMARY
+ );
+
+ const messages: Array<{ role: string; content: string }> = [];
+
+ // Add conversation context as a system-like user message
+ if (conversationSummary?.text) {
+ messages.push({
+ role: 'user',
+ content: `[Previous Conversation Context]\n${conversationSummary.text}`,
+ });
+ messages.push({
+ role: 'assistant',
+ content: 'I understand the context from our previous conversations.',
+ });
+ }
+
+ // Add current user message
+ messages.push({
+ role: 'user',
+ content: currentMessage.content,
+ });
+
+ return messages;
+ }
+
+ /**
+ * Convert to LangChain message format
+ */
+ private buildLangChainMessages(
+ systemPrompt: string,
+ messages: Array<{ role: string; content: string }>
+ ): BaseMessage[] {
+ const langchainMessages: BaseMessage[] = [new SystemMessage(systemPrompt)];
+
+ for (const msg of messages) {
+ if (msg.role === 'user') {
+ langchainMessages.push(new HumanMessage(msg.content));
+ } else if (msg.role === 'assistant') {
+ langchainMessages.push(new AIMessage(msg.content));
+ }
+ }
+
+ return langchainMessages;
+ }
+
+ /**
+ * Build system prompt from platform base + user resources
+ */
+ private buildSystemPrompt(contextResources: ResourceContent[]): string {
+ const userProfile = contextResources.find((r) => r.uri === CONTEXT_URIS.USER_PROFILE);
+ const customPrompt = contextResources.find((r) => r.uri === CONTEXT_URIS.SYSTEM_PROMPT);
+ const workspaceState = contextResources.find((r) => r.uri === CONTEXT_URIS.WORKSPACE_STATE);
+
+ // Base platform prompt
+ let prompt = `You are a helpful AI assistant for Dexorder, an AI-first trading platform.
+You help users research markets, develop indicators and strategies, and analyze trading data.
+
+User license: ${this.config.license.licenseType}
+Available features: ${JSON.stringify(this.config.license.features, null, 2)}`;
+
+ // Add user profile context
+ if (userProfile?.text) {
+ prompt += `\n\n# User Profile\n${userProfile.text}`;
+ }
+
+ // Add workspace context
+ if (workspaceState?.text) {
+ prompt += `\n\n# Current Workspace\n${workspaceState.text}`;
+ }
+
+ // Add user's custom instructions (highest priority)
+ if (customPrompt?.text) {
+ prompt += `\n\n# User Instructions\n${customPrompt.text}`;
+ }
+
+ return prompt;
+ }
+
+ /**
+ * Get platform tools (non-user-specific tools)
+ */
+ private getPlatformTools(): Array<{ name: string; description?: string }> {
+ // Platform tools that don't need user's MCP
+ return [
+ // TODO: Add platform tools like market data queries, chart rendering, etc.
+ ];
+ }
+
+ /**
+ * Cleanup resources
+ */
+ async cleanup(): Promise {
+ this.config.logger.info('Cleaning up agent harness');
+ await this.mcpClient.disconnect();
+ }
+}
diff --git a/gateway/src/harness/mcp-client.ts b/gateway/src/harness/mcp-client.ts
new file mode 100644
index 0000000..9980feb
--- /dev/null
+++ b/gateway/src/harness/mcp-client.ts
@@ -0,0 +1,259 @@
+import { Client } from '@modelcontextprotocol/sdk/client/index.js';
+import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js';
+import type { FastifyBaseLogger } from 'fastify';
+
+export interface MCPClientConfig {
+ userId: string;
+ mcpServerUrl: string;
+ platformJWT?: string;
+ logger: FastifyBaseLogger;
+}
+
+/**
+ * MCP client connector for user's container
+ * Manages connection to user-specific MCP server
+ */
+export class MCPClientConnector {
+ private client: Client | null = null;
+ private connected = false;
+ private config: MCPClientConfig;
+
+ constructor(config: MCPClientConfig) {
+ this.config = config;
+ }
+
+ /**
+ * Connect to user's MCP server
+ * TODO: Implement HTTP/SSE transport instead of stdio for container communication
+ */
+ async connect(): Promise {
+ if (this.connected) {
+ return;
+ }
+
+ try {
+ this.config.logger.info(
+ { userId: this.config.userId, url: this.config.mcpServerUrl },
+ 'Connecting to user MCP server'
+ );
+
+ this.client = new Client(
+ {
+ name: 'dexorder-gateway',
+ version: '0.1.0',
+ },
+ {
+ capabilities: {
+ tools: {},
+ resources: {},
+ },
+ }
+ );
+
+ // TODO: Replace with HTTP transport when user containers are ready
+ // For now, this is a placeholder structure
+ // const transport = new HTTPTransport(this.config.mcpServerUrl, {
+ // headers: {
+ // 'Authorization': `Bearer ${this.config.platformJWT}`
+ // }
+ // });
+
+ // Placeholder: will be replaced with actual container transport
+ this.config.logger.warn(
+ 'MCP transport not yet implemented - using placeholder'
+ );
+
+ this.connected = true;
+ this.config.logger.info('Connected to user MCP server');
+ } catch (error) {
+ this.config.logger.error(
+ { error, userId: this.config.userId },
+ 'Failed to connect to user MCP server'
+ );
+ throw error;
+ }
+ }
+
+ /**
+ * Call a tool on the user's MCP server
+ */
+ async callTool(name: string, args: Record): Promise {
+ if (!this.client || !this.connected) {
+ throw new Error('MCP client not connected');
+ }
+
+ try {
+ this.config.logger.debug({ tool: name, args }, 'Calling MCP tool');
+
+ // TODO: Implement when MCP client is connected
+ // const result = await this.client.callTool({ name, arguments: args });
+ // return result;
+
+ // Placeholder response
+ return { success: true, message: 'MCP tool call placeholder' };
+ } catch (error) {
+ this.config.logger.error({ error, tool: name }, 'MCP tool call failed');
+ throw error;
+ }
+ }
+
+ /**
+ * List available tools from user's MCP server
+ */
+ async listTools(): Promise> {
+ if (!this.client || !this.connected) {
+ throw new Error('MCP client not connected');
+ }
+
+ try {
+ // TODO: Implement when MCP client is connected
+ // const tools = await this.client.listTools();
+ // return tools;
+
+ // Placeholder tools (actions only, not context)
+ return [
+ { name: 'save_message', description: 'Save message to conversation history' },
+ { name: 'list_strategies', description: 'List user strategies' },
+ { name: 'read_strategy', description: 'Read strategy code' },
+ { name: 'write_strategy', description: 'Write strategy code' },
+ { name: 'run_backtest', description: 'Run backtest on strategy' },
+ { name: 'get_watchlist', description: 'Get user watchlist' },
+ { name: 'execute_trade', description: 'Execute trade' },
+ ];
+ } catch (error) {
+ this.config.logger.error({ error }, 'Failed to list MCP tools');
+ throw error;
+ }
+ }
+
+ /**
+ * List available resources from user's MCP server
+ */
+ async listResources(): Promise> {
+ if (!this.client || !this.connected) {
+ throw new Error('MCP client not connected');
+ }
+
+ try {
+ // TODO: Implement when MCP client is connected
+ // const resources = await this.client.listResources();
+ // return resources;
+
+ // Placeholder resources for user context
+ return [
+ {
+ uri: 'context://user-profile',
+ name: 'User Profile',
+ description: 'User trading style, preferences, and background',
+ mimeType: 'text/plain',
+ },
+ {
+ uri: 'context://conversation-summary',
+ name: 'Conversation Summary',
+ description: 'Semantic summary of recent conversation history with RAG',
+ mimeType: 'text/plain',
+ },
+ {
+ uri: 'context://workspace-state',
+ name: 'Workspace State',
+ description: 'Current chart, watchlist, and open positions',
+ mimeType: 'application/json',
+ },
+ {
+ uri: 'context://system-prompt',
+ name: 'Custom System Prompt',
+ description: 'User custom instructions for the assistant',
+ mimeType: 'text/plain',
+ },
+ ];
+ } catch (error) {
+ this.config.logger.error({ error }, 'Failed to list MCP resources');
+ throw error;
+ }
+ }
+
+ /**
+ * Read a resource from user's MCP server
+ */
+ async readResource(uri: string): Promise<{ uri: string; mimeType?: string; text?: string; blob?: string }> {
+ if (!this.client || !this.connected) {
+ throw new Error('MCP client not connected');
+ }
+
+ try {
+ this.config.logger.debug({ uri }, 'Reading MCP resource');
+
+ // TODO: Implement when MCP client is connected
+ // const resource = await this.client.readResource({ uri });
+ // return resource;
+
+ // Placeholder resource content
+ if (uri === 'context://user-profile') {
+ return {
+ uri,
+ mimeType: 'text/plain',
+ text: `User Profile:
+- Trading experience: Intermediate
+- Preferred timeframes: 1h, 4h, 1d
+- Risk tolerance: Medium
+- Focus: Swing trading with technical indicators`,
+ };
+ } else if (uri === 'context://conversation-summary') {
+ return {
+ uri,
+ mimeType: 'text/plain',
+ text: `Recent Conversation Summary:
+[RAG-generated summary would go here]
+
+User recently discussed:
+- Moving average crossover strategies
+- Backtesting on BTC/USDT
+- Risk management techniques`,
+ };
+ } else if (uri === 'context://workspace-state') {
+ return {
+ uri,
+ mimeType: 'application/json',
+ text: JSON.stringify({
+ currentChart: { ticker: 'BINANCE:BTC/USDT', timeframe: '1h' },
+ watchlist: ['BTC/USDT', 'ETH/USDT', 'SOL/USDT'],
+ openPositions: [],
+ }, null, 2),
+ };
+ } else if (uri === 'context://system-prompt') {
+ return {
+ uri,
+ mimeType: 'text/plain',
+ text: `Custom Instructions:
+- Be concise and data-driven
+- Always show risk/reward ratios
+- Prefer simple strategies over complex ones`,
+ };
+ }
+
+ return { uri, text: '' };
+ } catch (error) {
+ this.config.logger.error({ error, uri }, 'MCP resource read failed');
+ throw error;
+ }
+ }
+
+ /**
+ * Disconnect from MCP server
+ */
+ async disconnect(): Promise {
+ if (this.client && this.connected) {
+ try {
+ await this.client.close();
+ this.connected = false;
+ this.config.logger.info('Disconnected from user MCP server');
+ } catch (error) {
+ this.config.logger.error({ error }, 'Error disconnecting from MCP server');
+ }
+ }
+ }
+
+ isConnected(): boolean {
+ return this.connected;
+ }
+}
diff --git a/gateway/src/k8s/client.ts b/gateway/src/k8s/client.ts
new file mode 100644
index 0000000..767b727
--- /dev/null
+++ b/gateway/src/k8s/client.ts
@@ -0,0 +1,327 @@
+import * as k8s from '@kubernetes/client-node';
+import type { FastifyBaseLogger } from 'fastify';
+import * as yaml from 'js-yaml';
+import * as fs from 'fs/promises';
+import * as path from 'path';
+import { fileURLToPath } from 'url';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
+export interface K8sClientConfig {
+ namespace: string;
+ inCluster: boolean;
+ context?: string; // For local dev
+ logger: FastifyBaseLogger;
+}
+
+export interface DeploymentSpec {
+ userId: string;
+ licenseType: 'free' | 'pro' | 'enterprise';
+ agentImage: string;
+ sidecarImage: string;
+ storageClass: string;
+}
+
+/**
+ * Kubernetes client wrapper for managing agent deployments
+ */
+export class KubernetesClient {
+ private config: K8sClientConfig;
+ private k8sConfig: k8s.KubeConfig;
+ private appsApi: k8s.AppsV1Api;
+ private coreApi: k8s.CoreV1Api;
+
+ constructor(config: K8sClientConfig) {
+ this.config = config;
+ this.k8sConfig = new k8s.KubeConfig();
+
+ if (config.inCluster) {
+ this.k8sConfig.loadFromCluster();
+ this.config.logger.info('Loaded in-cluster Kubernetes config');
+ } else {
+ this.k8sConfig.loadFromDefault();
+ if (config.context) {
+ this.k8sConfig.setCurrentContext(config.context);
+ this.config.logger.info({ context: config.context }, 'Set Kubernetes context');
+ }
+ this.config.logger.info('Loaded Kubernetes config from default location');
+ }
+
+ this.appsApi = this.k8sConfig.makeApiClient(k8s.AppsV1Api);
+ this.coreApi = this.k8sConfig.makeApiClient(k8s.CoreV1Api);
+ }
+
+ /**
+ * Generate deployment name from user ID
+ */
+ static getDeploymentName(userId: string): string {
+ // Sanitize userId to be k8s-compliant (lowercase alphanumeric + hyphens)
+ const sanitized = userId.toLowerCase().replace(/[^a-z0-9-]/g, '-');
+ return `agent-${sanitized}`;
+ }
+
+ /**
+ * Generate service name (same as deployment)
+ */
+ static getServiceName(userId: string): string {
+ return this.getDeploymentName(userId);
+ }
+
+ /**
+ * Generate PVC name
+ */
+ static getPvcName(userId: string): string {
+ return `${this.getDeploymentName(userId)}-data`;
+ }
+
+ /**
+ * Compute MCP endpoint URL from service name
+ */
+ static getMcpEndpoint(userId: string, namespace: string): string {
+ const serviceName = this.getServiceName(userId);
+ return `http://${serviceName}.${namespace}.svc.cluster.local:3000`;
+ }
+
+ /**
+ * Check if deployment exists
+ */
+ async deploymentExists(deploymentName: string): Promise {
+ try {
+ await this.appsApi.readNamespacedDeployment(deploymentName, this.config.namespace);
+ return true;
+ } catch (error: any) {
+ if (error.response?.statusCode === 404) {
+ return false;
+ }
+ throw error;
+ }
+ }
+
+ /**
+ * Create agent deployment from template
+ */
+ async createAgentDeployment(spec: DeploymentSpec): Promise {
+ const deploymentName = KubernetesClient.getDeploymentName(spec.userId);
+ const serviceName = KubernetesClient.getServiceName(spec.userId);
+ const pvcName = KubernetesClient.getPvcName(spec.userId);
+
+ this.config.logger.info(
+ { userId: spec.userId, licenseType: spec.licenseType, deploymentName },
+ 'Creating agent deployment'
+ );
+
+ // Load template based on license type
+ const templatePath = path.join(
+ __dirname,
+ 'templates',
+ `${spec.licenseType}-tier.yaml`
+ );
+
+ const templateContent = await fs.readFile(templatePath, 'utf-8');
+
+ // Substitute variables
+ const rendered = templateContent
+ .replace(/\{\{userId\}\}/g, spec.userId)
+ .replace(/\{\{deploymentName\}\}/g, deploymentName)
+ .replace(/\{\{serviceName\}\}/g, serviceName)
+ .replace(/\{\{pvcName\}\}/g, pvcName)
+ .replace(/\{\{agentImage\}\}/g, spec.agentImage)
+ .replace(/\{\{sidecarImage\}\}/g, spec.sidecarImage)
+ .replace(/\{\{storageClass\}\}/g, spec.storageClass);
+
+ // Parse YAML documents (deployment, pvc, service)
+ const documents = yaml.loadAll(rendered) as any[];
+
+ // Apply each resource
+ for (const doc of documents) {
+ if (!doc || !doc.kind) continue;
+
+ try {
+ switch (doc.kind) {
+ case 'Deployment':
+ await this.appsApi.createNamespacedDeployment(this.config.namespace, doc);
+ this.config.logger.info({ deploymentName }, 'Created deployment');
+ break;
+
+ case 'PersistentVolumeClaim':
+ await this.coreApi.createNamespacedPersistentVolumeClaim(
+ this.config.namespace,
+ doc
+ );
+ this.config.logger.info({ pvcName }, 'Created PVC');
+ break;
+
+ case 'Service':
+ await this.coreApi.createNamespacedService(this.config.namespace, doc);
+ this.config.logger.info({ serviceName }, 'Created service');
+ break;
+
+ default:
+ this.config.logger.warn({ kind: doc.kind }, 'Unknown resource kind in template');
+ }
+ } catch (error: any) {
+ // If resource already exists, log warning but continue
+ if (error.response?.statusCode === 409) {
+ this.config.logger.warn(
+ { kind: doc.kind, name: doc.metadata?.name },
+ 'Resource already exists, skipping'
+ );
+ } else {
+ throw error;
+ }
+ }
+ }
+
+ this.config.logger.info({ deploymentName }, 'Agent deployment created successfully');
+ }
+
+ /**
+ * Wait for deployment to be ready
+ */
+ async waitForDeploymentReady(
+ deploymentName: string,
+ timeoutMs: number = 120000
+ ): Promise {
+ const startTime = Date.now();
+ const pollInterval = 2000; // 2 seconds
+
+ this.config.logger.info(
+ { deploymentName, timeoutMs },
+ 'Waiting for deployment to be ready'
+ );
+
+ while (Date.now() - startTime < timeoutMs) {
+ try {
+ const response = await this.appsApi.readNamespacedDeployment(
+ deploymentName,
+ this.config.namespace
+ );
+
+ const deployment = response.body;
+ const status = deployment.status;
+
+ // Check if deployment is ready
+ if (
+ status?.availableReplicas &&
+ status.availableReplicas > 0 &&
+ status.readyReplicas &&
+ status.readyReplicas > 0
+ ) {
+ this.config.logger.info({ deploymentName }, 'Deployment is ready');
+ return true;
+ }
+
+ // Check for failure conditions
+ if (status?.conditions) {
+ const failedCondition = status.conditions.find(
+ (c) => c.type === 'Progressing' && c.status === 'False'
+ );
+ if (failedCondition) {
+ this.config.logger.error(
+ { deploymentName, reason: failedCondition.reason, message: failedCondition.message },
+ 'Deployment failed to progress'
+ );
+ return false;
+ }
+ }
+
+ this.config.logger.debug(
+ {
+ deploymentName,
+ replicas: status?.replicas,
+ ready: status?.readyReplicas,
+ available: status?.availableReplicas,
+ },
+ 'Deployment not ready yet, waiting...'
+ );
+
+ await new Promise((resolve) => setTimeout(resolve, pollInterval));
+ } catch (error: any) {
+ if (error.response?.statusCode === 404) {
+ this.config.logger.warn({ deploymentName }, 'Deployment not found');
+ return false;
+ }
+ throw error;
+ }
+ }
+
+ this.config.logger.warn({ deploymentName, timeoutMs }, 'Deployment readiness timeout');
+ return false;
+ }
+
+ /**
+ * Get service endpoint URL
+ */
+ async getServiceEndpoint(serviceName: string): Promise {
+ try {
+ const response = await this.coreApi.readNamespacedService(
+ serviceName,
+ this.config.namespace
+ );
+
+ const service = response.body;
+
+ // For ClusterIP services, return internal DNS name
+ if (service.spec?.type === 'ClusterIP') {
+ const port = service.spec.ports?.find((p) => p.name === 'mcp')?.port || 3000;
+ return `http://${serviceName}.${this.config.namespace}.svc.cluster.local:${port}`;
+ }
+
+ // For other service types (NodePort, LoadBalancer), would need different logic
+ this.config.logger.warn(
+ { serviceName, type: service.spec?.type },
+ 'Unexpected service type'
+ );
+ return null;
+ } catch (error: any) {
+ if (error.response?.statusCode === 404) {
+ this.config.logger.warn({ serviceName }, 'Service not found');
+ return null;
+ }
+ throw error;
+ }
+ }
+
+ /**
+ * Delete deployment and associated resources
+ * (Used for cleanup/testing - normally handled by lifecycle sidecar)
+ */
+ async deleteAgentDeployment(userId: string): Promise {
+ const deploymentName = KubernetesClient.getDeploymentName(userId);
+ const serviceName = KubernetesClient.getServiceName(userId);
+ const pvcName = KubernetesClient.getPvcName(userId);
+
+ this.config.logger.info({ userId, deploymentName }, 'Deleting agent deployment');
+
+ // Delete deployment
+ try {
+ await this.appsApi.deleteNamespacedDeployment(deploymentName, this.config.namespace);
+ this.config.logger.info({ deploymentName }, 'Deleted deployment');
+ } catch (error: any) {
+ if (error.response?.statusCode !== 404) {
+ this.config.logger.warn({ deploymentName, error }, 'Failed to delete deployment');
+ }
+ }
+
+ // Delete service
+ try {
+ await this.coreApi.deleteNamespacedService(serviceName, this.config.namespace);
+ this.config.logger.info({ serviceName }, 'Deleted service');
+ } catch (error: any) {
+ if (error.response?.statusCode !== 404) {
+ this.config.logger.warn({ serviceName, error }, 'Failed to delete service');
+ }
+ }
+
+ // Delete PVC
+ try {
+ await this.coreApi.deleteNamespacedPersistentVolumeClaim(pvcName, this.config.namespace);
+ this.config.logger.info({ pvcName }, 'Deleted PVC');
+ } catch (error: any) {
+ if (error.response?.statusCode !== 404) {
+ this.config.logger.warn({ pvcName, error }, 'Failed to delete PVC');
+ }
+ }
+ }
+}
diff --git a/gateway/src/k8s/container-manager.ts b/gateway/src/k8s/container-manager.ts
new file mode 100644
index 0000000..67d5e2f
--- /dev/null
+++ b/gateway/src/k8s/container-manager.ts
@@ -0,0 +1,118 @@
+import type { FastifyBaseLogger } from 'fastify';
+import { KubernetesClient, type DeploymentSpec } from './client.js';
+import type { UserLicense } from '../types/user.js';
+
+export interface ContainerManagerConfig {
+ k8sClient: KubernetesClient;
+ agentImage: string;
+ sidecarImage: string;
+ storageClass: string;
+ namespace: string;
+ logger: FastifyBaseLogger;
+}
+
+export interface ContainerStatus {
+ exists: boolean;
+ ready: boolean;
+ mcpEndpoint: string;
+}
+
+/**
+ * Container manager orchestrates agent container lifecycle
+ */
+export class ContainerManager {
+ private config: ContainerManagerConfig;
+
+ constructor(config: ContainerManagerConfig) {
+ this.config = config;
+ }
+
+ /**
+ * Ensure user's container is running and ready
+ * Returns the MCP endpoint URL
+ */
+ async ensureContainerRunning(
+ userId: string,
+ license: UserLicense
+ ): Promise<{ mcpEndpoint: string; wasCreated: boolean }> {
+ const deploymentName = KubernetesClient.getDeploymentName(userId);
+ const mcpEndpoint = KubernetesClient.getMcpEndpoint(userId, this.config.namespace);
+
+ this.config.logger.info(
+ { userId, licenseType: license.licenseType, deploymentName },
+ 'Ensuring container is running'
+ );
+
+ // Check if deployment already exists
+ const exists = await this.config.k8sClient.deploymentExists(deploymentName);
+
+ if (exists) {
+ this.config.logger.info({ userId, deploymentName }, 'Container deployment already exists');
+
+ // Wait for it to be ready (in case it's starting up)
+ const ready = await this.config.k8sClient.waitForDeploymentReady(deploymentName, 30000);
+
+ if (!ready) {
+ this.config.logger.warn(
+ { userId, deploymentName },
+ 'Existing deployment not ready within timeout'
+ );
+ // Continue anyway - might be an image pull or other transient issue
+ }
+
+ return { mcpEndpoint, wasCreated: false };
+ }
+
+ // Create new deployment
+ this.config.logger.info({ userId, licenseType: license.licenseType }, 'Creating new container');
+
+ const spec: DeploymentSpec = {
+ userId,
+ licenseType: license.licenseType,
+ agentImage: this.config.agentImage,
+ sidecarImage: this.config.sidecarImage,
+ storageClass: this.config.storageClass,
+ };
+
+ await this.config.k8sClient.createAgentDeployment(spec);
+
+ // Wait for deployment to be ready
+ const ready = await this.config.k8sClient.waitForDeploymentReady(deploymentName, 120000);
+
+ if (!ready) {
+ throw new Error(
+ `Container deployment failed to become ready within timeout: ${deploymentName}`
+ );
+ }
+
+ this.config.logger.info({ userId, mcpEndpoint }, 'Container is ready');
+
+ return { mcpEndpoint, wasCreated: true };
+ }
+
+ /**
+ * Check container status without creating it
+ */
+ async getContainerStatus(userId: string): Promise {
+ const deploymentName = KubernetesClient.getDeploymentName(userId);
+ const mcpEndpoint = KubernetesClient.getMcpEndpoint(userId, this.config.namespace);
+
+ const exists = await this.config.k8sClient.deploymentExists(deploymentName);
+
+ if (!exists) {
+ return { exists: false, ready: false, mcpEndpoint };
+ }
+
+ // Check if ready (with short timeout)
+ const ready = await this.config.k8sClient.waitForDeploymentReady(deploymentName, 5000);
+
+ return { exists: true, ready, mcpEndpoint };
+ }
+
+ /**
+ * Delete container (for cleanup/testing)
+ */
+ async deleteContainer(userId: string): Promise {
+ await this.config.k8sClient.deleteAgentDeployment(userId);
+ }
+}
diff --git a/gateway/src/k8s/templates/enterprise-tier.yaml b/gateway/src/k8s/templates/enterprise-tier.yaml
new file mode 100644
index 0000000..04db77a
--- /dev/null
+++ b/gateway/src/k8s/templates/enterprise-tier.yaml
@@ -0,0 +1,199 @@
+# Enterprise tier agent deployment template
+# Variables: {{userId}}, {{deploymentName}}, {{pvcName}}, {{serviceName}}
+# Enterprise: No idle shutdown, larger resources
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: {{deploymentName}}
+ namespace: dexorder-agents
+ labels:
+ app.kubernetes.io/name: agent
+ app.kubernetes.io/component: user-agent
+ dexorder.io/component: agent
+ dexorder.io/user-id: {{userId}}
+ dexorder.io/deployment: {{deploymentName}}
+ dexorder.io/license-tier: enterprise
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ dexorder.io/user-id: {{userId}}
+ template:
+ metadata:
+ labels:
+ dexorder.io/component: agent
+ dexorder.io/user-id: {{userId}}
+ dexorder.io/deployment: {{deploymentName}}
+ dexorder.io/license-tier: enterprise
+ spec:
+ serviceAccountName: agent-lifecycle
+ shareProcessNamespace: true
+
+ securityContext:
+ runAsNonRoot: true
+ runAsUser: 1000
+ fsGroup: 1000
+ seccompProfile:
+ type: RuntimeDefault
+
+ containers:
+ - name: agent
+ image: {{agentImage}}
+ imagePullPolicy: Always
+
+ securityContext:
+ allowPrivilegeEscalation: false
+ runAsNonRoot: true
+ runAsUser: 1000
+ readOnlyRootFilesystem: true
+ capabilities:
+ drop:
+ - ALL
+
+ resources:
+ requests:
+ memory: "1Gi"
+ cpu: "500m"
+ limits:
+ memory: "4Gi"
+ cpu: "4000m"
+
+ env:
+ - name: USER_ID
+ value: {{userId}}
+ - name: IDLE_TIMEOUT_MINUTES
+ value: "0"
+ - name: IDLE_CHECK_INTERVAL_SECONDS
+ value: "60"
+ - name: ENABLE_IDLE_SHUTDOWN
+ value: "false"
+ - name: MCP_SERVER_PORT
+ value: "3000"
+ - name: ZMQ_CONTROL_PORT
+ value: "5555"
+
+ ports:
+ - name: mcp
+ containerPort: 3000
+ protocol: TCP
+ - name: zmq-control
+ containerPort: 5555
+ protocol: TCP
+
+ volumeMounts:
+ - name: agent-data
+ mountPath: /app/data
+ - name: tmp
+ mountPath: /tmp
+ - name: shared-run
+ mountPath: /var/run/agent
+
+ livenessProbe:
+ httpGet:
+ path: /health
+ port: mcp
+ initialDelaySeconds: 10
+ periodSeconds: 30
+ timeoutSeconds: 5
+
+ readinessProbe:
+ httpGet:
+ path: /ready
+ port: mcp
+ initialDelaySeconds: 5
+ periodSeconds: 10
+
+ - name: lifecycle-sidecar
+ image: {{sidecarImage}}
+ imagePullPolicy: Always
+
+ securityContext:
+ allowPrivilegeEscalation: false
+ runAsNonRoot: true
+ runAsUser: 1000
+ readOnlyRootFilesystem: true
+ capabilities:
+ drop:
+ - ALL
+
+ resources:
+ requests:
+ memory: "32Mi"
+ cpu: "10m"
+ limits:
+ memory: "64Mi"
+ cpu: "50m"
+
+ env:
+ - name: NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.namespace
+ - name: DEPLOYMENT_NAME
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.labels['dexorder.io/deployment']
+ - name: USER_TYPE
+ value: "enterprise"
+ - name: MAIN_CONTAINER_PID
+ value: "1"
+
+ volumeMounts:
+ - name: shared-run
+ mountPath: /var/run/agent
+ readOnly: true
+
+ volumes:
+ - name: agent-data
+ persistentVolumeClaim:
+ claimName: {{pvcName}}
+ - name: tmp
+ emptyDir:
+ medium: Memory
+ sizeLimit: 512Mi
+ - name: shared-run
+ emptyDir:
+ medium: Memory
+ sizeLimit: 1Mi
+
+ restartPolicy: Always
+ terminationGracePeriodSeconds: 30
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+ name: {{pvcName}}
+ namespace: dexorder-agents
+ labels:
+ dexorder.io/user-id: {{userId}}
+ dexorder.io/license-tier: enterprise
+spec:
+ accessModes:
+ - ReadWriteOnce
+ resources:
+ requests:
+ storage: 50Gi
+ storageClassName: {{storageClass}}
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: {{serviceName}}
+ namespace: dexorder-agents
+ labels:
+ dexorder.io/user-id: {{userId}}
+ dexorder.io/license-tier: enterprise
+spec:
+ type: ClusterIP
+ selector:
+ dexorder.io/user-id: {{userId}}
+ ports:
+ - name: mcp
+ port: 3000
+ targetPort: mcp
+ protocol: TCP
+ - name: zmq-control
+ port: 5555
+ targetPort: zmq-control
+ protocol: TCP
diff --git a/gateway/src/k8s/templates/free-tier.yaml b/gateway/src/k8s/templates/free-tier.yaml
new file mode 100644
index 0000000..3ea0415
--- /dev/null
+++ b/gateway/src/k8s/templates/free-tier.yaml
@@ -0,0 +1,198 @@
+# Free tier agent deployment template
+# Variables: {{userId}}, {{deploymentName}}, {{pvcName}}, {{serviceName}}
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: {{deploymentName}}
+ namespace: dexorder-agents
+ labels:
+ app.kubernetes.io/name: agent
+ app.kubernetes.io/component: user-agent
+ dexorder.io/component: agent
+ dexorder.io/user-id: {{userId}}
+ dexorder.io/deployment: {{deploymentName}}
+ dexorder.io/license-tier: free
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ dexorder.io/user-id: {{userId}}
+ template:
+ metadata:
+ labels:
+ dexorder.io/component: agent
+ dexorder.io/user-id: {{userId}}
+ dexorder.io/deployment: {{deploymentName}}
+ dexorder.io/license-tier: free
+ spec:
+ serviceAccountName: agent-lifecycle
+ shareProcessNamespace: true
+
+ securityContext:
+ runAsNonRoot: true
+ runAsUser: 1000
+ fsGroup: 1000
+ seccompProfile:
+ type: RuntimeDefault
+
+ containers:
+ - name: agent
+ image: {{agentImage}}
+ imagePullPolicy: Always
+
+ securityContext:
+ allowPrivilegeEscalation: false
+ runAsNonRoot: true
+ runAsUser: 1000
+ readOnlyRootFilesystem: true
+ capabilities:
+ drop:
+ - ALL
+
+ resources:
+ requests:
+ memory: "256Mi"
+ cpu: "100m"
+ limits:
+ memory: "512Mi"
+ cpu: "500m"
+
+ env:
+ - name: USER_ID
+ value: {{userId}}
+ - name: IDLE_TIMEOUT_MINUTES
+ value: "15"
+ - name: IDLE_CHECK_INTERVAL_SECONDS
+ value: "60"
+ - name: ENABLE_IDLE_SHUTDOWN
+ value: "true"
+ - name: MCP_SERVER_PORT
+ value: "3000"
+ - name: ZMQ_CONTROL_PORT
+ value: "5555"
+
+ ports:
+ - name: mcp
+ containerPort: 3000
+ protocol: TCP
+ - name: zmq-control
+ containerPort: 5555
+ protocol: TCP
+
+ volumeMounts:
+ - name: agent-data
+ mountPath: /app/data
+ - name: tmp
+ mountPath: /tmp
+ - name: shared-run
+ mountPath: /var/run/agent
+
+ livenessProbe:
+ httpGet:
+ path: /health
+ port: mcp
+ initialDelaySeconds: 10
+ periodSeconds: 30
+ timeoutSeconds: 5
+
+ readinessProbe:
+ httpGet:
+ path: /ready
+ port: mcp
+ initialDelaySeconds: 5
+ periodSeconds: 10
+
+ - name: lifecycle-sidecar
+ image: {{sidecarImage}}
+ imagePullPolicy: Always
+
+ securityContext:
+ allowPrivilegeEscalation: false
+ runAsNonRoot: true
+ runAsUser: 1000
+ readOnlyRootFilesystem: true
+ capabilities:
+ drop:
+ - ALL
+
+ resources:
+ requests:
+ memory: "32Mi"
+ cpu: "10m"
+ limits:
+ memory: "64Mi"
+ cpu: "50m"
+
+ env:
+ - name: NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.namespace
+ - name: DEPLOYMENT_NAME
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.labels['dexorder.io/deployment']
+ - name: USER_TYPE
+ value: "free"
+ - name: MAIN_CONTAINER_PID
+ value: "1"
+
+ volumeMounts:
+ - name: shared-run
+ mountPath: /var/run/agent
+ readOnly: true
+
+ volumes:
+ - name: agent-data
+ persistentVolumeClaim:
+ claimName: {{pvcName}}
+ - name: tmp
+ emptyDir:
+ medium: Memory
+ sizeLimit: 128Mi
+ - name: shared-run
+ emptyDir:
+ medium: Memory
+ sizeLimit: 1Mi
+
+ restartPolicy: Always
+ terminationGracePeriodSeconds: 30
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+ name: {{pvcName}}
+ namespace: dexorder-agents
+ labels:
+ dexorder.io/user-id: {{userId}}
+ dexorder.io/license-tier: free
+spec:
+ accessModes:
+ - ReadWriteOnce
+ resources:
+ requests:
+ storage: 1Gi
+ storageClassName: {{storageClass}}
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: {{serviceName}}
+ namespace: dexorder-agents
+ labels:
+ dexorder.io/user-id: {{userId}}
+ dexorder.io/license-tier: free
+spec:
+ type: ClusterIP
+ selector:
+ dexorder.io/user-id: {{userId}}
+ ports:
+ - name: mcp
+ port: 3000
+ targetPort: mcp
+ protocol: TCP
+ - name: zmq-control
+ port: 5555
+ targetPort: zmq-control
+ protocol: TCP
diff --git a/gateway/src/k8s/templates/pro-tier.yaml b/gateway/src/k8s/templates/pro-tier.yaml
new file mode 100644
index 0000000..a99abac
--- /dev/null
+++ b/gateway/src/k8s/templates/pro-tier.yaml
@@ -0,0 +1,198 @@
+# Pro tier agent deployment template
+# Variables: {{userId}}, {{deploymentName}}, {{pvcName}}, {{serviceName}}
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: {{deploymentName}}
+ namespace: dexorder-agents
+ labels:
+ app.kubernetes.io/name: agent
+ app.kubernetes.io/component: user-agent
+ dexorder.io/component: agent
+ dexorder.io/user-id: {{userId}}
+ dexorder.io/deployment: {{deploymentName}}
+ dexorder.io/license-tier: pro
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ dexorder.io/user-id: {{userId}}
+ template:
+ metadata:
+ labels:
+ dexorder.io/component: agent
+ dexorder.io/user-id: {{userId}}
+ dexorder.io/deployment: {{deploymentName}}
+ dexorder.io/license-tier: pro
+ spec:
+ serviceAccountName: agent-lifecycle
+ shareProcessNamespace: true
+
+ securityContext:
+ runAsNonRoot: true
+ runAsUser: 1000
+ fsGroup: 1000
+ seccompProfile:
+ type: RuntimeDefault
+
+ containers:
+ - name: agent
+ image: {{agentImage}}
+ imagePullPolicy: Always
+
+ securityContext:
+ allowPrivilegeEscalation: false
+ runAsNonRoot: true
+ runAsUser: 1000
+ readOnlyRootFilesystem: true
+ capabilities:
+ drop:
+ - ALL
+
+ resources:
+ requests:
+ memory: "512Mi"
+ cpu: "250m"
+ limits:
+ memory: "2Gi"
+ cpu: "2000m"
+
+ env:
+ - name: USER_ID
+ value: {{userId}}
+ - name: IDLE_TIMEOUT_MINUTES
+ value: "60"
+ - name: IDLE_CHECK_INTERVAL_SECONDS
+ value: "60"
+ - name: ENABLE_IDLE_SHUTDOWN
+ value: "true"
+ - name: MCP_SERVER_PORT
+ value: "3000"
+ - name: ZMQ_CONTROL_PORT
+ value: "5555"
+
+ ports:
+ - name: mcp
+ containerPort: 3000
+ protocol: TCP
+ - name: zmq-control
+ containerPort: 5555
+ protocol: TCP
+
+ volumeMounts:
+ - name: agent-data
+ mountPath: /app/data
+ - name: tmp
+ mountPath: /tmp
+ - name: shared-run
+ mountPath: /var/run/agent
+
+ livenessProbe:
+ httpGet:
+ path: /health
+ port: mcp
+ initialDelaySeconds: 10
+ periodSeconds: 30
+ timeoutSeconds: 5
+
+ readinessProbe:
+ httpGet:
+ path: /ready
+ port: mcp
+ initialDelaySeconds: 5
+ periodSeconds: 10
+
+ - name: lifecycle-sidecar
+ image: {{sidecarImage}}
+ imagePullPolicy: Always
+
+ securityContext:
+ allowPrivilegeEscalation: false
+ runAsNonRoot: true
+ runAsUser: 1000
+ readOnlyRootFilesystem: true
+ capabilities:
+ drop:
+ - ALL
+
+ resources:
+ requests:
+ memory: "32Mi"
+ cpu: "10m"
+ limits:
+ memory: "64Mi"
+ cpu: "50m"
+
+ env:
+ - name: NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.namespace
+ - name: DEPLOYMENT_NAME
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.labels['dexorder.io/deployment']
+ - name: USER_TYPE
+ value: "pro"
+ - name: MAIN_CONTAINER_PID
+ value: "1"
+
+ volumeMounts:
+ - name: shared-run
+ mountPath: /var/run/agent
+ readOnly: true
+
+ volumes:
+ - name: agent-data
+ persistentVolumeClaim:
+ claimName: {{pvcName}}
+ - name: tmp
+ emptyDir:
+ medium: Memory
+ sizeLimit: 256Mi
+ - name: shared-run
+ emptyDir:
+ medium: Memory
+ sizeLimit: 1Mi
+
+ restartPolicy: Always
+ terminationGracePeriodSeconds: 30
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+ name: {{pvcName}}
+ namespace: dexorder-agents
+ labels:
+ dexorder.io/user-id: {{userId}}
+ dexorder.io/license-tier: pro
+spec:
+ accessModes:
+ - ReadWriteOnce
+ resources:
+ requests:
+ storage: 10Gi
+ storageClassName: {{storageClass}}
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: {{serviceName}}
+ namespace: dexorder-agents
+ labels:
+ dexorder.io/user-id: {{userId}}
+ dexorder.io/license-tier: pro
+spec:
+ type: ClusterIP
+ selector:
+ dexorder.io/user-id: {{userId}}
+ ports:
+ - name: mcp
+ port: 3000
+ targetPort: mcp
+ protocol: TCP
+ - name: zmq-control
+ port: 5555
+ targetPort: zmq-control
+ protocol: TCP
diff --git a/gateway/src/llm/provider.ts b/gateway/src/llm/provider.ts
new file mode 100644
index 0000000..efdb9df
--- /dev/null
+++ b/gateway/src/llm/provider.ts
@@ -0,0 +1,216 @@
+import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
+import { ChatAnthropic } from '@langchain/anthropic';
+import { ChatOpenAI } from '@langchain/openai';
+import { ChatGoogleGenerativeAI } from '@langchain/google-genai';
+import { ChatOpenRouter } from '@langchain/openrouter';
+import type { FastifyBaseLogger } from 'fastify';
+
+/**
+ * Supported LLM providers
+ */
+export enum LLMProvider {
+ ANTHROPIC = 'anthropic',
+ OPENAI = 'openai',
+ GOOGLE = 'google',
+ OPENROUTER = 'openrouter',
+}
+
+/**
+ * Model configuration
+ */
+export interface ModelConfig {
+ provider: LLMProvider;
+ model: string;
+ temperature?: number;
+ maxTokens?: number;
+}
+
+/**
+ * Provider configuration with API keys
+ */
+export interface ProviderConfig {
+ anthropicApiKey?: string;
+ openaiApiKey?: string;
+ googleApiKey?: string;
+ openrouterApiKey?: string;
+}
+
+/**
+ * LLM Provider factory
+ * Creates model instances with unified interface across providers
+ */
+export class LLMProviderFactory {
+ private config: ProviderConfig;
+ private logger: FastifyBaseLogger;
+
+ constructor(config: ProviderConfig, logger: FastifyBaseLogger) {
+ this.config = config;
+ this.logger = logger;
+ }
+
+ /**
+ * Create a chat model instance
+ */
+ createModel(modelConfig: ModelConfig): BaseChatModel {
+ this.logger.debug(
+ { provider: modelConfig.provider, model: modelConfig.model },
+ 'Creating LLM model'
+ );
+
+ switch (modelConfig.provider) {
+ case LLMProvider.ANTHROPIC:
+ return this.createAnthropicModel(modelConfig);
+
+ case LLMProvider.OPENAI:
+ return this.createOpenAIModel(modelConfig);
+
+ case LLMProvider.GOOGLE:
+ return this.createGoogleModel(modelConfig);
+
+ case LLMProvider.OPENROUTER:
+ return this.createOpenRouterModel(modelConfig);
+
+ default:
+ throw new Error(`Unsupported provider: ${modelConfig.provider}`);
+ }
+ }
+
+ /**
+ * Create Anthropic Claude model
+ */
+ private createAnthropicModel(config: ModelConfig): ChatAnthropic {
+ if (!this.config.anthropicApiKey) {
+ throw new Error('Anthropic API key not configured');
+ }
+
+ return new ChatAnthropic({
+ model: config.model,
+ temperature: config.temperature ?? 0.7,
+ maxTokens: config.maxTokens ?? 4096,
+ anthropicApiKey: this.config.anthropicApiKey,
+ });
+ }
+
+ /**
+ * Create OpenAI GPT model
+ */
+ private createOpenAIModel(config: ModelConfig): ChatOpenAI {
+ if (!this.config.openaiApiKey) {
+ throw new Error('OpenAI API key not configured');
+ }
+
+ return new ChatOpenAI({
+ model: config.model,
+ temperature: config.temperature ?? 0.7,
+ maxTokens: config.maxTokens ?? 4096,
+ openAIApiKey: this.config.openaiApiKey,
+ });
+ }
+
+ /**
+ * Create Google Gemini model
+ */
+ private createGoogleModel(config: ModelConfig): ChatGoogleGenerativeAI {
+ if (!this.config.googleApiKey) {
+ throw new Error('Google API key not configured');
+ }
+
+ return new ChatGoogleGenerativeAI({
+ model: config.model,
+ temperature: config.temperature ?? 0.7,
+ maxOutputTokens: config.maxTokens ?? 4096,
+ apiKey: this.config.googleApiKey,
+ });
+ }
+
+ /**
+ * Create OpenRouter model (access to 300+ models)
+ */
+ private createOpenRouterModel(config: ModelConfig): ChatOpenRouter {
+ if (!this.config.openrouterApiKey) {
+ throw new Error('OpenRouter API key not configured');
+ }
+
+ return new ChatOpenRouter({
+ model: config.model,
+ temperature: config.temperature ?? 0.7,
+ maxTokens: config.maxTokens ?? 4096,
+ apiKey: this.config.openrouterApiKey,
+ });
+ }
+
+ /**
+ * Get default model based on environment
+ */
+ getDefaultModel(): ModelConfig {
+ // Check which API keys are available
+ if (this.config.anthropicApiKey) {
+ return {
+ provider: LLMProvider.ANTHROPIC,
+ model: 'claude-3-5-sonnet-20241022',
+ };
+ }
+
+ if (this.config.openaiApiKey) {
+ return {
+ provider: LLMProvider.OPENAI,
+ model: 'gpt-4o',
+ };
+ }
+
+ if (this.config.googleApiKey) {
+ return {
+ provider: LLMProvider.GOOGLE,
+ model: 'gemini-2.0-flash-exp',
+ };
+ }
+
+ if (this.config.openrouterApiKey) {
+ return {
+ provider: LLMProvider.OPENROUTER,
+ model: 'anthropic/claude-3.5-sonnet',
+ };
+ }
+
+ throw new Error('No LLM API keys configured');
+ }
+}
+
+/**
+ * Predefined model configurations
+ */
+export const MODELS = {
+ // Anthropic
+ CLAUDE_SONNET: {
+ provider: LLMProvider.ANTHROPIC,
+ model: 'claude-3-5-sonnet-20241022',
+ },
+ CLAUDE_HAIKU: {
+ provider: LLMProvider.ANTHROPIC,
+ model: 'claude-3-5-haiku-20241022',
+ },
+ CLAUDE_OPUS: {
+ provider: LLMProvider.ANTHROPIC,
+ model: 'claude-3-opus-20240229',
+ },
+
+ // OpenAI
+ GPT4O: {
+ provider: LLMProvider.OPENAI,
+ model: 'gpt-4o',
+ },
+ GPT4O_MINI: {
+ provider: LLMProvider.OPENAI,
+ model: 'gpt-4o-mini',
+ },
+
+ // Google
+ GEMINI_2_FLASH: {
+ provider: LLMProvider.GOOGLE,
+ model: 'gemini-2.0-flash-exp',
+ },
+ GEMINI_PRO: {
+ provider: LLMProvider.GOOGLE,
+ model: 'gemini-1.5-pro',
+ },
+} as const satisfies Record;
diff --git a/gateway/src/llm/router.ts b/gateway/src/llm/router.ts
new file mode 100644
index 0000000..ef529bb
--- /dev/null
+++ b/gateway/src/llm/router.ts
@@ -0,0 +1,202 @@
+import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
+import type { FastifyBaseLogger } from 'fastify';
+import { LLMProviderFactory, type ModelConfig, LLMProvider } from './provider.js';
+import type { UserLicense } from '../types/user.js';
+
+/**
+ * Model routing strategies
+ */
+export enum RoutingStrategy {
+ /** Use user's preferred model from license */
+ USER_PREFERENCE = 'user_preference',
+ /** Route based on query complexity */
+ COMPLEXITY = 'complexity',
+ /** Route based on license tier */
+ LICENSE_TIER = 'license_tier',
+ /** Use cheapest available model */
+ COST_OPTIMIZED = 'cost_optimized',
+}
+
+/**
+ * Model router
+ * Intelligently selects which model to use based on various factors
+ */
+export class ModelRouter {
+ private factory: LLMProviderFactory;
+ private logger: FastifyBaseLogger;
+ private defaultModel: ModelConfig;
+
+ constructor(factory: LLMProviderFactory, logger: FastifyBaseLogger) {
+ this.factory = factory;
+ this.logger = logger;
+ this.defaultModel = factory.getDefaultModel();
+ }
+
+ /**
+ * Route to appropriate model based on context
+ */
+ async route(
+ message: string,
+ license: UserLicense,
+ strategy: RoutingStrategy = RoutingStrategy.USER_PREFERENCE
+ ): Promise {
+ let modelConfig: ModelConfig;
+
+ switch (strategy) {
+ case RoutingStrategy.USER_PREFERENCE:
+ modelConfig = this.routeByUserPreference(license);
+ break;
+
+ case RoutingStrategy.COMPLEXITY:
+ modelConfig = this.routeByComplexity(message, license);
+ break;
+
+ case RoutingStrategy.LICENSE_TIER:
+ modelConfig = this.routeByLicenseTier(license);
+ break;
+
+ case RoutingStrategy.COST_OPTIMIZED:
+ modelConfig = this.routeByCost(license);
+ break;
+
+ default:
+ modelConfig = this.defaultModel;
+ }
+
+ this.logger.info(
+ {
+ userId: license.userId,
+ strategy,
+ provider: modelConfig.provider,
+ model: modelConfig.model,
+ },
+ 'Routing to model'
+ );
+
+ return this.factory.createModel(modelConfig);
+ }
+
+ /**
+ * Route based on user's preferred model (if set in license)
+ */
+ private routeByUserPreference(license: UserLicense): ModelConfig {
+ // Check if user has custom model preference
+ const preferredModel = (license as any).preferredModel as ModelConfig | undefined;
+
+ if (preferredModel && this.isModelAllowed(preferredModel, license)) {
+ return preferredModel;
+ }
+
+ // Fall back to license tier default
+ return this.routeByLicenseTier(license);
+ }
+
+ /**
+ * Route based on query complexity
+ */
+ private routeByComplexity(message: string, license: UserLicense): ModelConfig {
+ const isComplex = this.isComplexQuery(message);
+
+ if (license.licenseType === 'enterprise') {
+ // Enterprise users get best models for complex queries
+ return isComplex
+ ? { provider: LLMProvider.ANTHROPIC, model: 'claude-3-opus-20240229' }
+ : { provider: LLMProvider.ANTHROPIC, model: 'claude-3-5-sonnet-20241022' };
+ }
+
+ if (license.licenseType === 'pro') {
+ // Pro users get good models
+ return isComplex
+ ? { provider: LLMProvider.ANTHROPIC, model: 'claude-3-5-sonnet-20241022' }
+ : { provider: LLMProvider.OPENAI, model: 'gpt-4o-mini' };
+ }
+
+ // Free users get efficient models
+ return { provider: LLMProvider.GOOGLE, model: 'gemini-2.0-flash-exp' };
+ }
+
+ /**
+ * Route based on license tier
+ */
+ private routeByLicenseTier(license: UserLicense): ModelConfig {
+ switch (license.licenseType) {
+ case 'enterprise':
+ return { provider: LLMProvider.ANTHROPIC, model: 'claude-3-5-sonnet-20241022' };
+
+ case 'pro':
+ return { provider: LLMProvider.OPENAI, model: 'gpt-4o' };
+
+ case 'free':
+ return { provider: LLMProvider.GOOGLE, model: 'gemini-2.0-flash-exp' };
+
+ default:
+ return this.defaultModel;
+ }
+ }
+
+ /**
+ * Route to cheapest available model
+ */
+ private routeByCost(license: UserLicense): ModelConfig {
+ // Free tier: use cheapest
+ if (license.licenseType === 'free') {
+ return { provider: LLMProvider.GOOGLE, model: 'gemini-2.0-flash-exp' };
+ }
+
+ // Paid tiers: use GPT-4o-mini for cost efficiency
+ return { provider: LLMProvider.OPENAI, model: 'gpt-4o-mini' };
+ }
+
+ /**
+ * Check if model is allowed for user's license
+ */
+ private isModelAllowed(model: ModelConfig, license: UserLicense): boolean {
+ // Free tier: only cheap models
+ if (license.licenseType === 'free') {
+ const allowedModels = ['gemini-2.0-flash-exp', 'gpt-4o-mini', 'claude-3-5-haiku-20241022'];
+ return allowedModels.includes(model.model);
+ }
+
+ // Pro: all except Opus
+ if (license.licenseType === 'pro') {
+ const blockedModels = ['claude-3-opus-20240229'];
+ return !blockedModels.includes(model.model);
+ }
+
+ // Enterprise: all models allowed
+ return true;
+ }
+
+ /**
+ * Determine if query is complex
+ */
+ private isComplexQuery(message: string): boolean {
+ const complexityIndicators = [
+ // Multi-step analysis
+ 'backtest',
+ 'analyze',
+ 'compare',
+ 'optimize',
+
+ // Code generation
+ 'write',
+ 'create',
+ 'implement',
+ 'build',
+
+ // Deep reasoning
+ 'explain why',
+ 'what if',
+ 'how would',
+
+ // Long messages (> 200 chars likely complex)
+ message.length > 200,
+ ];
+
+ const messageLower = message.toLowerCase();
+
+ return complexityIndicators.some((indicator) =>
+ typeof indicator === 'string' ? messageLower.includes(indicator) : indicator
+ );
+ }
+}
diff --git a/gateway/src/main.ts b/gateway/src/main.ts
new file mode 100644
index 0000000..057478b
--- /dev/null
+++ b/gateway/src/main.ts
@@ -0,0 +1,154 @@
+import Fastify from 'fastify';
+import websocket from '@fastify/websocket';
+import cors from '@fastify/cors';
+import { UserService } from './db/user-service.js';
+import { Authenticator } from './auth/authenticator.js';
+import { WebSocketHandler } from './channels/websocket-handler.js';
+import { TelegramHandler } from './channels/telegram-handler.js';
+import { KubernetesClient } from './k8s/client.js';
+import { ContainerManager } from './k8s/container-manager.js';
+
+const app = Fastify({
+ logger: {
+ level: process.env.LOG_LEVEL || 'info',
+ transport: {
+ target: 'pino-pretty',
+ options: {
+ colorize: true,
+ translateTime: 'HH:MM:ss Z',
+ ignore: 'pid,hostname',
+ },
+ },
+ },
+});
+
+// Configuration from environment
+const config = {
+ port: parseInt(process.env.PORT || '3000'),
+ host: process.env.HOST || '0.0.0.0',
+ databaseUrl: process.env.DATABASE_URL || 'postgresql://localhost/dexorder',
+
+ // LLM provider API keys
+ providerConfig: {
+ anthropicApiKey: process.env.ANTHROPIC_API_KEY,
+ openaiApiKey: process.env.OPENAI_API_KEY,
+ googleApiKey: process.env.GOOGLE_API_KEY,
+ openrouterApiKey: process.env.OPENROUTER_API_KEY,
+ },
+
+ telegramBotToken: process.env.TELEGRAM_BOT_TOKEN || '',
+
+ // Kubernetes configuration
+ kubernetes: {
+ namespace: process.env.KUBERNETES_NAMESPACE || 'dexorder-agents',
+ inCluster: process.env.KUBERNETES_IN_CLUSTER === 'true',
+ context: process.env.KUBERNETES_CONTEXT,
+ agentImage: process.env.AGENT_IMAGE || 'ghcr.io/dexorder/agent:latest',
+ sidecarImage: process.env.SIDECAR_IMAGE || 'ghcr.io/dexorder/lifecycle-sidecar:latest',
+ storageClass: process.env.AGENT_STORAGE_CLASS || 'standard',
+ },
+};
+
+// Validate at least one LLM provider is configured
+const hasAnyProvider = Object.values(config.providerConfig).some(key => !!key);
+if (!hasAnyProvider) {
+ app.log.error('At least one LLM provider API key is required (ANTHROPIC_API_KEY, OPENAI_API_KEY, GOOGLE_API_KEY, or OPENROUTER_API_KEY)');
+ process.exit(1);
+}
+
+// Register plugins
+await app.register(cors, {
+ origin: process.env.CORS_ORIGIN || '*',
+});
+
+await app.register(websocket, {
+ options: {
+ maxPayload: 1024 * 1024, // 1MB
+ },
+});
+
+// Initialize services
+const userService = new UserService(config.databaseUrl);
+
+// Initialize Kubernetes client and container manager
+const k8sClient = new KubernetesClient({
+ namespace: config.kubernetes.namespace,
+ inCluster: config.kubernetes.inCluster,
+ context: config.kubernetes.context,
+ logger: app.log,
+});
+
+const containerManager = new ContainerManager({
+ k8sClient,
+ agentImage: config.kubernetes.agentImage,
+ sidecarImage: config.kubernetes.sidecarImage,
+ storageClass: config.kubernetes.storageClass,
+ namespace: config.kubernetes.namespace,
+ logger: app.log,
+});
+
+const authenticator = new Authenticator({
+ userService,
+ containerManager,
+ logger: app.log,
+});
+
+// Initialize channel handlers
+const websocketHandler = new WebSocketHandler({
+ authenticator,
+ providerConfig: config.providerConfig,
+});
+
+const telegramHandler = new TelegramHandler({
+ authenticator,
+ providerConfig: config.providerConfig,
+ telegramBotToken: config.telegramBotToken,
+});
+
+// Register routes
+websocketHandler.register(app);
+telegramHandler.register(app);
+
+// Health check
+app.get('/health', async () => {
+ return {
+ status: 'ok',
+ timestamp: new Date().toISOString(),
+ };
+});
+
+// Graceful shutdown
+const shutdown = async () => {
+ app.log.info('Shutting down gracefully...');
+ try {
+ await userService.close();
+ await app.close();
+ app.log.info('Shutdown complete');
+ process.exit(0);
+ } catch (error) {
+ app.log.error({ error }, 'Error during shutdown');
+ process.exit(1);
+ }
+};
+
+process.on('SIGTERM', shutdown);
+process.on('SIGINT', shutdown);
+
+// Start server
+try {
+ await app.listen({
+ port: config.port,
+ host: config.host,
+ });
+
+ app.log.info(
+ {
+ port: config.port,
+ host: config.host,
+ },
+ 'Gateway server started'
+ );
+} catch (error) {
+ app.log.error({ error }, 'Failed to start server');
+ process.exit(1);
+}
diff --git a/gateway/src/types/messages.ts b/gateway/src/types/messages.ts
new file mode 100644
index 0000000..97642fe
--- /dev/null
+++ b/gateway/src/types/messages.ts
@@ -0,0 +1,37 @@
+import { z } from 'zod';
+
+/**
+ * Inbound user message from any channel
+ */
+export const InboundMessageSchema = z.object({
+ messageId: z.string(),
+ userId: z.string(),
+ sessionId: z.string(),
+ content: z.string(),
+ attachments: z.array(z.object({
+ type: z.enum(['image', 'file', 'url']),
+ url: z.string(),
+ mimeType: z.string().optional(),
+ })).optional(),
+ timestamp: z.date(),
+});
+
+export type InboundMessage = z.infer;
+
+/**
+ * Outbound response to channel
+ */
+export const OutboundMessageSchema = z.object({
+ messageId: z.string(),
+ sessionId: z.string(),
+ content: z.string(),
+ attachments: z.array(z.object({
+ type: z.enum(['image', 'chart', 'file']),
+ url: z.string(),
+ caption: z.string().optional(),
+ })).optional(),
+ metadata: z.record(z.unknown()).optional(),
+ timestamp: z.date(),
+});
+
+export type OutboundMessage = z.infer;
diff --git a/gateway/src/types/resources.ts b/gateway/src/types/resources.ts
new file mode 100644
index 0000000..516a7de
--- /dev/null
+++ b/gateway/src/types/resources.ts
@@ -0,0 +1,101 @@
+import { z } from 'zod';
+
+/**
+ * MCP Resource types for user context
+ */
+
+/**
+ * Base resource structure from MCP server
+ */
+export const MCPResourceSchema = z.object({
+ uri: z.string(),
+ mimeType: z.string().optional(),
+ text: z.string().optional(),
+ blob: z.string().optional(), // base64 encoded
+});
+
+export type MCPResource = z.infer;
+
+/**
+ * User profile context
+ */
+export const UserProfileContextSchema = z.object({
+ tradingExperience: z.enum(['beginner', 'intermediate', 'advanced', 'professional']),
+ preferredTimeframes: z.array(z.string()),
+ riskTolerance: z.enum(['low', 'medium', 'high']),
+ tradingStyle: z.string(),
+ favoriteIndicators: z.array(z.string()).optional(),
+ activeTradingPairs: z.array(z.string()).optional(),
+ notes: z.string().optional(),
+});
+
+export type UserProfileContext = z.infer;
+
+/**
+ * Workspace state (current chart, positions, etc.)
+ */
+export const WorkspaceStateSchema = z.object({
+ currentChart: z.object({
+ ticker: z.string(),
+ timeframe: z.string(),
+ indicators: z.array(z.string()).optional(),
+ }).optional(),
+ watchlist: z.array(z.string()),
+ openPositions: z.array(z.object({
+ ticker: z.string(),
+ side: z.enum(['long', 'short']),
+ size: z.number(),
+ entryPrice: z.number(),
+ currentPrice: z.number().optional(),
+ unrealizedPnL: z.number().optional(),
+ })),
+ recentAlerts: z.array(z.object({
+ type: z.string(),
+ message: z.string(),
+ timestamp: z.string(),
+ })).optional(),
+});
+
+export type WorkspaceState = z.infer;
+
+/**
+ * Standard context resource URIs
+ */
+export const CONTEXT_URIS = {
+ USER_PROFILE: 'context://user-profile',
+ CONVERSATION_SUMMARY: 'context://conversation-summary',
+ WORKSPACE_STATE: 'context://workspace-state',
+ SYSTEM_PROMPT: 'context://system-prompt',
+} as const;
+
+/**
+ * Resource content interface
+ */
+export interface ResourceContent {
+ uri: string;
+ mimeType?: string;
+ text?: string;
+ blob?: string;
+}
+
+/**
+ * Helper to parse resource content
+ */
+export function parseResource(resource: ResourceContent, schema: z.ZodSchema): T | null {
+ if (!resource.text) {
+ return null;
+ }
+
+ try {
+ // Try JSON parsing if mime type is JSON
+ if (resource.mimeType?.includes('json')) {
+ const data = JSON.parse(resource.text);
+ return schema.parse(data);
+ }
+
+ // Otherwise return as-is for text resources
+ return resource.text as T;
+ } catch {
+ return null;
+ }
+}
diff --git a/gateway/src/types/user.ts b/gateway/src/types/user.ts
new file mode 100644
index 0000000..13e0764
--- /dev/null
+++ b/gateway/src/types/user.ts
@@ -0,0 +1,66 @@
+import { z } from 'zod';
+
+/**
+ * Model preference configuration
+ */
+export const ModelPreferenceSchema = z.object({
+ provider: z.enum(['anthropic', 'openai', 'google', 'openrouter']),
+ model: z.string(),
+ temperature: z.number().optional(),
+});
+
+export type ModelPreference = z.infer;
+
+/**
+ * User license and feature authorization
+ */
+export const UserLicenseSchema = z.object({
+ userId: z.string(),
+ email: z.string().email().optional(),
+ licenseType: z.enum(['free', 'pro', 'enterprise']),
+ features: z.object({
+ maxIndicators: z.number(),
+ maxStrategies: z.number(),
+ maxBacktestDays: z.number(),
+ realtimeData: z.boolean(),
+ customExecutors: z.boolean(),
+ apiAccess: z.boolean(),
+ }),
+ resourceLimits: z.object({
+ maxConcurrentSessions: z.number(),
+ maxMessagesPerDay: z.number(),
+ maxTokensPerMessage: z.number(),
+ rateLimitPerMinute: z.number(),
+ }),
+ mcpServerUrl: z.string().url(),
+ preferredModel: ModelPreferenceSchema.optional(),
+ expiresAt: z.date().optional(),
+ createdAt: z.date(),
+ updatedAt: z.date(),
+});
+
+export type UserLicense = z.infer;
+
+/**
+ * Channel types for multi-channel support
+ */
+export enum ChannelType {
+ WEBSOCKET = 'websocket',
+ TELEGRAM = 'telegram',
+ SLACK = 'slack',
+ DISCORD = 'discord',
+}
+
+/**
+ * Authentication context per channel
+ */
+export const AuthContextSchema = z.object({
+ userId: z.string(),
+ channelType: z.nativeEnum(ChannelType),
+ channelUserId: z.string(), // Platform-specific ID (telegram_id, discord_id, etc)
+ sessionId: z.string(),
+ license: UserLicenseSchema,
+ authenticatedAt: z.date(),
+});
+
+export type AuthContext = z.infer;
diff --git a/gateway/src/workflows/README.md b/gateway/src/workflows/README.md
new file mode 100644
index 0000000..d40484a
--- /dev/null
+++ b/gateway/src/workflows/README.md
@@ -0,0 +1,253 @@
+# LangGraph Workflows for Trading
+
+Complex, stateful workflows built with LangGraph for trading-specific tasks.
+
+## Overview
+
+LangGraph provides:
+- **Stateful execution**: Workflow state persists across failures
+- **Conditional branching**: Route based on market conditions, backtest results, etc.
+- **Human-in-the-loop**: Pause for user approval before executing trades
+- **Loops & retries**: Backtest with different parameters, retry failed operations
+- **Multi-agent**: Different LLMs for different tasks (analysis, risk, execution)
+
+## Workflows
+
+### Strategy Analysis (`strategy-analysis.ts`)
+
+Multi-step pipeline for analyzing trading strategies:
+
+```typescript
+import { buildStrategyAnalysisWorkflow } from './workflows/strategy-analysis.js';
+
+const workflow = buildStrategyAnalysisWorkflow(model, logger, mcpBacktestFn);
+
+const result = await workflow.invoke({
+ strategyCode: userStrategy,
+ ticker: 'BTC/USDT',
+ timeframe: '1h',
+});
+
+console.log(result.recommendation); // Go/no-go decision
+```
+
+**Steps:**
+1. **Code Review** - LLM analyzes strategy code for bugs, logic errors
+2. **Backtest** - Runs backtest via user's MCP server
+3. **Risk Assessment** - LLM evaluates results (drawdown, Sharpe, etc.)
+4. **Human Approval** - Pauses for user review
+5. **Recommendation** - Final go/no-go decision
+
+**Benefits:**
+- Stateful: Can resume if server restarts
+- Human-in-the-loop: User must approve before deployment
+- Multi-step reasoning: Each step builds on previous
+
+---
+
+## Future Workflows
+
+### Market Scanner
+
+Scan multiple tickers for trading opportunities:
+
+```typescript
+const scanner = buildMarketScannerWorkflow(model, logger);
+
+const result = await scanner.invoke({
+ tickers: ['BTC/USDT', 'ETH/USDT', 'SOL/USDT'],
+ strategies: ['momentum', 'mean_reversion'],
+ timeframe: '1h',
+});
+
+// Returns ranked opportunities
+```
+
+**Steps:**
+1. **Fetch Data** - Get OHLC for all tickers
+2. **Apply Strategies** - Run each strategy on each ticker (parallel)
+3. **Rank Signals** - Score by confidence, risk/reward
+4. **Filter** - Apply user's risk limits
+5. **Return Top N** - Best opportunities
+
+---
+
+### Portfolio Optimization
+
+Optimize position sizing across multiple strategies:
+
+```typescript
+const optimizer = buildPortfolioOptimizerWorkflow(model, logger);
+
+const result = await optimizer.invoke({
+ strategies: [strategy1, strategy2, strategy3],
+ totalCapital: 100000,
+ maxRiskPerTrade: 0.02,
+});
+
+// Returns optimal allocation
+```
+
+**Steps:**
+1. **Backtest All** - Run backtests for each strategy
+2. **Correlation Analysis** - Check strategy correlation
+3. **Monte Carlo** - Simulate portfolio performance
+4. **Optimize** - Find optimal weights (Sharpe maximization)
+5. **Risk Check** - Validate against user limits
+
+---
+
+### Trade Execution Monitor
+
+Monitor trade execution and adapt to market conditions:
+
+```typescript
+const monitor = buildTradeExecutionWorkflow(model, logger, exchange);
+
+const result = await monitor.invoke({
+ tradeId: 'xyz',
+ targetPrice: 45000,
+ maxSlippage: 0.001,
+ timeLimit: 60, // seconds
+});
+```
+
+**Steps:**
+1. **Place Order** - Submit order to exchange
+2. **Monitor Fill** - Check fill status every second
+3. **Adapt** - If not filling, adjust price (within slippage)
+4. **Retry Logic** - If rejected, retry with backoff
+5. **Timeout** - Cancel if time limit exceeded
+6. **Report** - Final execution report
+
+---
+
+## Using Workflows in Gateway
+
+### Simple Chat vs Complex Workflow
+
+```typescript
+// gateway/src/orchestrator.ts
+
+export class MessageOrchestrator {
+ async handleMessage(msg: InboundMessage) {
+ // Route based on complexity
+ if (this.isSimpleQuery(msg)) {
+ // Use agent harness for streaming chat
+ return this.harness.streamMessage(msg);
+ }
+
+ if (this.isWorkflowRequest(msg)) {
+ // Use LangGraph for complex analysis
+ return this.executeWorkflow(msg);
+ }
+ }
+
+ async executeWorkflow(msg: InboundMessage) {
+ const { type, params } = this.parseWorkflowRequest(msg);
+
+ switch (type) {
+ case 'analyze_strategy':
+ const workflow = buildStrategyAnalysisWorkflow(...);
+ return await workflow.invoke(params);
+
+ case 'scan_market':
+ const scanner = buildMarketScannerWorkflow(...);
+ return await scanner.invoke(params);
+
+ // ... more workflows
+ }
+ }
+}
+```
+
+---
+
+## Benefits for Trading
+
+### vs Simple LLM Calls
+
+| Scenario | Simple LLM | LangGraph Workflow |
+|----------|-----------|-------------------|
+| "What's the RSI?" | ✅ Fast, streaming | ❌ Overkill |
+| "Analyze this strategy" | ❌ Limited context | ✅ Multi-step analysis |
+| "Backtest 10 param combos" | ❌ No loops | ✅ Conditional loops |
+| "Execute if approved" | ❌ No state | ✅ Human-in-the-loop |
+| Server crashes mid-analysis | ❌ Lost progress | ✅ Resume from checkpoint |
+
+### When to Use Workflows
+
+**Use LangGraph when:**
+- Multi-step analysis (backtest → risk → approval)
+- Conditional logic (if bullish → momentum, else → mean-reversion)
+- Human approval required (pause workflow)
+- Loops needed (try different parameters)
+- Long-running (can survive restarts)
+
+**Use Agent Harness when:**
+- Simple Q&A ("What is RSI?")
+- Fast response needed (streaming chat)
+- Single tool call ("Get my watchlist")
+- Real-time interaction (Telegram, WebSocket)
+
+---
+
+## Implementation Notes
+
+### State Persistence
+
+LangGraph can persist state to database:
+
+```typescript
+import { MemorySaver } from '@langchain/langgraph';
+
+const checkpointer = new MemorySaver();
+
+const workflow = graph.compile({ checkpointer });
+
+// Resume from checkpoint
+const result = await workflow.invoke(input, {
+ configurable: { thread_id: 'user-123-strategy-analysis' }
+});
+```
+
+### Human-in-the-Loop
+
+Pause workflow for user input:
+
+```typescript
+const workflow = graph
+ .addNode('human_approval', humanApprovalNode)
+ .interrupt('human_approval'); // Pauses here
+
+// User reviews in UI
+const approved = await getUserApproval(workflowId);
+
+// Resume workflow
+await workflow.resume(state, { approved });
+```
+
+### Multi-Agent
+
+Use different models for different tasks:
+
+```typescript
+const analysisModel = new ChatAnthropic({ model: 'claude-3-opus' }); // Smart
+const codeModel = new ChatOpenAI({ model: 'gpt-4o' }); // Good at code
+const cheapModel = new ChatOpenAI({ model: 'gpt-4o-mini' }); // Fast
+
+const workflow = graph
+ .addNode('analyze', (state) => analysisModel.invoke(...))
+ .addNode('code_review', (state) => codeModel.invoke(...))
+ .addNode('summarize', (state) => cheapModel.invoke(...));
+```
+
+---
+
+## Next Steps
+
+1. Implement remaining workflows (scanner, optimizer, execution)
+2. Add state persistence (PostgreSQL checkpointer)
+3. Integrate human-in-the-loop with WebSocket
+4. Add workflow monitoring dashboard
+5. Performance optimization (parallel execution)
diff --git a/gateway/src/workflows/strategy-analysis.ts b/gateway/src/workflows/strategy-analysis.ts
new file mode 100644
index 0000000..b4925f0
--- /dev/null
+++ b/gateway/src/workflows/strategy-analysis.ts
@@ -0,0 +1,162 @@
+import { StateGraph, Annotation } from '@langchain/langgraph';
+import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
+import { HumanMessage, SystemMessage } from '@langchain/core/messages';
+import type { FastifyBaseLogger } from 'fastify';
+
+/**
+ * State for strategy analysis workflow
+ */
+const StrategyAnalysisState = Annotation.Root({
+ strategyCode: Annotation(),
+ ticker: Annotation(),
+ timeframe: Annotation(),
+
+ // Analysis steps
+ codeReview: Annotation({
+ default: () => null,
+ }),
+ backtestResults: Annotation | null>({
+ default: () => null,
+ }),
+ riskAssessment: Annotation({
+ default: () => null,
+ }),
+ humanApproved: Annotation({
+ default: () => false,
+ }),
+
+ // Final output
+ recommendation: Annotation({
+ default: () => null,
+ }),
+});
+
+type StrategyAnalysisStateType = typeof StrategyAnalysisState.State;
+
+/**
+ * Build strategy analysis workflow using LangGraph
+ *
+ * Workflow steps:
+ * 1. Code review (LLM analyzes strategy code)
+ * 2. Backtest (calls user's MCP backtest tool)
+ * 3. Risk assessment (LLM evaluates results)
+ * 4. Human approval (pause for user review)
+ * 5. Final recommendation
+ */
+export function buildStrategyAnalysisWorkflow(
+ model: BaseChatModel,
+ logger: FastifyBaseLogger,
+ mcpBacktestFn: (strategy: string, ticker: string, timeframe: string) => Promise>
+) {
+ // Node: Code Review
+ const codeReviewNode = async (state: StrategyAnalysisStateType) => {
+ logger.info('Strategy workflow: Code review');
+
+ const systemPrompt = `You are an expert trading strategy analyst.
+Review the following strategy code for potential issues, bugs, or improvements.
+Focus on: logic errors, edge cases, performance, and trading best practices.`;
+
+ const response = await model.invoke([
+ new SystemMessage(systemPrompt),
+ new HumanMessage(`Review this strategy:\n\n${state.strategyCode}`),
+ ]);
+
+ return {
+ codeReview: response.content as string,
+ };
+ };
+
+ // Node: Backtest
+ const backtestNode = async (state: StrategyAnalysisStateType) => {
+ logger.info('Strategy workflow: Running backtest');
+
+ const results = await mcpBacktestFn(state.strategyCode, state.ticker, state.timeframe);
+
+ return {
+ backtestResults: results,
+ };
+ };
+
+ // Node: Risk Assessment
+ const riskAssessmentNode = async (state: StrategyAnalysisStateType) => {
+ logger.info('Strategy workflow: Risk assessment');
+
+ const systemPrompt = `You are a risk management expert for trading strategies.
+Analyze the backtest results and provide a risk assessment.
+Focus on: drawdown, win rate, Sharpe ratio, position sizing, and risk of ruin.`;
+
+ const response = await model.invoke([
+ new SystemMessage(systemPrompt),
+ new HumanMessage(
+ `Code review: ${state.codeReview}\n\nBacktest results: ${JSON.stringify(state.backtestResults, null, 2)}\n\nProvide risk assessment:`
+ ),
+ ]);
+
+ return {
+ riskAssessment: response.content as string,
+ };
+ };
+
+ // Node: Human Approval (placeholder - would integrate with UI)
+ const humanApprovalNode = async (state: StrategyAnalysisStateType) => {
+ logger.info('Strategy workflow: Awaiting human approval');
+
+ // In real implementation, this would pause and wait for user input
+ // For now, auto-approve
+ return {
+ humanApproved: true,
+ };
+ };
+
+ // Node: Final Recommendation
+ const recommendationNode = async (state: StrategyAnalysisStateType) => {
+ logger.info('Strategy workflow: Generating recommendation');
+
+ const systemPrompt = `Provide a final recommendation on whether to deploy this trading strategy.
+Summarize the code review, backtest results, and risk assessment.
+Give clear go/no-go decision with reasoning.`;
+
+ const response = await model.invoke([
+ new SystemMessage(systemPrompt),
+ new HumanMessage(
+ `Code review: ${state.codeReview}\n\nBacktest: ${JSON.stringify(state.backtestResults)}\n\nRisk: ${state.riskAssessment}\n\nApproved: ${state.humanApproved}\n\nYour recommendation:`
+ ),
+ ]);
+
+ return {
+ recommendation: response.content as string,
+ };
+ };
+
+ // Build graph
+ const workflow = new StateGraph(StrategyAnalysisState)
+ .addNode('code_review', codeReviewNode)
+ .addNode('backtest', backtestNode)
+ .addNode('risk_assessment', riskAssessmentNode)
+ .addNode('human_approval', humanApprovalNode)
+ .addNode('recommendation', recommendationNode)
+ .addEdge('__start__', 'code_review')
+ .addEdge('code_review', 'backtest')
+ .addEdge('backtest', 'risk_assessment')
+ .addEdge('risk_assessment', 'human_approval')
+ .addConditionalEdges('human_approval', (state) => {
+ return state.humanApproved ? 'recommendation' : '__end__';
+ })
+ .addEdge('recommendation', '__end__');
+
+ return workflow.compile();
+}
+
+/**
+ * Example usage:
+ *
+ * const workflow = buildStrategyAnalysisWorkflow(model, logger, mcpBacktestFn);
+ *
+ * const result = await workflow.invoke({
+ * strategyCode: "strategy code here",
+ * ticker: "BTC/USDT",
+ * timeframe: "1h",
+ * });
+ *
+ * console.log(result.recommendation);
+ */
diff --git a/gateway/tsconfig.json b/gateway/tsconfig.json
new file mode 100644
index 0000000..4f09770
--- /dev/null
+++ b/gateway/tsconfig.json
@@ -0,0 +1,26 @@
+{
+ "compilerOptions": {
+ "target": "ES2022",
+ "module": "ESNext",
+ "lib": ["ES2022"],
+ "moduleResolution": "bundler",
+ "resolveJsonModule": true,
+ "allowJs": false,
+ "outDir": "./dist",
+ "rootDir": "./src",
+ "strict": true,
+ "esModuleInterop": true,
+ "skipLibCheck": true,
+ "forceConsistentCasingInFileNames": true,
+ "declaration": true,
+ "declarationMap": true,
+ "sourceMap": true,
+ "noUnusedLocals": true,
+ "noUnusedParameters": true,
+ "noImplicitReturns": true,
+ "noFallthroughCasesInSwitch": true,
+ "allowSyntheticDefaultImports": true
+ },
+ "include": ["src/**/*"],
+ "exclude": ["node_modules", "dist"]
+}
diff --git a/lifecycle-sidecar/.gitignore b/lifecycle-sidecar/.gitignore
new file mode 100644
index 0000000..a3d200d
--- /dev/null
+++ b/lifecycle-sidecar/.gitignore
@@ -0,0 +1,15 @@
+# Binaries
+lifecycle-sidecar
+*.exe
+*.dll
+*.so
+*.dylib
+
+# Test binary
+*.test
+
+# Go workspace file
+go.work
+
+# Build output
+dist/
diff --git a/lifecycle-sidecar/Dockerfile b/lifecycle-sidecar/Dockerfile
new file mode 100644
index 0000000..5dfa04f
--- /dev/null
+++ b/lifecycle-sidecar/Dockerfile
@@ -0,0 +1,40 @@
+# Build stage
+FROM golang:1.22-alpine AS builder
+
+WORKDIR /app
+
+# Install build dependencies
+RUN apk add --no-cache git ca-certificates
+
+# Copy go mod files
+COPY go.mod go.sum ./
+RUN go mod download
+
+# Copy source
+COPY main.go ./
+
+# Build static binary
+RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \
+ -ldflags="-w -s" \
+ -o lifecycle-sidecar \
+ main.go
+
+# Runtime stage
+FROM alpine:3.19
+
+# Install procps for process monitoring (pgrep, kill)
+RUN apk add --no-cache procps ca-certificates
+
+# Create non-root user
+RUN addgroup -g 1000 sidecar && \
+ adduser -D -u 1000 -G sidecar sidecar
+
+WORKDIR /app
+
+# Copy binary from builder
+COPY --from=builder /app/lifecycle-sidecar /app/lifecycle-sidecar
+
+# Run as non-root
+USER sidecar
+
+ENTRYPOINT ["/app/lifecycle-sidecar"]
diff --git a/lifecycle-sidecar/README.md b/lifecycle-sidecar/README.md
new file mode 100644
index 0000000..bbb2097
--- /dev/null
+++ b/lifecycle-sidecar/README.md
@@ -0,0 +1,94 @@
+# Lifecycle Sidecar
+
+A lightweight Kubernetes sidecar that monitors the main agent container and handles cleanup when the container exits with a specific exit code indicating idle shutdown.
+
+## Purpose
+
+User agent containers self-manage their lifecycle by:
+1. Tracking their own activity (MCP calls, trigger status)
+2. Exiting with code `42` when idle (no triggers + no recent activity)
+3. Delegating deployment cleanup to this sidecar
+
+The sidecar watches the main container and:
+- On exit code `42`: Deletes the deployment (and optionally PVC)
+- On any other exit code: Allows Kubernetes restart policy to handle it
+
+## Architecture
+
+```
+┌─────────────────────────────────────────────────┐
+│ Pod │
+│ ┌────────────────┐ ┌──────────────────┐ │
+│ │ Agent Container│ │ Lifecycle Sidecar│ │
+│ │ │ │ │ │
+│ │ - Track activity │ - Monitor agent │ │
+│ │ - Track triggers │ - Watch exit code│ │
+│ │ - Exit 42 if idle │ - Delete if 42 │ │
+│ └────────────────┘ └──────────────────┘ │
+│ │ │ │
+│ │ writes exit_code │ │
+│ └─────────►/var/run/agent/exit_code │
+│ │ │
+└───────────────────────────────────┼─────────────┘
+ │
+ ▼ k8s API
+ ┌──────────────────────┐
+ │ Delete Deployment │
+ │ (+ PVC if anonymous)│
+ └──────────────────────┘
+```
+
+## Environment Variables
+
+| Variable | Required | Description |
+|----------|----------|-------------|
+| `NAMESPACE` | Yes | Kubernetes namespace (injected via downward API) |
+| `DEPLOYMENT_NAME` | Yes | Name of the deployment to delete (from pod label) |
+| `USER_TYPE` | No | User license tier: `anonymous`, `free`, `paid`, `enterprise` |
+| `MAIN_CONTAINER_PID` | No | PID of main container (for precise monitoring) |
+
+## Exit Code Contract
+
+The agent container uses exit codes to signal intent:
+
+| Exit Code | Meaning | Sidecar Action |
+|-----------|---------|----------------|
+| `42` | Clean idle shutdown | Delete deployment + optional PVC |
+| Any other | Error or normal restart | Allow Kubernetes to restart |
+
+## RBAC Requirements
+
+The sidecar requires a ServiceAccount with permission to delete its own deployment:
+
+```yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+rules:
+ - apiGroups: ["apps"]
+ resources: ["deployments"]
+ verbs: ["get", "delete"]
+ - apiGroups: [""]
+ resources: ["persistentvolumeclaims"]
+ verbs: ["get", "delete"]
+```
+
+See `deploy/k8s/base/lifecycle-sidecar-rbac.yaml` for the full RBAC configuration.
+
+## Building
+
+```bash
+docker build -t ghcr.io/dexorder/lifecycle-sidecar:latest .
+docker push ghcr.io/dexorder/lifecycle-sidecar:latest
+```
+
+## Example Usage
+
+See `deploy/k8s/base/agent-deployment-example.yaml` for a complete example of how to configure an agent deployment with the lifecycle sidecar.
+
+## Security Considerations
+
+1. **Self-delete only**: The sidecar can only delete the deployment it's part of (enforced by label matching in admission policy)
+2. **Non-privileged**: Runs as non-root user (UID 1000)
+3. **Minimal permissions**: Only has `get` and `delete` on deployments/PVCs in the agents namespace
+4. **No cross-namespace access**: Scoped to `dexorder-agents` namespace only
+5. **Crash-safe**: Only triggers cleanup on exit code 42, never on crashes
diff --git a/lifecycle-sidecar/go.mod b/lifecycle-sidecar/go.mod
new file mode 100644
index 0000000..bc4a623
--- /dev/null
+++ b/lifecycle-sidecar/go.mod
@@ -0,0 +1,16 @@
+module github.com/dexorder/lifecycle-sidecar
+
+go 1.22
+
+require (
+ github.com/rs/zerolog v1.32.0
+ k8s.io/api v0.29.2
+ k8s.io/apimachinery v0.29.2
+ k8s.io/client-go v0.29.2
+)
+
+require (
+ github.com/mattn/go-colorable v0.1.13 // indirect
+ github.com/mattn/go-isatty v0.0.19 // indirect
+ golang.org/x/sys v0.17.0 // indirect
+)
diff --git a/lifecycle-sidecar/main.go b/lifecycle-sidecar/main.go
new file mode 100644
index 0000000..ea538ce
--- /dev/null
+++ b/lifecycle-sidecar/main.go
@@ -0,0 +1,234 @@
+package main
+
+import (
+ "context"
+ "fmt"
+ "os"
+ "os/exec"
+ "syscall"
+ "time"
+
+ "github.com/rs/zerolog"
+ "github.com/rs/zerolog/log"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/client-go/kubernetes"
+ "k8s.io/client-go/rest"
+)
+
+const (
+ // Exit code indicating clean idle shutdown
+ ExitCodeIdleShutdown = 42
+
+ // Poll interval for checking main container status
+ PollInterval = 5 * time.Second
+)
+
+func main() {
+ // Setup logging
+ zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
+ log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr})
+
+ log.Info().Msg("Lifecycle sidecar starting")
+
+ // Get environment configuration
+ namespace := os.Getenv("NAMESPACE")
+ deploymentName := os.Getenv("DEPLOYMENT_NAME")
+ userType := os.Getenv("USER_TYPE")
+ mainContainerPID := os.Getenv("MAIN_CONTAINER_PID")
+
+ if namespace == "" || deploymentName == "" {
+ log.Fatal().Msg("NAMESPACE and DEPLOYMENT_NAME environment variables are required")
+ }
+
+ log.Info().
+ Str("namespace", namespace).
+ Str("deployment", deploymentName).
+ Str("userType", userType).
+ Str("mainPID", mainContainerPID).
+ Msg("Configuration loaded")
+
+ // Create Kubernetes client
+ config, err := rest.InClusterConfig()
+ if err != nil {
+ log.Fatal().Err(err).Msg("Failed to get in-cluster config")
+ }
+
+ clientset, err := kubernetes.NewForConfig(config)
+ if err != nil {
+ log.Fatal().Err(err).Msg("Failed to create Kubernetes client")
+ }
+
+ // Wait for main container to exit
+ exitCode := waitForMainContainer()
+
+ log.Info().Int("exitCode", exitCode).Msg("Main container exited")
+
+ // Handle exit code
+ if exitCode == ExitCodeIdleShutdown {
+ log.Info().Msg("Detected idle shutdown (exit code 42) - cleaning up deployment")
+
+ ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+ defer cancel()
+
+ // Delete PVC if anonymous user
+ deletePVC := userType == "anonymous" || userType == "temporary"
+
+ if err := cleanupDeployment(ctx, clientset, namespace, deploymentName, deletePVC); err != nil {
+ log.Error().Err(err).Msg("Failed to cleanup deployment")
+ os.Exit(1)
+ }
+
+ log.Info().Msg("Cleanup complete - sidecar exiting")
+ os.Exit(0)
+ } else {
+ // Any other exit code - let Kubernetes restart policy handle it
+ log.Info().
+ Int("exitCode", exitCode).
+ Msg("Non-idle exit code - allowing Kubernetes to handle restart")
+ os.Exit(exitCode)
+ }
+}
+
+// waitForMainContainer monitors the main container process and returns its exit code
+func waitForMainContainer() int {
+ // Try multiple methods to detect main container exit
+ // Method 1: Poll for process via shared PID namespace
+ mainPID := os.Getenv("MAIN_CONTAINER_PID")
+ if mainPID != "" {
+ return pollProcessExit(mainPID)
+ }
+
+ // Method 2: Poll for agent process by name (fallback)
+ log.Info().Msg("MAIN_CONTAINER_PID not set, polling for 'agent' process")
+ return pollProcessByName("agent")
+}
+
+// pollProcessExit polls for process exit by PID
+func pollProcessExit(pidStr string) int {
+ log.Info().Str("pid", pidStr).Msg("Monitoring main container process")
+
+ for {
+ // Check if process exists
+ cmd := exec.Command("kill", "-0", pidStr)
+ err := cmd.Run()
+
+ if err != nil {
+ // Process no longer exists - get exit code from /proc if available
+ log.Info().Msg("Main container process exited")
+
+ // Try to get actual exit code (this is a best-effort)
+ // In Kubernetes, we might not have access to the actual exit code
+ // So we check if the container restarted via container status
+ return getContainerExitCode()
+ }
+
+ time.Sleep(PollInterval)
+ }
+}
+
+// pollProcessByName polls for process exit by name
+func pollProcessByName(name string) int {
+ log.Info().Str("name", name).Msg("Monitoring main container by name")
+
+ for {
+ cmd := exec.Command("pgrep", "-x", name)
+ err := cmd.Run()
+
+ if err != nil {
+ log.Info().Msg("Main container process exited")
+ return getContainerExitCode()
+ }
+
+ time.Sleep(PollInterval)
+ }
+}
+
+// getContainerExitCode attempts to retrieve the exit code of the main container
+// This is challenging in Kubernetes without direct access to container runtime
+// We use a fallback approach: check a shared file or default to 0
+func getContainerExitCode() int {
+ // Check if main container wrote exit code to shared volume
+ exitCodeFile := "/var/run/agent/exit_code"
+ data, err := os.ReadFile(exitCodeFile)
+ if err == nil {
+ var exitCode int
+ _, err := fmt.Sscanf(string(data), "%d", &exitCode)
+ if err == nil {
+ log.Info().Int("exitCode", exitCode).Msg("Read exit code from shared file")
+ return exitCode
+ }
+ }
+
+ // Default to 0 if we can't determine exit code
+ // This is safe because non-42 codes allow restart
+ log.Warn().Msg("Could not determine exit code, defaulting to 0")
+ return 0
+}
+
+// cleanupDeployment deletes the deployment and optionally the PVC
+func cleanupDeployment(ctx context.Context, clientset *kubernetes.Clientset, namespace, deploymentName string, deletePVC bool) error {
+ log.Info().
+ Str("namespace", namespace).
+ Str("deployment", deploymentName).
+ Bool("deletePVC", deletePVC).
+ Msg("Cleaning up deployment")
+
+ // Get deployment to find PVC name if needed
+ var pvcName string
+ if deletePVC {
+ deployment, err := clientset.AppsV1().Deployments(namespace).Get(ctx, deploymentName, metav1.GetOptions{})
+ if err != nil {
+ log.Warn().Err(err).Msg("Could not get deployment for PVC lookup")
+ } else {
+ // Find PVC from volume claim templates or volumes
+ if len(deployment.Spec.Template.Spec.Volumes) > 0 {
+ for _, vol := range deployment.Spec.Template.Spec.Volumes {
+ if vol.PersistentVolumeClaim != nil {
+ pvcName = vol.PersistentVolumeClaim.ClaimName
+ break
+ }
+ }
+ }
+ }
+ }
+
+ // Delete deployment
+ deletePolicy := metav1.DeletePropagationForeground
+ deleteOptions := metav1.DeleteOptions{
+ PropagationPolicy: &deletePolicy,
+ }
+
+ log.Info().Str("deployment", deploymentName).Msg("Deleting deployment")
+ err := clientset.AppsV1().Deployments(namespace).Delete(ctx, deploymentName, deleteOptions)
+ if err != nil {
+ return fmt.Errorf("failed to delete deployment: %w", err)
+ }
+
+ log.Info().Msg("Deployment deleted successfully")
+
+ // Delete PVC if requested and found
+ if deletePVC && pvcName != "" {
+ log.Info().Str("pvc", pvcName).Msg("Deleting PVC")
+ err := clientset.CoreV1().PersistentVolumeClaims(namespace).Delete(ctx, pvcName, metav1.DeleteOptions{})
+ if err != nil {
+ log.Warn().Err(err).Str("pvc", pvcName).Msg("Failed to delete PVC (non-fatal)")
+ } else {
+ log.Info().Msg("PVC deleted successfully")
+ }
+ }
+
+ return nil
+}
+
+func init() {
+ // Register signal handler for graceful shutdown
+ // If sidecar receives SIGTERM, just exit cleanly
+ // Don't trigger deployment deletion on sidecar termination
+ go func() {
+ sigChan := make(chan os.Signal, 1)
+ syscall.Signal(syscall.SIGTERM)
+ <-sigChan
+ log.Info().Msg("Received SIGTERM - sidecar exiting without cleanup")
+ os.Exit(0)
+ }()
+}