container lifecycle management

2026-03-12 15:13:38 -04:00
parent e99ef5d2dd
commit b9cc397e05
61 changed files with 6880 additions and 31 deletions
--- a/.idea/ai.iml
+++ b/.idea/ai.iml
@@ -8,6 +8,7 @@
      <excludeFolder url="file://$MODULE_DIR$/.venv" />
      <excludeFolder url="file://$MODULE_DIR$/backend.old/data" />
      <excludeFolder url="file://$MODULE_DIR$/doc.old" />
      <excludeFolder url="file://$MODULE_DIR$/backend.old" />
    </content>
    <orderEntry type="jdk" jdkName="Python 3.12 (ai)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
--- a/AGENT.md
+++ b/AGENT.md
@@ -0,0 +1,15 @@
 We're building an AI-first trading platform by integrating user-facing TradingView charts and chat with an AI assistant that helps do research, develop indicators (signals), and write strategies, using the Dexorder trading framework we provide.
 This monorepo has:
 bin/        scripts, mostly build and deploy 
 deploy/     kubernetes deployment and configuration
 doc/        documentation
 flink/      Apache Flink application mode processes data from Kafka
 iceberg/    Apache Iceberg for historical OHLC etc
 ingestor/   Data sources publish to Kafka
 kafka/      Apache Kafka
 protobuf/   Messaging entities
 relay/      Rust+ZeroMQ stateless router
 web/        Vue 3 / Pinia / PrimeVue / TradingView
 See doc/protocol.md for messaging architecture
--- a/bin/build-all
+++ b/bin/build-all
@@ -4,6 +4,7 @@
 set -e
 DIR="$(cd "$(dirname "$0")" && pwd)"
 ROOT_DIR="$(cd "$DIR/.." && pwd)"
 echo "Building all container images..."
 echo
@@ -13,5 +14,31 @@ echo
 "$DIR/build" ingestor "$@"
 "$DIR/build" web "$@"
 # Build lifecycle-sidecar (Go binary, no protobuf sync needed)
 echo "Building lifecycle-sidecar..."
 cd "$ROOT_DIR/lifecycle-sidecar"
 # Determine tag
 if [ "$1" == "dev" ]; then
  TAG="dev$(date +%Y%m%d%H%M%S)"
 else
  # Check for uncommitted changes
  DIRTY="$(git status | grep 'Changes ' || true)"
  if [ "$DIRTY" != "" ]; then
    echo "lifecycle-sidecar has uncommitted changes."
    echo "Use '$0 dev' to build a development-tagged version instead."
    exit 1
  fi
  TAG="$(git log --oneline | head -1 | cut -d ' ' -f 1)"
 fi
 REMOTE=${REMOTE:-ghcr.io/dexorder}
 docker build -t lifecycle-sidecar:latest -t lifecycle-sidecar:$TAG .
 docker tag lifecycle-sidecar:$TAG $REMOTE/lifecycle-sidecar:$TAG
 docker tag $REMOTE/lifecycle-sidecar:$TAG $REMOTE/lifecycle-sidecar:latest
 echo "$(date)" built $REMOTE/lifecycle-sidecar:$TAG
 echo
 echo "All images built successfully!"
--- a/bin/dev
+++ b/bin/dev
@@ -19,7 +19,7 @@ usage() {
    echo "Commands:"
    echo "  start            Start minikube and deploy all services"
    echo "  stop             Stop minikube"
-    echo "  restart [svc]    Rebuild and redeploy all services, or just one (relay|ingestor|flink)"
+    echo "  restart [svc]    Rebuild and redeploy all services, or just one (relay|ingestor|flink|sidecar)"
    echo "  rebuild [svc]    Rebuild all custom images, or just one"
    echo "  deploy  [svc]    Deploy/update all services, or just one"
    echo "  status      Show status of all services"
@@ -127,12 +127,23 @@ rebuild_images() {
        docker tag "dexorder/ai-flink:$FLINK_TAG" "dexorder/flink:$FLINK_TAG"
    fi
-    # Save the tags for deployment (all three, preserving any we didn't rebuild)
+    # Build lifecycle-sidecar (Go binary)
    if [ "$service" == "all" ] || [ "$service" == "lifecycle-sidecar" ] || [ "$service" == "sidecar" ]; then
        echo -e "${GREEN}→${NC} Building lifecycle-sidecar..."
        cd "$ROOT_DIR/lifecycle-sidecar"
        SIDECAR_TAG="dev$(date +%Y%m%d%H%M%S)"
        docker build -t lifecycle-sidecar:latest -t lifecycle-sidecar:$SIDECAR_TAG . || exit 1
        echo -e "${GREEN}✓ Built lifecycle-sidecar:$SIDECAR_TAG${NC}"
        cd "$ROOT_DIR"
    fi
    # Save the tags for deployment (all services, preserving any we didn't rebuild)
    echo "RELAY_TAG=$RELAY_TAG" > "$ROOT_DIR/.dev-image-tag"
    echo "INGEST_TAG=$INGEST_TAG" >> "$ROOT_DIR/.dev-image-tag"
    echo "FLINK_TAG=$FLINK_TAG" >> "$ROOT_DIR/.dev-image-tag"
    echo "SIDECAR_TAG=$SIDECAR_TAG" >> "$ROOT_DIR/.dev-image-tag"
-    echo -e "${GREEN}✓ Images built: relay=$RELAY_TAG, ingestor=$INGEST_TAG, flink=$FLINK_TAG${NC}"
+    echo -e "${GREEN}✓ Images built: relay=$RELAY_TAG, ingestor=$INGEST_TAG, flink=$FLINK_TAG, sidecar=$SIDECAR_TAG${NC}"
 }
 deploy_services() {
--- a/client-py/dexorder/lifecycle_manager.py
+++ b/client-py/dexorder/lifecycle_manager.py
@@ -0,0 +1,230 @@
 """
 Container lifecycle manager for agent containers.
 Tracks activity and triggers to determine when the container should shut down.
 Exits with code 42 to signal clean idle shutdown to the lifecycle sidecar.
 """
 import asyncio
 import logging
 import os
 import signal
 import sys
 from datetime import datetime, timedelta
 from pathlib import Path
 from typing import Optional, Set
 logger = logging.getLogger(__name__)
 # Exit code to signal clean idle shutdown to sidecar
 EXIT_CODE_IDLE_SHUTDOWN = 42
 # File to write exit code for sidecar to read
 EXIT_CODE_FILE = Path("/var/run/agent/exit_code")
 class LifecycleManager:
    """
    Manages container lifecycle based on activity and triggers.
    The container shuts itself down when:
    1. No active triggers (data subscriptions, CEP patterns, etc.)
    2. No recent user activity (MCP calls)
    3. Idle timeout has elapsed
    """
    def __init__(
        self,
        idle_timeout_minutes: int = 15,
        check_interval_seconds: int = 60,
        enable_shutdown: bool = True,
    ):
        """
        Initialize lifecycle manager.
        Args:
            idle_timeout_minutes: Minutes of inactivity before shutdown
            check_interval_seconds: Interval between idle checks
            enable_shutdown: If False, only log idle state without exiting (for testing)
        """
        self.idle_timeout = timedelta(minutes=idle_timeout_minutes)
        self.check_interval = check_interval_seconds
        self.enable_shutdown = enable_shutdown
        self.last_activity: datetime = datetime.now()
        self.active_triggers: Set[str] = set()
        self._running = False
        self._check_task: Optional[asyncio.Task] = None
        logger.info(
            "Lifecycle manager initialized: idle_timeout=%dm, check_interval=%ds, shutdown_enabled=%s",
            idle_timeout_minutes,
            check_interval_seconds,
            enable_shutdown,
        )
    def record_activity(self) -> None:
        """
        Record user activity (called on MCP tool/resource/prompt invocations).
        Resets the idle timer.
        """
        self.last_activity = datetime.now()
        logger.debug("Activity recorded, idle timer reset")
    def update_triggers(self, triggers: Set[str]) -> None:
        """
        Update the set of active triggers.
        Args:
            triggers: Set of active trigger IDs (data subscriptions, CEP patterns, etc.)
        """
        if triggers != self.active_triggers:
            added = triggers - self.active_triggers
            removed = self.active_triggers - triggers
            if added:
                logger.info("Triggers added: %s", added)
            if removed:
                logger.info("Triggers removed: %s", removed)
            self.active_triggers = triggers
            logger.info("Active triggers: %d", len(self.active_triggers))
    def add_trigger(self, trigger_id: str) -> None:
        """Add a single trigger."""
        if trigger_id not in self.active_triggers:
            self.active_triggers.add(trigger_id)
            logger.info("Trigger added: %s (total: %d)", trigger_id, len(self.active_triggers))
    def remove_trigger(self, trigger_id: str) -> None:
        """Remove a single trigger."""
        if trigger_id in self.active_triggers:
            self.active_triggers.remove(trigger_id)
            logger.info("Trigger removed: %s (total: %d)", trigger_id, len(self.active_triggers))
    def is_idle(self) -> bool:
        """
        Check if container is idle and should shut down.
        Returns:
            True if no triggers and idle timeout exceeded
        """
        has_triggers = len(self.active_triggers) > 0
        idle_time = datetime.now() - self.last_activity
        is_past_timeout = idle_time > self.idle_timeout
        if has_triggers:
            logger.debug("Not idle: has %d active triggers", len(self.active_triggers))
            return False
        if not is_past_timeout:
            logger.debug(
                "Not idle: last activity %s ago (timeout: %s)",
                idle_time,
                self.idle_timeout,
            )
            return False
        logger.info(
            "Container is idle: no triggers and %s since last activity", idle_time
        )
        return True
    async def start(self) -> None:
        """Start the lifecycle manager background task."""
        if self._running:
            logger.warning("Lifecycle manager already running")
            return
        self._running = True
        self._check_task = asyncio.create_task(self._check_loop())
        logger.info("Lifecycle manager started")
    async def stop(self) -> None:
        """Stop the lifecycle manager."""
        self._running = False
        if self._check_task:
            self._check_task.cancel()
            try:
                await self._check_task
            except asyncio.CancelledError:
                pass
        logger.info("Lifecycle manager stopped")
    async def _check_loop(self) -> None:
        """Background task that periodically checks if container should shut down."""
        while self._running:
            try:
                await asyncio.sleep(self.check_interval)
                if self.is_idle():
                    if self.enable_shutdown:
                        logger.info("Initiating idle shutdown (exit code %d)", EXIT_CODE_IDLE_SHUTDOWN)
                        self._write_exit_code(EXIT_CODE_IDLE_SHUTDOWN)
                        # Give sidecar a moment to see the exit code file
                        await asyncio.sleep(1)
                        # Exit with special code
                        os._exit(EXIT_CODE_IDLE_SHUTDOWN)
                    else:
                        logger.info(
                            "Container is idle but shutdown is disabled (testing mode)"
                        )
            except asyncio.CancelledError:
                logger.info("Check loop cancelled")
                raise
            except Exception as e:
                logger.error("Error in lifecycle check loop: %s", e, exc_info=True)
    def _write_exit_code(self, code: int) -> None:
        """Write exit code to shared file for sidecar to read."""
        try:
            EXIT_CODE_FILE.parent.mkdir(parents=True, exist_ok=True)
            EXIT_CODE_FILE.write_text(str(code))
            logger.debug("Wrote exit code %d to %s", code, EXIT_CODE_FILE)
        except Exception as e:
            logger.warning("Failed to write exit code file: %s", e)
    def setup_signal_handlers(self) -> None:
        """
        Setup signal handlers for graceful shutdown.
        On SIGTERM/SIGINT, exit normally (not with code 42) to allow restart.
        """
        def signal_handler(signum, frame):
            logger.info("Received signal %d, exiting normally", signum)
            sys.exit(0)
        signal.signal(signal.SIGTERM, signal_handler)
        signal.signal(signal.SIGINT, signal_handler)
 # Singleton instance for easy access across the application
 _lifecycle_manager: Optional[LifecycleManager] = None
 def get_lifecycle_manager() -> LifecycleManager:
    """Get or create the global lifecycle manager instance."""
    global _lifecycle_manager
    if _lifecycle_manager is None:
        # Load configuration from environment
        idle_timeout = int(os.environ.get("IDLE_TIMEOUT_MINUTES", "15"))
        check_interval = int(os.environ.get("IDLE_CHECK_INTERVAL_SECONDS", "60"))
        enable_shutdown = os.environ.get("ENABLE_IDLE_SHUTDOWN", "true").lower() == "true"
        _lifecycle_manager = LifecycleManager(
            idle_timeout_minutes=idle_timeout,
            check_interval_seconds=check_interval,
            enable_shutdown=enable_shutdown,
        )
    return _lifecycle_manager
 async def start_lifecycle_manager() -> LifecycleManager:
    """Initialize and start the lifecycle manager."""
    manager = get_lifecycle_manager()
    manager.setup_signal_handlers()
    await manager.start()
    return manager
--- a/client-py/dexorder/mcp_auth_middleware.py
+++ b/client-py/dexorder/mcp_auth_middleware.py
@@ -0,0 +1,43 @@
 # openclaw/auth.py
 class MCPAuthMiddleware:
    """Authenticates incoming MCP connections based on configured mode."""
    def __init__(self, config: AuthConfig):
        self.config = config
        self._jwks_client = None  # lazy-loaded for platform mode
    async def authenticate(self, request) -> AuthContext:
        match self.config.mode:
            case "local":
                # stdio transport or localhost-only binding
                # No auth needed — if you can exec into the container,
                # you're the user
                return AuthContext(user_id=self.config.local_user_id,
                                  source="local")
            case "token":
                # User-generated API key (standalone remote access)
                token = extract_bearer_token(request)
                if not verify_token_hash(token, self.config.tokens):
                    raise AuthError("Invalid API token")
                return AuthContext(user_id=self.config.local_user_id,
                                  source="api_key")
            case "platform":
                # JWT signed by the OpenClaw platform
                token = extract_bearer_token(request)
                claims = await self._verify_platform_jwt(token)
                if claims["sub"] != self.config.expected_user_id:
                    raise AuthError("User ID mismatch")
                return AuthContext(user_id=claims["sub"],
                                  source="platform",
                                  scopes=claims.get("scopes", []))
    async def _verify_platform_jwt(self, token: str) -> dict:
        if not self._jwks_client:
            self._jwks_client = JWKSClient(self.config.platform_jwks_url)
        signing_key = await self._jwks_client.get_signing_key_from_jwt(token)
        return jwt.decode(token, signing_key.key,
                          algorithms=["RS256"],
                          audience="openclaw-mcp")
--- a/deploy/k8s/base/admission-policy.yaml
+++ b/deploy/k8s/base/admission-policy.yaml
@@ -0,0 +1,110 @@
 # ValidatingAdmissionPolicy to restrict images in dexorder-agents namespace
 # Requires Kubernetes 1.30+ (or 1.28+ with feature gate)
 # This is the critical security control that prevents arbitrary image execution
 # even if the gateway is compromised.
 ---
 apiVersion: admissionregistration.k8s.io/v1
 kind: ValidatingAdmissionPolicy
 metadata:
  name: dexorder-agent-image-policy
 spec:
  failurePolicy: Fail
  matchConstraints:
    namespaceSelector:
      matchLabels:
        dexorder.io/type: agents
    resourceRules:
      - apiGroups: ["apps"]
        apiVersions: ["v1"]
        resources: ["deployments"]
        operations: ["CREATE", "UPDATE"]
  validations:
    # Only allow images from our approved registry with agent prefix
    - expression: |
        object.spec.template.spec.containers.all(c,
          c.image.startsWith('ghcr.io/dexorder/agent:') ||
          c.image.startsWith('ghcr.io/dexorder/agent-'))
      message: "Only approved dexorder agent images are allowed in the agents namespace"
      reason: Forbidden
    # No privileged containers
    - expression: |
        object.spec.template.spec.containers.all(c,
          !has(c.securityContext) ||
          !has(c.securityContext.privileged) ||
          c.securityContext.privileged == false)
      message: "Privileged containers are not allowed"
      reason: Forbidden
    # No hostPath volumes
    - expression: |
        !has(object.spec.template.spec.volumes) ||
        object.spec.template.spec.volumes.all(v,
          !has(v.hostPath))
      message: "hostPath volumes are not allowed"
      reason: Forbidden
    # No hostNetwork
    - expression: |
        !has(object.spec.template.spec.hostNetwork) ||
        object.spec.template.spec.hostNetwork == false
      message: "hostNetwork is not allowed"
      reason: Forbidden
    # No hostPID
    - expression: |
        !has(object.spec.template.spec.hostPID) ||
        object.spec.template.spec.hostPID == false
      message: "hostPID is not allowed"
      reason: Forbidden
    # Containers must run as non-root
    - expression: |
        object.spec.template.spec.containers.all(c,
          has(c.securityContext) &&
          has(c.securityContext.runAsNonRoot) &&
          c.securityContext.runAsNonRoot == true)
      message: "Containers must run as non-root"
      reason: Forbidden
    # Must drop all capabilities
    - expression: |
        object.spec.template.spec.containers.all(c,
          has(c.securityContext) &&
          has(c.securityContext.capabilities) &&
          has(c.securityContext.capabilities.drop) &&
          c.securityContext.capabilities.drop.exists(cap, cap == 'ALL'))
      message: "Containers must drop all capabilities"
      reason: Forbidden
    # Read-only root filesystem
    - expression: |
        object.spec.template.spec.containers.all(c,
          has(c.securityContext) &&
          has(c.securityContext.readOnlyRootFilesystem) &&
          c.securityContext.readOnlyRootFilesystem == true)
      message: "Containers must have read-only root filesystem"
      reason: Forbidden
    # Resource limits must be set
    - expression: |
        object.spec.template.spec.containers.all(c,
          has(c.resources) &&
          has(c.resources.limits) &&
          has(c.resources.limits.memory) &&
          has(c.resources.limits.cpu))
      message: "Containers must have resource limits set"
      reason: Forbidden
 ---
 apiVersion: admissionregistration.k8s.io/v1
 kind: ValidatingAdmissionPolicyBinding
 metadata:
  name: dexorder-agent-image-policy-binding
 spec:
  policyName: dexorder-agent-image-policy
  validationActions:
    - Deny
  matchResources:
    namespaceSelector:
      matchLabels:
        dexorder.io/type: agents
--- a/deploy/k8s/base/agent-deployment-example.yaml
+++ b/deploy/k8s/base/agent-deployment-example.yaml
@@ -0,0 +1,221 @@
 # Example agent deployment with lifecycle sidecar
 # This would be created by the gateway for each user
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: agent-user-abc123
  namespace: dexorder-agents
  labels:
    app.kubernetes.io/name: agent
    app.kubernetes.io/component: user-agent
    dexorder.io/component: agent
    dexorder.io/user-id: user-abc123
    dexorder.io/deployment: agent-user-abc123
 spec:
  replicas: 1
  selector:
    matchLabels:
      dexorder.io/user-id: user-abc123
  template:
    metadata:
      labels:
        dexorder.io/component: agent
        dexorder.io/user-id: user-abc123
        dexorder.io/deployment: agent-user-abc123
    spec:
      serviceAccountName: agent-lifecycle
      # Share PID namespace so sidecar can monitor main container
      shareProcessNamespace: true
      # Security context
      securityContext:
        runAsNonRoot: true
        runAsUser: 1000
        fsGroup: 1000
        seccompProfile:
          type: RuntimeDefault
      containers:
        # Main agent container
        - name: agent
          image: ghcr.io/dexorder/agent:latest
          imagePullPolicy: Always
          # Security context (required by admission policy)
          securityContext:
            allowPrivilegeEscalation: false
            runAsNonRoot: true
            runAsUser: 1000
            readOnlyRootFilesystem: true
            capabilities:
              drop:
                - ALL
          # Resource limits (required by admission policy)
          resources:
            requests:
              memory: "256Mi"
              cpu: "100m"
            limits:
              memory: "1Gi"
              cpu: "1000m"
          # Environment variables
          env:
            - name: USER_ID
              value: "user-abc123"
            - name: IDLE_TIMEOUT_MINUTES
              value: "15"
            - name: IDLE_CHECK_INTERVAL_SECONDS
              value: "60"
            - name: ENABLE_IDLE_SHUTDOWN
              value: "true"
            - name: MCP_SERVER_PORT
              value: "3000"
            - name: ZMQ_CONTROL_PORT
              value: "5555"
          # Ports
          ports:
            - name: mcp
              containerPort: 3000
              protocol: TCP
            - name: zmq-control
              containerPort: 5555
              protocol: TCP
          # Volume mounts
          volumeMounts:
            - name: agent-data
              mountPath: /app/data
            - name: tmp
              mountPath: /tmp
            - name: shared-run
              mountPath: /var/run/agent
          # Liveness probe (agent's MCP server)
          livenessProbe:
            httpGet:
              path: /health
              port: mcp
            initialDelaySeconds: 10
            periodSeconds: 30
            timeoutSeconds: 5
          # Readiness probe
          readinessProbe:
            httpGet:
              path: /ready
              port: mcp
            initialDelaySeconds: 5
            periodSeconds: 10
        # Lifecycle sidecar
        - name: lifecycle-sidecar
          image: ghcr.io/dexorder/lifecycle-sidecar:latest
          imagePullPolicy: Always
          # Security context
          securityContext:
            allowPrivilegeEscalation: false
            runAsNonRoot: true
            runAsUser: 1000
            readOnlyRootFilesystem: true
            capabilities:
              drop:
                - ALL
          # Resource limits
          resources:
            requests:
              memory: "32Mi"
              cpu: "10m"
            limits:
              memory: "64Mi"
              cpu: "50m"
          # Environment variables (injected via downward API)
          env:
            - name: NAMESPACE
              valueFrom:
                fieldRef:
                  fieldPath: metadata.namespace
            - name: DEPLOYMENT_NAME
              valueFrom:
                fieldRef:
                  fieldPath: metadata.labels['dexorder.io/deployment']
            - name: USER_TYPE
              value: "free"  # Gateway sets this based on license
            - name: MAIN_CONTAINER_PID
              value: "1"  # In shared PID namespace, main container is typically PID 1
          # Volume mounts
          volumeMounts:
            - name: shared-run
              mountPath: /var/run/agent
              readOnly: true
      # Volumes
      volumes:
        # Persistent data (user files, state)
        - name: agent-data
          persistentVolumeClaim:
            claimName: agent-user-abc123-data
        # Temporary writable filesystem (read-only rootfs)
        - name: tmp
          emptyDir:
            medium: Memory
            sizeLimit: 128Mi
        # Shared between main container and sidecar
        - name: shared-run
          emptyDir:
            medium: Memory
            sizeLimit: 1Mi
      # Restart policy
      restartPolicy: Always
      # Termination grace period
      terminationGracePeriodSeconds: 30
 ---
 # PVC for agent persistent data
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: agent-user-abc123-data
  namespace: dexorder-agents
  labels:
    dexorder.io/user-id: user-abc123
 spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 1Gi
  storageClassName: standard  # Or your preferred storage class
 ---
 # Service to expose agent MCP endpoint
 apiVersion: v1
 kind: Service
 metadata:
  name: agent-user-abc123
  namespace: dexorder-agents
  labels:
    dexorder.io/user-id: user-abc123
 spec:
  type: ClusterIP
  selector:
    dexorder.io/user-id: user-abc123
  ports:
    - name: mcp
      port: 3000
      targetPort: mcp
      protocol: TCP
    - name: zmq-control
      port: 5555
      targetPort: zmq-control
      protocol: TCP
--- a/deploy/k8s/base/agent-quotas.yaml
+++ b/deploy/k8s/base/agent-quotas.yaml
@@ -0,0 +1,53 @@
 # Resource constraints for the dexorder-agents namespace
 # These limits apply regardless of what the gateway requests
 ---
 # LimitRange: per-container defaults and maximums
 apiVersion: v1
 kind: LimitRange
 metadata:
  name: agent-limits
  namespace: dexorder-agents
 spec:
  limits:
    # Default limits applied if deployment doesn't specify
    - type: Container
      default:
        memory: "512Mi"
        cpu: "500m"
      defaultRequest:
        memory: "256Mi"
        cpu: "100m"
      # Maximum any single container can request
      max:
        memory: "2Gi"
        cpu: "2000m"
      min:
        memory: "64Mi"
        cpu: "50m"
    # PVC size limits
    - type: PersistentVolumeClaim
      max:
        storage: "10Gi"
      min:
        storage: "100Mi"
 ---
 # ResourceQuota: total namespace limits
 # Prevents a compromised gateway from exhausting cluster resources
 apiVersion: v1
 kind: ResourceQuota
 metadata:
  name: agent-quota
  namespace: dexorder-agents
 spec:
  hard:
    # Total compute limits for all agents combined
    requests.cpu: "20"
    requests.memory: "40Gi"
    limits.cpu: "40"
    limits.memory: "80Gi"
    # Object count limits
    pods: "100"
    persistentvolumeclaims: "100"
    services: "100"
    # Storage limits
    requests.storage: "500Gi"
--- a/deploy/k8s/base/gateway-rbac.yaml
+++ b/deploy/k8s/base/gateway-rbac.yaml
@@ -0,0 +1,65 @@
 # RBAC for gateway to CREATE agent deployments only
 # Principle of least privilege: gateway can ONLY create deployments/services/PVCs
 # in the dexorder-agents namespace. Deletion is handled by the lifecycle sidecar.
 # No pods, secrets, exec, or cross-namespace access.
 ---
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: gateway
  namespace: dexorder-system
 ---
 # Role scoped to dexorder-agents namespace only
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 metadata:
  name: agent-creator
  namespace: dexorder-agents
 rules:
  # Deployments: create and read only (deletion handled by sidecar)
  - apiGroups: ["apps"]
    resources: ["deployments"]
    verbs: ["create", "get", "list", "watch", "patch", "update"]
  # PVCs: create and read (deletion handled by sidecar)
  - apiGroups: [""]
    resources: ["persistentvolumeclaims"]
    verbs: ["create", "get", "list", "watch"]
  # Services: create and manage agent MCP endpoints
  - apiGroups: [""]
    resources: ["services"]
    verbs: ["create", "get", "list", "watch", "patch", "update"]
  # Read-only pod access for status checks (no exec!)
  - apiGroups: [""]
    resources: ["pods"]
    verbs: ["get", "list", "watch"]
  # Pod logs for debugging (read-only)
  - apiGroups: [""]
    resources: ["pods/log"]
    verbs: ["get"]
  # Explicitly NOT included:
  # - deployments/delete - handled by lifecycle sidecar
  # - pvc/delete - handled by lifecycle sidecar
  # - services/delete - handled by lifecycle sidecar
  # - pods (create/delete) - must go through deployments
  # - pods/exec, pods/attach - no shell access
  # - secrets, configmaps - no credential access
  # - any resources in other namespaces
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
  name: gateway-agent-creator
  namespace: dexorder-agents
 subjects:
  - kind: ServiceAccount
    name: gateway
    namespace: dexorder-system
 roleRef:
  kind: Role
  name: agent-creator
  apiGroup: rbac.authorization.k8s.io
--- a/deploy/k8s/base/init.yaml
+++ b/deploy/k8s/base/init.yaml
@@ -1,3 +1,6 @@
 # Runtime and security initialization for dexorder AI platform
 # Apply this first: kubectl apply -f init.yaml
 ---
 apiVersion: node.k8s.io/v1
 kind: RuntimeClass
 metadata:
--- a/deploy/k8s/base/kustomization.yaml
+++ b/deploy/k8s/base/kustomization.yaml
@@ -1,5 +1,26 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
-resources: []
+resources:
-  # ingress.yaml - removed until we have services to expose
+  # Core initialization (runtime classes)
  - init.yaml
  # Namespace definitions with PodSecurity labels
  - namespaces.yaml
  # RBAC for gateway to create agents (creation only)
  - gateway-rbac.yaml
  # RBAC for lifecycle sidecar (self-deletion)
  - lifecycle-sidecar-rbac.yaml
  # Admission policies (image restriction, security requirements)
  - admission-policy.yaml
  # Resource quotas and limits for agents namespace
  - agent-quotas.yaml
  # Network isolation policies
  - network-policies.yaml
  # Gateway service (uncomment when ready)
  # - gateway.yaml
  # Example agent deployment (for reference, not applied by default)
  # - agent-deployment-example.yaml
  # Services (uncomment as needed)
  # - backend.yaml
  # - web.yaml
  # - ingress.yaml
--- a/deploy/k8s/base/lifecycle-sidecar-rbac.yaml
+++ b/deploy/k8s/base/lifecycle-sidecar-rbac.yaml
@@ -0,0 +1,53 @@
 # RBAC for lifecycle sidecar - allows self-deletion only
 # Each agent pod gets this ServiceAccount and can only delete its own deployment
 ---
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: agent-lifecycle
  namespace: dexorder-agents
 ---
 # Role allowing deletion of deployments and PVCs
 # This is scoped to the dexorder-agents namespace
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 metadata:
  name: agent-self-delete
  namespace: dexorder-agents
 rules:
  # Allow getting and deleting deployments
  - apiGroups: ["apps"]
    resources: ["deployments"]
    verbs: ["get", "delete"]
  # Allow getting and deleting PVCs (for anonymous users)
  - apiGroups: [""]
    resources: ["persistentvolumeclaims"]
    verbs: ["get", "delete"]
  # Read-only access to pods (for status checking)
  - apiGroups: [""]
    resources: ["pods"]
    verbs: ["get", "list"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
  name: agent-self-delete
  namespace: dexorder-agents
 subjects:
  - kind: ServiceAccount
    name: agent-lifecycle
    namespace: dexorder-agents
 roleRef:
  kind: Role
  name: agent-self-delete
  apiGroup: rbac.authorization.k8s.io
 ---
 # Additional security: ValidatingWebhookConfiguration to restrict deletion
 # This ensures sidecars can only delete their own deployment
 # Requires a validating webhook server (can be added later)
 # For now, we rely on:
 # 1. Sidecar only knowing its own deployment name (from env)
 # 2. RBAC limiting to dexorder-agents namespace
 # 3. Admission policy restricting deployment creation (already defined)
--- a/deploy/k8s/base/namespaces.yaml
+++ b/deploy/k8s/base/namespaces.yaml
@@ -0,0 +1,24 @@
 # Namespace definitions for dexorder AI platform
 # - dexorder-system: gateway, flink, kafka, and other infrastructure
 # - dexorder-agents: user agent containers (isolated, restricted)
 ---
 apiVersion: v1
 kind: Namespace
 metadata:
  name: dexorder-system
  labels:
    app.kubernetes.io/part-of: dexorder
    dexorder.io/type: system
 ---
 apiVersion: v1
 kind: Namespace
 metadata:
  name: dexorder-agents
  labels:
    app.kubernetes.io/part-of: dexorder
    dexorder.io/type: agents
    # Enforce restricted pod security standards
    pod-security.kubernetes.io/enforce: restricted
    pod-security.kubernetes.io/enforce-version: latest
    pod-security.kubernetes.io/audit: restricted
    pod-security.kubernetes.io/warn: restricted
--- a/deploy/k8s/base/network-policies.yaml
+++ b/deploy/k8s/base/network-policies.yaml
@@ -0,0 +1,121 @@
 # Network policies for agent isolation
 # Agents can only communicate with specific services, not with each other
 # or with the Kubernetes API
 ---
 # Default deny all ingress and egress in agents namespace
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
  name: default-deny-all
  namespace: dexorder-agents
 spec:
  podSelector: {}
  policyTypes:
    - Ingress
    - Egress
 ---
 # Allow agents to receive connections from gateway (MCP)
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
  name: allow-gateway-ingress
  namespace: dexorder-agents
 spec:
  podSelector:
    matchLabels:
      dexorder.io/component: agent
  policyTypes:
    - Ingress
  ingress:
    - from:
        - namespaceSelector:
            matchLabels:
              dexorder.io/type: system
          podSelector:
            matchLabels:
              app: gateway
      ports:
        - protocol: TCP
          port: 3000  # MCP server port
        - protocol: TCP
          port: 5555  # ZeroMQ control channel
 ---
 # Allow agents to connect to required services
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
  name: allow-agent-egress
  namespace: dexorder-agents
 spec:
  podSelector:
    matchLabels:
      dexorder.io/component: agent
  policyTypes:
    - Egress
  egress:
    # DNS resolution (required)
    - to:
        - namespaceSelector: {}
          podSelector:
            matchLabels:
              k8s-app: kube-dns
      ports:
        - protocol: UDP
          port: 53
        - protocol: TCP
          port: 53
    # Gateway in system namespace (for callbacks)
    - to:
        - namespaceSelector:
            matchLabels:
              dexorder.io/type: system
          podSelector:
            matchLabels:
              app: gateway
      ports:
        - protocol: TCP
          port: 8080
    # Kafka/Redpanda for data subscriptions
    - to:
        - namespaceSelector:
            matchLabels:
              dexorder.io/type: system
          podSelector:
            matchLabels:
              app: redpanda
      ports:
        - protocol: TCP
          port: 9092
    # External HTTPS (for exchange APIs, LLM APIs)
    - to:
        - ipBlock:
            cidr: 0.0.0.0/0
            except:
              # Block access to k8s API server (common ranges)
              - 10.0.0.0/8
              - 172.16.0.0/12
              - 192.168.0.0/16
      ports:
        - protocol: TCP
          port: 443
 ---
 # System namespace: allow ingress from agents
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
  name: allow-agent-callbacks
  namespace: dexorder-system
 spec:
  podSelector:
    matchLabels:
      app: gateway
  policyTypes:
    - Ingress
  ingress:
    - from:
        - namespaceSelector:
            matchLabels:
              dexorder.io/type: agents
      ports:
        - protocol: TCP
          port: 8080
--- a/deploy/k8s/dev/admission-policy-patch.yaml
+++ b/deploy/k8s/dev/admission-policy-patch.yaml
@@ -0,0 +1,97 @@
 # Dev admission policy: allow local registry images
 # In dev, we also allow images from localhost/minikube registry
 ---
 apiVersion: admissionregistration.k8s.io/v1
 kind: ValidatingAdmissionPolicy
 metadata:
  name: dexorder-agent-image-policy
 spec:
  failurePolicy: Fail
  matchConstraints:
    namespaceSelector:
      matchLabels:
        dexorder.io/type: agents
    resourceRules:
      - apiGroups: ["apps"]
        apiVersions: ["v1"]
        resources: ["deployments"]
        operations: ["CREATE", "UPDATE"]
  validations:
    # Allow local dev images in addition to production registry
    - expression: |
        object.spec.template.spec.containers.all(c,
          c.image.startsWith('ghcr.io/dexorder/agent:') ||
          c.image.startsWith('ghcr.io/dexorder/agent-') ||
          c.image.startsWith('localhost:5000/dexorder/agent') ||
          c.image.startsWith('dexorder/agent'))
      message: "Only approved dexorder agent images are allowed"
      reason: Forbidden
    # No privileged containers
    - expression: |
        object.spec.template.spec.containers.all(c,
          !has(c.securityContext) ||
          !has(c.securityContext.privileged) ||
          c.securityContext.privileged == false)
      message: "Privileged containers are not allowed"
      reason: Forbidden
    # No hostPath volumes
    - expression: |
        !has(object.spec.template.spec.volumes) ||
        object.spec.template.spec.volumes.all(v,
          !has(v.hostPath))
      message: "hostPath volumes are not allowed"
      reason: Forbidden
    # No hostNetwork
    - expression: |
        !has(object.spec.template.spec.hostNetwork) ||
        object.spec.template.spec.hostNetwork == false
      message: "hostNetwork is not allowed"
      reason: Forbidden
    # No hostPID
    - expression: |
        !has(object.spec.template.spec.hostPID) ||
        object.spec.template.spec.hostPID == false
      message: "hostPID is not allowed"
      reason: Forbidden
    # Containers must run as non-root
    - expression: |
        object.spec.template.spec.containers.all(c,
          has(c.securityContext) &&
          has(c.securityContext.runAsNonRoot) &&
          c.securityContext.runAsNonRoot == true)
      message: "Containers must run as non-root"
      reason: Forbidden
    # Must drop all capabilities
    - expression: |
        object.spec.template.spec.containers.all(c,
          has(c.securityContext) &&
          has(c.securityContext.capabilities) &&
          has(c.securityContext.capabilities.drop) &&
          c.securityContext.capabilities.drop.exists(cap, cap == 'ALL'))
      message: "Containers must drop all capabilities"
      reason: Forbidden
    # Read-only root filesystem
    - expression: |
        object.spec.template.spec.containers.all(c,
          has(c.securityContext) &&
          has(c.securityContext.readOnlyRootFilesystem) &&
          c.securityContext.readOnlyRootFilesystem == true)
      message: "Containers must have read-only root filesystem"
      reason: Forbidden
    # Resource limits must be set
    - expression: |
        object.spec.template.spec.containers.all(c,
          has(c.resources) &&
          has(c.resources.limits) &&
          has(c.resources.limits.memory) &&
          has(c.resources.limits.cpu))
      message: "Containers must have resource limits set"
      reason: Forbidden
--- a/deploy/k8s/dev/agent-quotas-patch.yaml
+++ b/deploy/k8s/dev/agent-quotas-patch.yaml
@@ -0,0 +1,19 @@
 # Dev/minikube resource quota overrides
 # Smaller limits appropriate for local development
 ---
 apiVersion: v1
 kind: ResourceQuota
 metadata:
  name: agent-quota
  namespace: dexorder-agents
 spec:
  hard:
    # Reduced for minikube
    requests.cpu: "4"
    requests.memory: "8Gi"
    limits.cpu: "8"
    limits.memory: "16Gi"
    pods: "20"
    persistentvolumeclaims: "20"
    services: "20"
    requests.storage: "50Gi"
--- a/deploy/k8s/dev/kustomization.yaml
+++ b/deploy/k8s/dev/kustomization.yaml
@@ -1,16 +1,20 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
-namespace: default
+# Note: namespaces are defined in base; workloads go to dexorder-system
 namespace: dexorder-system
-# Base resources
+# Base resources (includes security policies)
 resources:
  - ../base
  - infrastructure.yaml
-# No patches needed currently
+# Dev-specific patches
-patches: []
+patches:
-  # ingress-dev.yaml - removed until we have services to expose
+  # Reduced resource quotas for minikube
  - path: agent-quotas-patch.yaml
  # Allow local registry images
  - path: admission-policy-patch.yaml
 # ConfigMaps for service configs
 configMapGenerator:
--- a/deploy/k8s/prod/kustomization.yaml
+++ b/deploy/k8s/prod/kustomization.yaml
@@ -1,9 +1,10 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
-namespace: default
+# Note: namespaces are defined in base; workloads go to dexorder-system
 namespace: dexorder-system
-# Base resources (backend, web, ingress, init/gVisor)
+# Base resources (includes all security policies)
 resources:
  - ../base
@@ -38,3 +39,10 @@ images:
    newTag: latest
  - name: dexorder/ai-web
    newTag: latest
  - name: ghcr.io/dexorder/gateway
    newTag: latest
  - name: lifecycle-sidecar
    newName: ghcr.io/dexorder/lifecycle-sidecar
    newTag: latest
  - name: ghcr.io/dexorder/agent
    newTag: latest
--- a/doc/agent_harness_flow.md
+++ b/doc/agent_harness_flow.md
@@ -0,0 +1,21 @@
 ┌─────────────────────────────────────────────────┐
 │              Agent Harness (your servers)         │
 │                                                   │
 │  on_message(user_id, message):                    │
 │    1. Look up user's MCP endpoint from Postgres   │
 │    2. mcp.call("get_context_summary")             │
 │    3. mcp.call("get_conversation_history", 20)    │
 │    4. Build prompt:                                │
 │         system = BASE_PROMPT                       │
 │                + context_summary                   │
 │                + user_agent_prompt (from MCP)      │
 │         messages = history + new message           │
 │    5. LLM call (your API key)                      │
 │    6. While LLM wants tool calls:                  │
 │         - Platform tools → handle locally          │
 │         - User tools → proxy to MCP                │
 │         - LLM call again with results              │
 │    7. mcp.call("save_message", ...)                │
 │    8. Return response to user                      │
 │                                                   │
 └─────────────────────────────────────────────────┘
--- a/doc/agent_redesign.md
+++ b/doc/agent_redesign.md
@@ -1,9 +1,11 @@
 Generally use skills instead of subagents, except for the analysis subagent.
-## User-specific files
+## User-specific files and tools
 * Indicators
 * Strategies
 * Watchlists
 * Preferences
  * Trading style
  * Charting / colors
 * Executors (really just sub-strategies)
  * tactical-level order generators e.g. TWAP, iceberg, etc. 
--- a/doc/config.md
+++ b/doc/config.md
@@ -1,18 +0,0 @@
 This file describes all the configuration options used by all components. All configuration is divided into regular config and secrets, and k8s will mount either or both as a yaml file accessible to the process.
 # Configuration
 * `flink_hostname`
 * ... various zmq ports for flink ...
 * `iceberg_catalog_hostname`
 * `iceberg_catalog_port`
 * `iceberg_catalog_database`
 * etc
 # Secrets
 * `iceberg_catalog_username`
 * `iceberg_catalog_password`
 * etc.
--- a/doc/container_lifecycle_management.md
+++ b/doc/container_lifecycle_management.md
@@ -0,0 +1,313 @@
 # Container Lifecycle Management
 ## Overview
 User agent containers self-manage their lifecycle to optimize resource usage. Containers automatically shut down when idle (no triggers + no recent activity) and clean themselves up using a lifecycle sidecar.
 ## Architecture
 ```
 ┌──────────────────────────────────────────────────────────┐
 │                   Agent Pod                              │
 │  ┌───────────────────┐       ┌──────────────────────┐   │
 │  │  Agent Container  │       │ Lifecycle Sidecar    │   │
 │  │  ───────────────  │       │ ──────────────────   │   │
 │  │                   │       │                      │   │
 │  │ Lifecycle Manager │       │ Watches exit code    │   │
 │  │ - Track activity  │       │ - Detects exit 42    │   │
 │  │ - Track triggers  │       │ - Calls k8s API      │   │
 │  │ - Exit 42 if idle │       │ - Deletes deployment │   │
 │  └───────────────────┘       └──────────────────────┘   │
 │           │                           │                  │
 │           │ writes exit_code          │                  │
 │           └────►/var/run/agent/exit_code                │
 │                                       │                  │
 └───────────────────────────────────────┼──────────────────┘
                                        │
                                        ▼ k8s API (RBAC)
                              ┌─────────────────────┐
                              │ Delete Deployment   │
                              │ Delete PVC (if anon)│
                              └─────────────────────┘
 ```
 ## Components
 ### 1. Lifecycle Manager (Python)
 **Location**: `client-py/dexorder/lifecycle_manager.py`
 Runs inside the agent container and tracks:
 - **Activity**: MCP tool/resource/prompt calls reset the idle timer
 - **Triggers**: Data subscriptions, CEP patterns, etc.
 - **Idle state**: No triggers + idle timeout exceeded
 **Configuration** (via environment variables):
 - `IDLE_TIMEOUT_MINUTES`: Minutes before shutdown (default: 15)
 - `IDLE_CHECK_INTERVAL_SECONDS`: Check frequency (default: 60)
 - `ENABLE_IDLE_SHUTDOWN`: Enable/disable shutdown (default: true)
 **Usage in agent code**:
 ```python
 from dexorder.lifecycle_manager import get_lifecycle_manager
 # On startup
 manager = get_lifecycle_manager()
 await manager.start()
 # On MCP calls (tool/resource/prompt)
 manager.record_activity()
 # When triggers change
 manager.add_trigger("data_sub_BTC_USDT")
 manager.remove_trigger("data_sub_BTC_USDT")
 # Or batch update
 manager.update_triggers({"trigger_1", "trigger_2"})
 ```
 **Exit behavior**:
 - Idle shutdown: Exit with code `42`
 - Signal (SIGTERM/SIGINT): Exit with code `0` (allows restart)
 - Errors/crashes: Exit with error code (allows restart)
 ### 2. Lifecycle Sidecar (Go)
 **Location**: `lifecycle-sidecar/`
 Runs alongside the agent container with shared PID namespace. Monitors the main container process and:
 - On exit code `42`: Deletes deployment (and PVC if anonymous user)
 - On any other exit code: Exits with same code (k8s restarts pod)
 **Configuration** (via environment, injected by downward API):
 - `NAMESPACE`: Pod's namespace
 - `DEPLOYMENT_NAME`: Deployment name (from pod label)
 - `USER_TYPE`: License tier (`anonymous`, `free`, `paid`, `enterprise`)
 - `MAIN_CONTAINER_PID`: PID of main container (default: 1)
 **RBAC**: Has permission to delete deployments and PVCs **only in dexorder-agents namespace**. Cannot delete other deployments due to:
 1. Only knows its own deployment name (from env)
 2. RBAC scoped to namespace
 3. No cross-pod communication
 ### 3. Gateway (TypeScript)
 **Location**: `gateway/src/harness/agent-harness.ts`
 Creates agent deployments when users connect. Has permissions to:
 - ✅ Create deployments, services, PVCs
 - ✅ Read pod status and logs
 - ✅ Update deployments (e.g., resource limits)
 - ❌ Delete deployments (handled by sidecar)
 - ❌ Exec into pods
 - ❌ Access secrets
 ## Lifecycle States
 ```
 ┌─────────────┐
 │   CREATED   │ ← Gateway creates deployment
 └──────┬──────┘
       │
       ▼
 ┌─────────────┐
 │   RUNNING   │ ← User interacts, has triggers
 └──────┬──────┘
       │
       ▼
 ┌─────────────┐
 │    IDLE     │ ← No triggers + timeout exceeded
 └──────┬──────┘
       │
       ▼
 ┌─────────────┐
 │  SHUTDOWN   │ ← Exit code 42
 └──────┬──────┘
       │
       ▼
 ┌─────────────┐
 │   DELETED   │ ← Sidecar deletes deployment
 └─────────────┘
 ```
 ## Idle Detection Logic
 Container is **IDLE** when:
 1. `active_triggers.isEmpty()` AND
 2. `(now - last_activity) > idle_timeout`
 Container is **ACTIVE** when:
 1. Has any active triggers (data subscriptions, CEP patterns, etc.) OR
 2. Recent user activity (MCP calls within timeout)
 ## Cleanup Policies by License Tier
 | User Type    | Idle Timeout | PVC Policy | Notes |
 |--------------|--------------|------------|-------|
 | Anonymous    | 15 minutes   | Delete     | Ephemeral, no data retention |
 | Free         | 15 minutes   | Retain     | Can resume session |
 | Paid         | 60 minutes   | Retain     | Longer grace period |
 | Enterprise   | No shutdown  | Retain     | Always-on containers |
 Configured via `USER_TYPE` env var in deployment.
 ## Security
 ### Principle of Least Privilege
 **Gateway**:
 - Can create agent resources
 - Cannot delete agent resources
 - Cannot access other namespaces
 - Cannot exec into pods
 **Lifecycle Sidecar**:
 - Can delete its own deployment only
 - Cannot delete other deployments
 - Scoped to dexorder-agents namespace
 - No exec, no secrets access
 ### Admission Control
 All deployments in `dexorder-agents` namespace are subject to:
 - Image allowlist (only approved images)
 - Security context enforcement (non-root, drop caps, read-only rootfs)
 - Resource limits required
 - PodSecurity standards (restricted profile)
 See `deploy/k8s/base/admission-policy.yaml`
 ### Network Isolation
 Agents are network-isolated via NetworkPolicy:
 - Can connect to gateway (MCP)
 - Can connect to Redpanda (data streams)
 - Can make outbound HTTPS (exchanges, LLM APIs)
 - Cannot access k8s API
 - Cannot access system namespace
 - Cannot access other agent pods
 See `deploy/k8s/base/network-policies.yaml`
 ## Deployment
 ### 1. Apply Security Policies
 ```bash
 kubectl apply -k deploy/k8s/dev  # or prod
 ```
 This creates:
 - Namespaces (`dexorder-system`, `dexorder-agents`)
 - RBAC (gateway, lifecycle sidecar)
 - Admission policies
 - Network policies
 - Resource quotas
 ### 2. Build and Push Lifecycle Sidecar
 ```bash
 cd lifecycle-sidecar
 docker build -t ghcr.io/dexorder/lifecycle-sidecar:latest .
 docker push ghcr.io/dexorder/lifecycle-sidecar:latest
 ```
 ### 3. Gateway Creates Agent Deployments
 When a user connects, the gateway creates:
 - Deployment with agent + sidecar
 - PVC for persistent data
 - Service for MCP endpoint
 See `deploy/k8s/base/agent-deployment-example.yaml` for template.
 ## Testing
 ### Test Lifecycle Manager Locally
 ```python
 from dexorder.lifecycle_manager import LifecycleManager
 # Disable actual shutdown for testing
 manager = LifecycleManager(
    idle_timeout_minutes=1,
    check_interval_seconds=10,
    enable_shutdown=False  # Only log, don't exit
 )
 await manager.start()
 # Simulate activity
 manager.record_activity()
 # Simulate triggers
 manager.add_trigger("test_trigger")
 await asyncio.sleep(70)  # Wait past timeout
 manager.remove_trigger("test_trigger")
 await asyncio.sleep(70)  # Should detect idle
 await manager.stop()
 ```
 ### Test Sidecar Locally
 ```bash
 # Build
 cd lifecycle-sidecar
 go build -o lifecycle-sidecar main.go
 # Run (requires k8s config)
 export NAMESPACE=dexorder-agents
 export DEPLOYMENT_NAME=agent-test
 export USER_TYPE=free
 ./lifecycle-sidecar
 ```
 ### Integration Test
 1. Deploy test agent with sidecar
 2. Verify agent starts and is healthy
 3. Stop sending MCP calls and remove all triggers
 4. Wait for idle timeout + check interval
 5. Verify deployment is deleted
 ## Troubleshooting
 ### Container not shutting down when idle
 Check logs:
 ```bash
 kubectl logs -n dexorder-agents agent-user-abc123 -c agent
 ```
 Verify:
 - `ENABLE_IDLE_SHUTDOWN=true`
 - No active triggers: `manager.active_triggers` should be empty
 - Idle timeout exceeded
 ### Sidecar not deleting deployment
 Check sidecar logs:
 ```bash
 kubectl logs -n dexorder-agents agent-user-abc123 -c lifecycle-sidecar
 ```
 Verify:
 - Exit code file exists: `/var/run/agent/exit_code` contains `42`
 - RBAC permissions: `kubectl auth can-i delete deployments --as=system:serviceaccount:dexorder-agents:agent-lifecycle -n dexorder-agents`
 - Deployment name matches: Check `DEPLOYMENT_NAME` env var
 ### Gateway can't create deployments
 Check gateway logs and verify:
 - ServiceAccount exists: `kubectl get sa gateway -n dexorder-system`
 - RoleBinding exists: `kubectl get rolebinding gateway-agent-creator -n dexorder-agents`
 - Admission policy allows image: Check image name matches allowlist in `admission-policy.yaml`
 ## Future Enhancements
 1. **Graceful shutdown notifications**: Warn users before shutdown via websocket
 2. **Predictive scaling**: Keep frequently-used containers warm
 3. **Tiered storage**: Move old PVCs to cheaper storage class
 4. **Metrics**: Expose lifecycle metrics (idle rate, shutdown count, etc.)
 5. **Cost allocation**: Track resource usage per user/license tier
--- a/doc/gateway_container_creation.md
+++ b/doc/gateway_container_creation.md
@@ -0,0 +1,286 @@
 # Gateway Container Creation
 ## Overview
 The gateway automatically provisions user agent containers when users authenticate. This ensures each user has their own isolated environment running their MCP server with persistent storage.
 ## Authentication Flow with Container Creation
 ```
 User connects (WebSocket/Telegram)
         ↓
   Send "Authenticating..." status
         ↓
   Verify token/channel link
         ↓
   Lookup user license from DB
         ↓
   Send "Starting workspace..." status
         ↓
 ┌────────────────────────────────────┐
 │  ContainerManager.ensureRunning() │
 │  ┌──────────────────────────────┐ │
 │  │ Check if deployment exists   │ │
 │  └──────────────────────────────┘ │
 │           ↓                        │
 │     Does it exist?                 │
 │     ↙         ↘                    │
 │   Yes          No                  │
 │    │            │                  │
 │    │      ┌──────────────────┐    │
 │    │      │ Create deployment│    │
 │    │      │ Create PVC       │    │
 │    │      │ Create service   │    │
 │    │      └──────────────────┘    │
 │    │            │                  │
 │    └────────────┘                  │
 │         ↓                          │
 │  Wait for deployment ready         │
 │  (polls every 2s, timeout 2min)    │
 │         ↓                          │
 │  Compute MCP endpoint URL          │
 │  (internal k8s service DNS)        │
 └────────────────────────────────────┘
         ↓
   Update license.mcpServerUrl
         ↓
   Send "Connected" status
         ↓
   Initialize AgentHarness
         ↓
   Connect to user's MCP server
         ↓
   Ready for messages
 ```
 ## Container Naming Convention
 All resources follow a consistent naming pattern based on `userId`:
 ```typescript
 userId: "user-abc123"
  ↓
 deploymentName: "agent-user-abc123"
 serviceName: "agent-user-abc123"
 pvcName: "agent-user-abc123-data"
 mcpEndpoint: "http://agent-user-abc123.dexorder-agents.svc.cluster.local:3000"
 ```
 User IDs are sanitized to be Kubernetes-compliant (lowercase alphanumeric + hyphens).
 ## Templates by License Tier
 Templates are located in `gateway/src/k8s/templates/`:
 - `free-tier.yaml`
 - `pro-tier.yaml`
 - `enterprise-tier.yaml`
 ### Variable Substitution
 Templates use simple string replacement:
 - `{{userId}}` - User ID
 - `{{deploymentName}}` - Computed deployment name
 - `{{serviceName}}` - Computed service name
 - `{{pvcName}}` - Computed PVC name
 - `{{agentImage}}` - Agent container image (from env)
 - `{{sidecarImage}}` - Lifecycle sidecar image (from env)
 - `{{storageClass}}` - Kubernetes storage class (from env)
 ### Resource Limits
 | Tier | Memory Request | Memory Limit | CPU Request | CPU Limit | Storage | Idle Timeout |
 |------|----------------|--------------|-------------|-----------|---------|--------------|
 | **Free** | 256Mi | 512Mi | 100m | 500m | 1Gi | 15min |
 | **Pro** | 512Mi | 2Gi | 250m | 2000m | 10Gi | 60min |
 | **Enterprise** | 1Gi | 4Gi | 500m | 4000m | 50Gi | Never (shutdown disabled) |
 ## Components
 ### KubernetesClient (`gateway/src/k8s/client.ts`)
 Low-level k8s API wrapper:
 - `deploymentExists(name)` - Check if deployment exists
 - `createAgentDeployment(spec)` - Create deployment/service/PVC from template
 - `waitForDeploymentReady(name, timeout)` - Poll until ready
 - `getServiceEndpoint(name)` - Get service URL
 - `deleteAgentDeployment(userId)` - Cleanup (for testing)
 Static helpers:
 - `getDeploymentName(userId)` - Generate deployment name
 - `getServiceName(userId)` - Generate service name
 - `getPvcName(userId)` - Generate PVC name
 - `getMcpEndpoint(userId, namespace)` - Compute internal service URL
 ### ContainerManager (`gateway/src/k8s/container-manager.ts`)
 High-level orchestration:
 - `ensureContainerRunning(userId, license)` - Main entry point
  - Returns: `{ mcpEndpoint, wasCreated }`
  - Creates deployment if missing
  - Waits for ready state
  - Returns endpoint URL
 - `getContainerStatus(userId)` - Check status without creating
 - `deleteContainer(userId)` - Manual cleanup
 ### Authenticator (`gateway/src/auth/authenticator.ts`)
 Updated to call container manager:
 - `authenticateWebSocket()` - Calls `ensureContainerRunning()` before returning `AuthContext`
 - `authenticateTelegram()` - Same for Telegram webhooks
 ### WebSocketHandler (`gateway/src/channels/websocket-handler.ts`)
 Multi-phase connection protocol:
 1. Send `{type: 'status', status: 'authenticating'}`
 2. Authenticate (may take 30-120s if creating container)
 3. Send `{type: 'status', status: 'initializing'}`
 4. Initialize agent harness
 5. Send `{type: 'connected', ...}`
 This gives the client visibility into the startup process.
 ## Configuration
 Environment variables:
 ```bash
 # Kubernetes
 KUBERNETES_NAMESPACE=dexorder-agents
 KUBERNETES_IN_CLUSTER=true         # false for local dev
 KUBERNETES_CONTEXT=minikube        # for local dev only
 # Container images
 AGENT_IMAGE=ghcr.io/dexorder/agent:latest
 SIDECAR_IMAGE=ghcr.io/dexorder/lifecycle-sidecar:latest
 # Storage
 AGENT_STORAGE_CLASS=standard
 ```
 ## Security
 The gateway uses a restricted ServiceAccount with RBAC:
 **Can do:**
 - ✅ Create deployments in `dexorder-agents` namespace
 - ✅ Create services in `dexorder-agents` namespace
 - ✅ Create PVCs in `dexorder-agents` namespace
 - ✅ Read pod status and logs (debugging)
 - ✅ Update deployments (future: resource scaling)
 **Cannot do:**
 - ❌ Delete deployments (handled by lifecycle sidecar)
 - ❌ Delete PVCs (handled by lifecycle sidecar)
 - ❌ Exec into pods
 - ❌ Access secrets or configmaps
 - ❌ Create resources in other namespaces
 - ❌ Access Kubernetes API from agent containers (blocked by NetworkPolicy)
 See `deploy/k8s/base/gateway-rbac.yaml` for full configuration.
 ## Lifecycle
 ### Container Creation (Gateway)
 - User authenticates
 - Gateway checks if deployment exists
 - If missing, creates from template
 - Waits for ready (2min timeout)
 - Returns MCP endpoint
 ### Container Deletion (Lifecycle Sidecar)
 - Container tracks activity and triggers
 - When idle (no triggers + timeout), exits with code 42
 - Sidecar detects exit code 42
 - Sidecar deletes deployment + optional PVC via k8s API
 - Gateway creates fresh container on next authentication
 See `doc/container_lifecycle_management.md` for full lifecycle details.
 ## Error Handling
 | Error | Gateway Action | User Experience |
 |-------|----------------|-----------------|
 | Deployment creation fails | Log error, return auth failure | "Authentication failed" |
 | Wait timeout (image pull, etc.) | Log warning, return 503 | "Service unavailable, retry" |
 | Service not found | Retry with backoff | Transparent retry |
 | MCP connection fails | Return error | "Failed to connect to workspace" |
 | Existing deployment not ready | Wait 30s, continue if still not ready | May connect to partially-ready container |
 ## Local Development
 For local development (outside k8s):
 1. Start minikube:
 ```bash
 minikube start
 minikube addons enable storage-provisioner
 ```
 2. Apply security policies:
 ```bash
 kubectl apply -k deploy/k8s/dev
 ```
 3. Configure gateway for local k8s:
 ```bash
 # .env
 KUBERNETES_IN_CLUSTER=false
 KUBERNETES_CONTEXT=minikube
 KUBERNETES_NAMESPACE=dexorder-agents
 ```
 4. Run gateway:
 ```bash
 cd gateway
 npm run dev
 ```
 5. Connect via WebSocket:
 ```bash
 wscat -c "ws://localhost:3000/ws/chat" -H "Authorization: Bearer your-jwt"
 ```
 The gateway will create deployments in minikube. View with:
 ```bash
 kubectl get deployments -n dexorder-agents
 kubectl get pods -n dexorder-agents
 kubectl logs -n dexorder-agents agent-user-abc123 -c agent
 ```
 ## Production Deployment
 1. Build and push gateway image:
 ```bash
 cd gateway
 docker build -t ghcr.io/dexorder/gateway:latest .
 docker push ghcr.io/dexorder/gateway:latest
 ```
 2. Deploy to k8s:
 ```bash
 kubectl apply -k deploy/k8s/prod
 ```
 3. Gateway runs in `dexorder-system` namespace
 4. Creates agent containers in `dexorder-agents` namespace
 5. Admission policies enforce image allowlist and security constraints
 ## Monitoring
 Useful metrics to track:
 - Container creation latency (time from auth to ready)
 - Container creation failure rate
 - Active containers by license tier
 - Resource usage per tier
 - Idle shutdown rate
 These can be exported via Prometheus or logged to monitoring service.
 ## Future Enhancements
 1. **Pre-warming**: Create containers for active users before they connect
 2. **Image updates**: Handle agent image version migrations with user consent
 3. **Multi-region**: Geo-distributed container placement
 4. **Cost tracking**: Per-user resource usage and billing
 5. **Auto-scaling**: Scale down to 0 replicas instead of deletion (faster restart)
 6. **Container pools**: Shared warm containers for anonymous users
--- a/doc/m_c_p_client_authentication_modes.md
+++ b/doc/m_c_p_client_authentication_modes.md
@@ -0,0 +1,80 @@
 Mode A: Platform Harness → Hosted Container (internal)
   Auth: mTLS + platform-signed user claim
   Network: k8s internal, never hits the internet
 Mode B: Platform Harness → External User Container (remote)
   Auth: OAuth2 token issued by your platform
   Network: public internet, TLS required
 Mode C: Third-party MCP Client → External User Container (standalone)
   Auth: User-managed API key or local-only (no network)
   Network: localhost or user's own network
 ┌──────────────────────────────────────────────────────────┐
 │                     Platform (Postgres)                    │
 │                                                            │
 │  users                                                     │
 │  ├── id, email, password_hash, plan_tier                   │
 │  │                                                         │
 │  containers                                                │
 │  ├── user_id                                               │
 │  ├── type: "hosted" | "external"                           │
 │  ├── mcp_endpoint: "internal-svc:3100" | "https://..."     │
 │  ├── auth_method: "mtls" | "platform_token" | "api_key"    │
 │  └── public_key_fingerprint (for pinning external certs)   │
 │                                                            │
 │  api_tokens                                                │
 │  ├── user_id                                               │
 │  ├── token_hash                                            │
 │  ├── scopes: ["mcp:tools", "mcp:resources", "data:read"]  │
 │  ├── expires_at                                            │
 │  └── issued_for: "platform_harness" | "user_direct"        │
 │                                                            │
 └──────────────────────────────────────────────────────────┘
 ## Mode A
 Harness ──mTLS──▶ k8s Service ──▶ User Container MCP
 Validates: source is platform namespace
 Extracts: user_id from forwarded header
 ## Mode B
 Registration flow (one-time):
 1. User provides their MCP endpoint URL in platform settings
 2. Platform generates a scoped token (JWT, short-lived, auto-refreshed)
 3. User configures their MCP server to accept tokens signed by your platform
 4. Platform stores the endpoint + auth method
 Runtime:
 ┌──────────┐   HTTPS + Bearer token    ┌────────────────────┐
 │ Harness  │ ─────────────────────────▶ │ External MCP Server│
 │          │   Authorization:           │                    │
 │          │   Bearer <platform_jwt>    │ Validates:         │
 │          │                            │  - JWT signature   │
 │          │                            │    (your public    │
 │          │                            │     key, JWKS)     │
 │          │                            │  - user_id claim   │
 │          │                            │    matches self    │
 │          │                            │  - not expired     │
 └──────────┘                            └────────────────────┘
 ## Mode C
 ```yaml
 # openclaw/config.yaml
 auth:
  # For local-only use (Claude Desktop, Cursor, etc via stdio)
  mode: "local"          # no network auth needed
  # OR for remote access
  mode: "token"
  tokens:
    - name: "my-laptop"
      hash: "sha256:..."  # generated by `openclaw token create`
  # OR for platform integration
  mode: "platform"
  platform_jwks_url: "https://api.openclaw.io/.well-known/jwks.json"
  expected_user_id: "user_abc123"
 ```
--- a/doc/m_c_p_tools_architecture.md
+++ b/doc/m_c_p_tools_architecture.md
@@ -0,0 +1,29 @@
 MCP Tools (User Container)
 ├── Memory
 │   ├── get_conversation_history(limit)
 │   ├── save_message(role, content)
 │   ├── search_memory(query)          ← semantic search over past conversations
 │   └── get_context_summary()         ← "who is this user, what do they care about"
 │
 ├── Strategies & Indicators
 │   ├── list_strategies()
 │   ├── read_strategy(name)
 │   ├── write_strategy(name, code)
 │   ├── list_indicators()
 │   ├── read_indicator(name)
 │   ├── write_indicator(name, code)
 │   └── run_backtest(strategy, params)
 │
 ├── Preferences
 │   ├── get_preferences()
 │   ├── set_preference(key, value)
 │   └── get_agent_prompt()            ← user's custom system prompt additions
 │
 ├── Trading
 │   ├── get_watchlist()
 │   ├── execute_trade(params)
 │   ├── get_positions()
 │   └── get_trade_history()
 │
 └── Sandbox
    └── run_python(code)              ← datascience toolset, matplotlib, etc.
--- a/protobuf/protocol.md
+++ b/protobuf/protocol.md
--- a/doc/user_mcp_resources.md
+++ b/doc/user_mcp_resources.md
@@ -0,0 +1,472 @@
 # User MCP Server - Resource Architecture
 The user's MCP server container owns **all** conversation history, RAG, and contextual data. The platform gateway is a thin, stateless orchestrator that only holds the Anthropic API key.
 ## Architecture Principle
 **User Container = Fat Context**
 - Conversation history (PostgreSQL/SQLite)
 - RAG system (embeddings, vector search)
 - User preferences and custom prompts
 - Trading context (positions, watchlists, alerts)
 - All user-specific data
 **Platform Gateway = Thin Orchestrator**
 - Anthropic API key (platform pays for LLM)
 - Session management (WebSocket/Telegram connections)
 - MCP client connection pooling
 - Tool routing (platform vs user tools)
 - **Zero conversation state stored**
 ## MCP Resources for Context Injection
 Resources are **read-only** data sources that provide context to the LLM. They're fetched before each Claude API call and embedded in the conversation.
 ### Standard Context Resources
 #### 1. `context://user-profile`
 **Purpose:** User's trading background and preferences
 **MIME Type:** `text/plain`
 **Example Content:**
 ```
 User Profile:
 - Trading experience: Intermediate
 - Preferred timeframes: 1h, 4h, 1d
 - Risk tolerance: Medium
 - Focus: Swing trading with technical indicators
 - Favorite indicators: RSI, MACD, Bollinger Bands
 - Active pairs: BTC/USDT, ETH/USDT, SOL/USDT
 ```
 **Implementation Notes:**
 - Stored in user's database `user_preferences` table
 - Updated via preference management tools
 - Includes inferred data from usage patterns
 ---
 #### 2. `context://conversation-summary`
 **Purpose:** Semantic summary of recent conversation with RAG-enhanced context
 **MIME Type:** `text/plain`
 **Example Content:**
 ```
 Recent Conversation Summary:
 Last 10 messages (summarized):
 - User asked about moving average crossover strategies
 - Discussed backtesting parameters for BTC/USDT
 - Reviewed risk management with 2% position sizing
 - Explored adding RSI filter to reduce false signals
 Relevant past discussions (RAG search):
 - 2 weeks ago: Similar strategy development on ETH/USDT
 - 1 month ago: User prefers simple strategies over complex ones
 - Past preference: Avoid strategies with >5 indicators
 Current focus: Optimizing MA crossover with momentum filter
 ```
 **Implementation Notes:**
 - Last N messages stored in `conversation_history` table
 - RAG search against embeddings of past conversations
 - Semantic search using user's current message as query
 - ChromaDB/pgvector for embedding storage
 - Summary generated on-demand (can be cached for 1-5 minutes)
 **RAG Integration:**
 ```python
 async def get_conversation_summary() -> str:
    # Get recent messages
    recent = await db.get_recent_messages(limit=50)
    # Semantic search for relevant context
    relevant = await rag.search_conversation_history(
        query=recent[-1].content,  # Last user message
        limit=5,
        min_score=0.7
    )
    # Build summary
    return build_summary(recent[-10:], relevant)
 ```
 ---
 #### 3. `context://workspace-state`
 **Purpose:** Current trading workspace (chart, positions, watchlist)
 **MIME Type:** `application/json`
 **Example Content:**
 ```json
 {
  "currentChart": {
    "ticker": "BINANCE:BTC/USDT",
    "timeframe": "1h",
    "indicators": ["SMA(20)", "RSI(14)", "MACD(12,26,9)"]
  },
  "watchlist": ["BTC/USDT", "ETH/USDT", "SOL/USDT"],
  "openPositions": [
    {
      "ticker": "BTC/USDT",
      "side": "long",
      "size": 0.1,
      "entryPrice": 45000,
      "currentPrice": 46500,
      "unrealizedPnL": 150
    }
  ],
  "recentAlerts": [
    {
      "type": "price_alert",
      "message": "BTC/USDT crossed above $46,000",
      "timestamp": "2025-01-15T10:30:00Z"
    }
  ]
 }
 ```
 **Implementation Notes:**
 - Synced from web client chart state
 - Updated via WebSocket sync protocol
 - Includes active indicators on current chart
 - Position data from trading system
 ---
 #### 4. `context://system-prompt`
 **Purpose:** User's custom instructions and preferences for AI behavior
 **MIME Type:** `text/plain`
 **Example Content:**
 ```
 Custom Instructions:
 - Be concise and data-driven
 - Always show risk/reward ratios
 - Prefer simple strategies over complex ones
 - When suggesting trades, include stop-loss and take-profit levels
 - Explain your reasoning in trading decisions
 ```
 **Implementation Notes:**
 - User-editable in preferences UI
 - Appended **last** to system prompt (highest priority)
 - Can override platform defaults
 - Stored in `user_preferences.custom_prompt` field
 ---
 ## MCP Tools for Actions
 Tools are for **actions** that have side effects. These are **not** used for context fetching.
 ### Conversation Management
 - `save_message(role, content, timestamp)` - Save message to history
 - `search_conversation(query, limit)` - Explicit semantic search (for user queries like "what did we discuss about BTC?")
 ### Strategy & Indicators
 - `list_strategies()` - List user's strategies
 - `read_strategy(name)` - Get strategy code
 - `write_strategy(name, code)` - Save strategy
 - `run_backtest(strategy, params)` - Execute backtest
 ### Trading
 - `get_watchlist()` - Get watchlist (action that may trigger sync)
 - `execute_trade(params)` - Execute trade order
 - `get_positions()` - Fetch current positions from exchange
 ### Sandbox
 - `run_python(code)` - Execute Python code with data science libraries
 ---
 ## Gateway Harness Flow
 ```typescript
 // gateway/src/harness/agent-harness.ts
 async handleMessage(message: InboundMessage): Promise<OutboundMessage> {
  // 1. Fetch context resources from user's MCP
  const contextResources = await fetchContextResources([
    'context://user-profile',
    'context://conversation-summary',  // <-- RAG happens here
    'context://workspace-state',
    'context://system-prompt',
  ]);
  // 2. Build system prompt from resources
  const systemPrompt = buildSystemPrompt(contextResources);
  // 3. Build messages with embedded conversation context
  const messages = buildMessages(message, contextResources);
  // 4. Get tools from MCP
  const tools = await mcpClient.listTools();
  // 5. Call Claude with embedded context
  const response = await anthropic.messages.create({
    model: 'claude-3-5-sonnet-20241022',
    system: systemPrompt,  // <-- User profile + workspace + custom prompt
    messages,              // <-- Conversation summary from RAG
    tools,
  });
  // 6. Save to user's MCP (tool call)
  await mcpClient.callTool('save_message', { role: 'user', content: message.content });
  await mcpClient.callTool('save_message', { role: 'assistant', content: response });
  return response;
 }
 ```
 ---
 ## User MCP Server Implementation (Python)
 ### Resource Handler
 ```python
 # user-mcp/src/resources.py
 from mcp.server import Server
 from mcp.types import Resource, ResourceTemplate
 import asyncpg
 server = Server("dexorder-user")
@server.list_resources()
 async def list_resources() -> list[Resource]:
    return [
        Resource(
            uri="context://user-profile",
            name="User Profile",
            description="Trading style, preferences, and background",
            mimeType="text/plain",
        ),
        Resource(
            uri="context://conversation-summary",
            name="Conversation Summary",
            description="Recent conversation with RAG-enhanced context",
            mimeType="text/plain",
        ),
        Resource(
            uri="context://workspace-state",
            name="Workspace State",
            description="Current chart, watchlist, positions",
            mimeType="application/json",
        ),
        Resource(
            uri="context://system-prompt",
            name="Custom System Prompt",
            description="User's custom AI instructions",
            mimeType="text/plain",
        ),
    ]
@server.read_resource()
 async def read_resource(uri: str) -> str:
    if uri == "context://user-profile":
        return await build_user_profile()
    elif uri == "context://conversation-summary":
        return await build_conversation_summary()
    elif uri == "context://workspace-state":
        return await build_workspace_state()
    elif uri == "context://system-prompt":
        return await get_custom_prompt()
    else:
        raise ValueError(f"Unknown resource: {uri}")
 ```
 ### RAG Integration
 ```python
 # user-mcp/src/rag.py
 import chromadb
 from sentence_transformers import SentenceTransformer
 class ConversationRAG:
    def __init__(self, db_path: str):
        self.chroma = chromadb.PersistentClient(path=db_path)
        self.collection = self.chroma.get_or_create_collection("conversations")
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
    async def search_conversation_history(
        self,
        query: str,
        limit: int = 5,
        min_score: float = 0.7
    ) -> list[dict]:
        """Semantic search over conversation history"""
        # Embed query
        query_embedding = self.embedder.encode(query).tolist()
        # Search
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=limit,
        )
        # Filter by score and format
        relevant = []
        for i, score in enumerate(results['distances'][0]):
            if score >= min_score:
                relevant.append({
                    'content': results['documents'][0][i],
                    'metadata': results['metadatas'][0][i],
                    'score': score,
                })
        return relevant
    async def add_message(self, message_id: str, role: str, content: str, metadata: dict):
        """Add message to RAG index"""
        embedding = self.embedder.encode(content).tolist()
        self.collection.add(
            ids=[message_id],
            embeddings=[embedding],
            documents=[content],
            metadatas=[{
                'role': role,
                'timestamp': metadata.get('timestamp'),
                **metadata
            }]
        )
 ```
 ### Conversation Summary Builder
 ```python
 # user-mcp/src/context.py
 async def build_conversation_summary(user_id: str) -> str:
    """Build conversation summary with RAG"""
    # 1. Get recent messages
    recent_messages = await db.get_messages(
        user_id=user_id,
        limit=50,
        order='desc'
    )
    # 2. Get current focus (last user message)
    last_user_msg = next(
        (m for m in recent_messages if m.role == 'user'),
        None
    )
    if not last_user_msg:
        return "No recent conversation history."
    # 3. RAG search for relevant context
    rag = ConversationRAG(f"/data/users/{user_id}/rag")
    relevant_context = await rag.search_conversation_history(
        query=last_user_msg.content,
        limit=5,
        min_score=0.7
    )
    # 4. Build summary
    summary = f"Recent Conversation Summary:\n\n"
    # Recent messages (last 10)
    summary += "Last 10 messages:\n"
    for msg in recent_messages[-10:]:
        summary += f"- {msg.role}: {msg.content[:100]}...\n"
    # Relevant past context
    if relevant_context:
        summary += "\nRelevant past discussions (RAG):\n"
        for ctx in relevant_context:
            timestamp = ctx['metadata'].get('timestamp', 'unknown')
            summary += f"- [{timestamp}] {ctx['content'][:150]}...\n"
    # Inferred focus
    summary += f"\nCurrent focus: {infer_topic(last_user_msg.content)}\n"
    return summary
 def infer_topic(message: str) -> str:
    """Simple topic extraction"""
    keywords = {
        'strategy': ['strategy', 'backtest', 'trading system'],
        'indicator': ['indicator', 'rsi', 'macd', 'moving average'],
        'analysis': ['analyze', 'chart', 'price action'],
        'risk': ['risk', 'position size', 'stop loss'],
    }
    message_lower = message.lower()
    for topic, words in keywords.items():
        if any(word in message_lower for word in words):
            return topic
    return 'general trading discussion'
 ```
 ---
 ## Benefits of This Architecture
 1. **Privacy**: Conversation history never leaves user's container
 2. **Customization**: Each user controls their RAG, embeddings, prompt engineering
 3. **Scalability**: Platform harness is stateless - horizontally scalable
 4. **Cost Control**: Platform pays for Claude, users pay for their compute/storage
 5. **Portability**: Users can export/migrate their entire context
 6. **Development**: Users can test prompts/context locally without platform involvement
 ---
 ## Future Enhancements
 ### Dynamic Resource URIs
 Support parameterized resources:
 ```
 context://conversation/{session_id}
 context://strategy/{strategy_name}
 context://backtest/{backtest_id}/results
 ```
 ### Resource Templates
 MCP supports resource templates for dynamic discovery:
 ```python
@server.list_resource_templates()
 async def list_templates() -> list[ResourceTemplate]:
    return [
        ResourceTemplate(
            uriTemplate="context://strategy/{name}",
            name="Strategy Context",
            description="Context for specific strategy",
        )
    ]
 ```
 ### Streaming Resources
 For large context (e.g., full backtest results), support streaming:
 ```python
@server.read_resource()
 async def read_resource(uri: str) -> AsyncIterator[str]:
    if uri.startswith("context://backtest/"):
        async for chunk in stream_backtest_results(uri):
            yield chunk
 ```
 ---
 ## Migration Path
 For users with existing conversation history in platform DB:
 1. **Export script**: Migrate platform history → user container DB
 2. **RAG indexing**: Embed all historical messages into ChromaDB
 3. **Preference migration**: Copy user preferences to container
 4. **Cutover**: Switch to resource-based context fetching
 Platform can keep read-only archive for compliance, but active context lives in user container.
--- a/gateway/.dockerignore
+++ b/gateway/.dockerignore
@@ -0,0 +1,9 @@
 node_modules
 dist
 .env
 .env.*
 !.env.example
 *.log
 .git
 .gitignore
 README.md
--- a/gateway/.env.example
+++ b/gateway/.env.example
@@ -0,0 +1,39 @@
 # Server configuration
 PORT=3000
 HOST=0.0.0.0
 LOG_LEVEL=info
 CORS_ORIGIN=*
 # Database
 DATABASE_URL=postgresql://postgres:postgres@localhost:5432/dexorder
 # LLM Provider API Keys (configure at least one)
 # Anthropic Claude
 ANTHROPIC_API_KEY=sk-ant-xxxxx
 # OpenAI GPT
 OPENAI_API_KEY=sk-xxxxx
 # Google Gemini
 GOOGLE_API_KEY=xxxxx
 # OpenRouter (access to 300+ models with one key)
 OPENROUTER_API_KEY=sk-or-xxxxx
 # Default model (if user has no preference)
 DEFAULT_MODEL_PROVIDER=anthropic
 DEFAULT_MODEL=claude-3-5-sonnet-20241022
 # Telegram (optional)
 TELEGRAM_BOT_TOKEN=
 # Kubernetes configuration
 KUBERNETES_NAMESPACE=dexorder-agents
 KUBERNETES_IN_CLUSTER=false
 KUBERNETES_CONTEXT=minikube
 AGENT_IMAGE=ghcr.io/dexorder/agent:latest
 SIDECAR_IMAGE=ghcr.io/dexorder/lifecycle-sidecar:latest
 AGENT_STORAGE_CLASS=standard
 # Redis (for session management - future)
 # REDIS_URL=redis://localhost:6379
--- a/gateway/.gitignore
+++ b/gateway/.gitignore
@@ -0,0 +1,6 @@
 node_modules
 dist
 .env
 .env.local
 *.log
 .DS_Store
--- a/gateway/ARCHITECTURE.md
+++ b/gateway/ARCHITECTURE.md
@@ -0,0 +1,313 @@
 # Gateway Architecture: LangChain.js + LangGraph
 ## Why LangChain.js (Not Vercel AI SDK or Direct Anthropic SDK)?
 ### The Decision
 After evaluating Vercel AI SDK and LangChain.js, we chose **LangChain.js + LangGraph** for these reasons:
 1. **Multi-model support**: 300+ models via OpenRouter, plus direct integrations
 2. **Complex workflows**: LangGraph for stateful trading analysis pipelines
 3. **No vendor lock-in**: Switch between Anthropic, OpenAI, Google with one line
 4. **Streaming**: Same as Vercel AI SDK (`.stream()` method)
 5. **Tool calling**: Unified across all providers
 6. **Trading-specific**: State management, conditional branching, human-in-the-loop
 **We don't need Vercel AI SDK because:**
 - ❌ We use Vue (not React) - don't need React hooks
 - ❌ We have Node.js servers (not edge) - don't need edge runtime
 - ✅ **DO need** complex workflows (strategy analysis, backtesting, approvals)
 - ✅ **DO need** stateful execution (resume from failures)
 ---
 ## Architecture Layers
 ### Layer 1: Model Abstraction (`src/llm/`)
 **Provider Factory** (`provider.ts`)
 ```typescript
 const factory = new LLMProviderFactory(config, logger);
 // Create any model
 const claude = factory.createModel({
  provider: 'anthropic',
  model: 'claude-3-5-sonnet-20241022',
 });
 const gpt4 = factory.createModel({
  provider: 'openai',
  model: 'gpt-4o',
 });
 ```
 **Model Router** (`router.ts`)
 ```typescript
 const router = new ModelRouter(factory, logger);
 // Intelligently route based on:
 // - User license (free → Gemini Flash, pro → GPT-4, enterprise → Claude)
 // - Query complexity (simple → cheap, complex → smart)
 // - User preference (if set in license.preferredModel)
 // - Cost optimization (always use cheapest)
 const model = await router.route(
  message.content,
  userLicense,
  RoutingStrategy.COMPLEXITY
 );
 ```
 ---
 ### Layer 2: Agent Harness (`src/harness/`)
 **Stateless Orchestrator**
 The harness has **ZERO conversation state**. Everything lives in user's MCP container.
 **Flow:**
 ```typescript
 async handleMessage(message: InboundMessage) {
  // 1. Fetch context from user's MCP (resources, not tools)
  const resources = await mcpClient.listResources();
  const context = await Promise.all([
    mcpClient.readResource('context://user-profile'),        // Trading style
    mcpClient.readResource('context://conversation-summary'), // RAG summary
    mcpClient.readResource('context://workspace-state'),      // Current chart
    mcpClient.readResource('context://system-prompt'),        // Custom instructions
  ]);
  // 2. Route to appropriate model
  const model = await modelRouter.route(message, license);
  // 3. Build messages with embedded context
  const messages = buildLangChainMessages(systemPrompt, context);
  // 4. Call LLM
  const response = await model.invoke(messages);
  // 5. Save to user's MCP (tool call)
  await mcpClient.callTool('save_message', { role: 'user', content: message });
  await mcpClient.callTool('save_message', { role: 'assistant', content: response });
  return response;
 }
 ```
 **Streaming variant:**
 ```typescript
 async *streamMessage(message: InboundMessage) {
  const model = await modelRouter.route(message, license);
  const messages = buildMessages(context, message);
  const stream = await model.stream(messages);
  let fullResponse = '';
  for await (const chunk of stream) {
    fullResponse += chunk.content;
    yield chunk.content; // Stream to WebSocket/Telegram
  }
  // Save after streaming completes
  await mcpClient.callTool('save_message', { /* ... */ });
 }
 ```
 ---
 ### Layer 3: Workflows (`src/workflows/`)
 **LangGraph for Complex Trading Analysis**
 ```typescript
 // Example: Strategy Analysis Pipeline
 const workflow = new StateGraph(StrategyAnalysisState)
  .addNode('code_review', async (state) => {
    const model = new ChatAnthropic({ model: 'claude-3-opus' });
    const review = await model.invoke(`Review: ${state.strategyCode}`);
    return { codeReview: review.content };
  })
  .addNode('backtest', async (state) => {
    // Call user's MCP backtest tool
    const results = await mcpClient.callTool('run_backtest', {
      strategy: state.strategyCode,
      ticker: state.ticker,
    });
    return { backtestResults: results };
  })
  .addNode('risk_assessment', async (state) => {
    const model = new ChatAnthropic({ model: 'claude-3-5-sonnet' });
    const assessment = await model.invoke(
      `Analyze risk: ${JSON.stringify(state.backtestResults)}`
    );
    return { riskAssessment: assessment.content };
  })
  .addNode('human_approval', async (state) => {
    // Pause for user review (human-in-the-loop)
    return { humanApproved: await waitForUserApproval(state) };
  })
  .addConditionalEdges('human_approval', (state) => {
    return state.humanApproved ? 'deploy' : 'reject';
  })
  .compile();
 // Execute
 const result = await workflow.invoke({
  strategyCode: userCode,
  ticker: 'BTC/USDT',
  timeframe: '1h',
 });
 ```
 **Benefits:**
 - **Stateful**: Resume if server crashes mid-analysis
 - **Conditional**: Route based on results (if Sharpe > 2 → deploy, else → reject)
 - **Human-in-the-loop**: Pause for user approval
 - **Multi-step**: Each node can use different models
 ---
 ## User Context Architecture
 ### MCP Resources (Not Tools)
 **User's MCP server exposes resources** (read-only context):
 ```
 context://user-profile          → Trading style, preferences
 context://conversation-summary  → RAG-generated summary
 context://workspace-state       → Current chart, positions
 context://system-prompt         → User's custom AI instructions
 ```
 **Gateway fetches and embeds in LLM call:**
 ```typescript
 const userProfile = await mcpClient.readResource('context://user-profile');
 const conversationSummary = await mcpClient.readResource('context://conversation-summary');
 // User's MCP server runs RAG search and returns summary
 // Gateway embeds this in Claude/GPT prompt
 ```
 **Why resources, not tools?**
 - Resources = context injection (read-only)
 - Tools = actions (write operations)
 - Context should be fetched **before** LLM call, not during
 ---
 ## Model Routing Strategies
 ### 1. User Preference
 ```typescript
 // User's license has preferred model
 {
  "preferredModel": {
    "provider": "anthropic",
    "model": "claude-3-5-sonnet-20241022"
  }
 }
 // Router uses this if set
 ```
 ### 2. Complexity-Based
 ```typescript
 const isComplex = message.includes('backtest') || message.length > 200;
 if (isComplex) {
  return { provider: 'anthropic', model: 'claude-3-opus' }; // Smart
 } else {
  return { provider: 'openai', model: 'gpt-4o-mini' }; // Fast
 }
 ```
 ### 3. License Tier
 ```typescript
 switch (license.licenseType) {
  case 'free':
    return { provider: 'google', model: 'gemini-2.0-flash-exp' }; // Cheap
  case 'pro':
    return { provider: 'openai', model: 'gpt-4o' }; // Balanced
  case 'enterprise':
    return { provider: 'anthropic', model: 'claude-3-5-sonnet' }; // Premium
 }
 ```
 ### 4. Cost-Optimized
 ```typescript
 return { provider: 'google', model: 'gemini-2.0-flash-exp' }; // Always cheapest
 ```
 ---
 ## When to Use What
 ### Simple Chat → Agent Harness
 ```typescript
 // User: "What's the RSI on BTC?"
 // → Fast streaming response via harness.streamMessage()
 ```
 ### Complex Analysis → LangGraph Workflow
 ```typescript
 // User: "Analyze this strategy and backtest it"
 // → Multi-step workflow: code review → backtest → risk → approval
 ```
 ### Direct Tool Call → MCP Client
 ```typescript
 // User: "Get my watchlist"
 // → Direct MCP tool call, no LLM needed
 ```
 ---
 ## Data Flow
 ```
 User Message ("Analyze my strategy")
    ↓
 Gateway → Route to workflow (not harness)
    ↓
 LangGraph Workflow:
  ├─ Node 1: Code Review (Claude Opus)
  │   └─ Analyzes strategy code
  ├─ Node 2: Backtest (MCP tool call)
  │   └─ User's container runs backtest
  ├─ Node 3: Risk Assessment (Claude Sonnet)
  │   └─ Evaluates results
  ├─ Node 4: Human Approval (pause)
  │   └─ User reviews in UI
  └─ Node 5: Recommendation (GPT-4o-mini)
      └─ Final decision
 Result → Return to user
 ```
 ---
 ## Benefits Summary
 | Feature | LangChain.js | Vercel AI SDK | Direct Anthropic SDK |
 |---------|--------------|---------------|----------------------|
 | Multi-model | ✅ 300+ models | ✅ 100+ models | ❌ Anthropic only |
 | Streaming | ✅ `.stream()` | ✅ `streamText()` | ✅ `.stream()` |
 | Tool calling | ✅ Unified | ✅ Unified | ✅ Anthropic format |
 | Complex workflows | ✅ LangGraph | ❌ Limited | ❌ DIY |
 | Stateful agents | ✅ LangGraph | ❌ No | ❌ No |
 | Human-in-the-loop | ✅ LangGraph | ❌ No | ❌ No |
 | React hooks | ❌ N/A | ✅ `useChat()` | ❌ N/A |
 | Bundle size | Large (101kb) | Small (30kb) | Medium (60kb) |
 | **Dexorder needs** | **✅ Perfect fit** | **❌ Missing workflows** | **❌ Vendor lock-in** |
 ---
 ## Next Steps
 1. **Implement tool calling** in agent harness (bind MCP tools to LangChain)
 2. **Add state persistence** for LangGraph (PostgreSQL checkpointer)
 3. **Build more workflows**: market scanner, portfolio optimizer
 4. **Add monitoring**: Track model usage, costs, latency
 5. **User container**: Implement Python MCP server with resources
--- a/gateway/Dockerfile
+++ b/gateway/Dockerfile
@@ -0,0 +1,40 @@
 FROM node:22-alpine AS builder
 WORKDIR /app
 # Copy package files
 COPY package*.json ./
 COPY tsconfig.json ./
 # Install dependencies
 RUN npm ci
 # Copy source
 COPY src ./src
 # Build
 RUN npm run build
 # Production image
 FROM node:22-alpine
 WORKDIR /app
 # Copy package files
 COPY package*.json ./
 # Install production dependencies only
 RUN npm ci --omit=dev
 # Copy built application
 COPY --from=builder /app/dist ./dist
 # Create non-root user
 RUN addgroup -g 1001 -S nodejs && \
    adduser -S nodejs -u 1001
 USER nodejs
 EXPOSE 3000
 CMD ["node", "dist/main.js"]
--- a/gateway/README.md
+++ b/gateway/README.md
@@ -0,0 +1,212 @@
 # Dexorder Gateway
 Multi-channel gateway with agent harness for the Dexorder AI platform.
 ## Architecture
 ```
 ┌─────────────────────────────────────────────────────────┐
 │                    Platform Gateway                      │
 │                   (Node.js/Fastify)                      │
 │                                                          │
 │  ┌────────────────────────────────────────────────┐    │
 │  │  Channels                                       │    │
 │  │  - WebSocket (/ws/chat)                         │    │
 │  │  - Telegram Webhook (/webhook/telegram)        │    │
 │  └────────────────────────────────────────────────┘    │
 │                         ↕                                │
 │  ┌────────────────────────────────────────────────┐    │
 │  │  Authenticator                                  │    │
 │  │  - JWT verification (WebSocket)                 │    │
 │  │  - Channel linking (Telegram)                   │    │
 │  │  - User license lookup (PostgreSQL)             │    │
 │  └────────────────────────────────────────────────┘    │
 │                         ↕                                │
 │  ┌────────────────────────────────────────────────┐    │
 │  │  Agent Harness (per-session)                    │    │
 │  │  - Claude API integration                       │    │
 │  │  - MCP client connector                         │    │
 │  │  - Conversation state                           │    │
 │  └────────────────────────────────────────────────┘    │
 │                         ↕                                │
 │  ┌────────────────────────────────────────────────┐    │
 │  │  MCP Client                                      │    │
 │  │  - User container connection                    │    │
 │  │  - Tool routing                                  │    │
 │  └────────────────────────────────────────────────┘    │
 └─────────────────────────────────────────────────────────┘
                          ↕
          ┌───────────────────────────────┐
          │  User MCP Server (Python)      │
          │  - Strategies, indicators       │
          │  - Memory, preferences          │
          │  - Backtest sandbox             │
          └───────────────────────────────────┘
 ```
 ## Features
 - **Automatic container provisioning**: Creates user agent containers on-demand via Kubernetes
 - **Multi-channel support**: WebSocket and Telegram webhooks
 - **Per-channel authentication**: JWT for web, channel linking for chat apps
 - **User license management**: Feature flags and resource limits from PostgreSQL
 - **Container lifecycle management**: Auto-shutdown on idle (handled by container sidecar)
 - **License-based resources**: Different memory/CPU/storage limits per tier
 - **Multi-model LLM support**: Anthropic Claude, OpenAI GPT, Google Gemini, OpenRouter (300+ models)
 - **Zero vendor lock-in**: Switch models with one line, powered by LangChain.js
 - **Intelligent routing**: Auto-select models based on complexity, license tier, or user preference
 - **Streaming responses**: Real-time chat with WebSocket and Telegram
 - **Complex workflows**: LangGraph for stateful trading analysis (backtest → risk → approval)
 - **Agent harness**: Stateless orchestrator (all context lives in user's MCP container)
 - **MCP resource integration**: User's RAG, conversation history, and preferences
 ## Container Management
 When a user authenticates, the gateway:
 1. **Checks for existing container**: Queries Kubernetes for deployment
 2. **Creates if missing**: Renders YAML template based on license tier
 3. **Waits for ready**: Polls deployment status until healthy
 4. **Returns MCP endpoint**: Computed from service name
 5. **Connects to MCP server**: Proceeds with normal authentication flow
 Container templates by license tier:
 | Tier | Memory | CPU | Storage | Idle Timeout |
 |------|--------|-----|---------|--------------|
 | Free | 512Mi | 500m | 1Gi | 15min |
 | Pro | 2Gi | 2000m | 10Gi | 60min |
 | Enterprise | 4Gi | 4000m | 50Gi | Never |
 Containers self-manage their lifecycle using the lifecycle sidecar (see `../lifecycle-sidecar/`)
 ## Setup
 ### Prerequisites
 - Node.js >= 22.0.0
 - PostgreSQL database
 - At least one LLM provider API key:
  - Anthropic Claude
  - OpenAI GPT
  - Google Gemini
  - OpenRouter (one key for 300+ models)
 ### Development
 1. Install dependencies:
 ```bash
 npm install
 ```
 2. Copy environment template:
 ```bash
 cp .env.example .env
 ```
 3. Configure `.env` (see `.env.example`):
 ```bash
 DATABASE_URL=postgresql://postgres:postgres@localhost:5432/dexorder
 # Configure at least one provider
 ANTHROPIC_API_KEY=sk-ant-xxxxx
 # OPENAI_API_KEY=sk-xxxxx
 # GOOGLE_API_KEY=xxxxx
 # OPENROUTER_API_KEY=sk-or-xxxxx
 # Optional: Set default model
 DEFAULT_MODEL_PROVIDER=anthropic
 DEFAULT_MODEL=claude-3-5-sonnet-20241022
 ```
 4. Run development server:
 ```bash
 npm run dev
 ```
 ### Production Build
 ```bash
 npm run build
 npm start
 ```
 ### Docker
 ```bash
 docker build -t dexorder/gateway:latest .
 docker run -p 3000:3000 --env-file .env dexorder/gateway:latest
 ```
 ## Database Schema
 Required PostgreSQL tables (will be documented separately):
 ### `user_licenses`
 - `user_id` (text, primary key)
 - `email` (text)
 - `license_type` (text: 'free', 'pro', 'enterprise')
 - `features` (jsonb)
 - `resource_limits` (jsonb)
 - `mcp_server_url` (text)
 - `expires_at` (timestamp, nullable)
 - `created_at` (timestamp)
 - `updated_at` (timestamp)
 ### `user_channel_links`
 - `id` (serial, primary key)
 - `user_id` (text, foreign key)
 - `channel_type` (text: 'telegram', 'slack', 'discord')
 - `channel_user_id` (text)
 - `created_at` (timestamp)
 ## API Endpoints
 ### WebSocket
 **`GET /ws/chat`**
 - WebSocket connection for web client
 - Auth: Bearer token in headers
 - Protocol: JSON messages
 Example:
 ```javascript
 const ws = new WebSocket('ws://localhost:3000/ws/chat', {
  headers: {
    'Authorization': 'Bearer your-jwt-token'
  }
 });
 ws.on('message', (data) => {
  const msg = JSON.parse(data);
  console.log(msg);
 });
 ws.send(JSON.stringify({
  type: 'message',
  content: 'Hello, AI!'
 }));
 ```
 ### Telegram Webhook
 **`POST /webhook/telegram`**
 - Telegram bot webhook endpoint
 - Auth: Telegram user linked to platform user
 - Automatically processes incoming messages
 ### Health Check
 **`GET /health`**
 - Returns server health status
 ## TODO
 - [ ] Implement JWT verification with JWKS
 - [ ] Implement MCP HTTP/SSE transport
 - [ ] Add Redis for session persistence
 - [ ] Add rate limiting per user license
 - [ ] Add message usage tracking
 - [ ] Add streaming responses for WebSocket
 - [ ] Add Slack and Discord channel handlers
 - [ ] Add session cleanup/timeout logic
--- a/gateway/package.json
+++ b/gateway/package.json
@@ -0,0 +1,42 @@
 {
  "name": "@dexorder/gateway",
  "version": "0.1.0",
  "type": "module",
  "private": true,
  "description": "Multi-channel gateway with agent harness for Dexorder AI platform",
  "scripts": {
    "dev": "tsx watch src/main.ts",
    "build": "tsc",
    "start": "node dist/main.js",
    "typecheck": "tsc --noEmit"
  },
  "dependencies": {
    "@fastify/cors": "^10.0.1",
    "@fastify/websocket": "^11.0.1",
    "@kubernetes/client-node": "^0.21.0",
    "@langchain/anthropic": "^0.3.8",
    "@langchain/core": "^0.3.24",
    "@langchain/google-genai": "^0.1.6",
    "@langchain/langgraph": "^0.2.26",
    "@langchain/openai": "^0.3.21",
    "@langchain/openrouter": "^0.1.2",
    "@modelcontextprotocol/sdk": "^1.0.4",
    "fastify": "^5.2.0",
    "ioredis": "^5.4.2",
    "js-yaml": "^4.1.0",
    "pg": "^8.13.1",
    "pino": "^9.6.0",
    "pino-pretty": "^13.0.0",
    "zod": "^3.24.1"
  },
  "devDependencies": {
    "@types/js-yaml": "^4.0.9",
    "@types/node": "^22.10.2",
    "@types/pg": "^8.11.10",
    "tsx": "^4.19.2",
    "typescript": "^5.7.2"
  },
  "engines": {
    "node": ">=22.0.0"
  }
 }
--- a/gateway/schema.sql
+++ b/gateway/schema.sql
@@ -0,0 +1,79 @@
 -- User license and authorization schema
 CREATE TABLE IF NOT EXISTS user_licenses (
  user_id TEXT PRIMARY KEY,
  email TEXT,
  license_type TEXT NOT NULL CHECK (license_type IN ('free', 'pro', 'enterprise')),
  features JSONB NOT NULL DEFAULT '{
    "maxIndicators": 5,
    "maxStrategies": 3,
    "maxBacktestDays": 30,
    "realtimeData": false,
    "customExecutors": false,
    "apiAccess": false
  }',
  resource_limits JSONB NOT NULL DEFAULT '{
    "maxConcurrentSessions": 1,
    "maxMessagesPerDay": 100,
    "maxTokensPerMessage": 4096,
    "rateLimitPerMinute": 10
  }',
  mcp_server_url TEXT NOT NULL,
  preferred_model JSONB DEFAULT NULL,
  expires_at TIMESTAMP WITH TIME ZONE,
  created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
  updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
 );
 COMMENT ON COLUMN user_licenses.preferred_model IS 'Optional model preference: {"provider": "anthropic", "model": "claude-3-5-sonnet-20241022", "temperature": 0.7}';
 CREATE INDEX idx_user_licenses_expires_at ON user_licenses(expires_at)
  WHERE expires_at IS NOT NULL;
 -- Channel linking for multi-channel support
 CREATE TABLE IF NOT EXISTS user_channel_links (
  id SERIAL PRIMARY KEY,
  user_id TEXT NOT NULL REFERENCES user_licenses(user_id) ON DELETE CASCADE,
  channel_type TEXT NOT NULL CHECK (channel_type IN ('telegram', 'slack', 'discord', 'websocket')),
  channel_user_id TEXT NOT NULL,
  metadata JSONB,
  created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
  UNIQUE(channel_type, channel_user_id)
 );
 CREATE INDEX idx_user_channel_links_user_id ON user_channel_links(user_id);
 CREATE INDEX idx_user_channel_links_channel ON user_channel_links(channel_type, channel_user_id);
 -- Example data for development
 INSERT INTO user_licenses (user_id, email, license_type, mcp_server_url, features, resource_limits, preferred_model)
 VALUES (
  'dev-user-001',
  'dev@example.com',
  'pro',
  'http://localhost:8080/mcp',
  '{
    "maxIndicators": 50,
    "maxStrategies": 20,
    "maxBacktestDays": 365,
    "realtimeData": true,
    "customExecutors": true,
    "apiAccess": true
  }',
  '{
    "maxConcurrentSessions": 5,
    "maxMessagesPerDay": 1000,
    "maxTokensPerMessage": 8192,
    "rateLimitPerMinute": 60
  }',
  '{
    "provider": "anthropic",
    "model": "claude-3-5-sonnet-20241022",
    "temperature": 0.7
  }'
 )
 ON CONFLICT (user_id) DO NOTHING;
 -- Example Telegram link
 INSERT INTO user_channel_links (user_id, channel_type, channel_user_id)
 VALUES ('dev-user-001', 'telegram', '123456789')
 ON CONFLICT (channel_type, channel_user_id) DO NOTHING;
--- a/gateway/src/auth/authenticator.ts
+++ b/gateway/src/auth/authenticator.ts
@@ -0,0 +1,146 @@
 import type { FastifyRequest, FastifyBaseLogger } from 'fastify';
 import { UserService } from '../db/user-service.js';
 import { ChannelType, type AuthContext } from '../types/user.js';
 import type { ContainerManager } from '../k8s/container-manager.js';
 export interface AuthenticatorConfig {
  userService: UserService;
  containerManager: ContainerManager;
  logger: FastifyBaseLogger;
 }
 /**
 * Multi-channel authenticator
 * Handles authentication for WebSocket, Telegram, and other channels
 */
 export class Authenticator {
  private config: AuthenticatorConfig;
  constructor(config: AuthenticatorConfig) {
    this.config = config;
  }
  /**
   * Authenticate WebSocket connection via JWT token
   * Also ensures the user's container is running
   */
  async authenticateWebSocket(
    request: FastifyRequest
  ): Promise<AuthContext | null> {
    try {
      const token = this.extractBearerToken(request);
      if (!token) {
        this.config.logger.warn('No bearer token in WebSocket connection');
        return null;
      }
      const userId = await this.config.userService.verifyWebToken(token);
      if (!userId) {
        this.config.logger.warn('Invalid JWT token');
        return null;
      }
      const license = await this.config.userService.getUserLicense(userId);
      if (!license) {
        this.config.logger.warn({ userId }, 'User license not found');
        return null;
      }
      // Ensure container is running (may take time if creating new container)
      this.config.logger.info({ userId }, 'Ensuring user container is running');
      const { mcpEndpoint, wasCreated } = await this.config.containerManager.ensureContainerRunning(
        userId,
        license
      );
      this.config.logger.info(
        { userId, mcpEndpoint, wasCreated },
        'Container is ready'
      );
      // Update license with actual MCP endpoint
      license.mcpServerUrl = mcpEndpoint;
      const sessionId = `ws_${userId}_${Date.now()}`;
      return {
        userId,
        channelType: ChannelType.WEBSOCKET,
        channelUserId: userId, // For WebSocket, same as userId
        sessionId,
        license,
        authenticatedAt: new Date(),
      };
    } catch (error) {
      this.config.logger.error({ error }, 'WebSocket authentication error');
      return null;
    }
  }
  /**
   * Authenticate Telegram webhook
   * Also ensures the user's container is running
   */
  async authenticateTelegram(telegramUserId: string): Promise<AuthContext | null> {
    try {
      const userId = await this.config.userService.getUserIdFromChannel(
        'telegram',
        telegramUserId
      );
      if (!userId) {
        this.config.logger.warn(
          { telegramUserId },
          'Telegram user not linked to platform user'
        );
        return null;
      }
      const license = await this.config.userService.getUserLicense(userId);
      if (!license) {
        this.config.logger.warn({ userId }, 'User license not found');
        return null;
      }
      // Ensure container is running
      this.config.logger.info({ userId }, 'Ensuring user container is running');
      const { mcpEndpoint, wasCreated } = await this.config.containerManager.ensureContainerRunning(
        userId,
        license
      );
      this.config.logger.info(
        { userId, mcpEndpoint, wasCreated },
        'Container is ready'
      );
      // Update license with actual MCP endpoint
      license.mcpServerUrl = mcpEndpoint;
      const sessionId = `tg_${telegramUserId}_${Date.now()}`;
      return {
        userId,
        channelType: ChannelType.TELEGRAM,
        channelUserId: telegramUserId,
        sessionId,
        license,
        authenticatedAt: new Date(),
      };
    } catch (error) {
      this.config.logger.error({ error }, 'Telegram authentication error');
      return null;
    }
  }
  /**
   * Extract bearer token from request headers
   */
  private extractBearerToken(request: FastifyRequest): string | null {
    const auth = request.headers.authorization;
    if (!auth || !auth.startsWith('Bearer ')) {
      return null;
    }
    return auth.substring(7);
  }
 }
--- a/gateway/src/channels/telegram-handler.ts
+++ b/gateway/src/channels/telegram-handler.ts
@@ -0,0 +1,163 @@
 import type { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify';
 import type { Authenticator } from '../auth/authenticator.js';
 import { AgentHarness } from '../harness/agent-harness.js';
 import type { InboundMessage } from '../types/messages.js';
 import { randomUUID } from 'crypto';
 import type { ProviderConfig } from '../llm/provider.js';
 export interface TelegramHandlerConfig {
  authenticator: Authenticator;
  providerConfig: ProviderConfig;
  telegramBotToken: string;
 }
 interface TelegramUpdate {
  update_id: number;
  message?: {
    message_id: number;
    from: {
      id: number;
      first_name: string;
      username?: string;
    };
    chat: {
      id: number;
      type: string;
    };
    text?: string;
    photo?: Array<{
      file_id: string;
      file_size: number;
    }>;
  };
 }
 /**
 * Telegram webhook handler
 */
 export class TelegramHandler {
  private config: TelegramHandlerConfig;
  private sessions = new Map<string, AgentHarness>();
  constructor(config: TelegramHandlerConfig) {
    this.config = config;
  }
  /**
   * Register Telegram webhook routes
   */
  register(app: FastifyInstance): void {
    app.post('/webhook/telegram', async (request: FastifyRequest, reply: FastifyReply) => {
      await this.handleWebhook(request, reply, app);
    });
  }
  /**
   * Handle Telegram webhook
   */
  private async handleWebhook(
    request: FastifyRequest,
    reply: FastifyReply,
    app: FastifyInstance
  ): Promise<void> {
    const logger = app.log;
    try {
      const update = request.body as TelegramUpdate;
      if (!update.message?.text) {
        // Ignore non-text messages for now
        reply.code(200).send({ ok: true });
        return;
      }
      const telegramUserId = update.message.from.id.toString();
      const chatId = update.message.chat.id;
      const text = update.message.text;
      logger.info({ telegramUserId, chatId, text }, 'Received Telegram message');
      // Authenticate
      const authContext = await this.config.authenticator.authenticateTelegram(telegramUserId);
      if (!authContext) {
        logger.warn({ telegramUserId }, 'Telegram user not authenticated');
        await this.sendTelegramMessage(
          chatId,
          'Please link your Telegram account to Dexorder first.'
        );
        reply.code(200).send({ ok: true });
        return;
      }
      // Get or create harness
      let harness = this.sessions.get(authContext.sessionId);
      if (!harness) {
        harness = new AgentHarness({
          userId: authContext.userId,
          sessionId: authContext.sessionId,
          license: authContext.license,
          providerConfig: this.config.providerConfig,
          logger,
        });
        await harness.initialize();
        this.sessions.set(authContext.sessionId, harness);
      }
      // Process message
      const inboundMessage: InboundMessage = {
        messageId: randomUUID(),
        userId: authContext.userId,
        sessionId: authContext.sessionId,
        content: text,
        timestamp: new Date(),
      };
      const response = await harness.handleMessage(inboundMessage);
      // Send response back to Telegram
      await this.sendTelegramMessage(chatId, response.content);
      reply.code(200).send({ ok: true });
    } catch (error) {
      logger.error({ error }, 'Error handling Telegram webhook');
      reply.code(500).send({ ok: false, error: 'Internal server error' });
    }
  }
  /**
   * Send message to Telegram chat
   */
  private async sendTelegramMessage(chatId: number, text: string): Promise<void> {
    const url = `https://api.telegram.org/bot${this.config.telegramBotToken}/sendMessage`;
    try {
      const response = await fetch(url, {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json',
        },
        body: JSON.stringify({
          chat_id: chatId,
          text,
          parse_mode: 'Markdown',
        }),
      });
      if (!response.ok) {
        throw new Error(`Telegram API error: ${response.statusText}`);
      }
    } catch (error) {
      console.error('Failed to send Telegram message:', error);
      throw error;
    }
  }
  /**
   * Cleanup old sessions (call periodically)
   */
  async cleanupSessions(maxAgeMs = 30 * 60 * 1000): Promise<void> {
    // TODO: Track session last activity and cleanup
    // For now, sessions persist until server restart
  }
 }
--- a/gateway/src/channels/websocket-handler.ts
+++ b/gateway/src/channels/websocket-handler.ts
@@ -0,0 +1,161 @@
 import type { FastifyInstance, FastifyRequest } from 'fastify';
 import type { WebSocket } from '@fastify/websocket';
 import type { Authenticator } from '../auth/authenticator.js';
 import { AgentHarness } from '../harness/agent-harness.js';
 import type { InboundMessage } from '../types/messages.js';
 import { randomUUID } from 'crypto';
 import type { ProviderConfig } from '../llm/provider.js';
 export interface WebSocketHandlerConfig {
  authenticator: Authenticator;
  providerConfig: ProviderConfig;
 }
 /**
 * WebSocket channel handler
 */
 export class WebSocketHandler {
  private config: WebSocketHandlerConfig;
  private sessions = new Map<string, AgentHarness>();
  constructor(config: WebSocketHandlerConfig) {
    this.config = config;
  }
  /**
   * Register WebSocket routes
   */
  register(app: FastifyInstance): void {
    app.get(
      '/ws/chat',
      { websocket: true },
      async (socket: WebSocket, request: FastifyRequest) => {
        await this.handleConnection(socket, request, app);
      }
    );
  }
  /**
   * Handle WebSocket connection
   */
  private async handleConnection(
    socket: WebSocket,
    request: FastifyRequest,
    app: FastifyInstance
  ): Promise<void> {
    const logger = app.log;
    // Send initial connecting message
    socket.send(
      JSON.stringify({
        type: 'status',
        status: 'authenticating',
        message: 'Authenticating...',
      })
    );
    // Authenticate (this may take time if creating container)
    const authContext = await this.config.authenticator.authenticateWebSocket(request);
    if (!authContext) {
      logger.warn('WebSocket authentication failed');
      socket.send(
        JSON.stringify({
          type: 'error',
          message: 'Authentication failed',
        })
      );
      socket.close(1008, 'Authentication failed');
      return;
    }
    logger.info(
      { userId: authContext.userId, sessionId: authContext.sessionId },
      'WebSocket connection authenticated'
    );
    // Send workspace starting message
    socket.send(
      JSON.stringify({
        type: 'status',
        status: 'initializing',
        message: 'Starting your workspace...',
      })
    );
    // Create agent harness
    const harness = new AgentHarness({
      userId: authContext.userId,
      sessionId: authContext.sessionId,
      license: authContext.license,
      providerConfig: this.config.providerConfig,
      logger,
    });
    try {
      await harness.initialize();
      this.sessions.set(authContext.sessionId, harness);
      // Send connected message
      socket.send(
        JSON.stringify({
          type: 'connected',
          sessionId: authContext.sessionId,
          userId: authContext.userId,
          licenseType: authContext.license.licenseType,
          message: 'Connected to Dexorder AI',
        })
      );
      // Handle messages
      socket.on('message', async (data: Buffer) => {
        try {
          const payload = JSON.parse(data.toString());
          if (payload.type === 'message') {
            const inboundMessage: InboundMessage = {
              messageId: randomUUID(),
              userId: authContext.userId,
              sessionId: authContext.sessionId,
              content: payload.content,
              attachments: payload.attachments,
              timestamp: new Date(),
            };
            const response = await harness.handleMessage(inboundMessage);
            socket.send(
              JSON.stringify({
                type: 'message',
                ...response,
              })
            );
          }
        } catch (error) {
          logger.error({ error }, 'Error handling WebSocket message');
          socket.send(
            JSON.stringify({
              type: 'error',
              message: 'Failed to process message',
            })
          );
        }
      });
      // Handle disconnection
      socket.on('close', async () => {
        logger.info({ sessionId: authContext.sessionId }, 'WebSocket disconnected');
        await harness.cleanup();
        this.sessions.delete(authContext.sessionId);
      });
      socket.on('error', (error) => {
        logger.error({ error, sessionId: authContext.sessionId }, 'WebSocket error');
      });
    } catch (error) {
      logger.error({ error }, 'Failed to initialize agent harness');
      socket.close(1011, 'Internal server error');
      await harness.cleanup();
    }
  }
 }
--- a/gateway/src/db/user-service.ts
+++ b/gateway/src/db/user-service.ts
@@ -0,0 +1,107 @@
 import { Pool, PoolClient } from 'pg';
 import type { UserLicense } from '../types/user.js';
 import { UserLicenseSchema } from '../types/user.js';
 export class UserService {
  private pool: Pool;
  constructor(connectionString: string) {
    this.pool = new Pool({
      connectionString,
      max: 20,
      idleTimeoutMillis: 30000,
      connectionTimeoutMillis: 2000,
    });
  }
  /**
   * Get user license by user ID
   */
  async getUserLicense(userId: string): Promise<UserLicense | null> {
    const client = await this.pool.connect();
    try {
      const result = await client.query(
        `SELECT
          user_id as "userId",
          email,
          license_type as "licenseType",
          features,
          resource_limits as "resourceLimits",
          mcp_server_url as "mcpServerUrl",
          preferred_model as "preferredModel",
          expires_at as "expiresAt",
          created_at as "createdAt",
          updated_at as "updatedAt"
        FROM user_licenses
        WHERE user_id = $1
          AND (expires_at IS NULL OR expires_at > NOW())`,
        [userId]
      );
      if (result.rows.length === 0) {
        return null;
      }
      const row = result.rows[0];
      // Parse and validate
      return UserLicenseSchema.parse({
        userId: row.userId,
        email: row.email,
        licenseType: row.licenseType,
        features: row.features,
        resourceLimits: row.resourceLimits,
        mcpServerUrl: row.mcpServerUrl,
        preferredModel: row.preferredModel,
        expiresAt: row.expiresAt,
        createdAt: row.createdAt,
        updatedAt: row.updatedAt,
      });
    } finally {
      client.release();
    }
  }
  /**
   * Get user ID from channel-specific identifier
   */
  async getUserIdFromChannel(channelType: string, channelUserId: string): Promise<string | null> {
    const client = await this.pool.connect();
    try {
      const result = await client.query(
        `SELECT user_id
        FROM user_channel_links
        WHERE channel_type = $1 AND channel_user_id = $2`,
        [channelType, channelUserId]
      );
      return result.rows.length > 0 ? result.rows[0].user_id : null;
    } finally {
      client.release();
    }
  }
  /**
   * Verify JWT token from web client
   * TODO: Implement JWT verification with JWKS
   */
  async verifyWebToken(token: string): Promise<string | null> {
    // Placeholder - implement JWT verification
    // For now, decode without verification (INSECURE - FOR DEV ONLY)
    try {
      const payload = JSON.parse(
        Buffer.from(token.split('.')[1], 'base64').toString()
      );
      return payload.sub || null;
    } catch {
      return null;
    }
  }
  /**
   * Close database pool
   */
  async close(): Promise<void> {
    await this.pool.end();
  }
 }
--- a/gateway/src/harness/agent-harness.ts
+++ b/gateway/src/harness/agent-harness.ts
@@ -0,0 +1,306 @@
 import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
 import type { BaseMessage } from '@langchain/core/messages';
 import { HumanMessage, AIMessage, SystemMessage } from '@langchain/core/messages';
 import type { FastifyBaseLogger } from 'fastify';
 import type { UserLicense } from '../types/user.js';
 import type { InboundMessage, OutboundMessage } from '../types/messages.js';
 import { MCPClientConnector } from './mcp-client.js';
 import { CONTEXT_URIS, type ResourceContent } from '../types/resources.js';
 import { LLMProviderFactory, type ProviderConfig } from '../llm/provider.js';
 import { ModelRouter, RoutingStrategy } from '../llm/router.js';
 export interface AgentHarnessConfig {
  userId: string;
  sessionId: string;
  license: UserLicense;
  providerConfig: ProviderConfig;
  logger: FastifyBaseLogger;
 }
 /**
 * Agent harness orchestrates between LLM and user's MCP server.
 *
 * This is a STATELESS orchestrator - all conversation history, RAG, and context
 * lives in the user's MCP server container. The harness only:
 * 1. Fetches context from user's MCP resources
 * 2. Routes to appropriate LLM model
 * 3. Calls LLM with embedded context
 * 4. Routes tool calls to user's MCP or platform tools
 * 5. Saves messages back to user's MCP
 */
 export class AgentHarness {
  private config: AgentHarnessConfig;
  private modelFactory: LLMProviderFactory;
  private modelRouter: ModelRouter;
  private mcpClient: MCPClientConnector;
  constructor(config: AgentHarnessConfig) {
    this.config = config;
    this.modelFactory = new LLMProviderFactory(config.providerConfig, config.logger);
    this.modelRouter = new ModelRouter(this.modelFactory, config.logger);
    this.mcpClient = new MCPClientConnector({
      userId: config.userId,
      mcpServerUrl: config.license.mcpServerUrl,
      logger: config.logger,
    });
  }
  /**
   * Initialize harness and connect to user's MCP server
   */
  async initialize(): Promise<void> {
    this.config.logger.info(
      { userId: this.config.userId, sessionId: this.config.sessionId },
      'Initializing agent harness'
    );
    try {
      await this.mcpClient.connect();
      this.config.logger.info('Agent harness initialized');
    } catch (error) {
      this.config.logger.error({ error }, 'Failed to initialize agent harness');
      throw error;
    }
  }
  /**
   * Handle incoming message from user
   */
  async handleMessage(message: InboundMessage): Promise<OutboundMessage> {
    this.config.logger.info(
      { messageId: message.messageId, userId: message.userId },
      'Processing user message'
    );
    try {
      // 1. Fetch context resources from user's MCP server
      this.config.logger.debug('Fetching context resources from MCP');
      const contextResources = await this.fetchContextResources();
      // 2. Build system prompt from resources
      const systemPrompt = this.buildSystemPrompt(contextResources);
      // 3. Build messages with conversation context from MCP
      const messages = this.buildMessages(message, contextResources);
      // 4. Route to appropriate model
      const model = await this.modelRouter.route(
        message.content,
        this.config.license,
        RoutingStrategy.COMPLEXITY
      );
      // 5. Build LangChain messages
      const langchainMessages = this.buildLangChainMessages(systemPrompt, messages);
      // 6. Call LLM with streaming
      this.config.logger.debug('Invoking LLM');
      const response = await model.invoke(langchainMessages);
      // 7. Extract text response (tool handling TODO)
      const assistantMessage = response.content as string;
      // 8. Save messages to user's MCP server
      this.config.logger.debug('Saving messages to MCP');
      await this.mcpClient.callTool('save_message', {
        role: 'user',
        content: message.content,
        timestamp: message.timestamp.toISOString(),
      });
      await this.mcpClient.callTool('save_message', {
        role: 'assistant',
        content: assistantMessage,
        timestamp: new Date().toISOString(),
      });
      return {
        messageId: `msg_${Date.now()}`,
        sessionId: message.sessionId,
        content: assistantMessage,
        timestamp: new Date(),
      };
    } catch (error) {
      this.config.logger.error({ error }, 'Error processing message');
      throw error;
    }
  }
  /**
   * Stream response from LLM
   */
  async *streamMessage(message: InboundMessage): AsyncGenerator<string> {
    try {
      // Fetch context
      const contextResources = await this.fetchContextResources();
      const systemPrompt = this.buildSystemPrompt(contextResources);
      const messages = this.buildMessages(message, contextResources);
      // Route to model
      const model = await this.modelRouter.route(
        message.content,
        this.config.license,
        RoutingStrategy.COMPLEXITY
      );
      // Build messages
      const langchainMessages = this.buildLangChainMessages(systemPrompt, messages);
      // Stream response
      const stream = await model.stream(langchainMessages);
      let fullResponse = '';
      for await (const chunk of stream) {
        const content = chunk.content as string;
        fullResponse += content;
        yield content;
      }
      // Save after streaming completes
      await this.mcpClient.callTool('save_message', {
        role: 'user',
        content: message.content,
        timestamp: message.timestamp.toISOString(),
      });
      await this.mcpClient.callTool('save_message', {
        role: 'assistant',
        content: fullResponse,
        timestamp: new Date().toISOString(),
      });
    } catch (error) {
      this.config.logger.error({ error }, 'Error streaming message');
      throw error;
    }
  }
  /**
   * Fetch context resources from user's MCP server
   */
  private async fetchContextResources(): Promise<ResourceContent[]> {
    const contextUris = [
      CONTEXT_URIS.USER_PROFILE,
      CONTEXT_URIS.CONVERSATION_SUMMARY,
      CONTEXT_URIS.WORKSPACE_STATE,
      CONTEXT_URIS.SYSTEM_PROMPT,
    ];
    const resources = await Promise.all(
      contextUris.map(async (uri) => {
        try {
          return await this.mcpClient.readResource(uri);
        } catch (error) {
          this.config.logger.warn({ error, uri }, 'Failed to fetch resource, using empty');
          return { uri, text: '' };
        }
      })
    );
    return resources;
  }
  /**
   * Build messages array with context from resources
   */
  private buildMessages(
    currentMessage: InboundMessage,
    contextResources: ResourceContent[]
  ): Array<{ role: string; content: string }> {
    const conversationSummary = contextResources.find(
      (r) => r.uri === CONTEXT_URIS.CONVERSATION_SUMMARY
    );
    const messages: Array<{ role: string; content: string }> = [];
    // Add conversation context as a system-like user message
    if (conversationSummary?.text) {
      messages.push({
        role: 'user',
        content: `[Previous Conversation Context]\n${conversationSummary.text}`,
      });
      messages.push({
        role: 'assistant',
        content: 'I understand the context from our previous conversations.',
      });
    }
    // Add current user message
    messages.push({
      role: 'user',
      content: currentMessage.content,
    });
    return messages;
  }
  /**
   * Convert to LangChain message format
   */
  private buildLangChainMessages(
    systemPrompt: string,
    messages: Array<{ role: string; content: string }>
  ): BaseMessage[] {
    const langchainMessages: BaseMessage[] = [new SystemMessage(systemPrompt)];
    for (const msg of messages) {
      if (msg.role === 'user') {
        langchainMessages.push(new HumanMessage(msg.content));
      } else if (msg.role === 'assistant') {
        langchainMessages.push(new AIMessage(msg.content));
      }
    }
    return langchainMessages;
  }
  /**
   * Build system prompt from platform base + user resources
   */
  private buildSystemPrompt(contextResources: ResourceContent[]): string {
    const userProfile = contextResources.find((r) => r.uri === CONTEXT_URIS.USER_PROFILE);
    const customPrompt = contextResources.find((r) => r.uri === CONTEXT_URIS.SYSTEM_PROMPT);
    const workspaceState = contextResources.find((r) => r.uri === CONTEXT_URIS.WORKSPACE_STATE);
    // Base platform prompt
    let prompt = `You are a helpful AI assistant for Dexorder, an AI-first trading platform.
 You help users research markets, develop indicators and strategies, and analyze trading data.
 User license: ${this.config.license.licenseType}
 Available features: ${JSON.stringify(this.config.license.features, null, 2)}`;
    // Add user profile context
    if (userProfile?.text) {
      prompt += `\n\n# User Profile\n${userProfile.text}`;
    }
    // Add workspace context
    if (workspaceState?.text) {
      prompt += `\n\n# Current Workspace\n${workspaceState.text}`;
    }
    // Add user's custom instructions (highest priority)
    if (customPrompt?.text) {
      prompt += `\n\n# User Instructions\n${customPrompt.text}`;
    }
    return prompt;
  }
  /**
   * Get platform tools (non-user-specific tools)
   */
  private getPlatformTools(): Array<{ name: string; description?: string }> {
    // Platform tools that don't need user's MCP
    return [
      // TODO: Add platform tools like market data queries, chart rendering, etc.
    ];
  }
  /**
   * Cleanup resources
   */
  async cleanup(): Promise<void> {
    this.config.logger.info('Cleaning up agent harness');
    await this.mcpClient.disconnect();
  }
 }
--- a/gateway/src/harness/mcp-client.ts
+++ b/gateway/src/harness/mcp-client.ts
@@ -0,0 +1,259 @@
 import { Client } from '@modelcontextprotocol/sdk/client/index.js';
 import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js';
 import type { FastifyBaseLogger } from 'fastify';
 export interface MCPClientConfig {
  userId: string;
  mcpServerUrl: string;
  platformJWT?: string;
  logger: FastifyBaseLogger;
 }
 /**
 * MCP client connector for user's container
 * Manages connection to user-specific MCP server
 */
 export class MCPClientConnector {
  private client: Client | null = null;
  private connected = false;
  private config: MCPClientConfig;
  constructor(config: MCPClientConfig) {
    this.config = config;
  }
  /**
   * Connect to user's MCP server
   * TODO: Implement HTTP/SSE transport instead of stdio for container communication
   */
  async connect(): Promise<void> {
    if (this.connected) {
      return;
    }
    try {
      this.config.logger.info(
        { userId: this.config.userId, url: this.config.mcpServerUrl },
        'Connecting to user MCP server'
      );
      this.client = new Client(
        {
          name: 'dexorder-gateway',
          version: '0.1.0',
        },
        {
          capabilities: {
            tools: {},
            resources: {},
          },
        }
      );
      // TODO: Replace with HTTP transport when user containers are ready
      // For now, this is a placeholder structure
      // const transport = new HTTPTransport(this.config.mcpServerUrl, {
      //   headers: {
      //     'Authorization': `Bearer ${this.config.platformJWT}`
      //   }
      // });
      // Placeholder: will be replaced with actual container transport
      this.config.logger.warn(
        'MCP transport not yet implemented - using placeholder'
      );
      this.connected = true;
      this.config.logger.info('Connected to user MCP server');
    } catch (error) {
      this.config.logger.error(
        { error, userId: this.config.userId },
        'Failed to connect to user MCP server'
      );
      throw error;
    }
  }
  /**
   * Call a tool on the user's MCP server
   */
  async callTool(name: string, args: Record<string, unknown>): Promise<unknown> {
    if (!this.client || !this.connected) {
      throw new Error('MCP client not connected');
    }
    try {
      this.config.logger.debug({ tool: name, args }, 'Calling MCP tool');
      // TODO: Implement when MCP client is connected
      // const result = await this.client.callTool({ name, arguments: args });
      // return result;
      // Placeholder response
      return { success: true, message: 'MCP tool call placeholder' };
    } catch (error) {
      this.config.logger.error({ error, tool: name }, 'MCP tool call failed');
      throw error;
    }
  }
  /**
   * List available tools from user's MCP server
   */
  async listTools(): Promise<Array<{ name: string; description?: string }>> {
    if (!this.client || !this.connected) {
      throw new Error('MCP client not connected');
    }
    try {
      // TODO: Implement when MCP client is connected
      // const tools = await this.client.listTools();
      // return tools;
      // Placeholder tools (actions only, not context)
      return [
        { name: 'save_message', description: 'Save message to conversation history' },
        { name: 'list_strategies', description: 'List user strategies' },
        { name: 'read_strategy', description: 'Read strategy code' },
        { name: 'write_strategy', description: 'Write strategy code' },
        { name: 'run_backtest', description: 'Run backtest on strategy' },
        { name: 'get_watchlist', description: 'Get user watchlist' },
        { name: 'execute_trade', description: 'Execute trade' },
      ];
    } catch (error) {
      this.config.logger.error({ error }, 'Failed to list MCP tools');
      throw error;
    }
  }
  /**
   * List available resources from user's MCP server
   */
  async listResources(): Promise<Array<{ uri: string; name: string; description?: string; mimeType?: string }>> {
    if (!this.client || !this.connected) {
      throw new Error('MCP client not connected');
    }
    try {
      // TODO: Implement when MCP client is connected
      // const resources = await this.client.listResources();
      // return resources;
      // Placeholder resources for user context
      return [
        {
          uri: 'context://user-profile',
          name: 'User Profile',
          description: 'User trading style, preferences, and background',
          mimeType: 'text/plain',
        },
        {
          uri: 'context://conversation-summary',
          name: 'Conversation Summary',
          description: 'Semantic summary of recent conversation history with RAG',
          mimeType: 'text/plain',
        },
        {
          uri: 'context://workspace-state',
          name: 'Workspace State',
          description: 'Current chart, watchlist, and open positions',
          mimeType: 'application/json',
        },
        {
          uri: 'context://system-prompt',
          name: 'Custom System Prompt',
          description: 'User custom instructions for the assistant',
          mimeType: 'text/plain',
        },
      ];
    } catch (error) {
      this.config.logger.error({ error }, 'Failed to list MCP resources');
      throw error;
    }
  }
  /**
   * Read a resource from user's MCP server
   */
  async readResource(uri: string): Promise<{ uri: string; mimeType?: string; text?: string; blob?: string }> {
    if (!this.client || !this.connected) {
      throw new Error('MCP client not connected');
    }
    try {
      this.config.logger.debug({ uri }, 'Reading MCP resource');
      // TODO: Implement when MCP client is connected
      // const resource = await this.client.readResource({ uri });
      // return resource;
      // Placeholder resource content
      if (uri === 'context://user-profile') {
        return {
          uri,
          mimeType: 'text/plain',
          text: `User Profile:
 - Trading experience: Intermediate
 - Preferred timeframes: 1h, 4h, 1d
 - Risk tolerance: Medium
 - Focus: Swing trading with technical indicators`,
        };
      } else if (uri === 'context://conversation-summary') {
        return {
          uri,
          mimeType: 'text/plain',
          text: `Recent Conversation Summary:
 [RAG-generated summary would go here]
 User recently discussed:
 - Moving average crossover strategies
 - Backtesting on BTC/USDT
 - Risk management techniques`,
        };
      } else if (uri === 'context://workspace-state') {
        return {
          uri,
          mimeType: 'application/json',
          text: JSON.stringify({
            currentChart: { ticker: 'BINANCE:BTC/USDT', timeframe: '1h' },
            watchlist: ['BTC/USDT', 'ETH/USDT', 'SOL/USDT'],
            openPositions: [],
          }, null, 2),
        };
      } else if (uri === 'context://system-prompt') {
        return {
          uri,
          mimeType: 'text/plain',
          text: `Custom Instructions:
 - Be concise and data-driven
 - Always show risk/reward ratios
 - Prefer simple strategies over complex ones`,
        };
      }
      return { uri, text: '' };
    } catch (error) {
      this.config.logger.error({ error, uri }, 'MCP resource read failed');
      throw error;
    }
  }
  /**
   * Disconnect from MCP server
   */
  async disconnect(): Promise<void> {
    if (this.client && this.connected) {
      try {
        await this.client.close();
        this.connected = false;
        this.config.logger.info('Disconnected from user MCP server');
      } catch (error) {
        this.config.logger.error({ error }, 'Error disconnecting from MCP server');
      }
    }
  }
  isConnected(): boolean {
    return this.connected;
  }
 }
--- a/gateway/src/k8s/client.ts
+++ b/gateway/src/k8s/client.ts
@@ -0,0 +1,327 @@
 import * as k8s from '@kubernetes/client-node';
 import type { FastifyBaseLogger } from 'fastify';
 import * as yaml from 'js-yaml';
 import * as fs from 'fs/promises';
 import * as path from 'path';
 import { fileURLToPath } from 'url';
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = path.dirname(__filename);
 export interface K8sClientConfig {
  namespace: string;
  inCluster: boolean;
  context?: string; // For local dev
  logger: FastifyBaseLogger;
 }
 export interface DeploymentSpec {
  userId: string;
  licenseType: 'free' | 'pro' | 'enterprise';
  agentImage: string;
  sidecarImage: string;
  storageClass: string;
 }
 /**
 * Kubernetes client wrapper for managing agent deployments
 */
 export class KubernetesClient {
  private config: K8sClientConfig;
  private k8sConfig: k8s.KubeConfig;
  private appsApi: k8s.AppsV1Api;
  private coreApi: k8s.CoreV1Api;
  constructor(config: K8sClientConfig) {
    this.config = config;
    this.k8sConfig = new k8s.KubeConfig();
    if (config.inCluster) {
      this.k8sConfig.loadFromCluster();
      this.config.logger.info('Loaded in-cluster Kubernetes config');
    } else {
      this.k8sConfig.loadFromDefault();
      if (config.context) {
        this.k8sConfig.setCurrentContext(config.context);
        this.config.logger.info({ context: config.context }, 'Set Kubernetes context');
      }
      this.config.logger.info('Loaded Kubernetes config from default location');
    }
    this.appsApi = this.k8sConfig.makeApiClient(k8s.AppsV1Api);
    this.coreApi = this.k8sConfig.makeApiClient(k8s.CoreV1Api);
  }
  /**
   * Generate deployment name from user ID
   */
  static getDeploymentName(userId: string): string {
    // Sanitize userId to be k8s-compliant (lowercase alphanumeric + hyphens)
    const sanitized = userId.toLowerCase().replace(/[^a-z0-9-]/g, '-');
    return `agent-${sanitized}`;
  }
  /**
   * Generate service name (same as deployment)
   */
  static getServiceName(userId: string): string {
    return this.getDeploymentName(userId);
  }
  /**
   * Generate PVC name
   */
  static getPvcName(userId: string): string {
    return `${this.getDeploymentName(userId)}-data`;
  }
  /**
   * Compute MCP endpoint URL from service name
   */
  static getMcpEndpoint(userId: string, namespace: string): string {
    const serviceName = this.getServiceName(userId);
    return `http://${serviceName}.${namespace}.svc.cluster.local:3000`;
  }
  /**
   * Check if deployment exists
   */
  async deploymentExists(deploymentName: string): Promise<boolean> {
    try {
      await this.appsApi.readNamespacedDeployment(deploymentName, this.config.namespace);
      return true;
    } catch (error: any) {
      if (error.response?.statusCode === 404) {
        return false;
      }
      throw error;
    }
  }
  /**
   * Create agent deployment from template
   */
  async createAgentDeployment(spec: DeploymentSpec): Promise<void> {
    const deploymentName = KubernetesClient.getDeploymentName(spec.userId);
    const serviceName = KubernetesClient.getServiceName(spec.userId);
    const pvcName = KubernetesClient.getPvcName(spec.userId);
    this.config.logger.info(
      { userId: spec.userId, licenseType: spec.licenseType, deploymentName },
      'Creating agent deployment'
    );
    // Load template based on license type
    const templatePath = path.join(
      __dirname,
      'templates',
      `${spec.licenseType}-tier.yaml`
    );
    const templateContent = await fs.readFile(templatePath, 'utf-8');
    // Substitute variables
    const rendered = templateContent
      .replace(/\{\{userId\}\}/g, spec.userId)
      .replace(/\{\{deploymentName\}\}/g, deploymentName)
      .replace(/\{\{serviceName\}\}/g, serviceName)
      .replace(/\{\{pvcName\}\}/g, pvcName)
      .replace(/\{\{agentImage\}\}/g, spec.agentImage)
      .replace(/\{\{sidecarImage\}\}/g, spec.sidecarImage)
      .replace(/\{\{storageClass\}\}/g, spec.storageClass);
    // Parse YAML documents (deployment, pvc, service)
    const documents = yaml.loadAll(rendered) as any[];
    // Apply each resource
    for (const doc of documents) {
      if (!doc || !doc.kind) continue;
      try {
        switch (doc.kind) {
          case 'Deployment':
            await this.appsApi.createNamespacedDeployment(this.config.namespace, doc);
            this.config.logger.info({ deploymentName }, 'Created deployment');
            break;
          case 'PersistentVolumeClaim':
            await this.coreApi.createNamespacedPersistentVolumeClaim(
              this.config.namespace,
              doc
            );
            this.config.logger.info({ pvcName }, 'Created PVC');
            break;
          case 'Service':
            await this.coreApi.createNamespacedService(this.config.namespace, doc);
            this.config.logger.info({ serviceName }, 'Created service');
            break;
          default:
            this.config.logger.warn({ kind: doc.kind }, 'Unknown resource kind in template');
        }
      } catch (error: any) {
        // If resource already exists, log warning but continue
        if (error.response?.statusCode === 409) {
          this.config.logger.warn(
            { kind: doc.kind, name: doc.metadata?.name },
            'Resource already exists, skipping'
          );
        } else {
          throw error;
        }
      }
    }
    this.config.logger.info({ deploymentName }, 'Agent deployment created successfully');
  }
  /**
   * Wait for deployment to be ready
   */
  async waitForDeploymentReady(
    deploymentName: string,
    timeoutMs: number = 120000
  ): Promise<boolean> {
    const startTime = Date.now();
    const pollInterval = 2000; // 2 seconds
    this.config.logger.info(
      { deploymentName, timeoutMs },
      'Waiting for deployment to be ready'
    );
    while (Date.now() - startTime < timeoutMs) {
      try {
        const response = await this.appsApi.readNamespacedDeployment(
          deploymentName,
          this.config.namespace
        );
        const deployment = response.body;
        const status = deployment.status;
        // Check if deployment is ready
        if (
          status?.availableReplicas &&
          status.availableReplicas > 0 &&
          status.readyReplicas &&
          status.readyReplicas > 0
        ) {
          this.config.logger.info({ deploymentName }, 'Deployment is ready');
          return true;
        }
        // Check for failure conditions
        if (status?.conditions) {
          const failedCondition = status.conditions.find(
            (c) => c.type === 'Progressing' && c.status === 'False'
          );
          if (failedCondition) {
            this.config.logger.error(
              { deploymentName, reason: failedCondition.reason, message: failedCondition.message },
              'Deployment failed to progress'
            );
            return false;
          }
        }
        this.config.logger.debug(
          {
            deploymentName,
            replicas: status?.replicas,
            ready: status?.readyReplicas,
            available: status?.availableReplicas,
          },
          'Deployment not ready yet, waiting...'
        );
        await new Promise((resolve) => setTimeout(resolve, pollInterval));
      } catch (error: any) {
        if (error.response?.statusCode === 404) {
          this.config.logger.warn({ deploymentName }, 'Deployment not found');
          return false;
        }
        throw error;
      }
    }
    this.config.logger.warn({ deploymentName, timeoutMs }, 'Deployment readiness timeout');
    return false;
  }
  /**
   * Get service endpoint URL
   */
  async getServiceEndpoint(serviceName: string): Promise<string | null> {
    try {
      const response = await this.coreApi.readNamespacedService(
        serviceName,
        this.config.namespace
      );
      const service = response.body;
      // For ClusterIP services, return internal DNS name
      if (service.spec?.type === 'ClusterIP') {
        const port = service.spec.ports?.find((p) => p.name === 'mcp')?.port || 3000;
        return `http://${serviceName}.${this.config.namespace}.svc.cluster.local:${port}`;
      }
      // For other service types (NodePort, LoadBalancer), would need different logic
      this.config.logger.warn(
        { serviceName, type: service.spec?.type },
        'Unexpected service type'
      );
      return null;
    } catch (error: any) {
      if (error.response?.statusCode === 404) {
        this.config.logger.warn({ serviceName }, 'Service not found');
        return null;
      }
      throw error;
    }
  }
  /**
   * Delete deployment and associated resources
   * (Used for cleanup/testing - normally handled by lifecycle sidecar)
   */
  async deleteAgentDeployment(userId: string): Promise<void> {
    const deploymentName = KubernetesClient.getDeploymentName(userId);
    const serviceName = KubernetesClient.getServiceName(userId);
    const pvcName = KubernetesClient.getPvcName(userId);
    this.config.logger.info({ userId, deploymentName }, 'Deleting agent deployment');
    // Delete deployment
    try {
      await this.appsApi.deleteNamespacedDeployment(deploymentName, this.config.namespace);
      this.config.logger.info({ deploymentName }, 'Deleted deployment');
    } catch (error: any) {
      if (error.response?.statusCode !== 404) {
        this.config.logger.warn({ deploymentName, error }, 'Failed to delete deployment');
      }
    }
    // Delete service
    try {
      await this.coreApi.deleteNamespacedService(serviceName, this.config.namespace);
      this.config.logger.info({ serviceName }, 'Deleted service');
    } catch (error: any) {
      if (error.response?.statusCode !== 404) {
        this.config.logger.warn({ serviceName, error }, 'Failed to delete service');
      }
    }
    // Delete PVC
    try {
      await this.coreApi.deleteNamespacedPersistentVolumeClaim(pvcName, this.config.namespace);
      this.config.logger.info({ pvcName }, 'Deleted PVC');
    } catch (error: any) {
      if (error.response?.statusCode !== 404) {
        this.config.logger.warn({ pvcName, error }, 'Failed to delete PVC');
      }
    }
  }
 }
--- a/gateway/src/k8s/container-manager.ts
+++ b/gateway/src/k8s/container-manager.ts
@@ -0,0 +1,118 @@
 import type { FastifyBaseLogger } from 'fastify';
 import { KubernetesClient, type DeploymentSpec } from './client.js';
 import type { UserLicense } from '../types/user.js';
 export interface ContainerManagerConfig {
  k8sClient: KubernetesClient;
  agentImage: string;
  sidecarImage: string;
  storageClass: string;
  namespace: string;
  logger: FastifyBaseLogger;
 }
 export interface ContainerStatus {
  exists: boolean;
  ready: boolean;
  mcpEndpoint: string;
 }
 /**
 * Container manager orchestrates agent container lifecycle
 */
 export class ContainerManager {
  private config: ContainerManagerConfig;
  constructor(config: ContainerManagerConfig) {
    this.config = config;
  }
  /**
   * Ensure user's container is running and ready
   * Returns the MCP endpoint URL
   */
  async ensureContainerRunning(
    userId: string,
    license: UserLicense
  ): Promise<{ mcpEndpoint: string; wasCreated: boolean }> {
    const deploymentName = KubernetesClient.getDeploymentName(userId);
    const mcpEndpoint = KubernetesClient.getMcpEndpoint(userId, this.config.namespace);
    this.config.logger.info(
      { userId, licenseType: license.licenseType, deploymentName },
      'Ensuring container is running'
    );
    // Check if deployment already exists
    const exists = await this.config.k8sClient.deploymentExists(deploymentName);
    if (exists) {
      this.config.logger.info({ userId, deploymentName }, 'Container deployment already exists');
      // Wait for it to be ready (in case it's starting up)
      const ready = await this.config.k8sClient.waitForDeploymentReady(deploymentName, 30000);
      if (!ready) {
        this.config.logger.warn(
          { userId, deploymentName },
          'Existing deployment not ready within timeout'
        );
        // Continue anyway - might be an image pull or other transient issue
      }
      return { mcpEndpoint, wasCreated: false };
    }
    // Create new deployment
    this.config.logger.info({ userId, licenseType: license.licenseType }, 'Creating new container');
    const spec: DeploymentSpec = {
      userId,
      licenseType: license.licenseType,
      agentImage: this.config.agentImage,
      sidecarImage: this.config.sidecarImage,
      storageClass: this.config.storageClass,
    };
    await this.config.k8sClient.createAgentDeployment(spec);
    // Wait for deployment to be ready
    const ready = await this.config.k8sClient.waitForDeploymentReady(deploymentName, 120000);
    if (!ready) {
      throw new Error(
        `Container deployment failed to become ready within timeout: ${deploymentName}`
      );
    }
    this.config.logger.info({ userId, mcpEndpoint }, 'Container is ready');
    return { mcpEndpoint, wasCreated: true };
  }
  /**
   * Check container status without creating it
   */
  async getContainerStatus(userId: string): Promise<ContainerStatus> {
    const deploymentName = KubernetesClient.getDeploymentName(userId);
    const mcpEndpoint = KubernetesClient.getMcpEndpoint(userId, this.config.namespace);
    const exists = await this.config.k8sClient.deploymentExists(deploymentName);
    if (!exists) {
      return { exists: false, ready: false, mcpEndpoint };
    }
    // Check if ready (with short timeout)
    const ready = await this.config.k8sClient.waitForDeploymentReady(deploymentName, 5000);
    return { exists: true, ready, mcpEndpoint };
  }
  /**
   * Delete container (for cleanup/testing)
   */
  async deleteContainer(userId: string): Promise<void> {
    await this.config.k8sClient.deleteAgentDeployment(userId);
  }
 }
--- a/gateway/src/k8s/templates/enterprise-tier.yaml
+++ b/gateway/src/k8s/templates/enterprise-tier.yaml
@@ -0,0 +1,199 @@
 # Enterprise tier agent deployment template
 # Variables: {{userId}}, {{deploymentName}}, {{pvcName}}, {{serviceName}}
 # Enterprise: No idle shutdown, larger resources
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: {{deploymentName}}
  namespace: dexorder-agents
  labels:
    app.kubernetes.io/name: agent
    app.kubernetes.io/component: user-agent
    dexorder.io/component: agent
    dexorder.io/user-id: {{userId}}
    dexorder.io/deployment: {{deploymentName}}
    dexorder.io/license-tier: enterprise
 spec:
  replicas: 1
  selector:
    matchLabels:
      dexorder.io/user-id: {{userId}}
  template:
    metadata:
      labels:
        dexorder.io/component: agent
        dexorder.io/user-id: {{userId}}
        dexorder.io/deployment: {{deploymentName}}
        dexorder.io/license-tier: enterprise
    spec:
      serviceAccountName: agent-lifecycle
      shareProcessNamespace: true
      securityContext:
        runAsNonRoot: true
        runAsUser: 1000
        fsGroup: 1000
        seccompProfile:
          type: RuntimeDefault
      containers:
        - name: agent
          image: {{agentImage}}
          imagePullPolicy: Always
          securityContext:
            allowPrivilegeEscalation: false
            runAsNonRoot: true
            runAsUser: 1000
            readOnlyRootFilesystem: true
            capabilities:
              drop:
                - ALL
          resources:
            requests:
              memory: "1Gi"
              cpu: "500m"
            limits:
              memory: "4Gi"
              cpu: "4000m"
          env:
            - name: USER_ID
              value: {{userId}}
            - name: IDLE_TIMEOUT_MINUTES
              value: "0"
            - name: IDLE_CHECK_INTERVAL_SECONDS
              value: "60"
            - name: ENABLE_IDLE_SHUTDOWN
              value: "false"
            - name: MCP_SERVER_PORT
              value: "3000"
            - name: ZMQ_CONTROL_PORT
              value: "5555"
          ports:
            - name: mcp
              containerPort: 3000
              protocol: TCP
            - name: zmq-control
              containerPort: 5555
              protocol: TCP
          volumeMounts:
            - name: agent-data
              mountPath: /app/data
            - name: tmp
              mountPath: /tmp
            - name: shared-run
              mountPath: /var/run/agent
          livenessProbe:
            httpGet:
              path: /health
              port: mcp
            initialDelaySeconds: 10
            periodSeconds: 30
            timeoutSeconds: 5
          readinessProbe:
            httpGet:
              path: /ready
              port: mcp
            initialDelaySeconds: 5
            periodSeconds: 10
        - name: lifecycle-sidecar
          image: {{sidecarImage}}
          imagePullPolicy: Always
          securityContext:
            allowPrivilegeEscalation: false
            runAsNonRoot: true
            runAsUser: 1000
            readOnlyRootFilesystem: true
            capabilities:
              drop:
                - ALL
          resources:
            requests:
              memory: "32Mi"
              cpu: "10m"
            limits:
              memory: "64Mi"
              cpu: "50m"
          env:
            - name: NAMESPACE
              valueFrom:
                fieldRef:
                  fieldPath: metadata.namespace
            - name: DEPLOYMENT_NAME
              valueFrom:
                fieldRef:
                  fieldPath: metadata.labels['dexorder.io/deployment']
            - name: USER_TYPE
              value: "enterprise"
            - name: MAIN_CONTAINER_PID
              value: "1"
          volumeMounts:
            - name: shared-run
              mountPath: /var/run/agent
              readOnly: true
      volumes:
        - name: agent-data
          persistentVolumeClaim:
            claimName: {{pvcName}}
        - name: tmp
          emptyDir:
            medium: Memory
            sizeLimit: 512Mi
        - name: shared-run
          emptyDir:
            medium: Memory
            sizeLimit: 1Mi
      restartPolicy: Always
      terminationGracePeriodSeconds: 30
 ---
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: {{pvcName}}
  namespace: dexorder-agents
  labels:
    dexorder.io/user-id: {{userId}}
    dexorder.io/license-tier: enterprise
 spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 50Gi
  storageClassName: {{storageClass}}
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: {{serviceName}}
  namespace: dexorder-agents
  labels:
    dexorder.io/user-id: {{userId}}
    dexorder.io/license-tier: enterprise
 spec:
  type: ClusterIP
  selector:
    dexorder.io/user-id: {{userId}}
  ports:
    - name: mcp
      port: 3000
      targetPort: mcp
      protocol: TCP
    - name: zmq-control
      port: 5555
      targetPort: zmq-control
      protocol: TCP
--- a/gateway/src/k8s/templates/free-tier.yaml
+++ b/gateway/src/k8s/templates/free-tier.yaml
@@ -0,0 +1,198 @@
 # Free tier agent deployment template
 # Variables: {{userId}}, {{deploymentName}}, {{pvcName}}, {{serviceName}}
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: {{deploymentName}}
  namespace: dexorder-agents
  labels:
    app.kubernetes.io/name: agent
    app.kubernetes.io/component: user-agent
    dexorder.io/component: agent
    dexorder.io/user-id: {{userId}}
    dexorder.io/deployment: {{deploymentName}}
    dexorder.io/license-tier: free
 spec:
  replicas: 1
  selector:
    matchLabels:
      dexorder.io/user-id: {{userId}}
  template:
    metadata:
      labels:
        dexorder.io/component: agent
        dexorder.io/user-id: {{userId}}
        dexorder.io/deployment: {{deploymentName}}
        dexorder.io/license-tier: free
    spec:
      serviceAccountName: agent-lifecycle
      shareProcessNamespace: true
      securityContext:
        runAsNonRoot: true
        runAsUser: 1000
        fsGroup: 1000
        seccompProfile:
          type: RuntimeDefault
      containers:
        - name: agent
          image: {{agentImage}}
          imagePullPolicy: Always
          securityContext:
            allowPrivilegeEscalation: false
            runAsNonRoot: true
            runAsUser: 1000
            readOnlyRootFilesystem: true
            capabilities:
              drop:
                - ALL
          resources:
            requests:
              memory: "256Mi"
              cpu: "100m"
            limits:
              memory: "512Mi"
              cpu: "500m"
          env:
            - name: USER_ID
              value: {{userId}}
            - name: IDLE_TIMEOUT_MINUTES
              value: "15"
            - name: IDLE_CHECK_INTERVAL_SECONDS
              value: "60"
            - name: ENABLE_IDLE_SHUTDOWN
              value: "true"
            - name: MCP_SERVER_PORT
              value: "3000"
            - name: ZMQ_CONTROL_PORT
              value: "5555"
          ports:
            - name: mcp
              containerPort: 3000
              protocol: TCP
            - name: zmq-control
              containerPort: 5555
              protocol: TCP
          volumeMounts:
            - name: agent-data
              mountPath: /app/data
            - name: tmp
              mountPath: /tmp
            - name: shared-run
              mountPath: /var/run/agent
          livenessProbe:
            httpGet:
              path: /health
              port: mcp
            initialDelaySeconds: 10
            periodSeconds: 30
            timeoutSeconds: 5
          readinessProbe:
            httpGet:
              path: /ready
              port: mcp
            initialDelaySeconds: 5
            periodSeconds: 10
        - name: lifecycle-sidecar
          image: {{sidecarImage}}
          imagePullPolicy: Always
          securityContext:
            allowPrivilegeEscalation: false
            runAsNonRoot: true
            runAsUser: 1000
            readOnlyRootFilesystem: true
            capabilities:
              drop:
                - ALL
          resources:
            requests:
              memory: "32Mi"
              cpu: "10m"
            limits:
              memory: "64Mi"
              cpu: "50m"
          env:
            - name: NAMESPACE
              valueFrom:
                fieldRef:
                  fieldPath: metadata.namespace
            - name: DEPLOYMENT_NAME
              valueFrom:
                fieldRef:
                  fieldPath: metadata.labels['dexorder.io/deployment']
            - name: USER_TYPE
              value: "free"
            - name: MAIN_CONTAINER_PID
              value: "1"
          volumeMounts:
            - name: shared-run
              mountPath: /var/run/agent
              readOnly: true
      volumes:
        - name: agent-data
          persistentVolumeClaim:
            claimName: {{pvcName}}
        - name: tmp
          emptyDir:
            medium: Memory
            sizeLimit: 128Mi
        - name: shared-run
          emptyDir:
            medium: Memory
            sizeLimit: 1Mi
      restartPolicy: Always
      terminationGracePeriodSeconds: 30
 ---
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: {{pvcName}}
  namespace: dexorder-agents
  labels:
    dexorder.io/user-id: {{userId}}
    dexorder.io/license-tier: free
 spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 1Gi
  storageClassName: {{storageClass}}
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: {{serviceName}}
  namespace: dexorder-agents
  labels:
    dexorder.io/user-id: {{userId}}
    dexorder.io/license-tier: free
 spec:
  type: ClusterIP
  selector:
    dexorder.io/user-id: {{userId}}
  ports:
    - name: mcp
      port: 3000
      targetPort: mcp
      protocol: TCP
    - name: zmq-control
      port: 5555
      targetPort: zmq-control
      protocol: TCP
--- a/gateway/src/k8s/templates/pro-tier.yaml
+++ b/gateway/src/k8s/templates/pro-tier.yaml
@@ -0,0 +1,198 @@
 # Pro tier agent deployment template
 # Variables: {{userId}}, {{deploymentName}}, {{pvcName}}, {{serviceName}}
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: {{deploymentName}}
  namespace: dexorder-agents
  labels:
    app.kubernetes.io/name: agent
    app.kubernetes.io/component: user-agent
    dexorder.io/component: agent
    dexorder.io/user-id: {{userId}}
    dexorder.io/deployment: {{deploymentName}}
    dexorder.io/license-tier: pro
 spec:
  replicas: 1
  selector:
    matchLabels:
      dexorder.io/user-id: {{userId}}
  template:
    metadata:
      labels:
        dexorder.io/component: agent
        dexorder.io/user-id: {{userId}}
        dexorder.io/deployment: {{deploymentName}}
        dexorder.io/license-tier: pro
    spec:
      serviceAccountName: agent-lifecycle
      shareProcessNamespace: true
      securityContext:
        runAsNonRoot: true
        runAsUser: 1000
        fsGroup: 1000
        seccompProfile:
          type: RuntimeDefault
      containers:
        - name: agent
          image: {{agentImage}}
          imagePullPolicy: Always
          securityContext:
            allowPrivilegeEscalation: false
            runAsNonRoot: true
            runAsUser: 1000
            readOnlyRootFilesystem: true
            capabilities:
              drop:
                - ALL
          resources:
            requests:
              memory: "512Mi"
              cpu: "250m"
            limits:
              memory: "2Gi"
              cpu: "2000m"
          env:
            - name: USER_ID
              value: {{userId}}
            - name: IDLE_TIMEOUT_MINUTES
              value: "60"
            - name: IDLE_CHECK_INTERVAL_SECONDS
              value: "60"
            - name: ENABLE_IDLE_SHUTDOWN
              value: "true"
            - name: MCP_SERVER_PORT
              value: "3000"
            - name: ZMQ_CONTROL_PORT
              value: "5555"
          ports:
            - name: mcp
              containerPort: 3000
              protocol: TCP
            - name: zmq-control
              containerPort: 5555
              protocol: TCP
          volumeMounts:
            - name: agent-data
              mountPath: /app/data
            - name: tmp
              mountPath: /tmp
            - name: shared-run
              mountPath: /var/run/agent
          livenessProbe:
            httpGet:
              path: /health
              port: mcp
            initialDelaySeconds: 10
            periodSeconds: 30
            timeoutSeconds: 5
          readinessProbe:
            httpGet:
              path: /ready
              port: mcp
            initialDelaySeconds: 5
            periodSeconds: 10
        - name: lifecycle-sidecar
          image: {{sidecarImage}}
          imagePullPolicy: Always
          securityContext:
            allowPrivilegeEscalation: false
            runAsNonRoot: true
            runAsUser: 1000
            readOnlyRootFilesystem: true
            capabilities:
              drop:
                - ALL
          resources:
            requests:
              memory: "32Mi"
              cpu: "10m"
            limits:
              memory: "64Mi"
              cpu: "50m"
          env:
            - name: NAMESPACE
              valueFrom:
                fieldRef:
                  fieldPath: metadata.namespace
            - name: DEPLOYMENT_NAME
              valueFrom:
                fieldRef:
                  fieldPath: metadata.labels['dexorder.io/deployment']
            - name: USER_TYPE
              value: "pro"
            - name: MAIN_CONTAINER_PID
              value: "1"
          volumeMounts:
            - name: shared-run
              mountPath: /var/run/agent
              readOnly: true
      volumes:
        - name: agent-data
          persistentVolumeClaim:
            claimName: {{pvcName}}
        - name: tmp
          emptyDir:
            medium: Memory
            sizeLimit: 256Mi
        - name: shared-run
          emptyDir:
            medium: Memory
            sizeLimit: 1Mi
      restartPolicy: Always
      terminationGracePeriodSeconds: 30
 ---
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: {{pvcName}}
  namespace: dexorder-agents
  labels:
    dexorder.io/user-id: {{userId}}
    dexorder.io/license-tier: pro
 spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 10Gi
  storageClassName: {{storageClass}}
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: {{serviceName}}
  namespace: dexorder-agents
  labels:
    dexorder.io/user-id: {{userId}}
    dexorder.io/license-tier: pro
 spec:
  type: ClusterIP
  selector:
    dexorder.io/user-id: {{userId}}
  ports:
    - name: mcp
      port: 3000
      targetPort: mcp
      protocol: TCP
    - name: zmq-control
      port: 5555
      targetPort: zmq-control
      protocol: TCP
--- a/gateway/src/llm/provider.ts
+++ b/gateway/src/llm/provider.ts
@@ -0,0 +1,216 @@
 import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
 import { ChatAnthropic } from '@langchain/anthropic';
 import { ChatOpenAI } from '@langchain/openai';
 import { ChatGoogleGenerativeAI } from '@langchain/google-genai';
 import { ChatOpenRouter } from '@langchain/openrouter';
 import type { FastifyBaseLogger } from 'fastify';
 /**
 * Supported LLM providers
 */
 export enum LLMProvider {
  ANTHROPIC = 'anthropic',
  OPENAI = 'openai',
  GOOGLE = 'google',
  OPENROUTER = 'openrouter',
 }
 /**
 * Model configuration
 */
 export interface ModelConfig {
  provider: LLMProvider;
  model: string;
  temperature?: number;
  maxTokens?: number;
 }
 /**
 * Provider configuration with API keys
 */
 export interface ProviderConfig {
  anthropicApiKey?: string;
  openaiApiKey?: string;
  googleApiKey?: string;
  openrouterApiKey?: string;
 }
 /**
 * LLM Provider factory
 * Creates model instances with unified interface across providers
 */
 export class LLMProviderFactory {
  private config: ProviderConfig;
  private logger: FastifyBaseLogger;
  constructor(config: ProviderConfig, logger: FastifyBaseLogger) {
    this.config = config;
    this.logger = logger;
  }
  /**
   * Create a chat model instance
   */
  createModel(modelConfig: ModelConfig): BaseChatModel {
    this.logger.debug(
      { provider: modelConfig.provider, model: modelConfig.model },
      'Creating LLM model'
    );
    switch (modelConfig.provider) {
      case LLMProvider.ANTHROPIC:
        return this.createAnthropicModel(modelConfig);
      case LLMProvider.OPENAI:
        return this.createOpenAIModel(modelConfig);
      case LLMProvider.GOOGLE:
        return this.createGoogleModel(modelConfig);
      case LLMProvider.OPENROUTER:
        return this.createOpenRouterModel(modelConfig);
      default:
        throw new Error(`Unsupported provider: ${modelConfig.provider}`);
    }
  }
  /**
   * Create Anthropic Claude model
   */
  private createAnthropicModel(config: ModelConfig): ChatAnthropic {
    if (!this.config.anthropicApiKey) {
      throw new Error('Anthropic API key not configured');
    }
    return new ChatAnthropic({
      model: config.model,
      temperature: config.temperature ?? 0.7,
      maxTokens: config.maxTokens ?? 4096,
      anthropicApiKey: this.config.anthropicApiKey,
    });
  }
  /**
   * Create OpenAI GPT model
   */
  private createOpenAIModel(config: ModelConfig): ChatOpenAI {
    if (!this.config.openaiApiKey) {
      throw new Error('OpenAI API key not configured');
    }
    return new ChatOpenAI({
      model: config.model,
      temperature: config.temperature ?? 0.7,
      maxTokens: config.maxTokens ?? 4096,
      openAIApiKey: this.config.openaiApiKey,
    });
  }
  /**
   * Create Google Gemini model
   */
  private createGoogleModel(config: ModelConfig): ChatGoogleGenerativeAI {
    if (!this.config.googleApiKey) {
      throw new Error('Google API key not configured');
    }
    return new ChatGoogleGenerativeAI({
      model: config.model,
      temperature: config.temperature ?? 0.7,
      maxOutputTokens: config.maxTokens ?? 4096,
      apiKey: this.config.googleApiKey,
    });
  }
  /**
   * Create OpenRouter model (access to 300+ models)
   */
  private createOpenRouterModel(config: ModelConfig): ChatOpenRouter {
    if (!this.config.openrouterApiKey) {
      throw new Error('OpenRouter API key not configured');
    }
    return new ChatOpenRouter({
      model: config.model,
      temperature: config.temperature ?? 0.7,
      maxTokens: config.maxTokens ?? 4096,
      apiKey: this.config.openrouterApiKey,
    });
  }
  /**
   * Get default model based on environment
   */
  getDefaultModel(): ModelConfig {
    // Check which API keys are available
    if (this.config.anthropicApiKey) {
      return {
        provider: LLMProvider.ANTHROPIC,
        model: 'claude-3-5-sonnet-20241022',
      };
    }
    if (this.config.openaiApiKey) {
      return {
        provider: LLMProvider.OPENAI,
        model: 'gpt-4o',
      };
    }
    if (this.config.googleApiKey) {
      return {
        provider: LLMProvider.GOOGLE,
        model: 'gemini-2.0-flash-exp',
      };
    }
    if (this.config.openrouterApiKey) {
      return {
        provider: LLMProvider.OPENROUTER,
        model: 'anthropic/claude-3.5-sonnet',
      };
    }
    throw new Error('No LLM API keys configured');
  }
 }
 /**
 * Predefined model configurations
 */
 export const MODELS = {
  // Anthropic
  CLAUDE_SONNET: {
    provider: LLMProvider.ANTHROPIC,
    model: 'claude-3-5-sonnet-20241022',
  },
  CLAUDE_HAIKU: {
    provider: LLMProvider.ANTHROPIC,
    model: 'claude-3-5-haiku-20241022',
  },
  CLAUDE_OPUS: {
    provider: LLMProvider.ANTHROPIC,
    model: 'claude-3-opus-20240229',
  },
  // OpenAI
  GPT4O: {
    provider: LLMProvider.OPENAI,
    model: 'gpt-4o',
  },
  GPT4O_MINI: {
    provider: LLMProvider.OPENAI,
    model: 'gpt-4o-mini',
  },
  // Google
  GEMINI_2_FLASH: {
    provider: LLMProvider.GOOGLE,
    model: 'gemini-2.0-flash-exp',
  },
  GEMINI_PRO: {
    provider: LLMProvider.GOOGLE,
    model: 'gemini-1.5-pro',
  },
 } as const satisfies Record<string, ModelConfig>;
--- a/gateway/src/llm/router.ts
+++ b/gateway/src/llm/router.ts
@@ -0,0 +1,202 @@
 import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
 import type { FastifyBaseLogger } from 'fastify';
 import { LLMProviderFactory, type ModelConfig, LLMProvider } from './provider.js';
 import type { UserLicense } from '../types/user.js';
 /**
 * Model routing strategies
 */
 export enum RoutingStrategy {
  /** Use user's preferred model from license */
  USER_PREFERENCE = 'user_preference',
  /** Route based on query complexity */
  COMPLEXITY = 'complexity',
  /** Route based on license tier */
  LICENSE_TIER = 'license_tier',
  /** Use cheapest available model */
  COST_OPTIMIZED = 'cost_optimized',
 }
 /**
 * Model router
 * Intelligently selects which model to use based on various factors
 */
 export class ModelRouter {
  private factory: LLMProviderFactory;
  private logger: FastifyBaseLogger;
  private defaultModel: ModelConfig;
  constructor(factory: LLMProviderFactory, logger: FastifyBaseLogger) {
    this.factory = factory;
    this.logger = logger;
    this.defaultModel = factory.getDefaultModel();
  }
  /**
   * Route to appropriate model based on context
   */
  async route(
    message: string,
    license: UserLicense,
    strategy: RoutingStrategy = RoutingStrategy.USER_PREFERENCE
  ): Promise<BaseChatModel> {
    let modelConfig: ModelConfig;
    switch (strategy) {
      case RoutingStrategy.USER_PREFERENCE:
        modelConfig = this.routeByUserPreference(license);
        break;
      case RoutingStrategy.COMPLEXITY:
        modelConfig = this.routeByComplexity(message, license);
        break;
      case RoutingStrategy.LICENSE_TIER:
        modelConfig = this.routeByLicenseTier(license);
        break;
      case RoutingStrategy.COST_OPTIMIZED:
        modelConfig = this.routeByCost(license);
        break;
      default:
        modelConfig = this.defaultModel;
    }
    this.logger.info(
      {
        userId: license.userId,
        strategy,
        provider: modelConfig.provider,
        model: modelConfig.model,
      },
      'Routing to model'
    );
    return this.factory.createModel(modelConfig);
  }
  /**
   * Route based on user's preferred model (if set in license)
   */
  private routeByUserPreference(license: UserLicense): ModelConfig {
    // Check if user has custom model preference
    const preferredModel = (license as any).preferredModel as ModelConfig | undefined;
    if (preferredModel && this.isModelAllowed(preferredModel, license)) {
      return preferredModel;
    }
    // Fall back to license tier default
    return this.routeByLicenseTier(license);
  }
  /**
   * Route based on query complexity
   */
  private routeByComplexity(message: string, license: UserLicense): ModelConfig {
    const isComplex = this.isComplexQuery(message);
    if (license.licenseType === 'enterprise') {
      // Enterprise users get best models for complex queries
      return isComplex
        ? { provider: LLMProvider.ANTHROPIC, model: 'claude-3-opus-20240229' }
        : { provider: LLMProvider.ANTHROPIC, model: 'claude-3-5-sonnet-20241022' };
    }
    if (license.licenseType === 'pro') {
      // Pro users get good models
      return isComplex
        ? { provider: LLMProvider.ANTHROPIC, model: 'claude-3-5-sonnet-20241022' }
        : { provider: LLMProvider.OPENAI, model: 'gpt-4o-mini' };
    }
    // Free users get efficient models
    return { provider: LLMProvider.GOOGLE, model: 'gemini-2.0-flash-exp' };
  }
  /**
   * Route based on license tier
   */
  private routeByLicenseTier(license: UserLicense): ModelConfig {
    switch (license.licenseType) {
      case 'enterprise':
        return { provider: LLMProvider.ANTHROPIC, model: 'claude-3-5-sonnet-20241022' };
      case 'pro':
        return { provider: LLMProvider.OPENAI, model: 'gpt-4o' };
      case 'free':
        return { provider: LLMProvider.GOOGLE, model: 'gemini-2.0-flash-exp' };
      default:
        return this.defaultModel;
    }
  }
  /**
   * Route to cheapest available model
   */
  private routeByCost(license: UserLicense): ModelConfig {
    // Free tier: use cheapest
    if (license.licenseType === 'free') {
      return { provider: LLMProvider.GOOGLE, model: 'gemini-2.0-flash-exp' };
    }
    // Paid tiers: use GPT-4o-mini for cost efficiency
    return { provider: LLMProvider.OPENAI, model: 'gpt-4o-mini' };
  }
  /**
   * Check if model is allowed for user's license
   */
  private isModelAllowed(model: ModelConfig, license: UserLicense): boolean {
    // Free tier: only cheap models
    if (license.licenseType === 'free') {
      const allowedModels = ['gemini-2.0-flash-exp', 'gpt-4o-mini', 'claude-3-5-haiku-20241022'];
      return allowedModels.includes(model.model);
    }
    // Pro: all except Opus
    if (license.licenseType === 'pro') {
      const blockedModels = ['claude-3-opus-20240229'];
      return !blockedModels.includes(model.model);
    }
    // Enterprise: all models allowed
    return true;
  }
  /**
   * Determine if query is complex
   */
  private isComplexQuery(message: string): boolean {
    const complexityIndicators = [
      // Multi-step analysis
      'backtest',
      'analyze',
      'compare',
      'optimize',
      // Code generation
      'write',
      'create',
      'implement',
      'build',
      // Deep reasoning
      'explain why',
      'what if',
      'how would',
      // Long messages (> 200 chars likely complex)
      message.length > 200,
    ];
    const messageLower = message.toLowerCase();
    return complexityIndicators.some((indicator) =>
      typeof indicator === 'string' ? messageLower.includes(indicator) : indicator
    );
  }
 }
--- a/gateway/src/main.ts
+++ b/gateway/src/main.ts
@@ -0,0 +1,154 @@
 import Fastify from 'fastify';
 import websocket from '@fastify/websocket';
 import cors from '@fastify/cors';
 import { UserService } from './db/user-service.js';
 import { Authenticator } from './auth/authenticator.js';
 import { WebSocketHandler } from './channels/websocket-handler.js';
 import { TelegramHandler } from './channels/telegram-handler.js';
 import { KubernetesClient } from './k8s/client.js';
 import { ContainerManager } from './k8s/container-manager.js';
 const app = Fastify({
  logger: {
    level: process.env.LOG_LEVEL || 'info',
    transport: {
      target: 'pino-pretty',
      options: {
        colorize: true,
        translateTime: 'HH:MM:ss Z',
        ignore: 'pid,hostname',
      },
    },
  },
 });
 // Configuration from environment
 const config = {
  port: parseInt(process.env.PORT || '3000'),
  host: process.env.HOST || '0.0.0.0',
  databaseUrl: process.env.DATABASE_URL || 'postgresql://localhost/dexorder',
  // LLM provider API keys
  providerConfig: {
    anthropicApiKey: process.env.ANTHROPIC_API_KEY,
    openaiApiKey: process.env.OPENAI_API_KEY,
    googleApiKey: process.env.GOOGLE_API_KEY,
    openrouterApiKey: process.env.OPENROUTER_API_KEY,
  },
  telegramBotToken: process.env.TELEGRAM_BOT_TOKEN || '',
  // Kubernetes configuration
  kubernetes: {
    namespace: process.env.KUBERNETES_NAMESPACE || 'dexorder-agents',
    inCluster: process.env.KUBERNETES_IN_CLUSTER === 'true',
    context: process.env.KUBERNETES_CONTEXT,
    agentImage: process.env.AGENT_IMAGE || 'ghcr.io/dexorder/agent:latest',
    sidecarImage: process.env.SIDECAR_IMAGE || 'ghcr.io/dexorder/lifecycle-sidecar:latest',
    storageClass: process.env.AGENT_STORAGE_CLASS || 'standard',
  },
 };
 // Validate at least one LLM provider is configured
 const hasAnyProvider = Object.values(config.providerConfig).some(key => !!key);
 if (!hasAnyProvider) {
  app.log.error('At least one LLM provider API key is required (ANTHROPIC_API_KEY, OPENAI_API_KEY, GOOGLE_API_KEY, or OPENROUTER_API_KEY)');
  process.exit(1);
 }
 // Register plugins
 await app.register(cors, {
  origin: process.env.CORS_ORIGIN || '*',
 });
 await app.register(websocket, {
  options: {
    maxPayload: 1024 * 1024, // 1MB
  },
 });
 // Initialize services
 const userService = new UserService(config.databaseUrl);
 // Initialize Kubernetes client and container manager
 const k8sClient = new KubernetesClient({
  namespace: config.kubernetes.namespace,
  inCluster: config.kubernetes.inCluster,
  context: config.kubernetes.context,
  logger: app.log,
 });
 const containerManager = new ContainerManager({
  k8sClient,
  agentImage: config.kubernetes.agentImage,
  sidecarImage: config.kubernetes.sidecarImage,
  storageClass: config.kubernetes.storageClass,
  namespace: config.kubernetes.namespace,
  logger: app.log,
 });
 const authenticator = new Authenticator({
  userService,
  containerManager,
  logger: app.log,
 });
 // Initialize channel handlers
 const websocketHandler = new WebSocketHandler({
  authenticator,
  providerConfig: config.providerConfig,
 });
 const telegramHandler = new TelegramHandler({
  authenticator,
  providerConfig: config.providerConfig,
  telegramBotToken: config.telegramBotToken,
 });
 // Register routes
 websocketHandler.register(app);
 telegramHandler.register(app);
 // Health check
 app.get('/health', async () => {
  return {
    status: 'ok',
    timestamp: new Date().toISOString(),
  };
 });
 // Graceful shutdown
 const shutdown = async () => {
  app.log.info('Shutting down gracefully...');
  try {
    await userService.close();
    await app.close();
    app.log.info('Shutdown complete');
    process.exit(0);
  } catch (error) {
    app.log.error({ error }, 'Error during shutdown');
    process.exit(1);
  }
 };
 process.on('SIGTERM', shutdown);
 process.on('SIGINT', shutdown);
 // Start server
 try {
  await app.listen({
    port: config.port,
    host: config.host,
  });
  app.log.info(
    {
      port: config.port,
      host: config.host,
    },
    'Gateway server started'
  );
 } catch (error) {
  app.log.error({ error }, 'Failed to start server');
  process.exit(1);
 }
--- a/gateway/src/types/messages.ts
+++ b/gateway/src/types/messages.ts
@@ -0,0 +1,37 @@
 import { z } from 'zod';
 /**
 * Inbound user message from any channel
 */
 export const InboundMessageSchema = z.object({
  messageId: z.string(),
  userId: z.string(),
  sessionId: z.string(),
  content: z.string(),
  attachments: z.array(z.object({
    type: z.enum(['image', 'file', 'url']),
    url: z.string(),
    mimeType: z.string().optional(),
  })).optional(),
  timestamp: z.date(),
 });
 export type InboundMessage = z.infer<typeof InboundMessageSchema>;
 /**
 * Outbound response to channel
 */
 export const OutboundMessageSchema = z.object({
  messageId: z.string(),
  sessionId: z.string(),
  content: z.string(),
  attachments: z.array(z.object({
    type: z.enum(['image', 'chart', 'file']),
    url: z.string(),
    caption: z.string().optional(),
  })).optional(),
  metadata: z.record(z.unknown()).optional(),
  timestamp: z.date(),
 });
 export type OutboundMessage = z.infer<typeof OutboundMessageSchema>;
--- a/gateway/src/types/resources.ts
+++ b/gateway/src/types/resources.ts
@@ -0,0 +1,101 @@
 import { z } from 'zod';
 /**
 * MCP Resource types for user context
 */
 /**
 * Base resource structure from MCP server
 */
 export const MCPResourceSchema = z.object({
  uri: z.string(),
  mimeType: z.string().optional(),
  text: z.string().optional(),
  blob: z.string().optional(), // base64 encoded
 });
 export type MCPResource = z.infer<typeof MCPResourceSchema>;
 /**
 * User profile context
 */
 export const UserProfileContextSchema = z.object({
  tradingExperience: z.enum(['beginner', 'intermediate', 'advanced', 'professional']),
  preferredTimeframes: z.array(z.string()),
  riskTolerance: z.enum(['low', 'medium', 'high']),
  tradingStyle: z.string(),
  favoriteIndicators: z.array(z.string()).optional(),
  activeTradingPairs: z.array(z.string()).optional(),
  notes: z.string().optional(),
 });
 export type UserProfileContext = z.infer<typeof UserProfileContextSchema>;
 /**
 * Workspace state (current chart, positions, etc.)
 */
 export const WorkspaceStateSchema = z.object({
  currentChart: z.object({
    ticker: z.string(),
    timeframe: z.string(),
    indicators: z.array(z.string()).optional(),
  }).optional(),
  watchlist: z.array(z.string()),
  openPositions: z.array(z.object({
    ticker: z.string(),
    side: z.enum(['long', 'short']),
    size: z.number(),
    entryPrice: z.number(),
    currentPrice: z.number().optional(),
    unrealizedPnL: z.number().optional(),
  })),
  recentAlerts: z.array(z.object({
    type: z.string(),
    message: z.string(),
    timestamp: z.string(),
  })).optional(),
 });
 export type WorkspaceState = z.infer<typeof WorkspaceStateSchema>;
 /**
 * Standard context resource URIs
 */
 export const CONTEXT_URIS = {
  USER_PROFILE: 'context://user-profile',
  CONVERSATION_SUMMARY: 'context://conversation-summary',
  WORKSPACE_STATE: 'context://workspace-state',
  SYSTEM_PROMPT: 'context://system-prompt',
 } as const;
 /**
 * Resource content interface
 */
 export interface ResourceContent {
  uri: string;
  mimeType?: string;
  text?: string;
  blob?: string;
 }
 /**
 * Helper to parse resource content
 */
 export function parseResource<T>(resource: ResourceContent, schema: z.ZodSchema<T>): T | null {
  if (!resource.text) {
    return null;
  }
  try {
    // Try JSON parsing if mime type is JSON
    if (resource.mimeType?.includes('json')) {
      const data = JSON.parse(resource.text);
      return schema.parse(data);
    }
    // Otherwise return as-is for text resources
    return resource.text as T;
  } catch {
    return null;
  }
 }
--- a/gateway/src/types/user.ts
+++ b/gateway/src/types/user.ts
@@ -0,0 +1,66 @@
 import { z } from 'zod';
 /**
 * Model preference configuration
 */
 export const ModelPreferenceSchema = z.object({
  provider: z.enum(['anthropic', 'openai', 'google', 'openrouter']),
  model: z.string(),
  temperature: z.number().optional(),
 });
 export type ModelPreference = z.infer<typeof ModelPreferenceSchema>;
 /**
 * User license and feature authorization
 */
 export const UserLicenseSchema = z.object({
  userId: z.string(),
  email: z.string().email().optional(),
  licenseType: z.enum(['free', 'pro', 'enterprise']),
  features: z.object({
    maxIndicators: z.number(),
    maxStrategies: z.number(),
    maxBacktestDays: z.number(),
    realtimeData: z.boolean(),
    customExecutors: z.boolean(),
    apiAccess: z.boolean(),
  }),
  resourceLimits: z.object({
    maxConcurrentSessions: z.number(),
    maxMessagesPerDay: z.number(),
    maxTokensPerMessage: z.number(),
    rateLimitPerMinute: z.number(),
  }),
  mcpServerUrl: z.string().url(),
  preferredModel: ModelPreferenceSchema.optional(),
  expiresAt: z.date().optional(),
  createdAt: z.date(),
  updatedAt: z.date(),
 });
 export type UserLicense = z.infer<typeof UserLicenseSchema>;
 /**
 * Channel types for multi-channel support
 */
 export enum ChannelType {
  WEBSOCKET = 'websocket',
  TELEGRAM = 'telegram',
  SLACK = 'slack',
  DISCORD = 'discord',
 }
 /**
 * Authentication context per channel
 */
 export const AuthContextSchema = z.object({
  userId: z.string(),
  channelType: z.nativeEnum(ChannelType),
  channelUserId: z.string(), // Platform-specific ID (telegram_id, discord_id, etc)
  sessionId: z.string(),
  license: UserLicenseSchema,
  authenticatedAt: z.date(),
 });
 export type AuthContext = z.infer<typeof AuthContextSchema>;
--- a/gateway/src/workflows/README.md
+++ b/gateway/src/workflows/README.md
@@ -0,0 +1,253 @@
 # LangGraph Workflows for Trading
 Complex, stateful workflows built with LangGraph for trading-specific tasks.
 ## Overview
 LangGraph provides:
 - **Stateful execution**: Workflow state persists across failures
 - **Conditional branching**: Route based on market conditions, backtest results, etc.
 - **Human-in-the-loop**: Pause for user approval before executing trades
 - **Loops & retries**: Backtest with different parameters, retry failed operations
 - **Multi-agent**: Different LLMs for different tasks (analysis, risk, execution)
 ## Workflows
 ### Strategy Analysis (`strategy-analysis.ts`)
 Multi-step pipeline for analyzing trading strategies:
 ```typescript
 import { buildStrategyAnalysisWorkflow } from './workflows/strategy-analysis.js';
 const workflow = buildStrategyAnalysisWorkflow(model, logger, mcpBacktestFn);
 const result = await workflow.invoke({
  strategyCode: userStrategy,
  ticker: 'BTC/USDT',
  timeframe: '1h',
 });
 console.log(result.recommendation); // Go/no-go decision
 ```
 **Steps:**
 1. **Code Review** - LLM analyzes strategy code for bugs, logic errors
 2. **Backtest** - Runs backtest via user's MCP server
 3. **Risk Assessment** - LLM evaluates results (drawdown, Sharpe, etc.)
 4. **Human Approval** - Pauses for user review
 5. **Recommendation** - Final go/no-go decision
 **Benefits:**
 - Stateful: Can resume if server restarts
 - Human-in-the-loop: User must approve before deployment
 - Multi-step reasoning: Each step builds on previous
 ---
 ## Future Workflows
 ### Market Scanner
 Scan multiple tickers for trading opportunities:
 ```typescript
 const scanner = buildMarketScannerWorkflow(model, logger);
 const result = await scanner.invoke({
  tickers: ['BTC/USDT', 'ETH/USDT', 'SOL/USDT'],
  strategies: ['momentum', 'mean_reversion'],
  timeframe: '1h',
 });
 // Returns ranked opportunities
 ```
 **Steps:**
 1. **Fetch Data** - Get OHLC for all tickers
 2. **Apply Strategies** - Run each strategy on each ticker (parallel)
 3. **Rank Signals** - Score by confidence, risk/reward
 4. **Filter** - Apply user's risk limits
 5. **Return Top N** - Best opportunities
 ---
 ### Portfolio Optimization
 Optimize position sizing across multiple strategies:
 ```typescript
 const optimizer = buildPortfolioOptimizerWorkflow(model, logger);
 const result = await optimizer.invoke({
  strategies: [strategy1, strategy2, strategy3],
  totalCapital: 100000,
  maxRiskPerTrade: 0.02,
 });
 // Returns optimal allocation
 ```
 **Steps:**
 1. **Backtest All** - Run backtests for each strategy
 2. **Correlation Analysis** - Check strategy correlation
 3. **Monte Carlo** - Simulate portfolio performance
 4. **Optimize** - Find optimal weights (Sharpe maximization)
 5. **Risk Check** - Validate against user limits
 ---
 ### Trade Execution Monitor
 Monitor trade execution and adapt to market conditions:
 ```typescript
 const monitor = buildTradeExecutionWorkflow(model, logger, exchange);
 const result = await monitor.invoke({
  tradeId: 'xyz',
  targetPrice: 45000,
  maxSlippage: 0.001,
  timeLimit: 60, // seconds
 });
 ```
 **Steps:**
 1. **Place Order** - Submit order to exchange
 2. **Monitor Fill** - Check fill status every second
 3. **Adapt** - If not filling, adjust price (within slippage)
 4. **Retry Logic** - If rejected, retry with backoff
 5. **Timeout** - Cancel if time limit exceeded
 6. **Report** - Final execution report
 ---
 ## Using Workflows in Gateway
 ### Simple Chat vs Complex Workflow
 ```typescript
 // gateway/src/orchestrator.ts
 export class MessageOrchestrator {
  async handleMessage(msg: InboundMessage) {
    // Route based on complexity
    if (this.isSimpleQuery(msg)) {
      // Use agent harness for streaming chat
      return this.harness.streamMessage(msg);
    }
    if (this.isWorkflowRequest(msg)) {
      // Use LangGraph for complex analysis
      return this.executeWorkflow(msg);
    }
  }
  async executeWorkflow(msg: InboundMessage) {
    const { type, params } = this.parseWorkflowRequest(msg);
    switch (type) {
      case 'analyze_strategy':
        const workflow = buildStrategyAnalysisWorkflow(...);
        return await workflow.invoke(params);
      case 'scan_market':
        const scanner = buildMarketScannerWorkflow(...);
        return await scanner.invoke(params);
      // ... more workflows
    }
  }
 }
 ```
 ---
 ## Benefits for Trading
 ### vs Simple LLM Calls
 | Scenario | Simple LLM | LangGraph Workflow |
 |----------|-----------|-------------------|
 | "What's the RSI?" | ✅ Fast, streaming | ❌ Overkill |
 | "Analyze this strategy" | ❌ Limited context | ✅ Multi-step analysis |
 | "Backtest 10 param combos" | ❌ No loops | ✅ Conditional loops |
 | "Execute if approved" | ❌ No state | ✅ Human-in-the-loop |
 | Server crashes mid-analysis | ❌ Lost progress | ✅ Resume from checkpoint |
 ### When to Use Workflows
 **Use LangGraph when:**
 - Multi-step analysis (backtest → risk → approval)
 - Conditional logic (if bullish → momentum, else → mean-reversion)
 - Human approval required (pause workflow)
 - Loops needed (try different parameters)
 - Long-running (can survive restarts)
 **Use Agent Harness when:**
 - Simple Q&A ("What is RSI?")
 - Fast response needed (streaming chat)
 - Single tool call ("Get my watchlist")
 - Real-time interaction (Telegram, WebSocket)
 ---
 ## Implementation Notes
 ### State Persistence
 LangGraph can persist state to database:
 ```typescript
 import { MemorySaver } from '@langchain/langgraph';
 const checkpointer = new MemorySaver();
 const workflow = graph.compile({ checkpointer });
 // Resume from checkpoint
 const result = await workflow.invoke(input, {
  configurable: { thread_id: 'user-123-strategy-analysis' }
 });
 ```
 ### Human-in-the-Loop
 Pause workflow for user input:
 ```typescript
 const workflow = graph
  .addNode('human_approval', humanApprovalNode)
  .interrupt('human_approval'); // Pauses here
 // User reviews in UI
 const approved = await getUserApproval(workflowId);
 // Resume workflow
 await workflow.resume(state, { approved });
 ```
 ### Multi-Agent
 Use different models for different tasks:
 ```typescript
 const analysisModel = new ChatAnthropic({ model: 'claude-3-opus' }); // Smart
 const codeModel = new ChatOpenAI({ model: 'gpt-4o' }); // Good at code
 const cheapModel = new ChatOpenAI({ model: 'gpt-4o-mini' }); // Fast
 const workflow = graph
  .addNode('analyze', (state) => analysisModel.invoke(...))
  .addNode('code_review', (state) => codeModel.invoke(...))
  .addNode('summarize', (state) => cheapModel.invoke(...));
 ```
 ---
 ## Next Steps
 1. Implement remaining workflows (scanner, optimizer, execution)
 2. Add state persistence (PostgreSQL checkpointer)
 3. Integrate human-in-the-loop with WebSocket
 4. Add workflow monitoring dashboard
 5. Performance optimization (parallel execution)
--- a/gateway/src/workflows/strategy-analysis.ts
+++ b/gateway/src/workflows/strategy-analysis.ts
@@ -0,0 +1,162 @@
 import { StateGraph, Annotation } from '@langchain/langgraph';
 import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
 import { HumanMessage, SystemMessage } from '@langchain/core/messages';
 import type { FastifyBaseLogger } from 'fastify';
 /**
 * State for strategy analysis workflow
 */
 const StrategyAnalysisState = Annotation.Root({
  strategyCode: Annotation<string>(),
  ticker: Annotation<string>(),
  timeframe: Annotation<string>(),
  // Analysis steps
  codeReview: Annotation<string | null>({
    default: () => null,
  }),
  backtestResults: Annotation<Record<string, unknown> | null>({
    default: () => null,
  }),
  riskAssessment: Annotation<string | null>({
    default: () => null,
  }),
  humanApproved: Annotation<boolean>({
    default: () => false,
  }),
  // Final output
  recommendation: Annotation<string | null>({
    default: () => null,
  }),
 });
 type StrategyAnalysisStateType = typeof StrategyAnalysisState.State;
 /**
 * Build strategy analysis workflow using LangGraph
 *
 * Workflow steps:
 * 1. Code review (LLM analyzes strategy code)
 * 2. Backtest (calls user's MCP backtest tool)
 * 3. Risk assessment (LLM evaluates results)
 * 4. Human approval (pause for user review)
 * 5. Final recommendation
 */
 export function buildStrategyAnalysisWorkflow(
  model: BaseChatModel,
  logger: FastifyBaseLogger,
  mcpBacktestFn: (strategy: string, ticker: string, timeframe: string) => Promise<Record<string, unknown>>
 ) {
  // Node: Code Review
  const codeReviewNode = async (state: StrategyAnalysisStateType) => {
    logger.info('Strategy workflow: Code review');
    const systemPrompt = `You are an expert trading strategy analyst.
 Review the following strategy code for potential issues, bugs, or improvements.
 Focus on: logic errors, edge cases, performance, and trading best practices.`;
    const response = await model.invoke([
      new SystemMessage(systemPrompt),
      new HumanMessage(`Review this strategy:\n\n${state.strategyCode}`),
    ]);
    return {
      codeReview: response.content as string,
    };
  };
  // Node: Backtest
  const backtestNode = async (state: StrategyAnalysisStateType) => {
    logger.info('Strategy workflow: Running backtest');
    const results = await mcpBacktestFn(state.strategyCode, state.ticker, state.timeframe);
    return {
      backtestResults: results,
    };
  };
  // Node: Risk Assessment
  const riskAssessmentNode = async (state: StrategyAnalysisStateType) => {
    logger.info('Strategy workflow: Risk assessment');
    const systemPrompt = `You are a risk management expert for trading strategies.
 Analyze the backtest results and provide a risk assessment.
 Focus on: drawdown, win rate, Sharpe ratio, position sizing, and risk of ruin.`;
    const response = await model.invoke([
      new SystemMessage(systemPrompt),
      new HumanMessage(
        `Code review: ${state.codeReview}\n\nBacktest results: ${JSON.stringify(state.backtestResults, null, 2)}\n\nProvide risk assessment:`
      ),
    ]);
    return {
      riskAssessment: response.content as string,
    };
  };
  // Node: Human Approval (placeholder - would integrate with UI)
  const humanApprovalNode = async (state: StrategyAnalysisStateType) => {
    logger.info('Strategy workflow: Awaiting human approval');
    // In real implementation, this would pause and wait for user input
    // For now, auto-approve
    return {
      humanApproved: true,
    };
  };
  // Node: Final Recommendation
  const recommendationNode = async (state: StrategyAnalysisStateType) => {
    logger.info('Strategy workflow: Generating recommendation');
    const systemPrompt = `Provide a final recommendation on whether to deploy this trading strategy.
 Summarize the code review, backtest results, and risk assessment.
 Give clear go/no-go decision with reasoning.`;
    const response = await model.invoke([
      new SystemMessage(systemPrompt),
      new HumanMessage(
        `Code review: ${state.codeReview}\n\nBacktest: ${JSON.stringify(state.backtestResults)}\n\nRisk: ${state.riskAssessment}\n\nApproved: ${state.humanApproved}\n\nYour recommendation:`
      ),
    ]);
    return {
      recommendation: response.content as string,
    };
  };
  // Build graph
  const workflow = new StateGraph(StrategyAnalysisState)
    .addNode('code_review', codeReviewNode)
    .addNode('backtest', backtestNode)
    .addNode('risk_assessment', riskAssessmentNode)
    .addNode('human_approval', humanApprovalNode)
    .addNode('recommendation', recommendationNode)
    .addEdge('__start__', 'code_review')
    .addEdge('code_review', 'backtest')
    .addEdge('backtest', 'risk_assessment')
    .addEdge('risk_assessment', 'human_approval')
    .addConditionalEdges('human_approval', (state) => {
      return state.humanApproved ? 'recommendation' : '__end__';
    })
    .addEdge('recommendation', '__end__');
  return workflow.compile();
 }
 /**
 * Example usage:
 *
 * const workflow = buildStrategyAnalysisWorkflow(model, logger, mcpBacktestFn);
 *
 * const result = await workflow.invoke({
 *   strategyCode: "strategy code here",
 *   ticker: "BTC/USDT",
 *   timeframe: "1h",
 * });
 *
 * console.log(result.recommendation);
 */
--- a/gateway/tsconfig.json
+++ b/gateway/tsconfig.json
@@ -0,0 +1,26 @@
 {
  "compilerOptions": {
    "target": "ES2022",
    "module": "ESNext",
    "lib": ["ES2022"],
    "moduleResolution": "bundler",
    "resolveJsonModule": true,
    "allowJs": false,
    "outDir": "./dist",
    "rootDir": "./src",
    "strict": true,
    "esModuleInterop": true,
    "skipLibCheck": true,
    "forceConsistentCasingInFileNames": true,
    "declaration": true,
    "declarationMap": true,
    "sourceMap": true,
    "noUnusedLocals": true,
    "noUnusedParameters": true,
    "noImplicitReturns": true,
    "noFallthroughCasesInSwitch": true,
    "allowSyntheticDefaultImports": true
  },
  "include": ["src/**/*"],
  "exclude": ["node_modules", "dist"]
 }
--- a/lifecycle-sidecar/.gitignore
+++ b/lifecycle-sidecar/.gitignore
@@ -0,0 +1,15 @@
 # Binaries
 lifecycle-sidecar
 *.exe
 *.dll
 *.so
 *.dylib
 # Test binary
 *.test
 # Go workspace file
 go.work
 # Build output
 dist/
--- a/lifecycle-sidecar/Dockerfile
+++ b/lifecycle-sidecar/Dockerfile
@@ -0,0 +1,40 @@
 # Build stage
 FROM golang:1.22-alpine AS builder
 WORKDIR /app
 # Install build dependencies
 RUN apk add --no-cache git ca-certificates
 # Copy go mod files
 COPY go.mod go.sum ./
 RUN go mod download
 # Copy source
 COPY main.go ./
 # Build static binary
 RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \
    -ldflags="-w -s" \
    -o lifecycle-sidecar \
    main.go
 # Runtime stage
 FROM alpine:3.19
 # Install procps for process monitoring (pgrep, kill)
 RUN apk add --no-cache procps ca-certificates
 # Create non-root user
 RUN addgroup -g 1000 sidecar && \
    adduser -D -u 1000 -G sidecar sidecar
 WORKDIR /app
 # Copy binary from builder
 COPY --from=builder /app/lifecycle-sidecar /app/lifecycle-sidecar
 # Run as non-root
 USER sidecar
 ENTRYPOINT ["/app/lifecycle-sidecar"]
--- a/lifecycle-sidecar/README.md
+++ b/lifecycle-sidecar/README.md
@@ -0,0 +1,94 @@
 # Lifecycle Sidecar
 A lightweight Kubernetes sidecar that monitors the main agent container and handles cleanup when the container exits with a specific exit code indicating idle shutdown.
 ## Purpose
 User agent containers self-manage their lifecycle by:
 1. Tracking their own activity (MCP calls, trigger status)
 2. Exiting with code `42` when idle (no triggers + no recent activity)
 3. Delegating deployment cleanup to this sidecar
 The sidecar watches the main container and:
 - On exit code `42`: Deletes the deployment (and optionally PVC)
 - On any other exit code: Allows Kubernetes restart policy to handle it
 ## Architecture
 ```
 ┌─────────────────────────────────────────────────┐
 │                    Pod                          │
 │  ┌────────────────┐      ┌──────────────────┐  │
 │  │ Agent Container│      │ Lifecycle Sidecar│  │
 │  │                │      │                  │  │
 │  │ - Track activity     │ - Monitor agent  │  │
 │  │ - Track triggers     │ - Watch exit code│  │
 │  │ - Exit 42 if idle    │ - Delete if 42   │  │
 │  └────────────────┘      └──────────────────┘  │
 │         │                         │             │
 │         │ writes exit_code        │             │
 │         └─────────►/var/run/agent/exit_code    │
 │                                   │             │
 └───────────────────────────────────┼─────────────┘
                                    │
                                    ▼ k8s API
                         ┌──────────────────────┐
                         │  Delete Deployment   │
                         │  (+ PVC if anonymous)│
                         └──────────────────────┘
 ```
 ## Environment Variables
 | Variable | Required | Description |
 |----------|----------|-------------|
 | `NAMESPACE` | Yes | Kubernetes namespace (injected via downward API) |
 | `DEPLOYMENT_NAME` | Yes | Name of the deployment to delete (from pod label) |
 | `USER_TYPE` | No | User license tier: `anonymous`, `free`, `paid`, `enterprise` |
 | `MAIN_CONTAINER_PID` | No | PID of main container (for precise monitoring) |
 ## Exit Code Contract
 The agent container uses exit codes to signal intent:
 | Exit Code | Meaning | Sidecar Action |
 |-----------|---------|----------------|
 | `42` | Clean idle shutdown | Delete deployment + optional PVC |
 | Any other | Error or normal restart | Allow Kubernetes to restart |
 ## RBAC Requirements
 The sidecar requires a ServiceAccount with permission to delete its own deployment:
 ```yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 rules:
  - apiGroups: ["apps"]
    resources: ["deployments"]
    verbs: ["get", "delete"]
  - apiGroups: [""]
    resources: ["persistentvolumeclaims"]
    verbs: ["get", "delete"]
 ```
 See `deploy/k8s/base/lifecycle-sidecar-rbac.yaml` for the full RBAC configuration.
 ## Building
 ```bash
 docker build -t ghcr.io/dexorder/lifecycle-sidecar:latest .
 docker push ghcr.io/dexorder/lifecycle-sidecar:latest
 ```
 ## Example Usage
 See `deploy/k8s/base/agent-deployment-example.yaml` for a complete example of how to configure an agent deployment with the lifecycle sidecar.
 ## Security Considerations
 1. **Self-delete only**: The sidecar can only delete the deployment it's part of (enforced by label matching in admission policy)
 2. **Non-privileged**: Runs as non-root user (UID 1000)
 3. **Minimal permissions**: Only has `get` and `delete` on deployments/PVCs in the agents namespace
 4. **No cross-namespace access**: Scoped to `dexorder-agents` namespace only
 5. **Crash-safe**: Only triggers cleanup on exit code 42, never on crashes
--- a/lifecycle-sidecar/go.mod
+++ b/lifecycle-sidecar/go.mod
@@ -0,0 +1,16 @@
 module github.com/dexorder/lifecycle-sidecar
 go 1.22
 require (
 	github.com/rs/zerolog v1.32.0
 	k8s.io/api v0.29.2
 	k8s.io/apimachinery v0.29.2
 	k8s.io/client-go v0.29.2
 )
 require (
 	github.com/mattn/go-colorable v0.1.13 // indirect
 	github.com/mattn/go-isatty v0.0.19 // indirect
 	golang.org/x/sys v0.17.0 // indirect
 )
--- a/lifecycle-sidecar/main.go
+++ b/lifecycle-sidecar/main.go
@@ -0,0 +1,234 @@
 package main
 import (
 	"context"
 	"fmt"
 	"os"
 	"os/exec"
 	"syscall"
 	"time"
 	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/rest"
 )
 const (
 	// Exit code indicating clean idle shutdown
 	ExitCodeIdleShutdown = 42
 	// Poll interval for checking main container status
 	PollInterval = 5 * time.Second
 )
 func main() {
 	// Setup logging
 	zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
 	log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr})
 	log.Info().Msg("Lifecycle sidecar starting")
 	// Get environment configuration
 	namespace := os.Getenv("NAMESPACE")
 	deploymentName := os.Getenv("DEPLOYMENT_NAME")
 	userType := os.Getenv("USER_TYPE")
 	mainContainerPID := os.Getenv("MAIN_CONTAINER_PID")
 	if namespace == "" || deploymentName == "" {
 		log.Fatal().Msg("NAMESPACE and DEPLOYMENT_NAME environment variables are required")
 	}
 	log.Info().
 		Str("namespace", namespace).
 		Str("deployment", deploymentName).
 		Str("userType", userType).
 		Str("mainPID", mainContainerPID).
 		Msg("Configuration loaded")
 	// Create Kubernetes client
 	config, err := rest.InClusterConfig()
 	if err != nil {
 		log.Fatal().Err(err).Msg("Failed to get in-cluster config")
 	}
 	clientset, err := kubernetes.NewForConfig(config)
 	if err != nil {
 		log.Fatal().Err(err).Msg("Failed to create Kubernetes client")
 	}
 	// Wait for main container to exit
 	exitCode := waitForMainContainer()
 	log.Info().Int("exitCode", exitCode).Msg("Main container exited")
 	// Handle exit code
 	if exitCode == ExitCodeIdleShutdown {
 		log.Info().Msg("Detected idle shutdown (exit code 42) - cleaning up deployment")
 		ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
 		defer cancel()
 		// Delete PVC if anonymous user
 		deletePVC := userType == "anonymous" || userType == "temporary"
 		if err := cleanupDeployment(ctx, clientset, namespace, deploymentName, deletePVC); err != nil {
 			log.Error().Err(err).Msg("Failed to cleanup deployment")
 			os.Exit(1)
 		}
 		log.Info().Msg("Cleanup complete - sidecar exiting")
 		os.Exit(0)
 	} else {
 		// Any other exit code - let Kubernetes restart policy handle it
 		log.Info().
 			Int("exitCode", exitCode).
 			Msg("Non-idle exit code - allowing Kubernetes to handle restart")
 		os.Exit(exitCode)
 	}
 }
 // waitForMainContainer monitors the main container process and returns its exit code
 func waitForMainContainer() int {
 	// Try multiple methods to detect main container exit
 	// Method 1: Poll for process via shared PID namespace
 	mainPID := os.Getenv("MAIN_CONTAINER_PID")
 	if mainPID != "" {
 		return pollProcessExit(mainPID)
 	}
 	// Method 2: Poll for agent process by name (fallback)
 	log.Info().Msg("MAIN_CONTAINER_PID not set, polling for 'agent' process")
 	return pollProcessByName("agent")
 }
 // pollProcessExit polls for process exit by PID
 func pollProcessExit(pidStr string) int {
 	log.Info().Str("pid", pidStr).Msg("Monitoring main container process")
 	for {
 		// Check if process exists
 		cmd := exec.Command("kill", "-0", pidStr)
 		err := cmd.Run()
 		if err != nil {
 			// Process no longer exists - get exit code from /proc if available
 			log.Info().Msg("Main container process exited")
 			// Try to get actual exit code (this is a best-effort)
 			// In Kubernetes, we might not have access to the actual exit code
 			// So we check if the container restarted via container status
 			return getContainerExitCode()
 		}
 		time.Sleep(PollInterval)
 	}
 }
 // pollProcessByName polls for process exit by name
 func pollProcessByName(name string) int {
 	log.Info().Str("name", name).Msg("Monitoring main container by name")
 	for {
 		cmd := exec.Command("pgrep", "-x", name)
 		err := cmd.Run()
 		if err != nil {
 			log.Info().Msg("Main container process exited")
 			return getContainerExitCode()
 		}
 		time.Sleep(PollInterval)
 	}
 }
 // getContainerExitCode attempts to retrieve the exit code of the main container
 // This is challenging in Kubernetes without direct access to container runtime
 // We use a fallback approach: check a shared file or default to 0
 func getContainerExitCode() int {
 	// Check if main container wrote exit code to shared volume
 	exitCodeFile := "/var/run/agent/exit_code"
 	data, err := os.ReadFile(exitCodeFile)
 	if err == nil {
 		var exitCode int
 		_, err := fmt.Sscanf(string(data), "%d", &exitCode)
 		if err == nil {
 			log.Info().Int("exitCode", exitCode).Msg("Read exit code from shared file")
 			return exitCode
 		}
 	}
 	// Default to 0 if we can't determine exit code
 	// This is safe because non-42 codes allow restart
 	log.Warn().Msg("Could not determine exit code, defaulting to 0")
 	return 0
 }
 // cleanupDeployment deletes the deployment and optionally the PVC
 func cleanupDeployment(ctx context.Context, clientset *kubernetes.Clientset, namespace, deploymentName string, deletePVC bool) error {
 	log.Info().
 		Str("namespace", namespace).
 		Str("deployment", deploymentName).
 		Bool("deletePVC", deletePVC).
 		Msg("Cleaning up deployment")
 	// Get deployment to find PVC name if needed
 	var pvcName string
 	if deletePVC {
 		deployment, err := clientset.AppsV1().Deployments(namespace).Get(ctx, deploymentName, metav1.GetOptions{})
 		if err != nil {
 			log.Warn().Err(err).Msg("Could not get deployment for PVC lookup")
 		} else {
 			// Find PVC from volume claim templates or volumes
 			if len(deployment.Spec.Template.Spec.Volumes) > 0 {
 				for _, vol := range deployment.Spec.Template.Spec.Volumes {
 					if vol.PersistentVolumeClaim != nil {
 						pvcName = vol.PersistentVolumeClaim.ClaimName
 						break
 					}
 				}
 			}
 		}
 	}
 	// Delete deployment
 	deletePolicy := metav1.DeletePropagationForeground
 	deleteOptions := metav1.DeleteOptions{
 		PropagationPolicy: &deletePolicy,
 	}
 	log.Info().Str("deployment", deploymentName).Msg("Deleting deployment")
 	err := clientset.AppsV1().Deployments(namespace).Delete(ctx, deploymentName, deleteOptions)
 	if err != nil {
 		return fmt.Errorf("failed to delete deployment: %w", err)
 	}
 	log.Info().Msg("Deployment deleted successfully")
 	// Delete PVC if requested and found
 	if deletePVC && pvcName != "" {
 		log.Info().Str("pvc", pvcName).Msg("Deleting PVC")
 		err := clientset.CoreV1().PersistentVolumeClaims(namespace).Delete(ctx, pvcName, metav1.DeleteOptions{})
 		if err != nil {
 			log.Warn().Err(err).Str("pvc", pvcName).Msg("Failed to delete PVC (non-fatal)")
 		} else {
 			log.Info().Msg("PVC deleted successfully")
 		}
 	}
 	return nil
 }
 func init() {
 	// Register signal handler for graceful shutdown
 	// If sidecar receives SIGTERM, just exit cleanly
 	// Don't trigger deployment deletion on sidecar termination
 	go func() {
 		sigChan := make(chan os.Signal, 1)
 		syscall.Signal(syscall.SIGTERM)
 		<-sigChan
 		log.Info().Msg("Received SIGTERM - sidecar exiting without cleanup")
 		os.Exit(0)
 	}()
 }