diff --git a/.idea/ai.iml b/.idea/ai.iml index 1394582..0c65e39 100644 --- a/.idea/ai.iml +++ b/.idea/ai.iml @@ -8,6 +8,7 @@ + diff --git a/AGENT.md b/AGENT.md new file mode 100644 index 0000000..6061588 --- /dev/null +++ b/AGENT.md @@ -0,0 +1,15 @@ +We're building an AI-first trading platform by integrating user-facing TradingView charts and chat with an AI assistant that helps do research, develop indicators (signals), and write strategies, using the Dexorder trading framework we provide. + +This monorepo has: +bin/ scripts, mostly build and deploy +deploy/ kubernetes deployment and configuration +doc/ documentation +flink/ Apache Flink application mode processes data from Kafka +iceberg/ Apache Iceberg for historical OHLC etc +ingestor/ Data sources publish to Kafka +kafka/ Apache Kafka +protobuf/ Messaging entities +relay/ Rust+ZeroMQ stateless router +web/ Vue 3 / Pinia / PrimeVue / TradingView + +See doc/protocol.md for messaging architecture diff --git a/bin/build-all b/bin/build-all index 4ff59e0..8b6fb8c 100755 --- a/bin/build-all +++ b/bin/build-all @@ -4,6 +4,7 @@ set -e DIR="$(cd "$(dirname "$0")" && pwd)" +ROOT_DIR="$(cd "$DIR/.." && pwd)" echo "Building all container images..." echo @@ -13,5 +14,31 @@ echo "$DIR/build" ingestor "$@" "$DIR/build" web "$@" +# Build lifecycle-sidecar (Go binary, no protobuf sync needed) +echo "Building lifecycle-sidecar..." +cd "$ROOT_DIR/lifecycle-sidecar" + +# Determine tag +if [ "$1" == "dev" ]; then + TAG="dev$(date +%Y%m%d%H%M%S)" +else + # Check for uncommitted changes + DIRTY="$(git status | grep 'Changes ' || true)" + if [ "$DIRTY" != "" ]; then + echo "lifecycle-sidecar has uncommitted changes." + echo "Use '$0 dev' to build a development-tagged version instead." + exit 1 + fi + TAG="$(git log --oneline | head -1 | cut -d ' ' -f 1)" +fi + +REMOTE=${REMOTE:-ghcr.io/dexorder} + +docker build -t lifecycle-sidecar:latest -t lifecycle-sidecar:$TAG . +docker tag lifecycle-sidecar:$TAG $REMOTE/lifecycle-sidecar:$TAG +docker tag $REMOTE/lifecycle-sidecar:$TAG $REMOTE/lifecycle-sidecar:latest + +echo "$(date)" built $REMOTE/lifecycle-sidecar:$TAG + echo echo "All images built successfully!" diff --git a/bin/dev b/bin/dev index 5d84c91..4ec7afe 100755 --- a/bin/dev +++ b/bin/dev @@ -19,7 +19,7 @@ usage() { echo "Commands:" echo " start Start minikube and deploy all services" echo " stop Stop minikube" - echo " restart [svc] Rebuild and redeploy all services, or just one (relay|ingestor|flink)" + echo " restart [svc] Rebuild and redeploy all services, or just one (relay|ingestor|flink|sidecar)" echo " rebuild [svc] Rebuild all custom images, or just one" echo " deploy [svc] Deploy/update all services, or just one" echo " status Show status of all services" @@ -127,12 +127,23 @@ rebuild_images() { docker tag "dexorder/ai-flink:$FLINK_TAG" "dexorder/flink:$FLINK_TAG" fi - # Save the tags for deployment (all three, preserving any we didn't rebuild) + # Build lifecycle-sidecar (Go binary) + if [ "$service" == "all" ] || [ "$service" == "lifecycle-sidecar" ] || [ "$service" == "sidecar" ]; then + echo -e "${GREEN}→${NC} Building lifecycle-sidecar..." + cd "$ROOT_DIR/lifecycle-sidecar" + SIDECAR_TAG="dev$(date +%Y%m%d%H%M%S)" + docker build -t lifecycle-sidecar:latest -t lifecycle-sidecar:$SIDECAR_TAG . || exit 1 + echo -e "${GREEN}✓ Built lifecycle-sidecar:$SIDECAR_TAG${NC}" + cd "$ROOT_DIR" + fi + + # Save the tags for deployment (all services, preserving any we didn't rebuild) echo "RELAY_TAG=$RELAY_TAG" > "$ROOT_DIR/.dev-image-tag" echo "INGEST_TAG=$INGEST_TAG" >> "$ROOT_DIR/.dev-image-tag" echo "FLINK_TAG=$FLINK_TAG" >> "$ROOT_DIR/.dev-image-tag" + echo "SIDECAR_TAG=$SIDECAR_TAG" >> "$ROOT_DIR/.dev-image-tag" - echo -e "${GREEN}✓ Images built: relay=$RELAY_TAG, ingestor=$INGEST_TAG, flink=$FLINK_TAG${NC}" + echo -e "${GREEN}✓ Images built: relay=$RELAY_TAG, ingestor=$INGEST_TAG, flink=$FLINK_TAG, sidecar=$SIDECAR_TAG${NC}" } deploy_services() { diff --git a/client-py/dexorder/lifecycle_manager.py b/client-py/dexorder/lifecycle_manager.py new file mode 100644 index 0000000..882d865 --- /dev/null +++ b/client-py/dexorder/lifecycle_manager.py @@ -0,0 +1,230 @@ +""" +Container lifecycle manager for agent containers. + +Tracks activity and triggers to determine when the container should shut down. +Exits with code 42 to signal clean idle shutdown to the lifecycle sidecar. +""" + +import asyncio +import logging +import os +import signal +import sys +from datetime import datetime, timedelta +from pathlib import Path +from typing import Optional, Set + +logger = logging.getLogger(__name__) + +# Exit code to signal clean idle shutdown to sidecar +EXIT_CODE_IDLE_SHUTDOWN = 42 + +# File to write exit code for sidecar to read +EXIT_CODE_FILE = Path("/var/run/agent/exit_code") + + +class LifecycleManager: + """ + Manages container lifecycle based on activity and triggers. + + The container shuts itself down when: + 1. No active triggers (data subscriptions, CEP patterns, etc.) + 2. No recent user activity (MCP calls) + 3. Idle timeout has elapsed + """ + + def __init__( + self, + idle_timeout_minutes: int = 15, + check_interval_seconds: int = 60, + enable_shutdown: bool = True, + ): + """ + Initialize lifecycle manager. + + Args: + idle_timeout_minutes: Minutes of inactivity before shutdown + check_interval_seconds: Interval between idle checks + enable_shutdown: If False, only log idle state without exiting (for testing) + """ + self.idle_timeout = timedelta(minutes=idle_timeout_minutes) + self.check_interval = check_interval_seconds + self.enable_shutdown = enable_shutdown + + self.last_activity: datetime = datetime.now() + self.active_triggers: Set[str] = set() + self._running = False + self._check_task: Optional[asyncio.Task] = None + + logger.info( + "Lifecycle manager initialized: idle_timeout=%dm, check_interval=%ds, shutdown_enabled=%s", + idle_timeout_minutes, + check_interval_seconds, + enable_shutdown, + ) + + def record_activity(self) -> None: + """ + Record user activity (called on MCP tool/resource/prompt invocations). + Resets the idle timer. + """ + self.last_activity = datetime.now() + logger.debug("Activity recorded, idle timer reset") + + def update_triggers(self, triggers: Set[str]) -> None: + """ + Update the set of active triggers. + + Args: + triggers: Set of active trigger IDs (data subscriptions, CEP patterns, etc.) + """ + if triggers != self.active_triggers: + added = triggers - self.active_triggers + removed = self.active_triggers - triggers + + if added: + logger.info("Triggers added: %s", added) + if removed: + logger.info("Triggers removed: %s", removed) + + self.active_triggers = triggers + logger.info("Active triggers: %d", len(self.active_triggers)) + + def add_trigger(self, trigger_id: str) -> None: + """Add a single trigger.""" + if trigger_id not in self.active_triggers: + self.active_triggers.add(trigger_id) + logger.info("Trigger added: %s (total: %d)", trigger_id, len(self.active_triggers)) + + def remove_trigger(self, trigger_id: str) -> None: + """Remove a single trigger.""" + if trigger_id in self.active_triggers: + self.active_triggers.remove(trigger_id) + logger.info("Trigger removed: %s (total: %d)", trigger_id, len(self.active_triggers)) + + def is_idle(self) -> bool: + """ + Check if container is idle and should shut down. + + Returns: + True if no triggers and idle timeout exceeded + """ + has_triggers = len(self.active_triggers) > 0 + idle_time = datetime.now() - self.last_activity + is_past_timeout = idle_time > self.idle_timeout + + if has_triggers: + logger.debug("Not idle: has %d active triggers", len(self.active_triggers)) + return False + + if not is_past_timeout: + logger.debug( + "Not idle: last activity %s ago (timeout: %s)", + idle_time, + self.idle_timeout, + ) + return False + + logger.info( + "Container is idle: no triggers and %s since last activity", idle_time + ) + return True + + async def start(self) -> None: + """Start the lifecycle manager background task.""" + if self._running: + logger.warning("Lifecycle manager already running") + return + + self._running = True + self._check_task = asyncio.create_task(self._check_loop()) + logger.info("Lifecycle manager started") + + async def stop(self) -> None: + """Stop the lifecycle manager.""" + self._running = False + if self._check_task: + self._check_task.cancel() + try: + await self._check_task + except asyncio.CancelledError: + pass + logger.info("Lifecycle manager stopped") + + async def _check_loop(self) -> None: + """Background task that periodically checks if container should shut down.""" + while self._running: + try: + await asyncio.sleep(self.check_interval) + + if self.is_idle(): + if self.enable_shutdown: + logger.info("Initiating idle shutdown (exit code %d)", EXIT_CODE_IDLE_SHUTDOWN) + self._write_exit_code(EXIT_CODE_IDLE_SHUTDOWN) + + # Give sidecar a moment to see the exit code file + await asyncio.sleep(1) + + # Exit with special code + os._exit(EXIT_CODE_IDLE_SHUTDOWN) + else: + logger.info( + "Container is idle but shutdown is disabled (testing mode)" + ) + + except asyncio.CancelledError: + logger.info("Check loop cancelled") + raise + except Exception as e: + logger.error("Error in lifecycle check loop: %s", e, exc_info=True) + + def _write_exit_code(self, code: int) -> None: + """Write exit code to shared file for sidecar to read.""" + try: + EXIT_CODE_FILE.parent.mkdir(parents=True, exist_ok=True) + EXIT_CODE_FILE.write_text(str(code)) + logger.debug("Wrote exit code %d to %s", code, EXIT_CODE_FILE) + except Exception as e: + logger.warning("Failed to write exit code file: %s", e) + + def setup_signal_handlers(self) -> None: + """ + Setup signal handlers for graceful shutdown. + On SIGTERM/SIGINT, exit normally (not with code 42) to allow restart. + """ + + def signal_handler(signum, frame): + logger.info("Received signal %d, exiting normally", signum) + sys.exit(0) + + signal.signal(signal.SIGTERM, signal_handler) + signal.signal(signal.SIGINT, signal_handler) + + +# Singleton instance for easy access across the application +_lifecycle_manager: Optional[LifecycleManager] = None + + +def get_lifecycle_manager() -> LifecycleManager: + """Get or create the global lifecycle manager instance.""" + global _lifecycle_manager + if _lifecycle_manager is None: + # Load configuration from environment + idle_timeout = int(os.environ.get("IDLE_TIMEOUT_MINUTES", "15")) + check_interval = int(os.environ.get("IDLE_CHECK_INTERVAL_SECONDS", "60")) + enable_shutdown = os.environ.get("ENABLE_IDLE_SHUTDOWN", "true").lower() == "true" + + _lifecycle_manager = LifecycleManager( + idle_timeout_minutes=idle_timeout, + check_interval_seconds=check_interval, + enable_shutdown=enable_shutdown, + ) + return _lifecycle_manager + + +async def start_lifecycle_manager() -> LifecycleManager: + """Initialize and start the lifecycle manager.""" + manager = get_lifecycle_manager() + manager.setup_signal_handlers() + await manager.start() + return manager diff --git a/client-py/dexorder/mcp_auth_middleware.py b/client-py/dexorder/mcp_auth_middleware.py new file mode 100644 index 0000000..1e4018e --- /dev/null +++ b/client-py/dexorder/mcp_auth_middleware.py @@ -0,0 +1,43 @@ +# openclaw/auth.py + +class MCPAuthMiddleware: + """Authenticates incoming MCP connections based on configured mode.""" + + def __init__(self, config: AuthConfig): + self.config = config + self._jwks_client = None # lazy-loaded for platform mode + + async def authenticate(self, request) -> AuthContext: + match self.config.mode: + case "local": + # stdio transport or localhost-only binding + # No auth needed — if you can exec into the container, + # you're the user + return AuthContext(user_id=self.config.local_user_id, + source="local") + + case "token": + # User-generated API key (standalone remote access) + token = extract_bearer_token(request) + if not verify_token_hash(token, self.config.tokens): + raise AuthError("Invalid API token") + return AuthContext(user_id=self.config.local_user_id, + source="api_key") + + case "platform": + # JWT signed by the OpenClaw platform + token = extract_bearer_token(request) + claims = await self._verify_platform_jwt(token) + if claims["sub"] != self.config.expected_user_id: + raise AuthError("User ID mismatch") + return AuthContext(user_id=claims["sub"], + source="platform", + scopes=claims.get("scopes", [])) + + async def _verify_platform_jwt(self, token: str) -> dict: + if not self._jwks_client: + self._jwks_client = JWKSClient(self.config.platform_jwks_url) + signing_key = await self._jwks_client.get_signing_key_from_jwt(token) + return jwt.decode(token, signing_key.key, + algorithms=["RS256"], + audience="openclaw-mcp") diff --git a/deploy/k8s/base/admission-policy.yaml b/deploy/k8s/base/admission-policy.yaml new file mode 100644 index 0000000..5cca3f4 --- /dev/null +++ b/deploy/k8s/base/admission-policy.yaml @@ -0,0 +1,110 @@ +# ValidatingAdmissionPolicy to restrict images in dexorder-agents namespace +# Requires Kubernetes 1.30+ (or 1.28+ with feature gate) +# This is the critical security control that prevents arbitrary image execution +# even if the gateway is compromised. +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingAdmissionPolicy +metadata: + name: dexorder-agent-image-policy +spec: + failurePolicy: Fail + matchConstraints: + namespaceSelector: + matchLabels: + dexorder.io/type: agents + resourceRules: + - apiGroups: ["apps"] + apiVersions: ["v1"] + resources: ["deployments"] + operations: ["CREATE", "UPDATE"] + validations: + # Only allow images from our approved registry with agent prefix + - expression: | + object.spec.template.spec.containers.all(c, + c.image.startsWith('ghcr.io/dexorder/agent:') || + c.image.startsWith('ghcr.io/dexorder/agent-')) + message: "Only approved dexorder agent images are allowed in the agents namespace" + reason: Forbidden + + # No privileged containers + - expression: | + object.spec.template.spec.containers.all(c, + !has(c.securityContext) || + !has(c.securityContext.privileged) || + c.securityContext.privileged == false) + message: "Privileged containers are not allowed" + reason: Forbidden + + # No hostPath volumes + - expression: | + !has(object.spec.template.spec.volumes) || + object.spec.template.spec.volumes.all(v, + !has(v.hostPath)) + message: "hostPath volumes are not allowed" + reason: Forbidden + + # No hostNetwork + - expression: | + !has(object.spec.template.spec.hostNetwork) || + object.spec.template.spec.hostNetwork == false + message: "hostNetwork is not allowed" + reason: Forbidden + + # No hostPID + - expression: | + !has(object.spec.template.spec.hostPID) || + object.spec.template.spec.hostPID == false + message: "hostPID is not allowed" + reason: Forbidden + + # Containers must run as non-root + - expression: | + object.spec.template.spec.containers.all(c, + has(c.securityContext) && + has(c.securityContext.runAsNonRoot) && + c.securityContext.runAsNonRoot == true) + message: "Containers must run as non-root" + reason: Forbidden + + # Must drop all capabilities + - expression: | + object.spec.template.spec.containers.all(c, + has(c.securityContext) && + has(c.securityContext.capabilities) && + has(c.securityContext.capabilities.drop) && + c.securityContext.capabilities.drop.exists(cap, cap == 'ALL')) + message: "Containers must drop all capabilities" + reason: Forbidden + + # Read-only root filesystem + - expression: | + object.spec.template.spec.containers.all(c, + has(c.securityContext) && + has(c.securityContext.readOnlyRootFilesystem) && + c.securityContext.readOnlyRootFilesystem == true) + message: "Containers must have read-only root filesystem" + reason: Forbidden + + # Resource limits must be set + - expression: | + object.spec.template.spec.containers.all(c, + has(c.resources) && + has(c.resources.limits) && + has(c.resources.limits.memory) && + has(c.resources.limits.cpu)) + message: "Containers must have resource limits set" + reason: Forbidden +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingAdmissionPolicyBinding +metadata: + name: dexorder-agent-image-policy-binding +spec: + policyName: dexorder-agent-image-policy + validationActions: + - Deny + matchResources: + namespaceSelector: + matchLabels: + dexorder.io/type: agents diff --git a/deploy/k8s/base/agent-deployment-example.yaml b/deploy/k8s/base/agent-deployment-example.yaml new file mode 100644 index 0000000..a46cda8 --- /dev/null +++ b/deploy/k8s/base/agent-deployment-example.yaml @@ -0,0 +1,221 @@ +# Example agent deployment with lifecycle sidecar +# This would be created by the gateway for each user +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: agent-user-abc123 + namespace: dexorder-agents + labels: + app.kubernetes.io/name: agent + app.kubernetes.io/component: user-agent + dexorder.io/component: agent + dexorder.io/user-id: user-abc123 + dexorder.io/deployment: agent-user-abc123 +spec: + replicas: 1 + selector: + matchLabels: + dexorder.io/user-id: user-abc123 + template: + metadata: + labels: + dexorder.io/component: agent + dexorder.io/user-id: user-abc123 + dexorder.io/deployment: agent-user-abc123 + spec: + serviceAccountName: agent-lifecycle + + # Share PID namespace so sidecar can monitor main container + shareProcessNamespace: true + + # Security context + securityContext: + runAsNonRoot: true + runAsUser: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + + containers: + # Main agent container + - name: agent + image: ghcr.io/dexorder/agent:latest + imagePullPolicy: Always + + # Security context (required by admission policy) + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + + # Resource limits (required by admission policy) + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "1Gi" + cpu: "1000m" + + # Environment variables + env: + - name: USER_ID + value: "user-abc123" + - name: IDLE_TIMEOUT_MINUTES + value: "15" + - name: IDLE_CHECK_INTERVAL_SECONDS + value: "60" + - name: ENABLE_IDLE_SHUTDOWN + value: "true" + - name: MCP_SERVER_PORT + value: "3000" + - name: ZMQ_CONTROL_PORT + value: "5555" + + # Ports + ports: + - name: mcp + containerPort: 3000 + protocol: TCP + - name: zmq-control + containerPort: 5555 + protocol: TCP + + # Volume mounts + volumeMounts: + - name: agent-data + mountPath: /app/data + - name: tmp + mountPath: /tmp + - name: shared-run + mountPath: /var/run/agent + + # Liveness probe (agent's MCP server) + livenessProbe: + httpGet: + path: /health + port: mcp + initialDelaySeconds: 10 + periodSeconds: 30 + timeoutSeconds: 5 + + # Readiness probe + readinessProbe: + httpGet: + path: /ready + port: mcp + initialDelaySeconds: 5 + periodSeconds: 10 + + # Lifecycle sidecar + - name: lifecycle-sidecar + image: ghcr.io/dexorder/lifecycle-sidecar:latest + imagePullPolicy: Always + + # Security context + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + + # Resource limits + resources: + requests: + memory: "32Mi" + cpu: "10m" + limits: + memory: "64Mi" + cpu: "50m" + + # Environment variables (injected via downward API) + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: DEPLOYMENT_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['dexorder.io/deployment'] + - name: USER_TYPE + value: "free" # Gateway sets this based on license + - name: MAIN_CONTAINER_PID + value: "1" # In shared PID namespace, main container is typically PID 1 + + # Volume mounts + volumeMounts: + - name: shared-run + mountPath: /var/run/agent + readOnly: true + + # Volumes + volumes: + # Persistent data (user files, state) + - name: agent-data + persistentVolumeClaim: + claimName: agent-user-abc123-data + + # Temporary writable filesystem (read-only rootfs) + - name: tmp + emptyDir: + medium: Memory + sizeLimit: 128Mi + + # Shared between main container and sidecar + - name: shared-run + emptyDir: + medium: Memory + sizeLimit: 1Mi + + # Restart policy + restartPolicy: Always + + # Termination grace period + terminationGracePeriodSeconds: 30 +--- +# PVC for agent persistent data +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: agent-user-abc123-data + namespace: dexorder-agents + labels: + dexorder.io/user-id: user-abc123 +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi + storageClassName: standard # Or your preferred storage class +--- +# Service to expose agent MCP endpoint +apiVersion: v1 +kind: Service +metadata: + name: agent-user-abc123 + namespace: dexorder-agents + labels: + dexorder.io/user-id: user-abc123 +spec: + type: ClusterIP + selector: + dexorder.io/user-id: user-abc123 + ports: + - name: mcp + port: 3000 + targetPort: mcp + protocol: TCP + - name: zmq-control + port: 5555 + targetPort: zmq-control + protocol: TCP diff --git a/deploy/k8s/base/agent-quotas.yaml b/deploy/k8s/base/agent-quotas.yaml new file mode 100644 index 0000000..660d8db --- /dev/null +++ b/deploy/k8s/base/agent-quotas.yaml @@ -0,0 +1,53 @@ +# Resource constraints for the dexorder-agents namespace +# These limits apply regardless of what the gateway requests +--- +# LimitRange: per-container defaults and maximums +apiVersion: v1 +kind: LimitRange +metadata: + name: agent-limits + namespace: dexorder-agents +spec: + limits: + # Default limits applied if deployment doesn't specify + - type: Container + default: + memory: "512Mi" + cpu: "500m" + defaultRequest: + memory: "256Mi" + cpu: "100m" + # Maximum any single container can request + max: + memory: "2Gi" + cpu: "2000m" + min: + memory: "64Mi" + cpu: "50m" + # PVC size limits + - type: PersistentVolumeClaim + max: + storage: "10Gi" + min: + storage: "100Mi" +--- +# ResourceQuota: total namespace limits +# Prevents a compromised gateway from exhausting cluster resources +apiVersion: v1 +kind: ResourceQuota +metadata: + name: agent-quota + namespace: dexorder-agents +spec: + hard: + # Total compute limits for all agents combined + requests.cpu: "20" + requests.memory: "40Gi" + limits.cpu: "40" + limits.memory: "80Gi" + # Object count limits + pods: "100" + persistentvolumeclaims: "100" + services: "100" + # Storage limits + requests.storage: "500Gi" diff --git a/deploy/k8s/base/gateway-rbac.yaml b/deploy/k8s/base/gateway-rbac.yaml new file mode 100644 index 0000000..53929fd --- /dev/null +++ b/deploy/k8s/base/gateway-rbac.yaml @@ -0,0 +1,65 @@ +# RBAC for gateway to CREATE agent deployments only +# Principle of least privilege: gateway can ONLY create deployments/services/PVCs +# in the dexorder-agents namespace. Deletion is handled by the lifecycle sidecar. +# No pods, secrets, exec, or cross-namespace access. +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: gateway + namespace: dexorder-system +--- +# Role scoped to dexorder-agents namespace only +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: agent-creator + namespace: dexorder-agents +rules: + # Deployments: create and read only (deletion handled by sidecar) + - apiGroups: ["apps"] + resources: ["deployments"] + verbs: ["create", "get", "list", "watch", "patch", "update"] + + # PVCs: create and read (deletion handled by sidecar) + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["create", "get", "list", "watch"] + + # Services: create and manage agent MCP endpoints + - apiGroups: [""] + resources: ["services"] + verbs: ["create", "get", "list", "watch", "patch", "update"] + + # Read-only pod access for status checks (no exec!) + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] + + # Pod logs for debugging (read-only) + - apiGroups: [""] + resources: ["pods/log"] + verbs: ["get"] + + # Explicitly NOT included: + # - deployments/delete - handled by lifecycle sidecar + # - pvc/delete - handled by lifecycle sidecar + # - services/delete - handled by lifecycle sidecar + # - pods (create/delete) - must go through deployments + # - pods/exec, pods/attach - no shell access + # - secrets, configmaps - no credential access + # - any resources in other namespaces +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: gateway-agent-creator + namespace: dexorder-agents +subjects: + - kind: ServiceAccount + name: gateway + namespace: dexorder-system +roleRef: + kind: Role + name: agent-creator + apiGroup: rbac.authorization.k8s.io diff --git a/deploy/k8s/base/init.yaml b/deploy/k8s/base/init.yaml index 54d5370..31c74d6 100644 --- a/deploy/k8s/base/init.yaml +++ b/deploy/k8s/base/init.yaml @@ -1,3 +1,6 @@ +# Runtime and security initialization for dexorder AI platform +# Apply this first: kubectl apply -f init.yaml +--- apiVersion: node.k8s.io/v1 kind: RuntimeClass metadata: diff --git a/deploy/k8s/base/kustomization.yaml b/deploy/k8s/base/kustomization.yaml index bae21bc..66e3b92 100644 --- a/deploy/k8s/base/kustomization.yaml +++ b/deploy/k8s/base/kustomization.yaml @@ -1,5 +1,26 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization -resources: [] - # ingress.yaml - removed until we have services to expose +resources: + # Core initialization (runtime classes) + - init.yaml + # Namespace definitions with PodSecurity labels + - namespaces.yaml + # RBAC for gateway to create agents (creation only) + - gateway-rbac.yaml + # RBAC for lifecycle sidecar (self-deletion) + - lifecycle-sidecar-rbac.yaml + # Admission policies (image restriction, security requirements) + - admission-policy.yaml + # Resource quotas and limits for agents namespace + - agent-quotas.yaml + # Network isolation policies + - network-policies.yaml + # Gateway service (uncomment when ready) + # - gateway.yaml + # Example agent deployment (for reference, not applied by default) + # - agent-deployment-example.yaml + # Services (uncomment as needed) + # - backend.yaml + # - web.yaml + # - ingress.yaml diff --git a/deploy/k8s/base/lifecycle-sidecar-rbac.yaml b/deploy/k8s/base/lifecycle-sidecar-rbac.yaml new file mode 100644 index 0000000..b3b2bd3 --- /dev/null +++ b/deploy/k8s/base/lifecycle-sidecar-rbac.yaml @@ -0,0 +1,53 @@ +# RBAC for lifecycle sidecar - allows self-deletion only +# Each agent pod gets this ServiceAccount and can only delete its own deployment +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: agent-lifecycle + namespace: dexorder-agents +--- +# Role allowing deletion of deployments and PVCs +# This is scoped to the dexorder-agents namespace +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: agent-self-delete + namespace: dexorder-agents +rules: + # Allow getting and deleting deployments + - apiGroups: ["apps"] + resources: ["deployments"] + verbs: ["get", "delete"] + + # Allow getting and deleting PVCs (for anonymous users) + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["get", "delete"] + + # Read-only access to pods (for status checking) + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: agent-self-delete + namespace: dexorder-agents +subjects: + - kind: ServiceAccount + name: agent-lifecycle + namespace: dexorder-agents +roleRef: + kind: Role + name: agent-self-delete + apiGroup: rbac.authorization.k8s.io +--- +# Additional security: ValidatingWebhookConfiguration to restrict deletion +# This ensures sidecars can only delete their own deployment +# Requires a validating webhook server (can be added later) +# For now, we rely on: +# 1. Sidecar only knowing its own deployment name (from env) +# 2. RBAC limiting to dexorder-agents namespace +# 3. Admission policy restricting deployment creation (already defined) diff --git a/deploy/k8s/base/namespaces.yaml b/deploy/k8s/base/namespaces.yaml new file mode 100644 index 0000000..54afbe0 --- /dev/null +++ b/deploy/k8s/base/namespaces.yaml @@ -0,0 +1,24 @@ +# Namespace definitions for dexorder AI platform +# - dexorder-system: gateway, flink, kafka, and other infrastructure +# - dexorder-agents: user agent containers (isolated, restricted) +--- +apiVersion: v1 +kind: Namespace +metadata: + name: dexorder-system + labels: + app.kubernetes.io/part-of: dexorder + dexorder.io/type: system +--- +apiVersion: v1 +kind: Namespace +metadata: + name: dexorder-agents + labels: + app.kubernetes.io/part-of: dexorder + dexorder.io/type: agents + # Enforce restricted pod security standards + pod-security.kubernetes.io/enforce: restricted + pod-security.kubernetes.io/enforce-version: latest + pod-security.kubernetes.io/audit: restricted + pod-security.kubernetes.io/warn: restricted diff --git a/deploy/k8s/base/network-policies.yaml b/deploy/k8s/base/network-policies.yaml new file mode 100644 index 0000000..8e4558a --- /dev/null +++ b/deploy/k8s/base/network-policies.yaml @@ -0,0 +1,121 @@ +# Network policies for agent isolation +# Agents can only communicate with specific services, not with each other +# or with the Kubernetes API +--- +# Default deny all ingress and egress in agents namespace +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: default-deny-all + namespace: dexorder-agents +spec: + podSelector: {} + policyTypes: + - Ingress + - Egress +--- +# Allow agents to receive connections from gateway (MCP) +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-gateway-ingress + namespace: dexorder-agents +spec: + podSelector: + matchLabels: + dexorder.io/component: agent + policyTypes: + - Ingress + ingress: + - from: + - namespaceSelector: + matchLabels: + dexorder.io/type: system + podSelector: + matchLabels: + app: gateway + ports: + - protocol: TCP + port: 3000 # MCP server port + - protocol: TCP + port: 5555 # ZeroMQ control channel +--- +# Allow agents to connect to required services +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-agent-egress + namespace: dexorder-agents +spec: + podSelector: + matchLabels: + dexorder.io/component: agent + policyTypes: + - Egress + egress: + # DNS resolution (required) + - to: + - namespaceSelector: {} + podSelector: + matchLabels: + k8s-app: kube-dns + ports: + - protocol: UDP + port: 53 + - protocol: TCP + port: 53 + # Gateway in system namespace (for callbacks) + - to: + - namespaceSelector: + matchLabels: + dexorder.io/type: system + podSelector: + matchLabels: + app: gateway + ports: + - protocol: TCP + port: 8080 + # Kafka/Redpanda for data subscriptions + - to: + - namespaceSelector: + matchLabels: + dexorder.io/type: system + podSelector: + matchLabels: + app: redpanda + ports: + - protocol: TCP + port: 9092 + # External HTTPS (for exchange APIs, LLM APIs) + - to: + - ipBlock: + cidr: 0.0.0.0/0 + except: + # Block access to k8s API server (common ranges) + - 10.0.0.0/8 + - 172.16.0.0/12 + - 192.168.0.0/16 + ports: + - protocol: TCP + port: 443 +--- +# System namespace: allow ingress from agents +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-agent-callbacks + namespace: dexorder-system +spec: + podSelector: + matchLabels: + app: gateway + policyTypes: + - Ingress + ingress: + - from: + - namespaceSelector: + matchLabels: + dexorder.io/type: agents + ports: + - protocol: TCP + port: 8080 diff --git a/deploy/k8s/dev/admission-policy-patch.yaml b/deploy/k8s/dev/admission-policy-patch.yaml new file mode 100644 index 0000000..7a6728a --- /dev/null +++ b/deploy/k8s/dev/admission-policy-patch.yaml @@ -0,0 +1,97 @@ +# Dev admission policy: allow local registry images +# In dev, we also allow images from localhost/minikube registry +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingAdmissionPolicy +metadata: + name: dexorder-agent-image-policy +spec: + failurePolicy: Fail + matchConstraints: + namespaceSelector: + matchLabels: + dexorder.io/type: agents + resourceRules: + - apiGroups: ["apps"] + apiVersions: ["v1"] + resources: ["deployments"] + operations: ["CREATE", "UPDATE"] + validations: + # Allow local dev images in addition to production registry + - expression: | + object.spec.template.spec.containers.all(c, + c.image.startsWith('ghcr.io/dexorder/agent:') || + c.image.startsWith('ghcr.io/dexorder/agent-') || + c.image.startsWith('localhost:5000/dexorder/agent') || + c.image.startsWith('dexorder/agent')) + message: "Only approved dexorder agent images are allowed" + reason: Forbidden + + # No privileged containers + - expression: | + object.spec.template.spec.containers.all(c, + !has(c.securityContext) || + !has(c.securityContext.privileged) || + c.securityContext.privileged == false) + message: "Privileged containers are not allowed" + reason: Forbidden + + # No hostPath volumes + - expression: | + !has(object.spec.template.spec.volumes) || + object.spec.template.spec.volumes.all(v, + !has(v.hostPath)) + message: "hostPath volumes are not allowed" + reason: Forbidden + + # No hostNetwork + - expression: | + !has(object.spec.template.spec.hostNetwork) || + object.spec.template.spec.hostNetwork == false + message: "hostNetwork is not allowed" + reason: Forbidden + + # No hostPID + - expression: | + !has(object.spec.template.spec.hostPID) || + object.spec.template.spec.hostPID == false + message: "hostPID is not allowed" + reason: Forbidden + + # Containers must run as non-root + - expression: | + object.spec.template.spec.containers.all(c, + has(c.securityContext) && + has(c.securityContext.runAsNonRoot) && + c.securityContext.runAsNonRoot == true) + message: "Containers must run as non-root" + reason: Forbidden + + # Must drop all capabilities + - expression: | + object.spec.template.spec.containers.all(c, + has(c.securityContext) && + has(c.securityContext.capabilities) && + has(c.securityContext.capabilities.drop) && + c.securityContext.capabilities.drop.exists(cap, cap == 'ALL')) + message: "Containers must drop all capabilities" + reason: Forbidden + + # Read-only root filesystem + - expression: | + object.spec.template.spec.containers.all(c, + has(c.securityContext) && + has(c.securityContext.readOnlyRootFilesystem) && + c.securityContext.readOnlyRootFilesystem == true) + message: "Containers must have read-only root filesystem" + reason: Forbidden + + # Resource limits must be set + - expression: | + object.spec.template.spec.containers.all(c, + has(c.resources) && + has(c.resources.limits) && + has(c.resources.limits.memory) && + has(c.resources.limits.cpu)) + message: "Containers must have resource limits set" + reason: Forbidden diff --git a/deploy/k8s/dev/agent-quotas-patch.yaml b/deploy/k8s/dev/agent-quotas-patch.yaml new file mode 100644 index 0000000..34a3a57 --- /dev/null +++ b/deploy/k8s/dev/agent-quotas-patch.yaml @@ -0,0 +1,19 @@ +# Dev/minikube resource quota overrides +# Smaller limits appropriate for local development +--- +apiVersion: v1 +kind: ResourceQuota +metadata: + name: agent-quota + namespace: dexorder-agents +spec: + hard: + # Reduced for minikube + requests.cpu: "4" + requests.memory: "8Gi" + limits.cpu: "8" + limits.memory: "16Gi" + pods: "20" + persistentvolumeclaims: "20" + services: "20" + requests.storage: "50Gi" diff --git a/deploy/k8s/dev/kustomization.yaml b/deploy/k8s/dev/kustomization.yaml index 14168e5..c5bef4a 100644 --- a/deploy/k8s/dev/kustomization.yaml +++ b/deploy/k8s/dev/kustomization.yaml @@ -1,16 +1,20 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization -namespace: default +# Note: namespaces are defined in base; workloads go to dexorder-system +namespace: dexorder-system -# Base resources +# Base resources (includes security policies) resources: - ../base - infrastructure.yaml -# No patches needed currently -patches: [] - # ingress-dev.yaml - removed until we have services to expose +# Dev-specific patches +patches: + # Reduced resource quotas for minikube + - path: agent-quotas-patch.yaml + # Allow local registry images + - path: admission-policy-patch.yaml # ConfigMaps for service configs configMapGenerator: diff --git a/deploy/k8s/prod/kustomization.yaml b/deploy/k8s/prod/kustomization.yaml index 6bd96fb..b8a746b 100644 --- a/deploy/k8s/prod/kustomization.yaml +++ b/deploy/k8s/prod/kustomization.yaml @@ -1,9 +1,10 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization -namespace: default +# Note: namespaces are defined in base; workloads go to dexorder-system +namespace: dexorder-system -# Base resources (backend, web, ingress, init/gVisor) +# Base resources (includes all security policies) resources: - ../base @@ -38,3 +39,10 @@ images: newTag: latest - name: dexorder/ai-web newTag: latest + - name: ghcr.io/dexorder/gateway + newTag: latest + - name: lifecycle-sidecar + newName: ghcr.io/dexorder/lifecycle-sidecar + newTag: latest + - name: ghcr.io/dexorder/agent + newTag: latest diff --git a/doc/agent_harness_flow.md b/doc/agent_harness_flow.md new file mode 100644 index 0000000..24a5c72 --- /dev/null +++ b/doc/agent_harness_flow.md @@ -0,0 +1,21 @@ +┌─────────────────────────────────────────────────┐ +│ Agent Harness (your servers) │ +│ │ +│ on_message(user_id, message): │ +│ 1. Look up user's MCP endpoint from Postgres │ +│ 2. mcp.call("get_context_summary") │ +│ 3. mcp.call("get_conversation_history", 20) │ +│ 4. Build prompt: │ +│ system = BASE_PROMPT │ +│ + context_summary │ +│ + user_agent_prompt (from MCP) │ +│ messages = history + new message │ +│ 5. LLM call (your API key) │ +│ 6. While LLM wants tool calls: │ +│ - Platform tools → handle locally │ +│ - User tools → proxy to MCP │ +│ - LLM call again with results │ +│ 7. mcp.call("save_message", ...) │ +│ 8. Return response to user │ +│ │ +└─────────────────────────────────────────────────┘ diff --git a/doc/agent_redesign.md b/doc/agent_redesign.md index 159244d..d318e72 100644 --- a/doc/agent_redesign.md +++ b/doc/agent_redesign.md @@ -1,9 +1,11 @@ Generally use skills instead of subagents, except for the analysis subagent. -## User-specific files +## User-specific files and tools * Indicators * Strategies * Watchlists * Preferences * Trading style * Charting / colors +* Executors (really just sub-strategies) + * tactical-level order generators e.g. TWAP, iceberg, etc. diff --git a/doc/config.md b/doc/config.md deleted file mode 100644 index b520ac7..0000000 --- a/doc/config.md +++ /dev/null @@ -1,18 +0,0 @@ -This file describes all the configuration options used by all components. All configuration is divided into regular config and secrets, and k8s will mount either or both as a yaml file accessible to the process. - -# Configuration - -* `flink_hostname` -* ... various zmq ports for flink ... -* `iceberg_catalog_hostname` -* `iceberg_catalog_port` -* `iceberg_catalog_database` -* etc - - -# Secrets - -* `iceberg_catalog_username` -* `iceberg_catalog_password` -* etc. - diff --git a/doc/container_lifecycle_management.md b/doc/container_lifecycle_management.md new file mode 100644 index 0000000..bf2ed3d --- /dev/null +++ b/doc/container_lifecycle_management.md @@ -0,0 +1,313 @@ +# Container Lifecycle Management + +## Overview + +User agent containers self-manage their lifecycle to optimize resource usage. Containers automatically shut down when idle (no triggers + no recent activity) and clean themselves up using a lifecycle sidecar. + +## Architecture + +``` +┌──────────────────────────────────────────────────────────┐ +│ Agent Pod │ +│ ┌───────────────────┐ ┌──────────────────────┐ │ +│ │ Agent Container │ │ Lifecycle Sidecar │ │ +│ │ ─────────────── │ │ ────────────────── │ │ +│ │ │ │ │ │ +│ │ Lifecycle Manager │ │ Watches exit code │ │ +│ │ - Track activity │ │ - Detects exit 42 │ │ +│ │ - Track triggers │ │ - Calls k8s API │ │ +│ │ - Exit 42 if idle │ │ - Deletes deployment │ │ +│ └───────────────────┘ └──────────────────────┘ │ +│ │ │ │ +│ │ writes exit_code │ │ +│ └────►/var/run/agent/exit_code │ +│ │ │ +└───────────────────────────────────────┼──────────────────┘ + │ + ▼ k8s API (RBAC) + ┌─────────────────────┐ + │ Delete Deployment │ + │ Delete PVC (if anon)│ + └─────────────────────┘ +``` + +## Components + +### 1. Lifecycle Manager (Python) + +**Location**: `client-py/dexorder/lifecycle_manager.py` + +Runs inside the agent container and tracks: +- **Activity**: MCP tool/resource/prompt calls reset the idle timer +- **Triggers**: Data subscriptions, CEP patterns, etc. +- **Idle state**: No triggers + idle timeout exceeded + +**Configuration** (via environment variables): +- `IDLE_TIMEOUT_MINUTES`: Minutes before shutdown (default: 15) +- `IDLE_CHECK_INTERVAL_SECONDS`: Check frequency (default: 60) +- `ENABLE_IDLE_SHUTDOWN`: Enable/disable shutdown (default: true) + +**Usage in agent code**: +```python +from dexorder.lifecycle_manager import get_lifecycle_manager + +# On startup +manager = get_lifecycle_manager() +await manager.start() + +# On MCP calls (tool/resource/prompt) +manager.record_activity() + +# When triggers change +manager.add_trigger("data_sub_BTC_USDT") +manager.remove_trigger("data_sub_BTC_USDT") + +# Or batch update +manager.update_triggers({"trigger_1", "trigger_2"}) +``` + +**Exit behavior**: +- Idle shutdown: Exit with code `42` +- Signal (SIGTERM/SIGINT): Exit with code `0` (allows restart) +- Errors/crashes: Exit with error code (allows restart) + +### 2. Lifecycle Sidecar (Go) + +**Location**: `lifecycle-sidecar/` + +Runs alongside the agent container with shared PID namespace. Monitors the main container process and: +- On exit code `42`: Deletes deployment (and PVC if anonymous user) +- On any other exit code: Exits with same code (k8s restarts pod) + +**Configuration** (via environment, injected by downward API): +- `NAMESPACE`: Pod's namespace +- `DEPLOYMENT_NAME`: Deployment name (from pod label) +- `USER_TYPE`: License tier (`anonymous`, `free`, `paid`, `enterprise`) +- `MAIN_CONTAINER_PID`: PID of main container (default: 1) + +**RBAC**: Has permission to delete deployments and PVCs **only in dexorder-agents namespace**. Cannot delete other deployments due to: +1. Only knows its own deployment name (from env) +2. RBAC scoped to namespace +3. No cross-pod communication + +### 3. Gateway (TypeScript) + +**Location**: `gateway/src/harness/agent-harness.ts` + +Creates agent deployments when users connect. Has permissions to: +- ✅ Create deployments, services, PVCs +- ✅ Read pod status and logs +- ✅ Update deployments (e.g., resource limits) +- ❌ Delete deployments (handled by sidecar) +- ❌ Exec into pods +- ❌ Access secrets + +## Lifecycle States + +``` +┌─────────────┐ +│ CREATED │ ← Gateway creates deployment +└──────┬──────┘ + │ + ▼ +┌─────────────┐ +│ RUNNING │ ← User interacts, has triggers +└──────┬──────┘ + │ + ▼ +┌─────────────┐ +│ IDLE │ ← No triggers + timeout exceeded +└──────┬──────┘ + │ + ▼ +┌─────────────┐ +│ SHUTDOWN │ ← Exit code 42 +└──────┬──────┘ + │ + ▼ +┌─────────────┐ +│ DELETED │ ← Sidecar deletes deployment +└─────────────┘ +``` + +## Idle Detection Logic + +Container is **IDLE** when: +1. `active_triggers.isEmpty()` AND +2. `(now - last_activity) > idle_timeout` + +Container is **ACTIVE** when: +1. Has any active triggers (data subscriptions, CEP patterns, etc.) OR +2. Recent user activity (MCP calls within timeout) + +## Cleanup Policies by License Tier + +| User Type | Idle Timeout | PVC Policy | Notes | +|--------------|--------------|------------|-------| +| Anonymous | 15 minutes | Delete | Ephemeral, no data retention | +| Free | 15 minutes | Retain | Can resume session | +| Paid | 60 minutes | Retain | Longer grace period | +| Enterprise | No shutdown | Retain | Always-on containers | + +Configured via `USER_TYPE` env var in deployment. + +## Security + +### Principle of Least Privilege + +**Gateway**: +- Can create agent resources +- Cannot delete agent resources +- Cannot access other namespaces +- Cannot exec into pods + +**Lifecycle Sidecar**: +- Can delete its own deployment only +- Cannot delete other deployments +- Scoped to dexorder-agents namespace +- No exec, no secrets access + +### Admission Control + +All deployments in `dexorder-agents` namespace are subject to: +- Image allowlist (only approved images) +- Security context enforcement (non-root, drop caps, read-only rootfs) +- Resource limits required +- PodSecurity standards (restricted profile) + +See `deploy/k8s/base/admission-policy.yaml` + +### Network Isolation + +Agents are network-isolated via NetworkPolicy: +- Can connect to gateway (MCP) +- Can connect to Redpanda (data streams) +- Can make outbound HTTPS (exchanges, LLM APIs) +- Cannot access k8s API +- Cannot access system namespace +- Cannot access other agent pods + +See `deploy/k8s/base/network-policies.yaml` + +## Deployment + +### 1. Apply Security Policies + +```bash +kubectl apply -k deploy/k8s/dev # or prod +``` + +This creates: +- Namespaces (`dexorder-system`, `dexorder-agents`) +- RBAC (gateway, lifecycle sidecar) +- Admission policies +- Network policies +- Resource quotas + +### 2. Build and Push Lifecycle Sidecar + +```bash +cd lifecycle-sidecar +docker build -t ghcr.io/dexorder/lifecycle-sidecar:latest . +docker push ghcr.io/dexorder/lifecycle-sidecar:latest +``` + +### 3. Gateway Creates Agent Deployments + +When a user connects, the gateway creates: +- Deployment with agent + sidecar +- PVC for persistent data +- Service for MCP endpoint + +See `deploy/k8s/base/agent-deployment-example.yaml` for template. + +## Testing + +### Test Lifecycle Manager Locally + +```python +from dexorder.lifecycle_manager import LifecycleManager + +# Disable actual shutdown for testing +manager = LifecycleManager( + idle_timeout_minutes=1, + check_interval_seconds=10, + enable_shutdown=False # Only log, don't exit +) + +await manager.start() + +# Simulate activity +manager.record_activity() + +# Simulate triggers +manager.add_trigger("test_trigger") +await asyncio.sleep(70) # Wait past timeout +manager.remove_trigger("test_trigger") +await asyncio.sleep(70) # Should detect idle + +await manager.stop() +``` + +### Test Sidecar Locally + +```bash +# Build +cd lifecycle-sidecar +go build -o lifecycle-sidecar main.go + +# Run (requires k8s config) +export NAMESPACE=dexorder-agents +export DEPLOYMENT_NAME=agent-test +export USER_TYPE=free +./lifecycle-sidecar +``` + +### Integration Test + +1. Deploy test agent with sidecar +2. Verify agent starts and is healthy +3. Stop sending MCP calls and remove all triggers +4. Wait for idle timeout + check interval +5. Verify deployment is deleted + +## Troubleshooting + +### Container not shutting down when idle + +Check logs: +```bash +kubectl logs -n dexorder-agents agent-user-abc123 -c agent +``` + +Verify: +- `ENABLE_IDLE_SHUTDOWN=true` +- No active triggers: `manager.active_triggers` should be empty +- Idle timeout exceeded + +### Sidecar not deleting deployment + +Check sidecar logs: +```bash +kubectl logs -n dexorder-agents agent-user-abc123 -c lifecycle-sidecar +``` + +Verify: +- Exit code file exists: `/var/run/agent/exit_code` contains `42` +- RBAC permissions: `kubectl auth can-i delete deployments --as=system:serviceaccount:dexorder-agents:agent-lifecycle -n dexorder-agents` +- Deployment name matches: Check `DEPLOYMENT_NAME` env var + +### Gateway can't create deployments + +Check gateway logs and verify: +- ServiceAccount exists: `kubectl get sa gateway -n dexorder-system` +- RoleBinding exists: `kubectl get rolebinding gateway-agent-creator -n dexorder-agents` +- Admission policy allows image: Check image name matches allowlist in `admission-policy.yaml` + +## Future Enhancements + +1. **Graceful shutdown notifications**: Warn users before shutdown via websocket +2. **Predictive scaling**: Keep frequently-used containers warm +3. **Tiered storage**: Move old PVCs to cheaper storage class +4. **Metrics**: Expose lifecycle metrics (idle rate, shutdown count, etc.) +5. **Cost allocation**: Track resource usage per user/license tier diff --git a/doc/gateway_container_creation.md b/doc/gateway_container_creation.md new file mode 100644 index 0000000..dc938a9 --- /dev/null +++ b/doc/gateway_container_creation.md @@ -0,0 +1,286 @@ +# Gateway Container Creation + +## Overview + +The gateway automatically provisions user agent containers when users authenticate. This ensures each user has their own isolated environment running their MCP server with persistent storage. + +## Authentication Flow with Container Creation + +``` +User connects (WebSocket/Telegram) + ↓ + Send "Authenticating..." status + ↓ + Verify token/channel link + ↓ + Lookup user license from DB + ↓ + Send "Starting workspace..." status + ↓ +┌────────────────────────────────────┐ +│ ContainerManager.ensureRunning() │ +│ ┌──────────────────────────────┐ │ +│ │ Check if deployment exists │ │ +│ └──────────────────────────────┘ │ +│ ↓ │ +│ Does it exist? │ +│ ↙ ↘ │ +│ Yes No │ +│ │ │ │ +│ │ ┌──────────────────┐ │ +│ │ │ Create deployment│ │ +│ │ │ Create PVC │ │ +│ │ │ Create service │ │ +│ │ └──────────────────┘ │ +│ │ │ │ +│ └────────────┘ │ +│ ↓ │ +│ Wait for deployment ready │ +│ (polls every 2s, timeout 2min) │ +│ ↓ │ +│ Compute MCP endpoint URL │ +│ (internal k8s service DNS) │ +└────────────────────────────────────┘ + ↓ + Update license.mcpServerUrl + ↓ + Send "Connected" status + ↓ + Initialize AgentHarness + ↓ + Connect to user's MCP server + ↓ + Ready for messages +``` + +## Container Naming Convention + +All resources follow a consistent naming pattern based on `userId`: + +```typescript +userId: "user-abc123" + ↓ +deploymentName: "agent-user-abc123" +serviceName: "agent-user-abc123" +pvcName: "agent-user-abc123-data" +mcpEndpoint: "http://agent-user-abc123.dexorder-agents.svc.cluster.local:3000" +``` + +User IDs are sanitized to be Kubernetes-compliant (lowercase alphanumeric + hyphens). + +## Templates by License Tier + +Templates are located in `gateway/src/k8s/templates/`: +- `free-tier.yaml` +- `pro-tier.yaml` +- `enterprise-tier.yaml` + +### Variable Substitution + +Templates use simple string replacement: +- `{{userId}}` - User ID +- `{{deploymentName}}` - Computed deployment name +- `{{serviceName}}` - Computed service name +- `{{pvcName}}` - Computed PVC name +- `{{agentImage}}` - Agent container image (from env) +- `{{sidecarImage}}` - Lifecycle sidecar image (from env) +- `{{storageClass}}` - Kubernetes storage class (from env) + +### Resource Limits + +| Tier | Memory Request | Memory Limit | CPU Request | CPU Limit | Storage | Idle Timeout | +|------|----------------|--------------|-------------|-----------|---------|--------------| +| **Free** | 256Mi | 512Mi | 100m | 500m | 1Gi | 15min | +| **Pro** | 512Mi | 2Gi | 250m | 2000m | 10Gi | 60min | +| **Enterprise** | 1Gi | 4Gi | 500m | 4000m | 50Gi | Never (shutdown disabled) | + +## Components + +### KubernetesClient (`gateway/src/k8s/client.ts`) + +Low-level k8s API wrapper: +- `deploymentExists(name)` - Check if deployment exists +- `createAgentDeployment(spec)` - Create deployment/service/PVC from template +- `waitForDeploymentReady(name, timeout)` - Poll until ready +- `getServiceEndpoint(name)` - Get service URL +- `deleteAgentDeployment(userId)` - Cleanup (for testing) + +Static helpers: +- `getDeploymentName(userId)` - Generate deployment name +- `getServiceName(userId)` - Generate service name +- `getPvcName(userId)` - Generate PVC name +- `getMcpEndpoint(userId, namespace)` - Compute internal service URL + +### ContainerManager (`gateway/src/k8s/container-manager.ts`) + +High-level orchestration: +- `ensureContainerRunning(userId, license)` - Main entry point + - Returns: `{ mcpEndpoint, wasCreated }` + - Creates deployment if missing + - Waits for ready state + - Returns endpoint URL +- `getContainerStatus(userId)` - Check status without creating +- `deleteContainer(userId)` - Manual cleanup + +### Authenticator (`gateway/src/auth/authenticator.ts`) + +Updated to call container manager: +- `authenticateWebSocket()` - Calls `ensureContainerRunning()` before returning `AuthContext` +- `authenticateTelegram()` - Same for Telegram webhooks + +### WebSocketHandler (`gateway/src/channels/websocket-handler.ts`) + +Multi-phase connection protocol: +1. Send `{type: 'status', status: 'authenticating'}` +2. Authenticate (may take 30-120s if creating container) +3. Send `{type: 'status', status: 'initializing'}` +4. Initialize agent harness +5. Send `{type: 'connected', ...}` + +This gives the client visibility into the startup process. + +## Configuration + +Environment variables: + +```bash +# Kubernetes +KUBERNETES_NAMESPACE=dexorder-agents +KUBERNETES_IN_CLUSTER=true # false for local dev +KUBERNETES_CONTEXT=minikube # for local dev only + +# Container images +AGENT_IMAGE=ghcr.io/dexorder/agent:latest +SIDECAR_IMAGE=ghcr.io/dexorder/lifecycle-sidecar:latest + +# Storage +AGENT_STORAGE_CLASS=standard +``` + +## Security + +The gateway uses a restricted ServiceAccount with RBAC: + +**Can do:** +- ✅ Create deployments in `dexorder-agents` namespace +- ✅ Create services in `dexorder-agents` namespace +- ✅ Create PVCs in `dexorder-agents` namespace +- ✅ Read pod status and logs (debugging) +- ✅ Update deployments (future: resource scaling) + +**Cannot do:** +- ❌ Delete deployments (handled by lifecycle sidecar) +- ❌ Delete PVCs (handled by lifecycle sidecar) +- ❌ Exec into pods +- ❌ Access secrets or configmaps +- ❌ Create resources in other namespaces +- ❌ Access Kubernetes API from agent containers (blocked by NetworkPolicy) + +See `deploy/k8s/base/gateway-rbac.yaml` for full configuration. + +## Lifecycle + +### Container Creation (Gateway) +- User authenticates +- Gateway checks if deployment exists +- If missing, creates from template +- Waits for ready (2min timeout) +- Returns MCP endpoint + +### Container Deletion (Lifecycle Sidecar) +- Container tracks activity and triggers +- When idle (no triggers + timeout), exits with code 42 +- Sidecar detects exit code 42 +- Sidecar deletes deployment + optional PVC via k8s API +- Gateway creates fresh container on next authentication + +See `doc/container_lifecycle_management.md` for full lifecycle details. + +## Error Handling + +| Error | Gateway Action | User Experience | +|-------|----------------|-----------------| +| Deployment creation fails | Log error, return auth failure | "Authentication failed" | +| Wait timeout (image pull, etc.) | Log warning, return 503 | "Service unavailable, retry" | +| Service not found | Retry with backoff | Transparent retry | +| MCP connection fails | Return error | "Failed to connect to workspace" | +| Existing deployment not ready | Wait 30s, continue if still not ready | May connect to partially-ready container | + +## Local Development + +For local development (outside k8s): + +1. Start minikube: +```bash +minikube start +minikube addons enable storage-provisioner +``` + +2. Apply security policies: +```bash +kubectl apply -k deploy/k8s/dev +``` + +3. Configure gateway for local k8s: +```bash +# .env +KUBERNETES_IN_CLUSTER=false +KUBERNETES_CONTEXT=minikube +KUBERNETES_NAMESPACE=dexorder-agents +``` + +4. Run gateway: +```bash +cd gateway +npm run dev +``` + +5. Connect via WebSocket: +```bash +wscat -c "ws://localhost:3000/ws/chat" -H "Authorization: Bearer your-jwt" +``` + +The gateway will create deployments in minikube. View with: +```bash +kubectl get deployments -n dexorder-agents +kubectl get pods -n dexorder-agents +kubectl logs -n dexorder-agents agent-user-abc123 -c agent +``` + +## Production Deployment + +1. Build and push gateway image: +```bash +cd gateway +docker build -t ghcr.io/dexorder/gateway:latest . +docker push ghcr.io/dexorder/gateway:latest +``` + +2. Deploy to k8s: +```bash +kubectl apply -k deploy/k8s/prod +``` + +3. Gateway runs in `dexorder-system` namespace +4. Creates agent containers in `dexorder-agents` namespace +5. Admission policies enforce image allowlist and security constraints + +## Monitoring + +Useful metrics to track: +- Container creation latency (time from auth to ready) +- Container creation failure rate +- Active containers by license tier +- Resource usage per tier +- Idle shutdown rate + +These can be exported via Prometheus or logged to monitoring service. + +## Future Enhancements + +1. **Pre-warming**: Create containers for active users before they connect +2. **Image updates**: Handle agent image version migrations with user consent +3. **Multi-region**: Geo-distributed container placement +4. **Cost tracking**: Per-user resource usage and billing +5. **Auto-scaling**: Scale down to 0 replicas instead of deletion (faster restart) +6. **Container pools**: Shared warm containers for anonymous users diff --git a/doc/m_c_p_client_authentication_modes.md b/doc/m_c_p_client_authentication_modes.md new file mode 100644 index 0000000..a3c56c8 --- /dev/null +++ b/doc/m_c_p_client_authentication_modes.md @@ -0,0 +1,80 @@ +Mode A: Platform Harness → Hosted Container (internal) + Auth: mTLS + platform-signed user claim + Network: k8s internal, never hits the internet + +Mode B: Platform Harness → External User Container (remote) + Auth: OAuth2 token issued by your platform + Network: public internet, TLS required + +Mode C: Third-party MCP Client → External User Container (standalone) + Auth: User-managed API key or local-only (no network) + Network: localhost or user's own network + +┌──────────────────────────────────────────────────────────┐ +│ Platform (Postgres) │ +│ │ +│ users │ +│ ├── id, email, password_hash, plan_tier │ +│ │ │ +│ containers │ +│ ├── user_id │ +│ ├── type: "hosted" | "external" │ +│ ├── mcp_endpoint: "internal-svc:3100" | "https://..." │ +│ ├── auth_method: "mtls" | "platform_token" | "api_key" │ +│ └── public_key_fingerprint (for pinning external certs) │ +│ │ +│ api_tokens │ +│ ├── user_id │ +│ ├── token_hash │ +│ ├── scopes: ["mcp:tools", "mcp:resources", "data:read"] │ +│ ├── expires_at │ +│ └── issued_for: "platform_harness" | "user_direct" │ +│ │ +└──────────────────────────────────────────────────────────┘ + +## Mode A + +Harness ──mTLS──▶ k8s Service ──▶ User Container MCP +Validates: source is platform namespace +Extracts: user_id from forwarded header + +## Mode B + +Registration flow (one-time): +1. User provides their MCP endpoint URL in platform settings +2. Platform generates a scoped token (JWT, short-lived, auto-refreshed) +3. User configures their MCP server to accept tokens signed by your platform +4. Platform stores the endpoint + auth method + +Runtime: +┌──────────┐ HTTPS + Bearer token ┌────────────────────┐ +│ Harness │ ─────────────────────────▶ │ External MCP Server│ +│ │ Authorization: │ │ +│ │ Bearer │ Validates: │ +│ │ │ - JWT signature │ +│ │ │ (your public │ +│ │ │ key, JWKS) │ +│ │ │ - user_id claim │ +│ │ │ matches self │ +│ │ │ - not expired │ +└──────────┘ └────────────────────┘ + +## Mode C + +```yaml +# openclaw/config.yaml +auth: + # For local-only use (Claude Desktop, Cursor, etc via stdio) + mode: "local" # no network auth needed + + # OR for remote access + mode: "token" + tokens: + - name: "my-laptop" + hash: "sha256:..." # generated by `openclaw token create` + + # OR for platform integration + mode: "platform" + platform_jwks_url: "https://api.openclaw.io/.well-known/jwks.json" + expected_user_id: "user_abc123" +``` diff --git a/doc/m_c_p_tools_architecture.md b/doc/m_c_p_tools_architecture.md new file mode 100644 index 0000000..8c56f34 --- /dev/null +++ b/doc/m_c_p_tools_architecture.md @@ -0,0 +1,29 @@ +MCP Tools (User Container) +├── Memory +│ ├── get_conversation_history(limit) +│ ├── save_message(role, content) +│ ├── search_memory(query) ← semantic search over past conversations +│ └── get_context_summary() ← "who is this user, what do they care about" +│ +├── Strategies & Indicators +│ ├── list_strategies() +│ ├── read_strategy(name) +│ ├── write_strategy(name, code) +│ ├── list_indicators() +│ ├── read_indicator(name) +│ ├── write_indicator(name, code) +│ └── run_backtest(strategy, params) +│ +├── Preferences +│ ├── get_preferences() +│ ├── set_preference(key, value) +│ └── get_agent_prompt() ← user's custom system prompt additions +│ +├── Trading +│ ├── get_watchlist() +│ ├── execute_trade(params) +│ ├── get_positions() +│ └── get_trade_history() +│ +└── Sandbox + └── run_python(code) ← datascience toolset, matplotlib, etc. diff --git a/protobuf/protocol.md b/doc/protocol.md similarity index 100% rename from protobuf/protocol.md rename to doc/protocol.md diff --git a/doc/user_mcp_resources.md b/doc/user_mcp_resources.md new file mode 100644 index 0000000..9decf89 --- /dev/null +++ b/doc/user_mcp_resources.md @@ -0,0 +1,472 @@ +# User MCP Server - Resource Architecture + +The user's MCP server container owns **all** conversation history, RAG, and contextual data. The platform gateway is a thin, stateless orchestrator that only holds the Anthropic API key. + +## Architecture Principle + +**User Container = Fat Context** +- Conversation history (PostgreSQL/SQLite) +- RAG system (embeddings, vector search) +- User preferences and custom prompts +- Trading context (positions, watchlists, alerts) +- All user-specific data + +**Platform Gateway = Thin Orchestrator** +- Anthropic API key (platform pays for LLM) +- Session management (WebSocket/Telegram connections) +- MCP client connection pooling +- Tool routing (platform vs user tools) +- **Zero conversation state stored** + +## MCP Resources for Context Injection + +Resources are **read-only** data sources that provide context to the LLM. They're fetched before each Claude API call and embedded in the conversation. + +### Standard Context Resources + +#### 1. `context://user-profile` +**Purpose:** User's trading background and preferences + +**MIME Type:** `text/plain` + +**Example Content:** +``` +User Profile: +- Trading experience: Intermediate +- Preferred timeframes: 1h, 4h, 1d +- Risk tolerance: Medium +- Focus: Swing trading with technical indicators +- Favorite indicators: RSI, MACD, Bollinger Bands +- Active pairs: BTC/USDT, ETH/USDT, SOL/USDT +``` + +**Implementation Notes:** +- Stored in user's database `user_preferences` table +- Updated via preference management tools +- Includes inferred data from usage patterns + +--- + +#### 2. `context://conversation-summary` +**Purpose:** Semantic summary of recent conversation with RAG-enhanced context + +**MIME Type:** `text/plain` + +**Example Content:** +``` +Recent Conversation Summary: + +Last 10 messages (summarized): +- User asked about moving average crossover strategies +- Discussed backtesting parameters for BTC/USDT +- Reviewed risk management with 2% position sizing +- Explored adding RSI filter to reduce false signals + +Relevant past discussions (RAG search): +- 2 weeks ago: Similar strategy development on ETH/USDT +- 1 month ago: User prefers simple strategies over complex ones +- Past preference: Avoid strategies with >5 indicators + +Current focus: Optimizing MA crossover with momentum filter +``` + +**Implementation Notes:** +- Last N messages stored in `conversation_history` table +- RAG search against embeddings of past conversations +- Semantic search using user's current message as query +- ChromaDB/pgvector for embedding storage +- Summary generated on-demand (can be cached for 1-5 minutes) + +**RAG Integration:** +```python +async def get_conversation_summary() -> str: + # Get recent messages + recent = await db.get_recent_messages(limit=50) + + # Semantic search for relevant context + relevant = await rag.search_conversation_history( + query=recent[-1].content, # Last user message + limit=5, + min_score=0.7 + ) + + # Build summary + return build_summary(recent[-10:], relevant) +``` + +--- + +#### 3. `context://workspace-state` +**Purpose:** Current trading workspace (chart, positions, watchlist) + +**MIME Type:** `application/json` + +**Example Content:** +```json +{ + "currentChart": { + "ticker": "BINANCE:BTC/USDT", + "timeframe": "1h", + "indicators": ["SMA(20)", "RSI(14)", "MACD(12,26,9)"] + }, + "watchlist": ["BTC/USDT", "ETH/USDT", "SOL/USDT"], + "openPositions": [ + { + "ticker": "BTC/USDT", + "side": "long", + "size": 0.1, + "entryPrice": 45000, + "currentPrice": 46500, + "unrealizedPnL": 150 + } + ], + "recentAlerts": [ + { + "type": "price_alert", + "message": "BTC/USDT crossed above $46,000", + "timestamp": "2025-01-15T10:30:00Z" + } + ] +} +``` + +**Implementation Notes:** +- Synced from web client chart state +- Updated via WebSocket sync protocol +- Includes active indicators on current chart +- Position data from trading system + +--- + +#### 4. `context://system-prompt` +**Purpose:** User's custom instructions and preferences for AI behavior + +**MIME Type:** `text/plain` + +**Example Content:** +``` +Custom Instructions: +- Be concise and data-driven +- Always show risk/reward ratios +- Prefer simple strategies over complex ones +- When suggesting trades, include stop-loss and take-profit levels +- Explain your reasoning in trading decisions +``` + +**Implementation Notes:** +- User-editable in preferences UI +- Appended **last** to system prompt (highest priority) +- Can override platform defaults +- Stored in `user_preferences.custom_prompt` field + +--- + +## MCP Tools for Actions + +Tools are for **actions** that have side effects. These are **not** used for context fetching. + +### Conversation Management +- `save_message(role, content, timestamp)` - Save message to history +- `search_conversation(query, limit)` - Explicit semantic search (for user queries like "what did we discuss about BTC?") + +### Strategy & Indicators +- `list_strategies()` - List user's strategies +- `read_strategy(name)` - Get strategy code +- `write_strategy(name, code)` - Save strategy +- `run_backtest(strategy, params)` - Execute backtest + +### Trading +- `get_watchlist()` - Get watchlist (action that may trigger sync) +- `execute_trade(params)` - Execute trade order +- `get_positions()` - Fetch current positions from exchange + +### Sandbox +- `run_python(code)` - Execute Python code with data science libraries + +--- + +## Gateway Harness Flow + +```typescript +// gateway/src/harness/agent-harness.ts + +async handleMessage(message: InboundMessage): Promise { + // 1. Fetch context resources from user's MCP + const contextResources = await fetchContextResources([ + 'context://user-profile', + 'context://conversation-summary', // <-- RAG happens here + 'context://workspace-state', + 'context://system-prompt', + ]); + + // 2. Build system prompt from resources + const systemPrompt = buildSystemPrompt(contextResources); + + // 3. Build messages with embedded conversation context + const messages = buildMessages(message, contextResources); + + // 4. Get tools from MCP + const tools = await mcpClient.listTools(); + + // 5. Call Claude with embedded context + const response = await anthropic.messages.create({ + model: 'claude-3-5-sonnet-20241022', + system: systemPrompt, // <-- User profile + workspace + custom prompt + messages, // <-- Conversation summary from RAG + tools, + }); + + // 6. Save to user's MCP (tool call) + await mcpClient.callTool('save_message', { role: 'user', content: message.content }); + await mcpClient.callTool('save_message', { role: 'assistant', content: response }); + + return response; +} +``` + +--- + +## User MCP Server Implementation (Python) + +### Resource Handler + +```python +# user-mcp/src/resources.py + +from mcp.server import Server +from mcp.types import Resource, ResourceTemplate +import asyncpg + +server = Server("dexorder-user") + +@server.list_resources() +async def list_resources() -> list[Resource]: + return [ + Resource( + uri="context://user-profile", + name="User Profile", + description="Trading style, preferences, and background", + mimeType="text/plain", + ), + Resource( + uri="context://conversation-summary", + name="Conversation Summary", + description="Recent conversation with RAG-enhanced context", + mimeType="text/plain", + ), + Resource( + uri="context://workspace-state", + name="Workspace State", + description="Current chart, watchlist, positions", + mimeType="application/json", + ), + Resource( + uri="context://system-prompt", + name="Custom System Prompt", + description="User's custom AI instructions", + mimeType="text/plain", + ), + ] + +@server.read_resource() +async def read_resource(uri: str) -> str: + if uri == "context://user-profile": + return await build_user_profile() + elif uri == "context://conversation-summary": + return await build_conversation_summary() + elif uri == "context://workspace-state": + return await build_workspace_state() + elif uri == "context://system-prompt": + return await get_custom_prompt() + else: + raise ValueError(f"Unknown resource: {uri}") +``` + +### RAG Integration + +```python +# user-mcp/src/rag.py + +import chromadb +from sentence_transformers import SentenceTransformer + +class ConversationRAG: + def __init__(self, db_path: str): + self.chroma = chromadb.PersistentClient(path=db_path) + self.collection = self.chroma.get_or_create_collection("conversations") + self.embedder = SentenceTransformer('all-MiniLM-L6-v2') + + async def search_conversation_history( + self, + query: str, + limit: int = 5, + min_score: float = 0.7 + ) -> list[dict]: + """Semantic search over conversation history""" + # Embed query + query_embedding = self.embedder.encode(query).tolist() + + # Search + results = self.collection.query( + query_embeddings=[query_embedding], + n_results=limit, + ) + + # Filter by score and format + relevant = [] + for i, score in enumerate(results['distances'][0]): + if score >= min_score: + relevant.append({ + 'content': results['documents'][0][i], + 'metadata': results['metadatas'][0][i], + 'score': score, + }) + + return relevant + + async def add_message(self, message_id: str, role: str, content: str, metadata: dict): + """Add message to RAG index""" + embedding = self.embedder.encode(content).tolist() + + self.collection.add( + ids=[message_id], + embeddings=[embedding], + documents=[content], + metadatas=[{ + 'role': role, + 'timestamp': metadata.get('timestamp'), + **metadata + }] + ) +``` + +### Conversation Summary Builder + +```python +# user-mcp/src/context.py + +async def build_conversation_summary(user_id: str) -> str: + """Build conversation summary with RAG""" + # 1. Get recent messages + recent_messages = await db.get_messages( + user_id=user_id, + limit=50, + order='desc' + ) + + # 2. Get current focus (last user message) + last_user_msg = next( + (m for m in recent_messages if m.role == 'user'), + None + ) + + if not last_user_msg: + return "No recent conversation history." + + # 3. RAG search for relevant context + rag = ConversationRAG(f"/data/users/{user_id}/rag") + relevant_context = await rag.search_conversation_history( + query=last_user_msg.content, + limit=5, + min_score=0.7 + ) + + # 4. Build summary + summary = f"Recent Conversation Summary:\n\n" + + # Recent messages (last 10) + summary += "Last 10 messages:\n" + for msg in recent_messages[-10:]: + summary += f"- {msg.role}: {msg.content[:100]}...\n" + + # Relevant past context + if relevant_context: + summary += "\nRelevant past discussions (RAG):\n" + for ctx in relevant_context: + timestamp = ctx['metadata'].get('timestamp', 'unknown') + summary += f"- [{timestamp}] {ctx['content'][:150]}...\n" + + # Inferred focus + summary += f"\nCurrent focus: {infer_topic(last_user_msg.content)}\n" + + return summary + +def infer_topic(message: str) -> str: + """Simple topic extraction""" + keywords = { + 'strategy': ['strategy', 'backtest', 'trading system'], + 'indicator': ['indicator', 'rsi', 'macd', 'moving average'], + 'analysis': ['analyze', 'chart', 'price action'], + 'risk': ['risk', 'position size', 'stop loss'], + } + + message_lower = message.lower() + for topic, words in keywords.items(): + if any(word in message_lower for word in words): + return topic + + return 'general trading discussion' +``` + +--- + +## Benefits of This Architecture + +1. **Privacy**: Conversation history never leaves user's container +2. **Customization**: Each user controls their RAG, embeddings, prompt engineering +3. **Scalability**: Platform harness is stateless - horizontally scalable +4. **Cost Control**: Platform pays for Claude, users pay for their compute/storage +5. **Portability**: Users can export/migrate their entire context +6. **Development**: Users can test prompts/context locally without platform involvement + +--- + +## Future Enhancements + +### Dynamic Resource URIs + +Support parameterized resources: +``` +context://conversation/{session_id} +context://strategy/{strategy_name} +context://backtest/{backtest_id}/results +``` + +### Resource Templates + +MCP supports resource templates for dynamic discovery: +```python +@server.list_resource_templates() +async def list_templates() -> list[ResourceTemplate]: + return [ + ResourceTemplate( + uriTemplate="context://strategy/{name}", + name="Strategy Context", + description="Context for specific strategy", + ) + ] +``` + +### Streaming Resources + +For large context (e.g., full backtest results), support streaming: +```python +@server.read_resource() +async def read_resource(uri: str) -> AsyncIterator[str]: + if uri.startswith("context://backtest/"): + async for chunk in stream_backtest_results(uri): + yield chunk +``` + +--- + +## Migration Path + +For users with existing conversation history in platform DB: + +1. **Export script**: Migrate platform history → user container DB +2. **RAG indexing**: Embed all historical messages into ChromaDB +3. **Preference migration**: Copy user preferences to container +4. **Cutover**: Switch to resource-based context fetching + +Platform can keep read-only archive for compliance, but active context lives in user container. diff --git a/gateway/.dockerignore b/gateway/.dockerignore new file mode 100644 index 0000000..fdf9854 --- /dev/null +++ b/gateway/.dockerignore @@ -0,0 +1,9 @@ +node_modules +dist +.env +.env.* +!.env.example +*.log +.git +.gitignore +README.md diff --git a/gateway/.env.example b/gateway/.env.example new file mode 100644 index 0000000..7db60eb --- /dev/null +++ b/gateway/.env.example @@ -0,0 +1,39 @@ +# Server configuration +PORT=3000 +HOST=0.0.0.0 +LOG_LEVEL=info +CORS_ORIGIN=* + +# Database +DATABASE_URL=postgresql://postgres:postgres@localhost:5432/dexorder + +# LLM Provider API Keys (configure at least one) +# Anthropic Claude +ANTHROPIC_API_KEY=sk-ant-xxxxx + +# OpenAI GPT +OPENAI_API_KEY=sk-xxxxx + +# Google Gemini +GOOGLE_API_KEY=xxxxx + +# OpenRouter (access to 300+ models with one key) +OPENROUTER_API_KEY=sk-or-xxxxx + +# Default model (if user has no preference) +DEFAULT_MODEL_PROVIDER=anthropic +DEFAULT_MODEL=claude-3-5-sonnet-20241022 + +# Telegram (optional) +TELEGRAM_BOT_TOKEN= + +# Kubernetes configuration +KUBERNETES_NAMESPACE=dexorder-agents +KUBERNETES_IN_CLUSTER=false +KUBERNETES_CONTEXT=minikube +AGENT_IMAGE=ghcr.io/dexorder/agent:latest +SIDECAR_IMAGE=ghcr.io/dexorder/lifecycle-sidecar:latest +AGENT_STORAGE_CLASS=standard + +# Redis (for session management - future) +# REDIS_URL=redis://localhost:6379 diff --git a/gateway/.gitignore b/gateway/.gitignore new file mode 100644 index 0000000..3a5c1d0 --- /dev/null +++ b/gateway/.gitignore @@ -0,0 +1,6 @@ +node_modules +dist +.env +.env.local +*.log +.DS_Store diff --git a/gateway/ARCHITECTURE.md b/gateway/ARCHITECTURE.md new file mode 100644 index 0000000..1a7a77b --- /dev/null +++ b/gateway/ARCHITECTURE.md @@ -0,0 +1,313 @@ +# Gateway Architecture: LangChain.js + LangGraph + +## Why LangChain.js (Not Vercel AI SDK or Direct Anthropic SDK)? + +### The Decision + +After evaluating Vercel AI SDK and LangChain.js, we chose **LangChain.js + LangGraph** for these reasons: + +1. **Multi-model support**: 300+ models via OpenRouter, plus direct integrations +2. **Complex workflows**: LangGraph for stateful trading analysis pipelines +3. **No vendor lock-in**: Switch between Anthropic, OpenAI, Google with one line +4. **Streaming**: Same as Vercel AI SDK (`.stream()` method) +5. **Tool calling**: Unified across all providers +6. **Trading-specific**: State management, conditional branching, human-in-the-loop + +**We don't need Vercel AI SDK because:** +- ❌ We use Vue (not React) - don't need React hooks +- ❌ We have Node.js servers (not edge) - don't need edge runtime +- ✅ **DO need** complex workflows (strategy analysis, backtesting, approvals) +- ✅ **DO need** stateful execution (resume from failures) + +--- + +## Architecture Layers + +### Layer 1: Model Abstraction (`src/llm/`) + +**Provider Factory** (`provider.ts`) +```typescript +const factory = new LLMProviderFactory(config, logger); + +// Create any model +const claude = factory.createModel({ + provider: 'anthropic', + model: 'claude-3-5-sonnet-20241022', +}); + +const gpt4 = factory.createModel({ + provider: 'openai', + model: 'gpt-4o', +}); +``` + +**Model Router** (`router.ts`) +```typescript +const router = new ModelRouter(factory, logger); + +// Intelligently route based on: +// - User license (free → Gemini Flash, pro → GPT-4, enterprise → Claude) +// - Query complexity (simple → cheap, complex → smart) +// - User preference (if set in license.preferredModel) +// - Cost optimization (always use cheapest) + +const model = await router.route( + message.content, + userLicense, + RoutingStrategy.COMPLEXITY +); +``` + +--- + +### Layer 2: Agent Harness (`src/harness/`) + +**Stateless Orchestrator** + +The harness has **ZERO conversation state**. Everything lives in user's MCP container. + +**Flow:** +```typescript +async handleMessage(message: InboundMessage) { + // 1. Fetch context from user's MCP (resources, not tools) + const resources = await mcpClient.listResources(); + const context = await Promise.all([ + mcpClient.readResource('context://user-profile'), // Trading style + mcpClient.readResource('context://conversation-summary'), // RAG summary + mcpClient.readResource('context://workspace-state'), // Current chart + mcpClient.readResource('context://system-prompt'), // Custom instructions + ]); + + // 2. Route to appropriate model + const model = await modelRouter.route(message, license); + + // 3. Build messages with embedded context + const messages = buildLangChainMessages(systemPrompt, context); + + // 4. Call LLM + const response = await model.invoke(messages); + + // 5. Save to user's MCP (tool call) + await mcpClient.callTool('save_message', { role: 'user', content: message }); + await mcpClient.callTool('save_message', { role: 'assistant', content: response }); + + return response; +} +``` + +**Streaming variant:** +```typescript +async *streamMessage(message: InboundMessage) { + const model = await modelRouter.route(message, license); + const messages = buildMessages(context, message); + + const stream = await model.stream(messages); + + let fullResponse = ''; + for await (const chunk of stream) { + fullResponse += chunk.content; + yield chunk.content; // Stream to WebSocket/Telegram + } + + // Save after streaming completes + await mcpClient.callTool('save_message', { /* ... */ }); +} +``` + +--- + +### Layer 3: Workflows (`src/workflows/`) + +**LangGraph for Complex Trading Analysis** + +```typescript +// Example: Strategy Analysis Pipeline +const workflow = new StateGraph(StrategyAnalysisState) + .addNode('code_review', async (state) => { + const model = new ChatAnthropic({ model: 'claude-3-opus' }); + const review = await model.invoke(`Review: ${state.strategyCode}`); + return { codeReview: review.content }; + }) + .addNode('backtest', async (state) => { + // Call user's MCP backtest tool + const results = await mcpClient.callTool('run_backtest', { + strategy: state.strategyCode, + ticker: state.ticker, + }); + return { backtestResults: results }; + }) + .addNode('risk_assessment', async (state) => { + const model = new ChatAnthropic({ model: 'claude-3-5-sonnet' }); + const assessment = await model.invoke( + `Analyze risk: ${JSON.stringify(state.backtestResults)}` + ); + return { riskAssessment: assessment.content }; + }) + .addNode('human_approval', async (state) => { + // Pause for user review (human-in-the-loop) + return { humanApproved: await waitForUserApproval(state) }; + }) + .addConditionalEdges('human_approval', (state) => { + return state.humanApproved ? 'deploy' : 'reject'; + }) + .compile(); + +// Execute +const result = await workflow.invoke({ + strategyCode: userCode, + ticker: 'BTC/USDT', + timeframe: '1h', +}); +``` + +**Benefits:** +- **Stateful**: Resume if server crashes mid-analysis +- **Conditional**: Route based on results (if Sharpe > 2 → deploy, else → reject) +- **Human-in-the-loop**: Pause for user approval +- **Multi-step**: Each node can use different models + +--- + +## User Context Architecture + +### MCP Resources (Not Tools) + +**User's MCP server exposes resources** (read-only context): + +``` +context://user-profile → Trading style, preferences +context://conversation-summary → RAG-generated summary +context://workspace-state → Current chart, positions +context://system-prompt → User's custom AI instructions +``` + +**Gateway fetches and embeds in LLM call:** +```typescript +const userProfile = await mcpClient.readResource('context://user-profile'); +const conversationSummary = await mcpClient.readResource('context://conversation-summary'); + +// User's MCP server runs RAG search and returns summary +// Gateway embeds this in Claude/GPT prompt +``` + +**Why resources, not tools?** +- Resources = context injection (read-only) +- Tools = actions (write operations) +- Context should be fetched **before** LLM call, not during + +--- + +## Model Routing Strategies + +### 1. User Preference +```typescript +// User's license has preferred model +{ + "preferredModel": { + "provider": "anthropic", + "model": "claude-3-5-sonnet-20241022" + } +} + +// Router uses this if set +``` + +### 2. Complexity-Based +```typescript +const isComplex = message.includes('backtest') || message.length > 200; + +if (isComplex) { + return { provider: 'anthropic', model: 'claude-3-opus' }; // Smart +} else { + return { provider: 'openai', model: 'gpt-4o-mini' }; // Fast +} +``` + +### 3. License Tier +```typescript +switch (license.licenseType) { + case 'free': + return { provider: 'google', model: 'gemini-2.0-flash-exp' }; // Cheap + case 'pro': + return { provider: 'openai', model: 'gpt-4o' }; // Balanced + case 'enterprise': + return { provider: 'anthropic', model: 'claude-3-5-sonnet' }; // Premium +} +``` + +### 4. Cost-Optimized +```typescript +return { provider: 'google', model: 'gemini-2.0-flash-exp' }; // Always cheapest +``` + +--- + +## When to Use What + +### Simple Chat → Agent Harness +```typescript +// User: "What's the RSI on BTC?" +// → Fast streaming response via harness.streamMessage() +``` + +### Complex Analysis → LangGraph Workflow +```typescript +// User: "Analyze this strategy and backtest it" +// → Multi-step workflow: code review → backtest → risk → approval +``` + +### Direct Tool Call → MCP Client +```typescript +// User: "Get my watchlist" +// → Direct MCP tool call, no LLM needed +``` + +--- + +## Data Flow + +``` +User Message ("Analyze my strategy") + ↓ +Gateway → Route to workflow (not harness) + ↓ +LangGraph Workflow: + ├─ Node 1: Code Review (Claude Opus) + │ └─ Analyzes strategy code + ├─ Node 2: Backtest (MCP tool call) + │ └─ User's container runs backtest + ├─ Node 3: Risk Assessment (Claude Sonnet) + │ └─ Evaluates results + ├─ Node 4: Human Approval (pause) + │ └─ User reviews in UI + └─ Node 5: Recommendation (GPT-4o-mini) + └─ Final decision + +Result → Return to user +``` + +--- + +## Benefits Summary + +| Feature | LangChain.js | Vercel AI SDK | Direct Anthropic SDK | +|---------|--------------|---------------|----------------------| +| Multi-model | ✅ 300+ models | ✅ 100+ models | ❌ Anthropic only | +| Streaming | ✅ `.stream()` | ✅ `streamText()` | ✅ `.stream()` | +| Tool calling | ✅ Unified | ✅ Unified | ✅ Anthropic format | +| Complex workflows | ✅ LangGraph | ❌ Limited | ❌ DIY | +| Stateful agents | ✅ LangGraph | ❌ No | ❌ No | +| Human-in-the-loop | ✅ LangGraph | ❌ No | ❌ No | +| React hooks | ❌ N/A | ✅ `useChat()` | ❌ N/A | +| Bundle size | Large (101kb) | Small (30kb) | Medium (60kb) | +| **Dexorder needs** | **✅ Perfect fit** | **❌ Missing workflows** | **❌ Vendor lock-in** | + +--- + +## Next Steps + +1. **Implement tool calling** in agent harness (bind MCP tools to LangChain) +2. **Add state persistence** for LangGraph (PostgreSQL checkpointer) +3. **Build more workflows**: market scanner, portfolio optimizer +4. **Add monitoring**: Track model usage, costs, latency +5. **User container**: Implement Python MCP server with resources diff --git a/gateway/Dockerfile b/gateway/Dockerfile new file mode 100644 index 0000000..4b7a63f --- /dev/null +++ b/gateway/Dockerfile @@ -0,0 +1,40 @@ +FROM node:22-alpine AS builder + +WORKDIR /app + +# Copy package files +COPY package*.json ./ +COPY tsconfig.json ./ + +# Install dependencies +RUN npm ci + +# Copy source +COPY src ./src + +# Build +RUN npm run build + +# Production image +FROM node:22-alpine + +WORKDIR /app + +# Copy package files +COPY package*.json ./ + +# Install production dependencies only +RUN npm ci --omit=dev + +# Copy built application +COPY --from=builder /app/dist ./dist + +# Create non-root user +RUN addgroup -g 1001 -S nodejs && \ + adduser -S nodejs -u 1001 + +USER nodejs + +EXPOSE 3000 + +CMD ["node", "dist/main.js"] diff --git a/gateway/README.md b/gateway/README.md new file mode 100644 index 0000000..15c20c6 --- /dev/null +++ b/gateway/README.md @@ -0,0 +1,212 @@ +# Dexorder Gateway + +Multi-channel gateway with agent harness for the Dexorder AI platform. + +## Architecture + +``` +┌─────────────────────────────────────────────────────────┐ +│ Platform Gateway │ +│ (Node.js/Fastify) │ +│ │ +│ ┌────────────────────────────────────────────────┐ │ +│ │ Channels │ │ +│ │ - WebSocket (/ws/chat) │ │ +│ │ - Telegram Webhook (/webhook/telegram) │ │ +│ └────────────────────────────────────────────────┘ │ +│ ↕ │ +│ ┌────────────────────────────────────────────────┐ │ +│ │ Authenticator │ │ +│ │ - JWT verification (WebSocket) │ │ +│ │ - Channel linking (Telegram) │ │ +│ │ - User license lookup (PostgreSQL) │ │ +│ └────────────────────────────────────────────────┘ │ +│ ↕ │ +│ ┌────────────────────────────────────────────────┐ │ +│ │ Agent Harness (per-session) │ │ +│ │ - Claude API integration │ │ +│ │ - MCP client connector │ │ +│ │ - Conversation state │ │ +│ └────────────────────────────────────────────────┘ │ +│ ↕ │ +│ ┌────────────────────────────────────────────────┐ │ +│ │ MCP Client │ │ +│ │ - User container connection │ │ +│ │ - Tool routing │ │ +│ └────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────┘ + ↕ + ┌───────────────────────────────┐ + │ User MCP Server (Python) │ + │ - Strategies, indicators │ + │ - Memory, preferences │ + │ - Backtest sandbox │ + └───────────────────────────────────┘ +``` + +## Features + +- **Automatic container provisioning**: Creates user agent containers on-demand via Kubernetes +- **Multi-channel support**: WebSocket and Telegram webhooks +- **Per-channel authentication**: JWT for web, channel linking for chat apps +- **User license management**: Feature flags and resource limits from PostgreSQL +- **Container lifecycle management**: Auto-shutdown on idle (handled by container sidecar) +- **License-based resources**: Different memory/CPU/storage limits per tier +- **Multi-model LLM support**: Anthropic Claude, OpenAI GPT, Google Gemini, OpenRouter (300+ models) +- **Zero vendor lock-in**: Switch models with one line, powered by LangChain.js +- **Intelligent routing**: Auto-select models based on complexity, license tier, or user preference +- **Streaming responses**: Real-time chat with WebSocket and Telegram +- **Complex workflows**: LangGraph for stateful trading analysis (backtest → risk → approval) +- **Agent harness**: Stateless orchestrator (all context lives in user's MCP container) +- **MCP resource integration**: User's RAG, conversation history, and preferences + +## Container Management + +When a user authenticates, the gateway: + +1. **Checks for existing container**: Queries Kubernetes for deployment +2. **Creates if missing**: Renders YAML template based on license tier +3. **Waits for ready**: Polls deployment status until healthy +4. **Returns MCP endpoint**: Computed from service name +5. **Connects to MCP server**: Proceeds with normal authentication flow + +Container templates by license tier: + +| Tier | Memory | CPU | Storage | Idle Timeout | +|------|--------|-----|---------|--------------| +| Free | 512Mi | 500m | 1Gi | 15min | +| Pro | 2Gi | 2000m | 10Gi | 60min | +| Enterprise | 4Gi | 4000m | 50Gi | Never | + +Containers self-manage their lifecycle using the lifecycle sidecar (see `../lifecycle-sidecar/`) + +## Setup + +### Prerequisites + +- Node.js >= 22.0.0 +- PostgreSQL database +- At least one LLM provider API key: + - Anthropic Claude + - OpenAI GPT + - Google Gemini + - OpenRouter (one key for 300+ models) + +### Development + +1. Install dependencies: +```bash +npm install +``` + +2. Copy environment template: +```bash +cp .env.example .env +``` + +3. Configure `.env` (see `.env.example`): +```bash +DATABASE_URL=postgresql://postgres:postgres@localhost:5432/dexorder + +# Configure at least one provider +ANTHROPIC_API_KEY=sk-ant-xxxxx +# OPENAI_API_KEY=sk-xxxxx +# GOOGLE_API_KEY=xxxxx +# OPENROUTER_API_KEY=sk-or-xxxxx + +# Optional: Set default model +DEFAULT_MODEL_PROVIDER=anthropic +DEFAULT_MODEL=claude-3-5-sonnet-20241022 +``` + +4. Run development server: +```bash +npm run dev +``` + +### Production Build + +```bash +npm run build +npm start +``` + +### Docker + +```bash +docker build -t dexorder/gateway:latest . +docker run -p 3000:3000 --env-file .env dexorder/gateway:latest +``` + +## Database Schema + +Required PostgreSQL tables (will be documented separately): + +### `user_licenses` +- `user_id` (text, primary key) +- `email` (text) +- `license_type` (text: 'free', 'pro', 'enterprise') +- `features` (jsonb) +- `resource_limits` (jsonb) +- `mcp_server_url` (text) +- `expires_at` (timestamp, nullable) +- `created_at` (timestamp) +- `updated_at` (timestamp) + +### `user_channel_links` +- `id` (serial, primary key) +- `user_id` (text, foreign key) +- `channel_type` (text: 'telegram', 'slack', 'discord') +- `channel_user_id` (text) +- `created_at` (timestamp) + +## API Endpoints + +### WebSocket + +**`GET /ws/chat`** +- WebSocket connection for web client +- Auth: Bearer token in headers +- Protocol: JSON messages + +Example: +```javascript +const ws = new WebSocket('ws://localhost:3000/ws/chat', { + headers: { + 'Authorization': 'Bearer your-jwt-token' + } +}); + +ws.on('message', (data) => { + const msg = JSON.parse(data); + console.log(msg); +}); + +ws.send(JSON.stringify({ + type: 'message', + content: 'Hello, AI!' +})); +``` + +### Telegram Webhook + +**`POST /webhook/telegram`** +- Telegram bot webhook endpoint +- Auth: Telegram user linked to platform user +- Automatically processes incoming messages + +### Health Check + +**`GET /health`** +- Returns server health status + +## TODO + +- [ ] Implement JWT verification with JWKS +- [ ] Implement MCP HTTP/SSE transport +- [ ] Add Redis for session persistence +- [ ] Add rate limiting per user license +- [ ] Add message usage tracking +- [ ] Add streaming responses for WebSocket +- [ ] Add Slack and Discord channel handlers +- [ ] Add session cleanup/timeout logic diff --git a/gateway/package.json b/gateway/package.json new file mode 100644 index 0000000..5655783 --- /dev/null +++ b/gateway/package.json @@ -0,0 +1,42 @@ +{ + "name": "@dexorder/gateway", + "version": "0.1.0", + "type": "module", + "private": true, + "description": "Multi-channel gateway with agent harness for Dexorder AI platform", + "scripts": { + "dev": "tsx watch src/main.ts", + "build": "tsc", + "start": "node dist/main.js", + "typecheck": "tsc --noEmit" + }, + "dependencies": { + "@fastify/cors": "^10.0.1", + "@fastify/websocket": "^11.0.1", + "@kubernetes/client-node": "^0.21.0", + "@langchain/anthropic": "^0.3.8", + "@langchain/core": "^0.3.24", + "@langchain/google-genai": "^0.1.6", + "@langchain/langgraph": "^0.2.26", + "@langchain/openai": "^0.3.21", + "@langchain/openrouter": "^0.1.2", + "@modelcontextprotocol/sdk": "^1.0.4", + "fastify": "^5.2.0", + "ioredis": "^5.4.2", + "js-yaml": "^4.1.0", + "pg": "^8.13.1", + "pino": "^9.6.0", + "pino-pretty": "^13.0.0", + "zod": "^3.24.1" + }, + "devDependencies": { + "@types/js-yaml": "^4.0.9", + "@types/node": "^22.10.2", + "@types/pg": "^8.11.10", + "tsx": "^4.19.2", + "typescript": "^5.7.2" + }, + "engines": { + "node": ">=22.0.0" + } +} diff --git a/gateway/schema.sql b/gateway/schema.sql new file mode 100644 index 0000000..d1ae4aa --- /dev/null +++ b/gateway/schema.sql @@ -0,0 +1,79 @@ +-- User license and authorization schema + +CREATE TABLE IF NOT EXISTS user_licenses ( + user_id TEXT PRIMARY KEY, + email TEXT, + license_type TEXT NOT NULL CHECK (license_type IN ('free', 'pro', 'enterprise')), + features JSONB NOT NULL DEFAULT '{ + "maxIndicators": 5, + "maxStrategies": 3, + "maxBacktestDays": 30, + "realtimeData": false, + "customExecutors": false, + "apiAccess": false + }', + resource_limits JSONB NOT NULL DEFAULT '{ + "maxConcurrentSessions": 1, + "maxMessagesPerDay": 100, + "maxTokensPerMessage": 4096, + "rateLimitPerMinute": 10 + }', + mcp_server_url TEXT NOT NULL, + preferred_model JSONB DEFAULT NULL, + expires_at TIMESTAMP WITH TIME ZONE, + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), + updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW() +); + +COMMENT ON COLUMN user_licenses.preferred_model IS 'Optional model preference: {"provider": "anthropic", "model": "claude-3-5-sonnet-20241022", "temperature": 0.7}'; + +CREATE INDEX idx_user_licenses_expires_at ON user_licenses(expires_at) + WHERE expires_at IS NOT NULL; + +-- Channel linking for multi-channel support +CREATE TABLE IF NOT EXISTS user_channel_links ( + id SERIAL PRIMARY KEY, + user_id TEXT NOT NULL REFERENCES user_licenses(user_id) ON DELETE CASCADE, + channel_type TEXT NOT NULL CHECK (channel_type IN ('telegram', 'slack', 'discord', 'websocket')), + channel_user_id TEXT NOT NULL, + metadata JSONB, + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), + UNIQUE(channel_type, channel_user_id) +); + +CREATE INDEX idx_user_channel_links_user_id ON user_channel_links(user_id); +CREATE INDEX idx_user_channel_links_channel ON user_channel_links(channel_type, channel_user_id); + +-- Example data for development +INSERT INTO user_licenses (user_id, email, license_type, mcp_server_url, features, resource_limits, preferred_model) +VALUES ( + 'dev-user-001', + 'dev@example.com', + 'pro', + 'http://localhost:8080/mcp', + '{ + "maxIndicators": 50, + "maxStrategies": 20, + "maxBacktestDays": 365, + "realtimeData": true, + "customExecutors": true, + "apiAccess": true + }', + '{ + "maxConcurrentSessions": 5, + "maxMessagesPerDay": 1000, + "maxTokensPerMessage": 8192, + "rateLimitPerMinute": 60 + }', + '{ + "provider": "anthropic", + "model": "claude-3-5-sonnet-20241022", + "temperature": 0.7 + }' +) +ON CONFLICT (user_id) DO NOTHING; + +-- Example Telegram link +INSERT INTO user_channel_links (user_id, channel_type, channel_user_id) +VALUES ('dev-user-001', 'telegram', '123456789') +ON CONFLICT (channel_type, channel_user_id) DO NOTHING; diff --git a/gateway/src/auth/authenticator.ts b/gateway/src/auth/authenticator.ts new file mode 100644 index 0000000..3a7b23d --- /dev/null +++ b/gateway/src/auth/authenticator.ts @@ -0,0 +1,146 @@ +import type { FastifyRequest, FastifyBaseLogger } from 'fastify'; +import { UserService } from '../db/user-service.js'; +import { ChannelType, type AuthContext } from '../types/user.js'; +import type { ContainerManager } from '../k8s/container-manager.js'; + +export interface AuthenticatorConfig { + userService: UserService; + containerManager: ContainerManager; + logger: FastifyBaseLogger; +} + +/** + * Multi-channel authenticator + * Handles authentication for WebSocket, Telegram, and other channels + */ +export class Authenticator { + private config: AuthenticatorConfig; + + constructor(config: AuthenticatorConfig) { + this.config = config; + } + + /** + * Authenticate WebSocket connection via JWT token + * Also ensures the user's container is running + */ + async authenticateWebSocket( + request: FastifyRequest + ): Promise { + try { + const token = this.extractBearerToken(request); + if (!token) { + this.config.logger.warn('No bearer token in WebSocket connection'); + return null; + } + + const userId = await this.config.userService.verifyWebToken(token); + if (!userId) { + this.config.logger.warn('Invalid JWT token'); + return null; + } + + const license = await this.config.userService.getUserLicense(userId); + if (!license) { + this.config.logger.warn({ userId }, 'User license not found'); + return null; + } + + // Ensure container is running (may take time if creating new container) + this.config.logger.info({ userId }, 'Ensuring user container is running'); + const { mcpEndpoint, wasCreated } = await this.config.containerManager.ensureContainerRunning( + userId, + license + ); + + this.config.logger.info( + { userId, mcpEndpoint, wasCreated }, + 'Container is ready' + ); + + // Update license with actual MCP endpoint + license.mcpServerUrl = mcpEndpoint; + + const sessionId = `ws_${userId}_${Date.now()}`; + + return { + userId, + channelType: ChannelType.WEBSOCKET, + channelUserId: userId, // For WebSocket, same as userId + sessionId, + license, + authenticatedAt: new Date(), + }; + } catch (error) { + this.config.logger.error({ error }, 'WebSocket authentication error'); + return null; + } + } + + /** + * Authenticate Telegram webhook + * Also ensures the user's container is running + */ + async authenticateTelegram(telegramUserId: string): Promise { + try { + const userId = await this.config.userService.getUserIdFromChannel( + 'telegram', + telegramUserId + ); + + if (!userId) { + this.config.logger.warn( + { telegramUserId }, + 'Telegram user not linked to platform user' + ); + return null; + } + + const license = await this.config.userService.getUserLicense(userId); + if (!license) { + this.config.logger.warn({ userId }, 'User license not found'); + return null; + } + + // Ensure container is running + this.config.logger.info({ userId }, 'Ensuring user container is running'); + const { mcpEndpoint, wasCreated } = await this.config.containerManager.ensureContainerRunning( + userId, + license + ); + + this.config.logger.info( + { userId, mcpEndpoint, wasCreated }, + 'Container is ready' + ); + + // Update license with actual MCP endpoint + license.mcpServerUrl = mcpEndpoint; + + const sessionId = `tg_${telegramUserId}_${Date.now()}`; + + return { + userId, + channelType: ChannelType.TELEGRAM, + channelUserId: telegramUserId, + sessionId, + license, + authenticatedAt: new Date(), + }; + } catch (error) { + this.config.logger.error({ error }, 'Telegram authentication error'); + return null; + } + } + + /** + * Extract bearer token from request headers + */ + private extractBearerToken(request: FastifyRequest): string | null { + const auth = request.headers.authorization; + if (!auth || !auth.startsWith('Bearer ')) { + return null; + } + return auth.substring(7); + } +} diff --git a/gateway/src/channels/telegram-handler.ts b/gateway/src/channels/telegram-handler.ts new file mode 100644 index 0000000..8e00dd0 --- /dev/null +++ b/gateway/src/channels/telegram-handler.ts @@ -0,0 +1,163 @@ +import type { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify'; +import type { Authenticator } from '../auth/authenticator.js'; +import { AgentHarness } from '../harness/agent-harness.js'; +import type { InboundMessage } from '../types/messages.js'; +import { randomUUID } from 'crypto'; + +import type { ProviderConfig } from '../llm/provider.js'; + +export interface TelegramHandlerConfig { + authenticator: Authenticator; + providerConfig: ProviderConfig; + telegramBotToken: string; +} + +interface TelegramUpdate { + update_id: number; + message?: { + message_id: number; + from: { + id: number; + first_name: string; + username?: string; + }; + chat: { + id: number; + type: string; + }; + text?: string; + photo?: Array<{ + file_id: string; + file_size: number; + }>; + }; +} + +/** + * Telegram webhook handler + */ +export class TelegramHandler { + private config: TelegramHandlerConfig; + private sessions = new Map(); + + constructor(config: TelegramHandlerConfig) { + this.config = config; + } + + /** + * Register Telegram webhook routes + */ + register(app: FastifyInstance): void { + app.post('/webhook/telegram', async (request: FastifyRequest, reply: FastifyReply) => { + await this.handleWebhook(request, reply, app); + }); + } + + /** + * Handle Telegram webhook + */ + private async handleWebhook( + request: FastifyRequest, + reply: FastifyReply, + app: FastifyInstance + ): Promise { + const logger = app.log; + + try { + const update = request.body as TelegramUpdate; + + if (!update.message?.text) { + // Ignore non-text messages for now + reply.code(200).send({ ok: true }); + return; + } + + const telegramUserId = update.message.from.id.toString(); + const chatId = update.message.chat.id; + const text = update.message.text; + + logger.info({ telegramUserId, chatId, text }, 'Received Telegram message'); + + // Authenticate + const authContext = await this.config.authenticator.authenticateTelegram(telegramUserId); + if (!authContext) { + logger.warn({ telegramUserId }, 'Telegram user not authenticated'); + await this.sendTelegramMessage( + chatId, + 'Please link your Telegram account to Dexorder first.' + ); + reply.code(200).send({ ok: true }); + return; + } + + // Get or create harness + let harness = this.sessions.get(authContext.sessionId); + if (!harness) { + harness = new AgentHarness({ + userId: authContext.userId, + sessionId: authContext.sessionId, + license: authContext.license, + providerConfig: this.config.providerConfig, + logger, + }); + await harness.initialize(); + this.sessions.set(authContext.sessionId, harness); + } + + // Process message + const inboundMessage: InboundMessage = { + messageId: randomUUID(), + userId: authContext.userId, + sessionId: authContext.sessionId, + content: text, + timestamp: new Date(), + }; + + const response = await harness.handleMessage(inboundMessage); + + // Send response back to Telegram + await this.sendTelegramMessage(chatId, response.content); + + reply.code(200).send({ ok: true }); + } catch (error) { + logger.error({ error }, 'Error handling Telegram webhook'); + reply.code(500).send({ ok: false, error: 'Internal server error' }); + } + } + + /** + * Send message to Telegram chat + */ + private async sendTelegramMessage(chatId: number, text: string): Promise { + const url = `https://api.telegram.org/bot${this.config.telegramBotToken}/sendMessage`; + + try { + const response = await fetch(url, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + chat_id: chatId, + text, + parse_mode: 'Markdown', + }), + }); + + if (!response.ok) { + throw new Error(`Telegram API error: ${response.statusText}`); + } + } catch (error) { + console.error('Failed to send Telegram message:', error); + throw error; + } + } + + /** + * Cleanup old sessions (call periodically) + */ + async cleanupSessions(maxAgeMs = 30 * 60 * 1000): Promise { + // TODO: Track session last activity and cleanup + // For now, sessions persist until server restart + } +} diff --git a/gateway/src/channels/websocket-handler.ts b/gateway/src/channels/websocket-handler.ts new file mode 100644 index 0000000..3cbcd3e --- /dev/null +++ b/gateway/src/channels/websocket-handler.ts @@ -0,0 +1,161 @@ +import type { FastifyInstance, FastifyRequest } from 'fastify'; +import type { WebSocket } from '@fastify/websocket'; +import type { Authenticator } from '../auth/authenticator.js'; +import { AgentHarness } from '../harness/agent-harness.js'; +import type { InboundMessage } from '../types/messages.js'; +import { randomUUID } from 'crypto'; + +import type { ProviderConfig } from '../llm/provider.js'; + +export interface WebSocketHandlerConfig { + authenticator: Authenticator; + providerConfig: ProviderConfig; +} + +/** + * WebSocket channel handler + */ +export class WebSocketHandler { + private config: WebSocketHandlerConfig; + private sessions = new Map(); + + constructor(config: WebSocketHandlerConfig) { + this.config = config; + } + + /** + * Register WebSocket routes + */ + register(app: FastifyInstance): void { + app.get( + '/ws/chat', + { websocket: true }, + async (socket: WebSocket, request: FastifyRequest) => { + await this.handleConnection(socket, request, app); + } + ); + } + + /** + * Handle WebSocket connection + */ + private async handleConnection( + socket: WebSocket, + request: FastifyRequest, + app: FastifyInstance + ): Promise { + const logger = app.log; + + // Send initial connecting message + socket.send( + JSON.stringify({ + type: 'status', + status: 'authenticating', + message: 'Authenticating...', + }) + ); + + // Authenticate (this may take time if creating container) + const authContext = await this.config.authenticator.authenticateWebSocket(request); + if (!authContext) { + logger.warn('WebSocket authentication failed'); + socket.send( + JSON.stringify({ + type: 'error', + message: 'Authentication failed', + }) + ); + socket.close(1008, 'Authentication failed'); + return; + } + + logger.info( + { userId: authContext.userId, sessionId: authContext.sessionId }, + 'WebSocket connection authenticated' + ); + + // Send workspace starting message + socket.send( + JSON.stringify({ + type: 'status', + status: 'initializing', + message: 'Starting your workspace...', + }) + ); + + // Create agent harness + const harness = new AgentHarness({ + userId: authContext.userId, + sessionId: authContext.sessionId, + license: authContext.license, + providerConfig: this.config.providerConfig, + logger, + }); + + try { + await harness.initialize(); + this.sessions.set(authContext.sessionId, harness); + + // Send connected message + socket.send( + JSON.stringify({ + type: 'connected', + sessionId: authContext.sessionId, + userId: authContext.userId, + licenseType: authContext.license.licenseType, + message: 'Connected to Dexorder AI', + }) + ); + + // Handle messages + socket.on('message', async (data: Buffer) => { + try { + const payload = JSON.parse(data.toString()); + + if (payload.type === 'message') { + const inboundMessage: InboundMessage = { + messageId: randomUUID(), + userId: authContext.userId, + sessionId: authContext.sessionId, + content: payload.content, + attachments: payload.attachments, + timestamp: new Date(), + }; + + const response = await harness.handleMessage(inboundMessage); + + socket.send( + JSON.stringify({ + type: 'message', + ...response, + }) + ); + } + } catch (error) { + logger.error({ error }, 'Error handling WebSocket message'); + socket.send( + JSON.stringify({ + type: 'error', + message: 'Failed to process message', + }) + ); + } + }); + + // Handle disconnection + socket.on('close', async () => { + logger.info({ sessionId: authContext.sessionId }, 'WebSocket disconnected'); + await harness.cleanup(); + this.sessions.delete(authContext.sessionId); + }); + + socket.on('error', (error) => { + logger.error({ error, sessionId: authContext.sessionId }, 'WebSocket error'); + }); + } catch (error) { + logger.error({ error }, 'Failed to initialize agent harness'); + socket.close(1011, 'Internal server error'); + await harness.cleanup(); + } + } +} diff --git a/gateway/src/db/user-service.ts b/gateway/src/db/user-service.ts new file mode 100644 index 0000000..3a1805e --- /dev/null +++ b/gateway/src/db/user-service.ts @@ -0,0 +1,107 @@ +import { Pool, PoolClient } from 'pg'; +import type { UserLicense } from '../types/user.js'; +import { UserLicenseSchema } from '../types/user.js'; + +export class UserService { + private pool: Pool; + + constructor(connectionString: string) { + this.pool = new Pool({ + connectionString, + max: 20, + idleTimeoutMillis: 30000, + connectionTimeoutMillis: 2000, + }); + } + + /** + * Get user license by user ID + */ + async getUserLicense(userId: string): Promise { + const client = await this.pool.connect(); + try { + const result = await client.query( + `SELECT + user_id as "userId", + email, + license_type as "licenseType", + features, + resource_limits as "resourceLimits", + mcp_server_url as "mcpServerUrl", + preferred_model as "preferredModel", + expires_at as "expiresAt", + created_at as "createdAt", + updated_at as "updatedAt" + FROM user_licenses + WHERE user_id = $1 + AND (expires_at IS NULL OR expires_at > NOW())`, + [userId] + ); + + if (result.rows.length === 0) { + return null; + } + + const row = result.rows[0]; + + // Parse and validate + return UserLicenseSchema.parse({ + userId: row.userId, + email: row.email, + licenseType: row.licenseType, + features: row.features, + resourceLimits: row.resourceLimits, + mcpServerUrl: row.mcpServerUrl, + preferredModel: row.preferredModel, + expiresAt: row.expiresAt, + createdAt: row.createdAt, + updatedAt: row.updatedAt, + }); + } finally { + client.release(); + } + } + + /** + * Get user ID from channel-specific identifier + */ + async getUserIdFromChannel(channelType: string, channelUserId: string): Promise { + const client = await this.pool.connect(); + try { + const result = await client.query( + `SELECT user_id + FROM user_channel_links + WHERE channel_type = $1 AND channel_user_id = $2`, + [channelType, channelUserId] + ); + + return result.rows.length > 0 ? result.rows[0].user_id : null; + } finally { + client.release(); + } + } + + /** + * Verify JWT token from web client + * TODO: Implement JWT verification with JWKS + */ + async verifyWebToken(token: string): Promise { + // Placeholder - implement JWT verification + // For now, decode without verification (INSECURE - FOR DEV ONLY) + try { + const payload = JSON.parse( + Buffer.from(token.split('.')[1], 'base64').toString() + ); + return payload.sub || null; + } catch { + return null; + } + } + + /** + * Close database pool + */ + async close(): Promise { + await this.pool.end(); + } +} diff --git a/gateway/src/harness/agent-harness.ts b/gateway/src/harness/agent-harness.ts new file mode 100644 index 0000000..9721c2a --- /dev/null +++ b/gateway/src/harness/agent-harness.ts @@ -0,0 +1,306 @@ +import type { BaseChatModel } from '@langchain/core/language_models/chat_models'; +import type { BaseMessage } from '@langchain/core/messages'; +import { HumanMessage, AIMessage, SystemMessage } from '@langchain/core/messages'; +import type { FastifyBaseLogger } from 'fastify'; +import type { UserLicense } from '../types/user.js'; +import type { InboundMessage, OutboundMessage } from '../types/messages.js'; +import { MCPClientConnector } from './mcp-client.js'; +import { CONTEXT_URIS, type ResourceContent } from '../types/resources.js'; +import { LLMProviderFactory, type ProviderConfig } from '../llm/provider.js'; +import { ModelRouter, RoutingStrategy } from '../llm/router.js'; + +export interface AgentHarnessConfig { + userId: string; + sessionId: string; + license: UserLicense; + providerConfig: ProviderConfig; + logger: FastifyBaseLogger; +} + +/** + * Agent harness orchestrates between LLM and user's MCP server. + * + * This is a STATELESS orchestrator - all conversation history, RAG, and context + * lives in the user's MCP server container. The harness only: + * 1. Fetches context from user's MCP resources + * 2. Routes to appropriate LLM model + * 3. Calls LLM with embedded context + * 4. Routes tool calls to user's MCP or platform tools + * 5. Saves messages back to user's MCP + */ +export class AgentHarness { + private config: AgentHarnessConfig; + private modelFactory: LLMProviderFactory; + private modelRouter: ModelRouter; + private mcpClient: MCPClientConnector; + + constructor(config: AgentHarnessConfig) { + this.config = config; + + this.modelFactory = new LLMProviderFactory(config.providerConfig, config.logger); + this.modelRouter = new ModelRouter(this.modelFactory, config.logger); + + this.mcpClient = new MCPClientConnector({ + userId: config.userId, + mcpServerUrl: config.license.mcpServerUrl, + logger: config.logger, + }); + } + + /** + * Initialize harness and connect to user's MCP server + */ + async initialize(): Promise { + this.config.logger.info( + { userId: this.config.userId, sessionId: this.config.sessionId }, + 'Initializing agent harness' + ); + + try { + await this.mcpClient.connect(); + this.config.logger.info('Agent harness initialized'); + } catch (error) { + this.config.logger.error({ error }, 'Failed to initialize agent harness'); + throw error; + } + } + + /** + * Handle incoming message from user + */ + async handleMessage(message: InboundMessage): Promise { + this.config.logger.info( + { messageId: message.messageId, userId: message.userId }, + 'Processing user message' + ); + + try { + // 1. Fetch context resources from user's MCP server + this.config.logger.debug('Fetching context resources from MCP'); + const contextResources = await this.fetchContextResources(); + + // 2. Build system prompt from resources + const systemPrompt = this.buildSystemPrompt(contextResources); + + // 3. Build messages with conversation context from MCP + const messages = this.buildMessages(message, contextResources); + + // 4. Route to appropriate model + const model = await this.modelRouter.route( + message.content, + this.config.license, + RoutingStrategy.COMPLEXITY + ); + + // 5. Build LangChain messages + const langchainMessages = this.buildLangChainMessages(systemPrompt, messages); + + // 6. Call LLM with streaming + this.config.logger.debug('Invoking LLM'); + const response = await model.invoke(langchainMessages); + + // 7. Extract text response (tool handling TODO) + const assistantMessage = response.content as string; + + // 8. Save messages to user's MCP server + this.config.logger.debug('Saving messages to MCP'); + await this.mcpClient.callTool('save_message', { + role: 'user', + content: message.content, + timestamp: message.timestamp.toISOString(), + }); + await this.mcpClient.callTool('save_message', { + role: 'assistant', + content: assistantMessage, + timestamp: new Date().toISOString(), + }); + + return { + messageId: `msg_${Date.now()}`, + sessionId: message.sessionId, + content: assistantMessage, + timestamp: new Date(), + }; + } catch (error) { + this.config.logger.error({ error }, 'Error processing message'); + throw error; + } + } + + /** + * Stream response from LLM + */ + async *streamMessage(message: InboundMessage): AsyncGenerator { + try { + // Fetch context + const contextResources = await this.fetchContextResources(); + const systemPrompt = this.buildSystemPrompt(contextResources); + const messages = this.buildMessages(message, contextResources); + + // Route to model + const model = await this.modelRouter.route( + message.content, + this.config.license, + RoutingStrategy.COMPLEXITY + ); + + // Build messages + const langchainMessages = this.buildLangChainMessages(systemPrompt, messages); + + // Stream response + const stream = await model.stream(langchainMessages); + + let fullResponse = ''; + for await (const chunk of stream) { + const content = chunk.content as string; + fullResponse += content; + yield content; + } + + // Save after streaming completes + await this.mcpClient.callTool('save_message', { + role: 'user', + content: message.content, + timestamp: message.timestamp.toISOString(), + }); + await this.mcpClient.callTool('save_message', { + role: 'assistant', + content: fullResponse, + timestamp: new Date().toISOString(), + }); + } catch (error) { + this.config.logger.error({ error }, 'Error streaming message'); + throw error; + } + } + + /** + * Fetch context resources from user's MCP server + */ + private async fetchContextResources(): Promise { + const contextUris = [ + CONTEXT_URIS.USER_PROFILE, + CONTEXT_URIS.CONVERSATION_SUMMARY, + CONTEXT_URIS.WORKSPACE_STATE, + CONTEXT_URIS.SYSTEM_PROMPT, + ]; + + const resources = await Promise.all( + contextUris.map(async (uri) => { + try { + return await this.mcpClient.readResource(uri); + } catch (error) { + this.config.logger.warn({ error, uri }, 'Failed to fetch resource, using empty'); + return { uri, text: '' }; + } + }) + ); + + return resources; + } + + /** + * Build messages array with context from resources + */ + private buildMessages( + currentMessage: InboundMessage, + contextResources: ResourceContent[] + ): Array<{ role: string; content: string }> { + const conversationSummary = contextResources.find( + (r) => r.uri === CONTEXT_URIS.CONVERSATION_SUMMARY + ); + + const messages: Array<{ role: string; content: string }> = []; + + // Add conversation context as a system-like user message + if (conversationSummary?.text) { + messages.push({ + role: 'user', + content: `[Previous Conversation Context]\n${conversationSummary.text}`, + }); + messages.push({ + role: 'assistant', + content: 'I understand the context from our previous conversations.', + }); + } + + // Add current user message + messages.push({ + role: 'user', + content: currentMessage.content, + }); + + return messages; + } + + /** + * Convert to LangChain message format + */ + private buildLangChainMessages( + systemPrompt: string, + messages: Array<{ role: string; content: string }> + ): BaseMessage[] { + const langchainMessages: BaseMessage[] = [new SystemMessage(systemPrompt)]; + + for (const msg of messages) { + if (msg.role === 'user') { + langchainMessages.push(new HumanMessage(msg.content)); + } else if (msg.role === 'assistant') { + langchainMessages.push(new AIMessage(msg.content)); + } + } + + return langchainMessages; + } + + /** + * Build system prompt from platform base + user resources + */ + private buildSystemPrompt(contextResources: ResourceContent[]): string { + const userProfile = contextResources.find((r) => r.uri === CONTEXT_URIS.USER_PROFILE); + const customPrompt = contextResources.find((r) => r.uri === CONTEXT_URIS.SYSTEM_PROMPT); + const workspaceState = contextResources.find((r) => r.uri === CONTEXT_URIS.WORKSPACE_STATE); + + // Base platform prompt + let prompt = `You are a helpful AI assistant for Dexorder, an AI-first trading platform. +You help users research markets, develop indicators and strategies, and analyze trading data. + +User license: ${this.config.license.licenseType} +Available features: ${JSON.stringify(this.config.license.features, null, 2)}`; + + // Add user profile context + if (userProfile?.text) { + prompt += `\n\n# User Profile\n${userProfile.text}`; + } + + // Add workspace context + if (workspaceState?.text) { + prompt += `\n\n# Current Workspace\n${workspaceState.text}`; + } + + // Add user's custom instructions (highest priority) + if (customPrompt?.text) { + prompt += `\n\n# User Instructions\n${customPrompt.text}`; + } + + return prompt; + } + + /** + * Get platform tools (non-user-specific tools) + */ + private getPlatformTools(): Array<{ name: string; description?: string }> { + // Platform tools that don't need user's MCP + return [ + // TODO: Add platform tools like market data queries, chart rendering, etc. + ]; + } + + /** + * Cleanup resources + */ + async cleanup(): Promise { + this.config.logger.info('Cleaning up agent harness'); + await this.mcpClient.disconnect(); + } +} diff --git a/gateway/src/harness/mcp-client.ts b/gateway/src/harness/mcp-client.ts new file mode 100644 index 0000000..9980feb --- /dev/null +++ b/gateway/src/harness/mcp-client.ts @@ -0,0 +1,259 @@ +import { Client } from '@modelcontextprotocol/sdk/client/index.js'; +import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js'; +import type { FastifyBaseLogger } from 'fastify'; + +export interface MCPClientConfig { + userId: string; + mcpServerUrl: string; + platformJWT?: string; + logger: FastifyBaseLogger; +} + +/** + * MCP client connector for user's container + * Manages connection to user-specific MCP server + */ +export class MCPClientConnector { + private client: Client | null = null; + private connected = false; + private config: MCPClientConfig; + + constructor(config: MCPClientConfig) { + this.config = config; + } + + /** + * Connect to user's MCP server + * TODO: Implement HTTP/SSE transport instead of stdio for container communication + */ + async connect(): Promise { + if (this.connected) { + return; + } + + try { + this.config.logger.info( + { userId: this.config.userId, url: this.config.mcpServerUrl }, + 'Connecting to user MCP server' + ); + + this.client = new Client( + { + name: 'dexorder-gateway', + version: '0.1.0', + }, + { + capabilities: { + tools: {}, + resources: {}, + }, + } + ); + + // TODO: Replace with HTTP transport when user containers are ready + // For now, this is a placeholder structure + // const transport = new HTTPTransport(this.config.mcpServerUrl, { + // headers: { + // 'Authorization': `Bearer ${this.config.platformJWT}` + // } + // }); + + // Placeholder: will be replaced with actual container transport + this.config.logger.warn( + 'MCP transport not yet implemented - using placeholder' + ); + + this.connected = true; + this.config.logger.info('Connected to user MCP server'); + } catch (error) { + this.config.logger.error( + { error, userId: this.config.userId }, + 'Failed to connect to user MCP server' + ); + throw error; + } + } + + /** + * Call a tool on the user's MCP server + */ + async callTool(name: string, args: Record): Promise { + if (!this.client || !this.connected) { + throw new Error('MCP client not connected'); + } + + try { + this.config.logger.debug({ tool: name, args }, 'Calling MCP tool'); + + // TODO: Implement when MCP client is connected + // const result = await this.client.callTool({ name, arguments: args }); + // return result; + + // Placeholder response + return { success: true, message: 'MCP tool call placeholder' }; + } catch (error) { + this.config.logger.error({ error, tool: name }, 'MCP tool call failed'); + throw error; + } + } + + /** + * List available tools from user's MCP server + */ + async listTools(): Promise> { + if (!this.client || !this.connected) { + throw new Error('MCP client not connected'); + } + + try { + // TODO: Implement when MCP client is connected + // const tools = await this.client.listTools(); + // return tools; + + // Placeholder tools (actions only, not context) + return [ + { name: 'save_message', description: 'Save message to conversation history' }, + { name: 'list_strategies', description: 'List user strategies' }, + { name: 'read_strategy', description: 'Read strategy code' }, + { name: 'write_strategy', description: 'Write strategy code' }, + { name: 'run_backtest', description: 'Run backtest on strategy' }, + { name: 'get_watchlist', description: 'Get user watchlist' }, + { name: 'execute_trade', description: 'Execute trade' }, + ]; + } catch (error) { + this.config.logger.error({ error }, 'Failed to list MCP tools'); + throw error; + } + } + + /** + * List available resources from user's MCP server + */ + async listResources(): Promise> { + if (!this.client || !this.connected) { + throw new Error('MCP client not connected'); + } + + try { + // TODO: Implement when MCP client is connected + // const resources = await this.client.listResources(); + // return resources; + + // Placeholder resources for user context + return [ + { + uri: 'context://user-profile', + name: 'User Profile', + description: 'User trading style, preferences, and background', + mimeType: 'text/plain', + }, + { + uri: 'context://conversation-summary', + name: 'Conversation Summary', + description: 'Semantic summary of recent conversation history with RAG', + mimeType: 'text/plain', + }, + { + uri: 'context://workspace-state', + name: 'Workspace State', + description: 'Current chart, watchlist, and open positions', + mimeType: 'application/json', + }, + { + uri: 'context://system-prompt', + name: 'Custom System Prompt', + description: 'User custom instructions for the assistant', + mimeType: 'text/plain', + }, + ]; + } catch (error) { + this.config.logger.error({ error }, 'Failed to list MCP resources'); + throw error; + } + } + + /** + * Read a resource from user's MCP server + */ + async readResource(uri: string): Promise<{ uri: string; mimeType?: string; text?: string; blob?: string }> { + if (!this.client || !this.connected) { + throw new Error('MCP client not connected'); + } + + try { + this.config.logger.debug({ uri }, 'Reading MCP resource'); + + // TODO: Implement when MCP client is connected + // const resource = await this.client.readResource({ uri }); + // return resource; + + // Placeholder resource content + if (uri === 'context://user-profile') { + return { + uri, + mimeType: 'text/plain', + text: `User Profile: +- Trading experience: Intermediate +- Preferred timeframes: 1h, 4h, 1d +- Risk tolerance: Medium +- Focus: Swing trading with technical indicators`, + }; + } else if (uri === 'context://conversation-summary') { + return { + uri, + mimeType: 'text/plain', + text: `Recent Conversation Summary: +[RAG-generated summary would go here] + +User recently discussed: +- Moving average crossover strategies +- Backtesting on BTC/USDT +- Risk management techniques`, + }; + } else if (uri === 'context://workspace-state') { + return { + uri, + mimeType: 'application/json', + text: JSON.stringify({ + currentChart: { ticker: 'BINANCE:BTC/USDT', timeframe: '1h' }, + watchlist: ['BTC/USDT', 'ETH/USDT', 'SOL/USDT'], + openPositions: [], + }, null, 2), + }; + } else if (uri === 'context://system-prompt') { + return { + uri, + mimeType: 'text/plain', + text: `Custom Instructions: +- Be concise and data-driven +- Always show risk/reward ratios +- Prefer simple strategies over complex ones`, + }; + } + + return { uri, text: '' }; + } catch (error) { + this.config.logger.error({ error, uri }, 'MCP resource read failed'); + throw error; + } + } + + /** + * Disconnect from MCP server + */ + async disconnect(): Promise { + if (this.client && this.connected) { + try { + await this.client.close(); + this.connected = false; + this.config.logger.info('Disconnected from user MCP server'); + } catch (error) { + this.config.logger.error({ error }, 'Error disconnecting from MCP server'); + } + } + } + + isConnected(): boolean { + return this.connected; + } +} diff --git a/gateway/src/k8s/client.ts b/gateway/src/k8s/client.ts new file mode 100644 index 0000000..767b727 --- /dev/null +++ b/gateway/src/k8s/client.ts @@ -0,0 +1,327 @@ +import * as k8s from '@kubernetes/client-node'; +import type { FastifyBaseLogger } from 'fastify'; +import * as yaml from 'js-yaml'; +import * as fs from 'fs/promises'; +import * as path from 'path'; +import { fileURLToPath } from 'url'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +export interface K8sClientConfig { + namespace: string; + inCluster: boolean; + context?: string; // For local dev + logger: FastifyBaseLogger; +} + +export interface DeploymentSpec { + userId: string; + licenseType: 'free' | 'pro' | 'enterprise'; + agentImage: string; + sidecarImage: string; + storageClass: string; +} + +/** + * Kubernetes client wrapper for managing agent deployments + */ +export class KubernetesClient { + private config: K8sClientConfig; + private k8sConfig: k8s.KubeConfig; + private appsApi: k8s.AppsV1Api; + private coreApi: k8s.CoreV1Api; + + constructor(config: K8sClientConfig) { + this.config = config; + this.k8sConfig = new k8s.KubeConfig(); + + if (config.inCluster) { + this.k8sConfig.loadFromCluster(); + this.config.logger.info('Loaded in-cluster Kubernetes config'); + } else { + this.k8sConfig.loadFromDefault(); + if (config.context) { + this.k8sConfig.setCurrentContext(config.context); + this.config.logger.info({ context: config.context }, 'Set Kubernetes context'); + } + this.config.logger.info('Loaded Kubernetes config from default location'); + } + + this.appsApi = this.k8sConfig.makeApiClient(k8s.AppsV1Api); + this.coreApi = this.k8sConfig.makeApiClient(k8s.CoreV1Api); + } + + /** + * Generate deployment name from user ID + */ + static getDeploymentName(userId: string): string { + // Sanitize userId to be k8s-compliant (lowercase alphanumeric + hyphens) + const sanitized = userId.toLowerCase().replace(/[^a-z0-9-]/g, '-'); + return `agent-${sanitized}`; + } + + /** + * Generate service name (same as deployment) + */ + static getServiceName(userId: string): string { + return this.getDeploymentName(userId); + } + + /** + * Generate PVC name + */ + static getPvcName(userId: string): string { + return `${this.getDeploymentName(userId)}-data`; + } + + /** + * Compute MCP endpoint URL from service name + */ + static getMcpEndpoint(userId: string, namespace: string): string { + const serviceName = this.getServiceName(userId); + return `http://${serviceName}.${namespace}.svc.cluster.local:3000`; + } + + /** + * Check if deployment exists + */ + async deploymentExists(deploymentName: string): Promise { + try { + await this.appsApi.readNamespacedDeployment(deploymentName, this.config.namespace); + return true; + } catch (error: any) { + if (error.response?.statusCode === 404) { + return false; + } + throw error; + } + } + + /** + * Create agent deployment from template + */ + async createAgentDeployment(spec: DeploymentSpec): Promise { + const deploymentName = KubernetesClient.getDeploymentName(spec.userId); + const serviceName = KubernetesClient.getServiceName(spec.userId); + const pvcName = KubernetesClient.getPvcName(spec.userId); + + this.config.logger.info( + { userId: spec.userId, licenseType: spec.licenseType, deploymentName }, + 'Creating agent deployment' + ); + + // Load template based on license type + const templatePath = path.join( + __dirname, + 'templates', + `${spec.licenseType}-tier.yaml` + ); + + const templateContent = await fs.readFile(templatePath, 'utf-8'); + + // Substitute variables + const rendered = templateContent + .replace(/\{\{userId\}\}/g, spec.userId) + .replace(/\{\{deploymentName\}\}/g, deploymentName) + .replace(/\{\{serviceName\}\}/g, serviceName) + .replace(/\{\{pvcName\}\}/g, pvcName) + .replace(/\{\{agentImage\}\}/g, spec.agentImage) + .replace(/\{\{sidecarImage\}\}/g, spec.sidecarImage) + .replace(/\{\{storageClass\}\}/g, spec.storageClass); + + // Parse YAML documents (deployment, pvc, service) + const documents = yaml.loadAll(rendered) as any[]; + + // Apply each resource + for (const doc of documents) { + if (!doc || !doc.kind) continue; + + try { + switch (doc.kind) { + case 'Deployment': + await this.appsApi.createNamespacedDeployment(this.config.namespace, doc); + this.config.logger.info({ deploymentName }, 'Created deployment'); + break; + + case 'PersistentVolumeClaim': + await this.coreApi.createNamespacedPersistentVolumeClaim( + this.config.namespace, + doc + ); + this.config.logger.info({ pvcName }, 'Created PVC'); + break; + + case 'Service': + await this.coreApi.createNamespacedService(this.config.namespace, doc); + this.config.logger.info({ serviceName }, 'Created service'); + break; + + default: + this.config.logger.warn({ kind: doc.kind }, 'Unknown resource kind in template'); + } + } catch (error: any) { + // If resource already exists, log warning but continue + if (error.response?.statusCode === 409) { + this.config.logger.warn( + { kind: doc.kind, name: doc.metadata?.name }, + 'Resource already exists, skipping' + ); + } else { + throw error; + } + } + } + + this.config.logger.info({ deploymentName }, 'Agent deployment created successfully'); + } + + /** + * Wait for deployment to be ready + */ + async waitForDeploymentReady( + deploymentName: string, + timeoutMs: number = 120000 + ): Promise { + const startTime = Date.now(); + const pollInterval = 2000; // 2 seconds + + this.config.logger.info( + { deploymentName, timeoutMs }, + 'Waiting for deployment to be ready' + ); + + while (Date.now() - startTime < timeoutMs) { + try { + const response = await this.appsApi.readNamespacedDeployment( + deploymentName, + this.config.namespace + ); + + const deployment = response.body; + const status = deployment.status; + + // Check if deployment is ready + if ( + status?.availableReplicas && + status.availableReplicas > 0 && + status.readyReplicas && + status.readyReplicas > 0 + ) { + this.config.logger.info({ deploymentName }, 'Deployment is ready'); + return true; + } + + // Check for failure conditions + if (status?.conditions) { + const failedCondition = status.conditions.find( + (c) => c.type === 'Progressing' && c.status === 'False' + ); + if (failedCondition) { + this.config.logger.error( + { deploymentName, reason: failedCondition.reason, message: failedCondition.message }, + 'Deployment failed to progress' + ); + return false; + } + } + + this.config.logger.debug( + { + deploymentName, + replicas: status?.replicas, + ready: status?.readyReplicas, + available: status?.availableReplicas, + }, + 'Deployment not ready yet, waiting...' + ); + + await new Promise((resolve) => setTimeout(resolve, pollInterval)); + } catch (error: any) { + if (error.response?.statusCode === 404) { + this.config.logger.warn({ deploymentName }, 'Deployment not found'); + return false; + } + throw error; + } + } + + this.config.logger.warn({ deploymentName, timeoutMs }, 'Deployment readiness timeout'); + return false; + } + + /** + * Get service endpoint URL + */ + async getServiceEndpoint(serviceName: string): Promise { + try { + const response = await this.coreApi.readNamespacedService( + serviceName, + this.config.namespace + ); + + const service = response.body; + + // For ClusterIP services, return internal DNS name + if (service.spec?.type === 'ClusterIP') { + const port = service.spec.ports?.find((p) => p.name === 'mcp')?.port || 3000; + return `http://${serviceName}.${this.config.namespace}.svc.cluster.local:${port}`; + } + + // For other service types (NodePort, LoadBalancer), would need different logic + this.config.logger.warn( + { serviceName, type: service.spec?.type }, + 'Unexpected service type' + ); + return null; + } catch (error: any) { + if (error.response?.statusCode === 404) { + this.config.logger.warn({ serviceName }, 'Service not found'); + return null; + } + throw error; + } + } + + /** + * Delete deployment and associated resources + * (Used for cleanup/testing - normally handled by lifecycle sidecar) + */ + async deleteAgentDeployment(userId: string): Promise { + const deploymentName = KubernetesClient.getDeploymentName(userId); + const serviceName = KubernetesClient.getServiceName(userId); + const pvcName = KubernetesClient.getPvcName(userId); + + this.config.logger.info({ userId, deploymentName }, 'Deleting agent deployment'); + + // Delete deployment + try { + await this.appsApi.deleteNamespacedDeployment(deploymentName, this.config.namespace); + this.config.logger.info({ deploymentName }, 'Deleted deployment'); + } catch (error: any) { + if (error.response?.statusCode !== 404) { + this.config.logger.warn({ deploymentName, error }, 'Failed to delete deployment'); + } + } + + // Delete service + try { + await this.coreApi.deleteNamespacedService(serviceName, this.config.namespace); + this.config.logger.info({ serviceName }, 'Deleted service'); + } catch (error: any) { + if (error.response?.statusCode !== 404) { + this.config.logger.warn({ serviceName, error }, 'Failed to delete service'); + } + } + + // Delete PVC + try { + await this.coreApi.deleteNamespacedPersistentVolumeClaim(pvcName, this.config.namespace); + this.config.logger.info({ pvcName }, 'Deleted PVC'); + } catch (error: any) { + if (error.response?.statusCode !== 404) { + this.config.logger.warn({ pvcName, error }, 'Failed to delete PVC'); + } + } + } +} diff --git a/gateway/src/k8s/container-manager.ts b/gateway/src/k8s/container-manager.ts new file mode 100644 index 0000000..67d5e2f --- /dev/null +++ b/gateway/src/k8s/container-manager.ts @@ -0,0 +1,118 @@ +import type { FastifyBaseLogger } from 'fastify'; +import { KubernetesClient, type DeploymentSpec } from './client.js'; +import type { UserLicense } from '../types/user.js'; + +export interface ContainerManagerConfig { + k8sClient: KubernetesClient; + agentImage: string; + sidecarImage: string; + storageClass: string; + namespace: string; + logger: FastifyBaseLogger; +} + +export interface ContainerStatus { + exists: boolean; + ready: boolean; + mcpEndpoint: string; +} + +/** + * Container manager orchestrates agent container lifecycle + */ +export class ContainerManager { + private config: ContainerManagerConfig; + + constructor(config: ContainerManagerConfig) { + this.config = config; + } + + /** + * Ensure user's container is running and ready + * Returns the MCP endpoint URL + */ + async ensureContainerRunning( + userId: string, + license: UserLicense + ): Promise<{ mcpEndpoint: string; wasCreated: boolean }> { + const deploymentName = KubernetesClient.getDeploymentName(userId); + const mcpEndpoint = KubernetesClient.getMcpEndpoint(userId, this.config.namespace); + + this.config.logger.info( + { userId, licenseType: license.licenseType, deploymentName }, + 'Ensuring container is running' + ); + + // Check if deployment already exists + const exists = await this.config.k8sClient.deploymentExists(deploymentName); + + if (exists) { + this.config.logger.info({ userId, deploymentName }, 'Container deployment already exists'); + + // Wait for it to be ready (in case it's starting up) + const ready = await this.config.k8sClient.waitForDeploymentReady(deploymentName, 30000); + + if (!ready) { + this.config.logger.warn( + { userId, deploymentName }, + 'Existing deployment not ready within timeout' + ); + // Continue anyway - might be an image pull or other transient issue + } + + return { mcpEndpoint, wasCreated: false }; + } + + // Create new deployment + this.config.logger.info({ userId, licenseType: license.licenseType }, 'Creating new container'); + + const spec: DeploymentSpec = { + userId, + licenseType: license.licenseType, + agentImage: this.config.agentImage, + sidecarImage: this.config.sidecarImage, + storageClass: this.config.storageClass, + }; + + await this.config.k8sClient.createAgentDeployment(spec); + + // Wait for deployment to be ready + const ready = await this.config.k8sClient.waitForDeploymentReady(deploymentName, 120000); + + if (!ready) { + throw new Error( + `Container deployment failed to become ready within timeout: ${deploymentName}` + ); + } + + this.config.logger.info({ userId, mcpEndpoint }, 'Container is ready'); + + return { mcpEndpoint, wasCreated: true }; + } + + /** + * Check container status without creating it + */ + async getContainerStatus(userId: string): Promise { + const deploymentName = KubernetesClient.getDeploymentName(userId); + const mcpEndpoint = KubernetesClient.getMcpEndpoint(userId, this.config.namespace); + + const exists = await this.config.k8sClient.deploymentExists(deploymentName); + + if (!exists) { + return { exists: false, ready: false, mcpEndpoint }; + } + + // Check if ready (with short timeout) + const ready = await this.config.k8sClient.waitForDeploymentReady(deploymentName, 5000); + + return { exists: true, ready, mcpEndpoint }; + } + + /** + * Delete container (for cleanup/testing) + */ + async deleteContainer(userId: string): Promise { + await this.config.k8sClient.deleteAgentDeployment(userId); + } +} diff --git a/gateway/src/k8s/templates/enterprise-tier.yaml b/gateway/src/k8s/templates/enterprise-tier.yaml new file mode 100644 index 0000000..04db77a --- /dev/null +++ b/gateway/src/k8s/templates/enterprise-tier.yaml @@ -0,0 +1,199 @@ +# Enterprise tier agent deployment template +# Variables: {{userId}}, {{deploymentName}}, {{pvcName}}, {{serviceName}} +# Enterprise: No idle shutdown, larger resources +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{deploymentName}} + namespace: dexorder-agents + labels: + app.kubernetes.io/name: agent + app.kubernetes.io/component: user-agent + dexorder.io/component: agent + dexorder.io/user-id: {{userId}} + dexorder.io/deployment: {{deploymentName}} + dexorder.io/license-tier: enterprise +spec: + replicas: 1 + selector: + matchLabels: + dexorder.io/user-id: {{userId}} + template: + metadata: + labels: + dexorder.io/component: agent + dexorder.io/user-id: {{userId}} + dexorder.io/deployment: {{deploymentName}} + dexorder.io/license-tier: enterprise + spec: + serviceAccountName: agent-lifecycle + shareProcessNamespace: true + + securityContext: + runAsNonRoot: true + runAsUser: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + + containers: + - name: agent + image: {{agentImage}} + imagePullPolicy: Always + + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + + resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "4Gi" + cpu: "4000m" + + env: + - name: USER_ID + value: {{userId}} + - name: IDLE_TIMEOUT_MINUTES + value: "0" + - name: IDLE_CHECK_INTERVAL_SECONDS + value: "60" + - name: ENABLE_IDLE_SHUTDOWN + value: "false" + - name: MCP_SERVER_PORT + value: "3000" + - name: ZMQ_CONTROL_PORT + value: "5555" + + ports: + - name: mcp + containerPort: 3000 + protocol: TCP + - name: zmq-control + containerPort: 5555 + protocol: TCP + + volumeMounts: + - name: agent-data + mountPath: /app/data + - name: tmp + mountPath: /tmp + - name: shared-run + mountPath: /var/run/agent + + livenessProbe: + httpGet: + path: /health + port: mcp + initialDelaySeconds: 10 + periodSeconds: 30 + timeoutSeconds: 5 + + readinessProbe: + httpGet: + path: /ready + port: mcp + initialDelaySeconds: 5 + periodSeconds: 10 + + - name: lifecycle-sidecar + image: {{sidecarImage}} + imagePullPolicy: Always + + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + + resources: + requests: + memory: "32Mi" + cpu: "10m" + limits: + memory: "64Mi" + cpu: "50m" + + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: DEPLOYMENT_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['dexorder.io/deployment'] + - name: USER_TYPE + value: "enterprise" + - name: MAIN_CONTAINER_PID + value: "1" + + volumeMounts: + - name: shared-run + mountPath: /var/run/agent + readOnly: true + + volumes: + - name: agent-data + persistentVolumeClaim: + claimName: {{pvcName}} + - name: tmp + emptyDir: + medium: Memory + sizeLimit: 512Mi + - name: shared-run + emptyDir: + medium: Memory + sizeLimit: 1Mi + + restartPolicy: Always + terminationGracePeriodSeconds: 30 +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{pvcName}} + namespace: dexorder-agents + labels: + dexorder.io/user-id: {{userId}} + dexorder.io/license-tier: enterprise +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: {{storageClass}} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{serviceName}} + namespace: dexorder-agents + labels: + dexorder.io/user-id: {{userId}} + dexorder.io/license-tier: enterprise +spec: + type: ClusterIP + selector: + dexorder.io/user-id: {{userId}} + ports: + - name: mcp + port: 3000 + targetPort: mcp + protocol: TCP + - name: zmq-control + port: 5555 + targetPort: zmq-control + protocol: TCP diff --git a/gateway/src/k8s/templates/free-tier.yaml b/gateway/src/k8s/templates/free-tier.yaml new file mode 100644 index 0000000..3ea0415 --- /dev/null +++ b/gateway/src/k8s/templates/free-tier.yaml @@ -0,0 +1,198 @@ +# Free tier agent deployment template +# Variables: {{userId}}, {{deploymentName}}, {{pvcName}}, {{serviceName}} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{deploymentName}} + namespace: dexorder-agents + labels: + app.kubernetes.io/name: agent + app.kubernetes.io/component: user-agent + dexorder.io/component: agent + dexorder.io/user-id: {{userId}} + dexorder.io/deployment: {{deploymentName}} + dexorder.io/license-tier: free +spec: + replicas: 1 + selector: + matchLabels: + dexorder.io/user-id: {{userId}} + template: + metadata: + labels: + dexorder.io/component: agent + dexorder.io/user-id: {{userId}} + dexorder.io/deployment: {{deploymentName}} + dexorder.io/license-tier: free + spec: + serviceAccountName: agent-lifecycle + shareProcessNamespace: true + + securityContext: + runAsNonRoot: true + runAsUser: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + + containers: + - name: agent + image: {{agentImage}} + imagePullPolicy: Always + + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "512Mi" + cpu: "500m" + + env: + - name: USER_ID + value: {{userId}} + - name: IDLE_TIMEOUT_MINUTES + value: "15" + - name: IDLE_CHECK_INTERVAL_SECONDS + value: "60" + - name: ENABLE_IDLE_SHUTDOWN + value: "true" + - name: MCP_SERVER_PORT + value: "3000" + - name: ZMQ_CONTROL_PORT + value: "5555" + + ports: + - name: mcp + containerPort: 3000 + protocol: TCP + - name: zmq-control + containerPort: 5555 + protocol: TCP + + volumeMounts: + - name: agent-data + mountPath: /app/data + - name: tmp + mountPath: /tmp + - name: shared-run + mountPath: /var/run/agent + + livenessProbe: + httpGet: + path: /health + port: mcp + initialDelaySeconds: 10 + periodSeconds: 30 + timeoutSeconds: 5 + + readinessProbe: + httpGet: + path: /ready + port: mcp + initialDelaySeconds: 5 + periodSeconds: 10 + + - name: lifecycle-sidecar + image: {{sidecarImage}} + imagePullPolicy: Always + + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + + resources: + requests: + memory: "32Mi" + cpu: "10m" + limits: + memory: "64Mi" + cpu: "50m" + + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: DEPLOYMENT_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['dexorder.io/deployment'] + - name: USER_TYPE + value: "free" + - name: MAIN_CONTAINER_PID + value: "1" + + volumeMounts: + - name: shared-run + mountPath: /var/run/agent + readOnly: true + + volumes: + - name: agent-data + persistentVolumeClaim: + claimName: {{pvcName}} + - name: tmp + emptyDir: + medium: Memory + sizeLimit: 128Mi + - name: shared-run + emptyDir: + medium: Memory + sizeLimit: 1Mi + + restartPolicy: Always + terminationGracePeriodSeconds: 30 +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{pvcName}} + namespace: dexorder-agents + labels: + dexorder.io/user-id: {{userId}} + dexorder.io/license-tier: free +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi + storageClassName: {{storageClass}} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{serviceName}} + namespace: dexorder-agents + labels: + dexorder.io/user-id: {{userId}} + dexorder.io/license-tier: free +spec: + type: ClusterIP + selector: + dexorder.io/user-id: {{userId}} + ports: + - name: mcp + port: 3000 + targetPort: mcp + protocol: TCP + - name: zmq-control + port: 5555 + targetPort: zmq-control + protocol: TCP diff --git a/gateway/src/k8s/templates/pro-tier.yaml b/gateway/src/k8s/templates/pro-tier.yaml new file mode 100644 index 0000000..a99abac --- /dev/null +++ b/gateway/src/k8s/templates/pro-tier.yaml @@ -0,0 +1,198 @@ +# Pro tier agent deployment template +# Variables: {{userId}}, {{deploymentName}}, {{pvcName}}, {{serviceName}} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{deploymentName}} + namespace: dexorder-agents + labels: + app.kubernetes.io/name: agent + app.kubernetes.io/component: user-agent + dexorder.io/component: agent + dexorder.io/user-id: {{userId}} + dexorder.io/deployment: {{deploymentName}} + dexorder.io/license-tier: pro +spec: + replicas: 1 + selector: + matchLabels: + dexorder.io/user-id: {{userId}} + template: + metadata: + labels: + dexorder.io/component: agent + dexorder.io/user-id: {{userId}} + dexorder.io/deployment: {{deploymentName}} + dexorder.io/license-tier: pro + spec: + serviceAccountName: agent-lifecycle + shareProcessNamespace: true + + securityContext: + runAsNonRoot: true + runAsUser: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + + containers: + - name: agent + image: {{agentImage}} + imagePullPolicy: Always + + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "2Gi" + cpu: "2000m" + + env: + - name: USER_ID + value: {{userId}} + - name: IDLE_TIMEOUT_MINUTES + value: "60" + - name: IDLE_CHECK_INTERVAL_SECONDS + value: "60" + - name: ENABLE_IDLE_SHUTDOWN + value: "true" + - name: MCP_SERVER_PORT + value: "3000" + - name: ZMQ_CONTROL_PORT + value: "5555" + + ports: + - name: mcp + containerPort: 3000 + protocol: TCP + - name: zmq-control + containerPort: 5555 + protocol: TCP + + volumeMounts: + - name: agent-data + mountPath: /app/data + - name: tmp + mountPath: /tmp + - name: shared-run + mountPath: /var/run/agent + + livenessProbe: + httpGet: + path: /health + port: mcp + initialDelaySeconds: 10 + periodSeconds: 30 + timeoutSeconds: 5 + + readinessProbe: + httpGet: + path: /ready + port: mcp + initialDelaySeconds: 5 + periodSeconds: 10 + + - name: lifecycle-sidecar + image: {{sidecarImage}} + imagePullPolicy: Always + + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + + resources: + requests: + memory: "32Mi" + cpu: "10m" + limits: + memory: "64Mi" + cpu: "50m" + + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: DEPLOYMENT_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['dexorder.io/deployment'] + - name: USER_TYPE + value: "pro" + - name: MAIN_CONTAINER_PID + value: "1" + + volumeMounts: + - name: shared-run + mountPath: /var/run/agent + readOnly: true + + volumes: + - name: agent-data + persistentVolumeClaim: + claimName: {{pvcName}} + - name: tmp + emptyDir: + medium: Memory + sizeLimit: 256Mi + - name: shared-run + emptyDir: + medium: Memory + sizeLimit: 1Mi + + restartPolicy: Always + terminationGracePeriodSeconds: 30 +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{pvcName}} + namespace: dexorder-agents + labels: + dexorder.io/user-id: {{userId}} + dexorder.io/license-tier: pro +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi + storageClassName: {{storageClass}} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{serviceName}} + namespace: dexorder-agents + labels: + dexorder.io/user-id: {{userId}} + dexorder.io/license-tier: pro +spec: + type: ClusterIP + selector: + dexorder.io/user-id: {{userId}} + ports: + - name: mcp + port: 3000 + targetPort: mcp + protocol: TCP + - name: zmq-control + port: 5555 + targetPort: zmq-control + protocol: TCP diff --git a/gateway/src/llm/provider.ts b/gateway/src/llm/provider.ts new file mode 100644 index 0000000..efdb9df --- /dev/null +++ b/gateway/src/llm/provider.ts @@ -0,0 +1,216 @@ +import type { BaseChatModel } from '@langchain/core/language_models/chat_models'; +import { ChatAnthropic } from '@langchain/anthropic'; +import { ChatOpenAI } from '@langchain/openai'; +import { ChatGoogleGenerativeAI } from '@langchain/google-genai'; +import { ChatOpenRouter } from '@langchain/openrouter'; +import type { FastifyBaseLogger } from 'fastify'; + +/** + * Supported LLM providers + */ +export enum LLMProvider { + ANTHROPIC = 'anthropic', + OPENAI = 'openai', + GOOGLE = 'google', + OPENROUTER = 'openrouter', +} + +/** + * Model configuration + */ +export interface ModelConfig { + provider: LLMProvider; + model: string; + temperature?: number; + maxTokens?: number; +} + +/** + * Provider configuration with API keys + */ +export interface ProviderConfig { + anthropicApiKey?: string; + openaiApiKey?: string; + googleApiKey?: string; + openrouterApiKey?: string; +} + +/** + * LLM Provider factory + * Creates model instances with unified interface across providers + */ +export class LLMProviderFactory { + private config: ProviderConfig; + private logger: FastifyBaseLogger; + + constructor(config: ProviderConfig, logger: FastifyBaseLogger) { + this.config = config; + this.logger = logger; + } + + /** + * Create a chat model instance + */ + createModel(modelConfig: ModelConfig): BaseChatModel { + this.logger.debug( + { provider: modelConfig.provider, model: modelConfig.model }, + 'Creating LLM model' + ); + + switch (modelConfig.provider) { + case LLMProvider.ANTHROPIC: + return this.createAnthropicModel(modelConfig); + + case LLMProvider.OPENAI: + return this.createOpenAIModel(modelConfig); + + case LLMProvider.GOOGLE: + return this.createGoogleModel(modelConfig); + + case LLMProvider.OPENROUTER: + return this.createOpenRouterModel(modelConfig); + + default: + throw new Error(`Unsupported provider: ${modelConfig.provider}`); + } + } + + /** + * Create Anthropic Claude model + */ + private createAnthropicModel(config: ModelConfig): ChatAnthropic { + if (!this.config.anthropicApiKey) { + throw new Error('Anthropic API key not configured'); + } + + return new ChatAnthropic({ + model: config.model, + temperature: config.temperature ?? 0.7, + maxTokens: config.maxTokens ?? 4096, + anthropicApiKey: this.config.anthropicApiKey, + }); + } + + /** + * Create OpenAI GPT model + */ + private createOpenAIModel(config: ModelConfig): ChatOpenAI { + if (!this.config.openaiApiKey) { + throw new Error('OpenAI API key not configured'); + } + + return new ChatOpenAI({ + model: config.model, + temperature: config.temperature ?? 0.7, + maxTokens: config.maxTokens ?? 4096, + openAIApiKey: this.config.openaiApiKey, + }); + } + + /** + * Create Google Gemini model + */ + private createGoogleModel(config: ModelConfig): ChatGoogleGenerativeAI { + if (!this.config.googleApiKey) { + throw new Error('Google API key not configured'); + } + + return new ChatGoogleGenerativeAI({ + model: config.model, + temperature: config.temperature ?? 0.7, + maxOutputTokens: config.maxTokens ?? 4096, + apiKey: this.config.googleApiKey, + }); + } + + /** + * Create OpenRouter model (access to 300+ models) + */ + private createOpenRouterModel(config: ModelConfig): ChatOpenRouter { + if (!this.config.openrouterApiKey) { + throw new Error('OpenRouter API key not configured'); + } + + return new ChatOpenRouter({ + model: config.model, + temperature: config.temperature ?? 0.7, + maxTokens: config.maxTokens ?? 4096, + apiKey: this.config.openrouterApiKey, + }); + } + + /** + * Get default model based on environment + */ + getDefaultModel(): ModelConfig { + // Check which API keys are available + if (this.config.anthropicApiKey) { + return { + provider: LLMProvider.ANTHROPIC, + model: 'claude-3-5-sonnet-20241022', + }; + } + + if (this.config.openaiApiKey) { + return { + provider: LLMProvider.OPENAI, + model: 'gpt-4o', + }; + } + + if (this.config.googleApiKey) { + return { + provider: LLMProvider.GOOGLE, + model: 'gemini-2.0-flash-exp', + }; + } + + if (this.config.openrouterApiKey) { + return { + provider: LLMProvider.OPENROUTER, + model: 'anthropic/claude-3.5-sonnet', + }; + } + + throw new Error('No LLM API keys configured'); + } +} + +/** + * Predefined model configurations + */ +export const MODELS = { + // Anthropic + CLAUDE_SONNET: { + provider: LLMProvider.ANTHROPIC, + model: 'claude-3-5-sonnet-20241022', + }, + CLAUDE_HAIKU: { + provider: LLMProvider.ANTHROPIC, + model: 'claude-3-5-haiku-20241022', + }, + CLAUDE_OPUS: { + provider: LLMProvider.ANTHROPIC, + model: 'claude-3-opus-20240229', + }, + + // OpenAI + GPT4O: { + provider: LLMProvider.OPENAI, + model: 'gpt-4o', + }, + GPT4O_MINI: { + provider: LLMProvider.OPENAI, + model: 'gpt-4o-mini', + }, + + // Google + GEMINI_2_FLASH: { + provider: LLMProvider.GOOGLE, + model: 'gemini-2.0-flash-exp', + }, + GEMINI_PRO: { + provider: LLMProvider.GOOGLE, + model: 'gemini-1.5-pro', + }, +} as const satisfies Record; diff --git a/gateway/src/llm/router.ts b/gateway/src/llm/router.ts new file mode 100644 index 0000000..ef529bb --- /dev/null +++ b/gateway/src/llm/router.ts @@ -0,0 +1,202 @@ +import type { BaseChatModel } from '@langchain/core/language_models/chat_models'; +import type { FastifyBaseLogger } from 'fastify'; +import { LLMProviderFactory, type ModelConfig, LLMProvider } from './provider.js'; +import type { UserLicense } from '../types/user.js'; + +/** + * Model routing strategies + */ +export enum RoutingStrategy { + /** Use user's preferred model from license */ + USER_PREFERENCE = 'user_preference', + /** Route based on query complexity */ + COMPLEXITY = 'complexity', + /** Route based on license tier */ + LICENSE_TIER = 'license_tier', + /** Use cheapest available model */ + COST_OPTIMIZED = 'cost_optimized', +} + +/** + * Model router + * Intelligently selects which model to use based on various factors + */ +export class ModelRouter { + private factory: LLMProviderFactory; + private logger: FastifyBaseLogger; + private defaultModel: ModelConfig; + + constructor(factory: LLMProviderFactory, logger: FastifyBaseLogger) { + this.factory = factory; + this.logger = logger; + this.defaultModel = factory.getDefaultModel(); + } + + /** + * Route to appropriate model based on context + */ + async route( + message: string, + license: UserLicense, + strategy: RoutingStrategy = RoutingStrategy.USER_PREFERENCE + ): Promise { + let modelConfig: ModelConfig; + + switch (strategy) { + case RoutingStrategy.USER_PREFERENCE: + modelConfig = this.routeByUserPreference(license); + break; + + case RoutingStrategy.COMPLEXITY: + modelConfig = this.routeByComplexity(message, license); + break; + + case RoutingStrategy.LICENSE_TIER: + modelConfig = this.routeByLicenseTier(license); + break; + + case RoutingStrategy.COST_OPTIMIZED: + modelConfig = this.routeByCost(license); + break; + + default: + modelConfig = this.defaultModel; + } + + this.logger.info( + { + userId: license.userId, + strategy, + provider: modelConfig.provider, + model: modelConfig.model, + }, + 'Routing to model' + ); + + return this.factory.createModel(modelConfig); + } + + /** + * Route based on user's preferred model (if set in license) + */ + private routeByUserPreference(license: UserLicense): ModelConfig { + // Check if user has custom model preference + const preferredModel = (license as any).preferredModel as ModelConfig | undefined; + + if (preferredModel && this.isModelAllowed(preferredModel, license)) { + return preferredModel; + } + + // Fall back to license tier default + return this.routeByLicenseTier(license); + } + + /** + * Route based on query complexity + */ + private routeByComplexity(message: string, license: UserLicense): ModelConfig { + const isComplex = this.isComplexQuery(message); + + if (license.licenseType === 'enterprise') { + // Enterprise users get best models for complex queries + return isComplex + ? { provider: LLMProvider.ANTHROPIC, model: 'claude-3-opus-20240229' } + : { provider: LLMProvider.ANTHROPIC, model: 'claude-3-5-sonnet-20241022' }; + } + + if (license.licenseType === 'pro') { + // Pro users get good models + return isComplex + ? { provider: LLMProvider.ANTHROPIC, model: 'claude-3-5-sonnet-20241022' } + : { provider: LLMProvider.OPENAI, model: 'gpt-4o-mini' }; + } + + // Free users get efficient models + return { provider: LLMProvider.GOOGLE, model: 'gemini-2.0-flash-exp' }; + } + + /** + * Route based on license tier + */ + private routeByLicenseTier(license: UserLicense): ModelConfig { + switch (license.licenseType) { + case 'enterprise': + return { provider: LLMProvider.ANTHROPIC, model: 'claude-3-5-sonnet-20241022' }; + + case 'pro': + return { provider: LLMProvider.OPENAI, model: 'gpt-4o' }; + + case 'free': + return { provider: LLMProvider.GOOGLE, model: 'gemini-2.0-flash-exp' }; + + default: + return this.defaultModel; + } + } + + /** + * Route to cheapest available model + */ + private routeByCost(license: UserLicense): ModelConfig { + // Free tier: use cheapest + if (license.licenseType === 'free') { + return { provider: LLMProvider.GOOGLE, model: 'gemini-2.0-flash-exp' }; + } + + // Paid tiers: use GPT-4o-mini for cost efficiency + return { provider: LLMProvider.OPENAI, model: 'gpt-4o-mini' }; + } + + /** + * Check if model is allowed for user's license + */ + private isModelAllowed(model: ModelConfig, license: UserLicense): boolean { + // Free tier: only cheap models + if (license.licenseType === 'free') { + const allowedModels = ['gemini-2.0-flash-exp', 'gpt-4o-mini', 'claude-3-5-haiku-20241022']; + return allowedModels.includes(model.model); + } + + // Pro: all except Opus + if (license.licenseType === 'pro') { + const blockedModels = ['claude-3-opus-20240229']; + return !blockedModels.includes(model.model); + } + + // Enterprise: all models allowed + return true; + } + + /** + * Determine if query is complex + */ + private isComplexQuery(message: string): boolean { + const complexityIndicators = [ + // Multi-step analysis + 'backtest', + 'analyze', + 'compare', + 'optimize', + + // Code generation + 'write', + 'create', + 'implement', + 'build', + + // Deep reasoning + 'explain why', + 'what if', + 'how would', + + // Long messages (> 200 chars likely complex) + message.length > 200, + ]; + + const messageLower = message.toLowerCase(); + + return complexityIndicators.some((indicator) => + typeof indicator === 'string' ? messageLower.includes(indicator) : indicator + ); + } +} diff --git a/gateway/src/main.ts b/gateway/src/main.ts new file mode 100644 index 0000000..057478b --- /dev/null +++ b/gateway/src/main.ts @@ -0,0 +1,154 @@ +import Fastify from 'fastify'; +import websocket from '@fastify/websocket'; +import cors from '@fastify/cors'; +import { UserService } from './db/user-service.js'; +import { Authenticator } from './auth/authenticator.js'; +import { WebSocketHandler } from './channels/websocket-handler.js'; +import { TelegramHandler } from './channels/telegram-handler.js'; +import { KubernetesClient } from './k8s/client.js'; +import { ContainerManager } from './k8s/container-manager.js'; + +const app = Fastify({ + logger: { + level: process.env.LOG_LEVEL || 'info', + transport: { + target: 'pino-pretty', + options: { + colorize: true, + translateTime: 'HH:MM:ss Z', + ignore: 'pid,hostname', + }, + }, + }, +}); + +// Configuration from environment +const config = { + port: parseInt(process.env.PORT || '3000'), + host: process.env.HOST || '0.0.0.0', + databaseUrl: process.env.DATABASE_URL || 'postgresql://localhost/dexorder', + + // LLM provider API keys + providerConfig: { + anthropicApiKey: process.env.ANTHROPIC_API_KEY, + openaiApiKey: process.env.OPENAI_API_KEY, + googleApiKey: process.env.GOOGLE_API_KEY, + openrouterApiKey: process.env.OPENROUTER_API_KEY, + }, + + telegramBotToken: process.env.TELEGRAM_BOT_TOKEN || '', + + // Kubernetes configuration + kubernetes: { + namespace: process.env.KUBERNETES_NAMESPACE || 'dexorder-agents', + inCluster: process.env.KUBERNETES_IN_CLUSTER === 'true', + context: process.env.KUBERNETES_CONTEXT, + agentImage: process.env.AGENT_IMAGE || 'ghcr.io/dexorder/agent:latest', + sidecarImage: process.env.SIDECAR_IMAGE || 'ghcr.io/dexorder/lifecycle-sidecar:latest', + storageClass: process.env.AGENT_STORAGE_CLASS || 'standard', + }, +}; + +// Validate at least one LLM provider is configured +const hasAnyProvider = Object.values(config.providerConfig).some(key => !!key); +if (!hasAnyProvider) { + app.log.error('At least one LLM provider API key is required (ANTHROPIC_API_KEY, OPENAI_API_KEY, GOOGLE_API_KEY, or OPENROUTER_API_KEY)'); + process.exit(1); +} + +// Register plugins +await app.register(cors, { + origin: process.env.CORS_ORIGIN || '*', +}); + +await app.register(websocket, { + options: { + maxPayload: 1024 * 1024, // 1MB + }, +}); + +// Initialize services +const userService = new UserService(config.databaseUrl); + +// Initialize Kubernetes client and container manager +const k8sClient = new KubernetesClient({ + namespace: config.kubernetes.namespace, + inCluster: config.kubernetes.inCluster, + context: config.kubernetes.context, + logger: app.log, +}); + +const containerManager = new ContainerManager({ + k8sClient, + agentImage: config.kubernetes.agentImage, + sidecarImage: config.kubernetes.sidecarImage, + storageClass: config.kubernetes.storageClass, + namespace: config.kubernetes.namespace, + logger: app.log, +}); + +const authenticator = new Authenticator({ + userService, + containerManager, + logger: app.log, +}); + +// Initialize channel handlers +const websocketHandler = new WebSocketHandler({ + authenticator, + providerConfig: config.providerConfig, +}); + +const telegramHandler = new TelegramHandler({ + authenticator, + providerConfig: config.providerConfig, + telegramBotToken: config.telegramBotToken, +}); + +// Register routes +websocketHandler.register(app); +telegramHandler.register(app); + +// Health check +app.get('/health', async () => { + return { + status: 'ok', + timestamp: new Date().toISOString(), + }; +}); + +// Graceful shutdown +const shutdown = async () => { + app.log.info('Shutting down gracefully...'); + try { + await userService.close(); + await app.close(); + app.log.info('Shutdown complete'); + process.exit(0); + } catch (error) { + app.log.error({ error }, 'Error during shutdown'); + process.exit(1); + } +}; + +process.on('SIGTERM', shutdown); +process.on('SIGINT', shutdown); + +// Start server +try { + await app.listen({ + port: config.port, + host: config.host, + }); + + app.log.info( + { + port: config.port, + host: config.host, + }, + 'Gateway server started' + ); +} catch (error) { + app.log.error({ error }, 'Failed to start server'); + process.exit(1); +} diff --git a/gateway/src/types/messages.ts b/gateway/src/types/messages.ts new file mode 100644 index 0000000..97642fe --- /dev/null +++ b/gateway/src/types/messages.ts @@ -0,0 +1,37 @@ +import { z } from 'zod'; + +/** + * Inbound user message from any channel + */ +export const InboundMessageSchema = z.object({ + messageId: z.string(), + userId: z.string(), + sessionId: z.string(), + content: z.string(), + attachments: z.array(z.object({ + type: z.enum(['image', 'file', 'url']), + url: z.string(), + mimeType: z.string().optional(), + })).optional(), + timestamp: z.date(), +}); + +export type InboundMessage = z.infer; + +/** + * Outbound response to channel + */ +export const OutboundMessageSchema = z.object({ + messageId: z.string(), + sessionId: z.string(), + content: z.string(), + attachments: z.array(z.object({ + type: z.enum(['image', 'chart', 'file']), + url: z.string(), + caption: z.string().optional(), + })).optional(), + metadata: z.record(z.unknown()).optional(), + timestamp: z.date(), +}); + +export type OutboundMessage = z.infer; diff --git a/gateway/src/types/resources.ts b/gateway/src/types/resources.ts new file mode 100644 index 0000000..516a7de --- /dev/null +++ b/gateway/src/types/resources.ts @@ -0,0 +1,101 @@ +import { z } from 'zod'; + +/** + * MCP Resource types for user context + */ + +/** + * Base resource structure from MCP server + */ +export const MCPResourceSchema = z.object({ + uri: z.string(), + mimeType: z.string().optional(), + text: z.string().optional(), + blob: z.string().optional(), // base64 encoded +}); + +export type MCPResource = z.infer; + +/** + * User profile context + */ +export const UserProfileContextSchema = z.object({ + tradingExperience: z.enum(['beginner', 'intermediate', 'advanced', 'professional']), + preferredTimeframes: z.array(z.string()), + riskTolerance: z.enum(['low', 'medium', 'high']), + tradingStyle: z.string(), + favoriteIndicators: z.array(z.string()).optional(), + activeTradingPairs: z.array(z.string()).optional(), + notes: z.string().optional(), +}); + +export type UserProfileContext = z.infer; + +/** + * Workspace state (current chart, positions, etc.) + */ +export const WorkspaceStateSchema = z.object({ + currentChart: z.object({ + ticker: z.string(), + timeframe: z.string(), + indicators: z.array(z.string()).optional(), + }).optional(), + watchlist: z.array(z.string()), + openPositions: z.array(z.object({ + ticker: z.string(), + side: z.enum(['long', 'short']), + size: z.number(), + entryPrice: z.number(), + currentPrice: z.number().optional(), + unrealizedPnL: z.number().optional(), + })), + recentAlerts: z.array(z.object({ + type: z.string(), + message: z.string(), + timestamp: z.string(), + })).optional(), +}); + +export type WorkspaceState = z.infer; + +/** + * Standard context resource URIs + */ +export const CONTEXT_URIS = { + USER_PROFILE: 'context://user-profile', + CONVERSATION_SUMMARY: 'context://conversation-summary', + WORKSPACE_STATE: 'context://workspace-state', + SYSTEM_PROMPT: 'context://system-prompt', +} as const; + +/** + * Resource content interface + */ +export interface ResourceContent { + uri: string; + mimeType?: string; + text?: string; + blob?: string; +} + +/** + * Helper to parse resource content + */ +export function parseResource(resource: ResourceContent, schema: z.ZodSchema): T | null { + if (!resource.text) { + return null; + } + + try { + // Try JSON parsing if mime type is JSON + if (resource.mimeType?.includes('json')) { + const data = JSON.parse(resource.text); + return schema.parse(data); + } + + // Otherwise return as-is for text resources + return resource.text as T; + } catch { + return null; + } +} diff --git a/gateway/src/types/user.ts b/gateway/src/types/user.ts new file mode 100644 index 0000000..13e0764 --- /dev/null +++ b/gateway/src/types/user.ts @@ -0,0 +1,66 @@ +import { z } from 'zod'; + +/** + * Model preference configuration + */ +export const ModelPreferenceSchema = z.object({ + provider: z.enum(['anthropic', 'openai', 'google', 'openrouter']), + model: z.string(), + temperature: z.number().optional(), +}); + +export type ModelPreference = z.infer; + +/** + * User license and feature authorization + */ +export const UserLicenseSchema = z.object({ + userId: z.string(), + email: z.string().email().optional(), + licenseType: z.enum(['free', 'pro', 'enterprise']), + features: z.object({ + maxIndicators: z.number(), + maxStrategies: z.number(), + maxBacktestDays: z.number(), + realtimeData: z.boolean(), + customExecutors: z.boolean(), + apiAccess: z.boolean(), + }), + resourceLimits: z.object({ + maxConcurrentSessions: z.number(), + maxMessagesPerDay: z.number(), + maxTokensPerMessage: z.number(), + rateLimitPerMinute: z.number(), + }), + mcpServerUrl: z.string().url(), + preferredModel: ModelPreferenceSchema.optional(), + expiresAt: z.date().optional(), + createdAt: z.date(), + updatedAt: z.date(), +}); + +export type UserLicense = z.infer; + +/** + * Channel types for multi-channel support + */ +export enum ChannelType { + WEBSOCKET = 'websocket', + TELEGRAM = 'telegram', + SLACK = 'slack', + DISCORD = 'discord', +} + +/** + * Authentication context per channel + */ +export const AuthContextSchema = z.object({ + userId: z.string(), + channelType: z.nativeEnum(ChannelType), + channelUserId: z.string(), // Platform-specific ID (telegram_id, discord_id, etc) + sessionId: z.string(), + license: UserLicenseSchema, + authenticatedAt: z.date(), +}); + +export type AuthContext = z.infer; diff --git a/gateway/src/workflows/README.md b/gateway/src/workflows/README.md new file mode 100644 index 0000000..d40484a --- /dev/null +++ b/gateway/src/workflows/README.md @@ -0,0 +1,253 @@ +# LangGraph Workflows for Trading + +Complex, stateful workflows built with LangGraph for trading-specific tasks. + +## Overview + +LangGraph provides: +- **Stateful execution**: Workflow state persists across failures +- **Conditional branching**: Route based on market conditions, backtest results, etc. +- **Human-in-the-loop**: Pause for user approval before executing trades +- **Loops & retries**: Backtest with different parameters, retry failed operations +- **Multi-agent**: Different LLMs for different tasks (analysis, risk, execution) + +## Workflows + +### Strategy Analysis (`strategy-analysis.ts`) + +Multi-step pipeline for analyzing trading strategies: + +```typescript +import { buildStrategyAnalysisWorkflow } from './workflows/strategy-analysis.js'; + +const workflow = buildStrategyAnalysisWorkflow(model, logger, mcpBacktestFn); + +const result = await workflow.invoke({ + strategyCode: userStrategy, + ticker: 'BTC/USDT', + timeframe: '1h', +}); + +console.log(result.recommendation); // Go/no-go decision +``` + +**Steps:** +1. **Code Review** - LLM analyzes strategy code for bugs, logic errors +2. **Backtest** - Runs backtest via user's MCP server +3. **Risk Assessment** - LLM evaluates results (drawdown, Sharpe, etc.) +4. **Human Approval** - Pauses for user review +5. **Recommendation** - Final go/no-go decision + +**Benefits:** +- Stateful: Can resume if server restarts +- Human-in-the-loop: User must approve before deployment +- Multi-step reasoning: Each step builds on previous + +--- + +## Future Workflows + +### Market Scanner + +Scan multiple tickers for trading opportunities: + +```typescript +const scanner = buildMarketScannerWorkflow(model, logger); + +const result = await scanner.invoke({ + tickers: ['BTC/USDT', 'ETH/USDT', 'SOL/USDT'], + strategies: ['momentum', 'mean_reversion'], + timeframe: '1h', +}); + +// Returns ranked opportunities +``` + +**Steps:** +1. **Fetch Data** - Get OHLC for all tickers +2. **Apply Strategies** - Run each strategy on each ticker (parallel) +3. **Rank Signals** - Score by confidence, risk/reward +4. **Filter** - Apply user's risk limits +5. **Return Top N** - Best opportunities + +--- + +### Portfolio Optimization + +Optimize position sizing across multiple strategies: + +```typescript +const optimizer = buildPortfolioOptimizerWorkflow(model, logger); + +const result = await optimizer.invoke({ + strategies: [strategy1, strategy2, strategy3], + totalCapital: 100000, + maxRiskPerTrade: 0.02, +}); + +// Returns optimal allocation +``` + +**Steps:** +1. **Backtest All** - Run backtests for each strategy +2. **Correlation Analysis** - Check strategy correlation +3. **Monte Carlo** - Simulate portfolio performance +4. **Optimize** - Find optimal weights (Sharpe maximization) +5. **Risk Check** - Validate against user limits + +--- + +### Trade Execution Monitor + +Monitor trade execution and adapt to market conditions: + +```typescript +const monitor = buildTradeExecutionWorkflow(model, logger, exchange); + +const result = await monitor.invoke({ + tradeId: 'xyz', + targetPrice: 45000, + maxSlippage: 0.001, + timeLimit: 60, // seconds +}); +``` + +**Steps:** +1. **Place Order** - Submit order to exchange +2. **Monitor Fill** - Check fill status every second +3. **Adapt** - If not filling, adjust price (within slippage) +4. **Retry Logic** - If rejected, retry with backoff +5. **Timeout** - Cancel if time limit exceeded +6. **Report** - Final execution report + +--- + +## Using Workflows in Gateway + +### Simple Chat vs Complex Workflow + +```typescript +// gateway/src/orchestrator.ts + +export class MessageOrchestrator { + async handleMessage(msg: InboundMessage) { + // Route based on complexity + if (this.isSimpleQuery(msg)) { + // Use agent harness for streaming chat + return this.harness.streamMessage(msg); + } + + if (this.isWorkflowRequest(msg)) { + // Use LangGraph for complex analysis + return this.executeWorkflow(msg); + } + } + + async executeWorkflow(msg: InboundMessage) { + const { type, params } = this.parseWorkflowRequest(msg); + + switch (type) { + case 'analyze_strategy': + const workflow = buildStrategyAnalysisWorkflow(...); + return await workflow.invoke(params); + + case 'scan_market': + const scanner = buildMarketScannerWorkflow(...); + return await scanner.invoke(params); + + // ... more workflows + } + } +} +``` + +--- + +## Benefits for Trading + +### vs Simple LLM Calls + +| Scenario | Simple LLM | LangGraph Workflow | +|----------|-----------|-------------------| +| "What's the RSI?" | ✅ Fast, streaming | ❌ Overkill | +| "Analyze this strategy" | ❌ Limited context | ✅ Multi-step analysis | +| "Backtest 10 param combos" | ❌ No loops | ✅ Conditional loops | +| "Execute if approved" | ❌ No state | ✅ Human-in-the-loop | +| Server crashes mid-analysis | ❌ Lost progress | ✅ Resume from checkpoint | + +### When to Use Workflows + +**Use LangGraph when:** +- Multi-step analysis (backtest → risk → approval) +- Conditional logic (if bullish → momentum, else → mean-reversion) +- Human approval required (pause workflow) +- Loops needed (try different parameters) +- Long-running (can survive restarts) + +**Use Agent Harness when:** +- Simple Q&A ("What is RSI?") +- Fast response needed (streaming chat) +- Single tool call ("Get my watchlist") +- Real-time interaction (Telegram, WebSocket) + +--- + +## Implementation Notes + +### State Persistence + +LangGraph can persist state to database: + +```typescript +import { MemorySaver } from '@langchain/langgraph'; + +const checkpointer = new MemorySaver(); + +const workflow = graph.compile({ checkpointer }); + +// Resume from checkpoint +const result = await workflow.invoke(input, { + configurable: { thread_id: 'user-123-strategy-analysis' } +}); +``` + +### Human-in-the-Loop + +Pause workflow for user input: + +```typescript +const workflow = graph + .addNode('human_approval', humanApprovalNode) + .interrupt('human_approval'); // Pauses here + +// User reviews in UI +const approved = await getUserApproval(workflowId); + +// Resume workflow +await workflow.resume(state, { approved }); +``` + +### Multi-Agent + +Use different models for different tasks: + +```typescript +const analysisModel = new ChatAnthropic({ model: 'claude-3-opus' }); // Smart +const codeModel = new ChatOpenAI({ model: 'gpt-4o' }); // Good at code +const cheapModel = new ChatOpenAI({ model: 'gpt-4o-mini' }); // Fast + +const workflow = graph + .addNode('analyze', (state) => analysisModel.invoke(...)) + .addNode('code_review', (state) => codeModel.invoke(...)) + .addNode('summarize', (state) => cheapModel.invoke(...)); +``` + +--- + +## Next Steps + +1. Implement remaining workflows (scanner, optimizer, execution) +2. Add state persistence (PostgreSQL checkpointer) +3. Integrate human-in-the-loop with WebSocket +4. Add workflow monitoring dashboard +5. Performance optimization (parallel execution) diff --git a/gateway/src/workflows/strategy-analysis.ts b/gateway/src/workflows/strategy-analysis.ts new file mode 100644 index 0000000..b4925f0 --- /dev/null +++ b/gateway/src/workflows/strategy-analysis.ts @@ -0,0 +1,162 @@ +import { StateGraph, Annotation } from '@langchain/langgraph'; +import type { BaseChatModel } from '@langchain/core/language_models/chat_models'; +import { HumanMessage, SystemMessage } from '@langchain/core/messages'; +import type { FastifyBaseLogger } from 'fastify'; + +/** + * State for strategy analysis workflow + */ +const StrategyAnalysisState = Annotation.Root({ + strategyCode: Annotation(), + ticker: Annotation(), + timeframe: Annotation(), + + // Analysis steps + codeReview: Annotation({ + default: () => null, + }), + backtestResults: Annotation | null>({ + default: () => null, + }), + riskAssessment: Annotation({ + default: () => null, + }), + humanApproved: Annotation({ + default: () => false, + }), + + // Final output + recommendation: Annotation({ + default: () => null, + }), +}); + +type StrategyAnalysisStateType = typeof StrategyAnalysisState.State; + +/** + * Build strategy analysis workflow using LangGraph + * + * Workflow steps: + * 1. Code review (LLM analyzes strategy code) + * 2. Backtest (calls user's MCP backtest tool) + * 3. Risk assessment (LLM evaluates results) + * 4. Human approval (pause for user review) + * 5. Final recommendation + */ +export function buildStrategyAnalysisWorkflow( + model: BaseChatModel, + logger: FastifyBaseLogger, + mcpBacktestFn: (strategy: string, ticker: string, timeframe: string) => Promise> +) { + // Node: Code Review + const codeReviewNode = async (state: StrategyAnalysisStateType) => { + logger.info('Strategy workflow: Code review'); + + const systemPrompt = `You are an expert trading strategy analyst. +Review the following strategy code for potential issues, bugs, or improvements. +Focus on: logic errors, edge cases, performance, and trading best practices.`; + + const response = await model.invoke([ + new SystemMessage(systemPrompt), + new HumanMessage(`Review this strategy:\n\n${state.strategyCode}`), + ]); + + return { + codeReview: response.content as string, + }; + }; + + // Node: Backtest + const backtestNode = async (state: StrategyAnalysisStateType) => { + logger.info('Strategy workflow: Running backtest'); + + const results = await mcpBacktestFn(state.strategyCode, state.ticker, state.timeframe); + + return { + backtestResults: results, + }; + }; + + // Node: Risk Assessment + const riskAssessmentNode = async (state: StrategyAnalysisStateType) => { + logger.info('Strategy workflow: Risk assessment'); + + const systemPrompt = `You are a risk management expert for trading strategies. +Analyze the backtest results and provide a risk assessment. +Focus on: drawdown, win rate, Sharpe ratio, position sizing, and risk of ruin.`; + + const response = await model.invoke([ + new SystemMessage(systemPrompt), + new HumanMessage( + `Code review: ${state.codeReview}\n\nBacktest results: ${JSON.stringify(state.backtestResults, null, 2)}\n\nProvide risk assessment:` + ), + ]); + + return { + riskAssessment: response.content as string, + }; + }; + + // Node: Human Approval (placeholder - would integrate with UI) + const humanApprovalNode = async (state: StrategyAnalysisStateType) => { + logger.info('Strategy workflow: Awaiting human approval'); + + // In real implementation, this would pause and wait for user input + // For now, auto-approve + return { + humanApproved: true, + }; + }; + + // Node: Final Recommendation + const recommendationNode = async (state: StrategyAnalysisStateType) => { + logger.info('Strategy workflow: Generating recommendation'); + + const systemPrompt = `Provide a final recommendation on whether to deploy this trading strategy. +Summarize the code review, backtest results, and risk assessment. +Give clear go/no-go decision with reasoning.`; + + const response = await model.invoke([ + new SystemMessage(systemPrompt), + new HumanMessage( + `Code review: ${state.codeReview}\n\nBacktest: ${JSON.stringify(state.backtestResults)}\n\nRisk: ${state.riskAssessment}\n\nApproved: ${state.humanApproved}\n\nYour recommendation:` + ), + ]); + + return { + recommendation: response.content as string, + }; + }; + + // Build graph + const workflow = new StateGraph(StrategyAnalysisState) + .addNode('code_review', codeReviewNode) + .addNode('backtest', backtestNode) + .addNode('risk_assessment', riskAssessmentNode) + .addNode('human_approval', humanApprovalNode) + .addNode('recommendation', recommendationNode) + .addEdge('__start__', 'code_review') + .addEdge('code_review', 'backtest') + .addEdge('backtest', 'risk_assessment') + .addEdge('risk_assessment', 'human_approval') + .addConditionalEdges('human_approval', (state) => { + return state.humanApproved ? 'recommendation' : '__end__'; + }) + .addEdge('recommendation', '__end__'); + + return workflow.compile(); +} + +/** + * Example usage: + * + * const workflow = buildStrategyAnalysisWorkflow(model, logger, mcpBacktestFn); + * + * const result = await workflow.invoke({ + * strategyCode: "strategy code here", + * ticker: "BTC/USDT", + * timeframe: "1h", + * }); + * + * console.log(result.recommendation); + */ diff --git a/gateway/tsconfig.json b/gateway/tsconfig.json new file mode 100644 index 0000000..4f09770 --- /dev/null +++ b/gateway/tsconfig.json @@ -0,0 +1,26 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "ESNext", + "lib": ["ES2022"], + "moduleResolution": "bundler", + "resolveJsonModule": true, + "allowJs": false, + "outDir": "./dist", + "rootDir": "./src", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "declaration": true, + "declarationMap": true, + "sourceMap": true, + "noUnusedLocals": true, + "noUnusedParameters": true, + "noImplicitReturns": true, + "noFallthroughCasesInSwitch": true, + "allowSyntheticDefaultImports": true + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"] +} diff --git a/lifecycle-sidecar/.gitignore b/lifecycle-sidecar/.gitignore new file mode 100644 index 0000000..a3d200d --- /dev/null +++ b/lifecycle-sidecar/.gitignore @@ -0,0 +1,15 @@ +# Binaries +lifecycle-sidecar +*.exe +*.dll +*.so +*.dylib + +# Test binary +*.test + +# Go workspace file +go.work + +# Build output +dist/ diff --git a/lifecycle-sidecar/Dockerfile b/lifecycle-sidecar/Dockerfile new file mode 100644 index 0000000..5dfa04f --- /dev/null +++ b/lifecycle-sidecar/Dockerfile @@ -0,0 +1,40 @@ +# Build stage +FROM golang:1.22-alpine AS builder + +WORKDIR /app + +# Install build dependencies +RUN apk add --no-cache git ca-certificates + +# Copy go mod files +COPY go.mod go.sum ./ +RUN go mod download + +# Copy source +COPY main.go ./ + +# Build static binary +RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \ + -ldflags="-w -s" \ + -o lifecycle-sidecar \ + main.go + +# Runtime stage +FROM alpine:3.19 + +# Install procps for process monitoring (pgrep, kill) +RUN apk add --no-cache procps ca-certificates + +# Create non-root user +RUN addgroup -g 1000 sidecar && \ + adduser -D -u 1000 -G sidecar sidecar + +WORKDIR /app + +# Copy binary from builder +COPY --from=builder /app/lifecycle-sidecar /app/lifecycle-sidecar + +# Run as non-root +USER sidecar + +ENTRYPOINT ["/app/lifecycle-sidecar"] diff --git a/lifecycle-sidecar/README.md b/lifecycle-sidecar/README.md new file mode 100644 index 0000000..bbb2097 --- /dev/null +++ b/lifecycle-sidecar/README.md @@ -0,0 +1,94 @@ +# Lifecycle Sidecar + +A lightweight Kubernetes sidecar that monitors the main agent container and handles cleanup when the container exits with a specific exit code indicating idle shutdown. + +## Purpose + +User agent containers self-manage their lifecycle by: +1. Tracking their own activity (MCP calls, trigger status) +2. Exiting with code `42` when idle (no triggers + no recent activity) +3. Delegating deployment cleanup to this sidecar + +The sidecar watches the main container and: +- On exit code `42`: Deletes the deployment (and optionally PVC) +- On any other exit code: Allows Kubernetes restart policy to handle it + +## Architecture + +``` +┌─────────────────────────────────────────────────┐ +│ Pod │ +│ ┌────────────────┐ ┌──────────────────┐ │ +│ │ Agent Container│ │ Lifecycle Sidecar│ │ +│ │ │ │ │ │ +│ │ - Track activity │ - Monitor agent │ │ +│ │ - Track triggers │ - Watch exit code│ │ +│ │ - Exit 42 if idle │ - Delete if 42 │ │ +│ └────────────────┘ └──────────────────┘ │ +│ │ │ │ +│ │ writes exit_code │ │ +│ └─────────►/var/run/agent/exit_code │ +│ │ │ +└───────────────────────────────────┼─────────────┘ + │ + ▼ k8s API + ┌──────────────────────┐ + │ Delete Deployment │ + │ (+ PVC if anonymous)│ + └──────────────────────┘ +``` + +## Environment Variables + +| Variable | Required | Description | +|----------|----------|-------------| +| `NAMESPACE` | Yes | Kubernetes namespace (injected via downward API) | +| `DEPLOYMENT_NAME` | Yes | Name of the deployment to delete (from pod label) | +| `USER_TYPE` | No | User license tier: `anonymous`, `free`, `paid`, `enterprise` | +| `MAIN_CONTAINER_PID` | No | PID of main container (for precise monitoring) | + +## Exit Code Contract + +The agent container uses exit codes to signal intent: + +| Exit Code | Meaning | Sidecar Action | +|-----------|---------|----------------| +| `42` | Clean idle shutdown | Delete deployment + optional PVC | +| Any other | Error or normal restart | Allow Kubernetes to restart | + +## RBAC Requirements + +The sidecar requires a ServiceAccount with permission to delete its own deployment: + +```yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +rules: + - apiGroups: ["apps"] + resources: ["deployments"] + verbs: ["get", "delete"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["get", "delete"] +``` + +See `deploy/k8s/base/lifecycle-sidecar-rbac.yaml` for the full RBAC configuration. + +## Building + +```bash +docker build -t ghcr.io/dexorder/lifecycle-sidecar:latest . +docker push ghcr.io/dexorder/lifecycle-sidecar:latest +``` + +## Example Usage + +See `deploy/k8s/base/agent-deployment-example.yaml` for a complete example of how to configure an agent deployment with the lifecycle sidecar. + +## Security Considerations + +1. **Self-delete only**: The sidecar can only delete the deployment it's part of (enforced by label matching in admission policy) +2. **Non-privileged**: Runs as non-root user (UID 1000) +3. **Minimal permissions**: Only has `get` and `delete` on deployments/PVCs in the agents namespace +4. **No cross-namespace access**: Scoped to `dexorder-agents` namespace only +5. **Crash-safe**: Only triggers cleanup on exit code 42, never on crashes diff --git a/lifecycle-sidecar/go.mod b/lifecycle-sidecar/go.mod new file mode 100644 index 0000000..bc4a623 --- /dev/null +++ b/lifecycle-sidecar/go.mod @@ -0,0 +1,16 @@ +module github.com/dexorder/lifecycle-sidecar + +go 1.22 + +require ( + github.com/rs/zerolog v1.32.0 + k8s.io/api v0.29.2 + k8s.io/apimachinery v0.29.2 + k8s.io/client-go v0.29.2 +) + +require ( + github.com/mattn/go-colorable v0.1.13 // indirect + github.com/mattn/go-isatty v0.0.19 // indirect + golang.org/x/sys v0.17.0 // indirect +) diff --git a/lifecycle-sidecar/main.go b/lifecycle-sidecar/main.go new file mode 100644 index 0000000..ea538ce --- /dev/null +++ b/lifecycle-sidecar/main.go @@ -0,0 +1,234 @@ +package main + +import ( + "context" + "fmt" + "os" + "os/exec" + "syscall" + "time" + + "github.com/rs/zerolog" + "github.com/rs/zerolog/log" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +const ( + // Exit code indicating clean idle shutdown + ExitCodeIdleShutdown = 42 + + // Poll interval for checking main container status + PollInterval = 5 * time.Second +) + +func main() { + // Setup logging + zerolog.TimeFieldFormat = zerolog.TimeFormatUnix + log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr}) + + log.Info().Msg("Lifecycle sidecar starting") + + // Get environment configuration + namespace := os.Getenv("NAMESPACE") + deploymentName := os.Getenv("DEPLOYMENT_NAME") + userType := os.Getenv("USER_TYPE") + mainContainerPID := os.Getenv("MAIN_CONTAINER_PID") + + if namespace == "" || deploymentName == "" { + log.Fatal().Msg("NAMESPACE and DEPLOYMENT_NAME environment variables are required") + } + + log.Info(). + Str("namespace", namespace). + Str("deployment", deploymentName). + Str("userType", userType). + Str("mainPID", mainContainerPID). + Msg("Configuration loaded") + + // Create Kubernetes client + config, err := rest.InClusterConfig() + if err != nil { + log.Fatal().Err(err).Msg("Failed to get in-cluster config") + } + + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + log.Fatal().Err(err).Msg("Failed to create Kubernetes client") + } + + // Wait for main container to exit + exitCode := waitForMainContainer() + + log.Info().Int("exitCode", exitCode).Msg("Main container exited") + + // Handle exit code + if exitCode == ExitCodeIdleShutdown { + log.Info().Msg("Detected idle shutdown (exit code 42) - cleaning up deployment") + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + // Delete PVC if anonymous user + deletePVC := userType == "anonymous" || userType == "temporary" + + if err := cleanupDeployment(ctx, clientset, namespace, deploymentName, deletePVC); err != nil { + log.Error().Err(err).Msg("Failed to cleanup deployment") + os.Exit(1) + } + + log.Info().Msg("Cleanup complete - sidecar exiting") + os.Exit(0) + } else { + // Any other exit code - let Kubernetes restart policy handle it + log.Info(). + Int("exitCode", exitCode). + Msg("Non-idle exit code - allowing Kubernetes to handle restart") + os.Exit(exitCode) + } +} + +// waitForMainContainer monitors the main container process and returns its exit code +func waitForMainContainer() int { + // Try multiple methods to detect main container exit + // Method 1: Poll for process via shared PID namespace + mainPID := os.Getenv("MAIN_CONTAINER_PID") + if mainPID != "" { + return pollProcessExit(mainPID) + } + + // Method 2: Poll for agent process by name (fallback) + log.Info().Msg("MAIN_CONTAINER_PID not set, polling for 'agent' process") + return pollProcessByName("agent") +} + +// pollProcessExit polls for process exit by PID +func pollProcessExit(pidStr string) int { + log.Info().Str("pid", pidStr).Msg("Monitoring main container process") + + for { + // Check if process exists + cmd := exec.Command("kill", "-0", pidStr) + err := cmd.Run() + + if err != nil { + // Process no longer exists - get exit code from /proc if available + log.Info().Msg("Main container process exited") + + // Try to get actual exit code (this is a best-effort) + // In Kubernetes, we might not have access to the actual exit code + // So we check if the container restarted via container status + return getContainerExitCode() + } + + time.Sleep(PollInterval) + } +} + +// pollProcessByName polls for process exit by name +func pollProcessByName(name string) int { + log.Info().Str("name", name).Msg("Monitoring main container by name") + + for { + cmd := exec.Command("pgrep", "-x", name) + err := cmd.Run() + + if err != nil { + log.Info().Msg("Main container process exited") + return getContainerExitCode() + } + + time.Sleep(PollInterval) + } +} + +// getContainerExitCode attempts to retrieve the exit code of the main container +// This is challenging in Kubernetes without direct access to container runtime +// We use a fallback approach: check a shared file or default to 0 +func getContainerExitCode() int { + // Check if main container wrote exit code to shared volume + exitCodeFile := "/var/run/agent/exit_code" + data, err := os.ReadFile(exitCodeFile) + if err == nil { + var exitCode int + _, err := fmt.Sscanf(string(data), "%d", &exitCode) + if err == nil { + log.Info().Int("exitCode", exitCode).Msg("Read exit code from shared file") + return exitCode + } + } + + // Default to 0 if we can't determine exit code + // This is safe because non-42 codes allow restart + log.Warn().Msg("Could not determine exit code, defaulting to 0") + return 0 +} + +// cleanupDeployment deletes the deployment and optionally the PVC +func cleanupDeployment(ctx context.Context, clientset *kubernetes.Clientset, namespace, deploymentName string, deletePVC bool) error { + log.Info(). + Str("namespace", namespace). + Str("deployment", deploymentName). + Bool("deletePVC", deletePVC). + Msg("Cleaning up deployment") + + // Get deployment to find PVC name if needed + var pvcName string + if deletePVC { + deployment, err := clientset.AppsV1().Deployments(namespace).Get(ctx, deploymentName, metav1.GetOptions{}) + if err != nil { + log.Warn().Err(err).Msg("Could not get deployment for PVC lookup") + } else { + // Find PVC from volume claim templates or volumes + if len(deployment.Spec.Template.Spec.Volumes) > 0 { + for _, vol := range deployment.Spec.Template.Spec.Volumes { + if vol.PersistentVolumeClaim != nil { + pvcName = vol.PersistentVolumeClaim.ClaimName + break + } + } + } + } + } + + // Delete deployment + deletePolicy := metav1.DeletePropagationForeground + deleteOptions := metav1.DeleteOptions{ + PropagationPolicy: &deletePolicy, + } + + log.Info().Str("deployment", deploymentName).Msg("Deleting deployment") + err := clientset.AppsV1().Deployments(namespace).Delete(ctx, deploymentName, deleteOptions) + if err != nil { + return fmt.Errorf("failed to delete deployment: %w", err) + } + + log.Info().Msg("Deployment deleted successfully") + + // Delete PVC if requested and found + if deletePVC && pvcName != "" { + log.Info().Str("pvc", pvcName).Msg("Deleting PVC") + err := clientset.CoreV1().PersistentVolumeClaims(namespace).Delete(ctx, pvcName, metav1.DeleteOptions{}) + if err != nil { + log.Warn().Err(err).Str("pvc", pvcName).Msg("Failed to delete PVC (non-fatal)") + } else { + log.Info().Msg("PVC deleted successfully") + } + } + + return nil +} + +func init() { + // Register signal handler for graceful shutdown + // If sidecar receives SIGTERM, just exit cleanly + // Don't trigger deployment deletion on sidecar termination + go func() { + sigChan := make(chan os.Signal, 1) + syscall.Signal(syscall.SIGTERM) + <-sigChan + log.Info().Msg("Received SIGTERM - sidecar exiting without cleanup") + os.Exit(0) + }() +}