prod deployment

2026-04-01 18:34:08 -04:00
parent ca44e68f64
commit eab581f8cb
62 changed files with 1922 additions and 286 deletions
--- a/deploy/k8s/base/admission-policy.yaml
+++ b/deploy/k8s/base/admission-policy.yaml
@@ -1,4 +1,4 @@
-# ValidatingAdmissionPolicy to restrict images in dexorder-sandboxes namespace
+# ValidatingAdmissionPolicy to restrict images in sandbox namespace
 # Requires Kubernetes 1.30+ (or 1.28+ with feature gate)
 # This is the critical security control that prevents arbitrary image execution
 # even if the gateway is compromised.
@@ -26,7 +26,9 @@ spec:
          c.image.startsWith('ghcr.io/dexorder/sandbox-') ||
          c.image.startsWith('ghcr.io/dexorder/lifecycle-sidecar:') ||
          c.image.startsWith('dexorder/ai-sandbox:') ||
-          c.image.startsWith('dexorder/ai-lifecycle-sidecar:'))
+          c.image.startsWith('dexorder/ai-lifecycle-sidecar:') ||
+          c.image.startsWith('git.dxod.org/dexorder/dexorder/ai-sandbox:') ||
+          c.image.startsWith('git.dxod.org/dexorder/dexorder/ai-lifecycle-sidecar:'))
      message: "Only approved dexorder sandbox images are allowed in the sandboxes namespace"
      reason: Forbidden

--- a/deploy/k8s/base/gateway-rbac.yaml
+++ b/deploy/k8s/base/gateway-rbac.yaml
@@ -1,6 +1,6 @@
 # RBAC for gateway to CREATE sandbox deployments only
 # Principle of least privilege: gateway can ONLY create deployments/services/PVCs
-# in the dexorder-sandboxes namespace. Deletion is handled by the lifecycle sidecar.
+# in the sandbox namespace. Deletion is handled by the lifecycle sidecar.
 # No pods, secrets, exec, or cross-namespace access.
 ---
 apiVersion: v1
@@ -8,12 +8,12 @@ kind: ServiceAccount
 metadata:
  name: gateway
 ---
-# Role scoped to dexorder-sandboxes namespace only
+# Role scoped to sandbox namespace only
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 metadata:
  name: sandbox-creator
-  namespace: dexorder-sandboxes
+  namespace: sandbox
 rules:
  # Deployments: create and read only (deletion handled by sidecar)
  - apiGroups: ["apps"]
@@ -53,7 +53,7 @@ apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
  name: gateway-sandbox-creator
-  namespace: dexorder-sandboxes
+  namespace: sandbox
 subjects:
  - kind: ServiceAccount
    name: gateway
--- a/deploy/k8s/base/lifecycle-sidecar-rbac.yaml
+++ b/deploy/k8s/base/lifecycle-sidecar-rbac.yaml
@@ -5,15 +5,15 @@ apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: sandbox-lifecycle
-  namespace: dexorder-sandboxes
+  namespace: sandbox
 ---
 # Role allowing deletion of deployments and PVCs
-# This is scoped to the dexorder-sandboxes namespace
+# This is scoped to the sandbox namespace
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 metadata:
  name: sandbox-self-delete
-  namespace: dexorder-sandboxes
+  namespace: sandbox
 rules:
  # Allow getting and deleting deployments
  - apiGroups: ["apps"]
@@ -34,11 +34,11 @@ apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
  name: sandbox-self-delete
-  namespace: dexorder-sandboxes
+  namespace: sandbox
 subjects:
  - kind: ServiceAccount
    name: sandbox-lifecycle
-    namespace: dexorder-sandboxes
+    namespace: sandbox
 roleRef:
  kind: Role
  name: sandbox-self-delete
@@ -49,5 +49,5 @@ roleRef:
 # Requires a validating webhook server (can be added later)
 # For now, we rely on:
 # 1. Sidecar only knowing its own deployment name (from env)
-# 2. RBAC limiting to dexorder-sandboxes namespace
+# 2. RBAC limiting to sandbox namespace
 # 3. Admission policy restricting deployment creation (already defined)
--- a/deploy/k8s/base/namespaces.yaml
+++ b/deploy/k8s/base/namespaces.yaml
@@ -1,11 +1,11 @@
 # Namespace definitions for dexorder AI platform
 # - default: gateway, web, and infrastructure services
-# - dexorder-sandboxes: per-user sandbox containers (isolated, restricted)
+# - sandbox: per-user sandbox containers (isolated, restricted)
 ---
 apiVersion: v1
 kind: Namespace
 metadata:
-  name: dexorder-sandboxes
+  name: sandbox
  labels:
    app.kubernetes.io/part-of: dexorder
    dexorder.io/type: sandboxes
--- a/deploy/k8s/base/network-policies.yaml
+++ b/deploy/k8s/base/network-policies.yaml
@@ -2,12 +2,12 @@
 # Sandboxes can only communicate with specific services, not with each other
 # or with the Kubernetes API
 ---
-# Default deny all ingress and egress in sandboxes namespace
+# Default deny all ingress and egress in sandbox namespace
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
  name: default-deny-all
-  namespace: dexorder-sandboxes
+  namespace: sandbox
 spec:
  podSelector: {}
  policyTypes:
@@ -19,7 +19,7 @@ apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
  name: allow-gateway-ingress
-  namespace: dexorder-sandboxes
+  namespace: sandbox
 spec:
  podSelector:
    matchLabels:
@@ -42,7 +42,7 @@ apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
  name: allow-sandbox-egress
-  namespace: dexorder-sandboxes
+  namespace: sandbox
 spec:
  podSelector:
    matchLabels:
@@ -69,17 +69,6 @@ spec:
      ports:
        - protocol: TCP
          port: 3000
-    # Kafka/Redpanda for data subscriptions
-    - to:
-        - namespaceSelector:
-            matchLabels:
-              dexorder.io/type: system
-          podSelector:
-            matchLabels:
-              app: redpanda
-      ports:
-        - protocol: TCP
-          port: 9092
    # External HTTPS (for exchange APIs, LLM APIs)
    - to:
        - ipBlock:
@@ -93,7 +82,8 @@ spec:
        - protocol: TCP
          port: 443
 ---
-# Default namespace: allow ingress from sandboxes to gateway
+# Allow ingress from sandboxes to gateway (no explicit namespace = context default)
+# In dev: applies to 'default' namespace. In prod: applies to 'ai' namespace.
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
--- a/deploy/k8s/base/sandbox-deployment-example.yaml
+++ b/deploy/k8s/base/sandbox-deployment-example.yaml
@@ -5,7 +5,7 @@ apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: sandbox-user-abc123
-  namespace: dexorder-sandboxes
+  namespace: sandbox
  labels:
    app.kubernetes.io/name: sandbox
    app.kubernetes.io/component: user-sandbox
@@ -187,7 +187,7 @@ apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: sandbox-user-abc123-data
-  namespace: dexorder-sandboxes
+  namespace: sandbox
  labels:
    dexorder.io/user-id: user-abc123
 spec:
@@ -203,7 +203,7 @@ apiVersion: v1
 kind: Service
 metadata:
  name: sandbox-user-abc123
-  namespace: dexorder-sandboxes
+  namespace: sandbox
  labels:
    dexorder.io/user-id: user-abc123
 spec:
--- a/deploy/k8s/base/sandbox-quotas.yaml
+++ b/deploy/k8s/base/sandbox-quotas.yaml
@@ -1,4 +1,4 @@
-# Resource constraints for the dexorder-sandboxes namespace
+# Resource constraints for the sandbox namespace
 # These limits apply regardless of what the gateway requests
 ---
 # LimitRange: per-container defaults and maximums
@@ -6,7 +6,7 @@ apiVersion: v1
 kind: LimitRange
 metadata:
  name: sandbox-limits
-  namespace: dexorder-sandboxes
+  namespace: sandbox
 spec:
  limits:
    # Default limits applied if deployment doesn't specify
@@ -37,7 +37,7 @@ apiVersion: v1
 kind: ResourceQuota
 metadata:
  name: sandbox-quota
-  namespace: dexorder-sandboxes
+  namespace: sandbox
 spec:
  hard:
    # Total compute limits for all sandboxes combined
--- a/deploy/k8s/dev/configs/gateway-config.yaml
+++ b/deploy/k8s/dev/configs/gateway-config.yaml
@@ -53,7 +53,8 @@ data:

    # Kubernetes configuration
    kubernetes:
-      namespace: dexorder-sandboxes
+      namespace: sandbox
+      service_namespace: default
      in_cluster: true
      sandbox_image: dexorder/ai-sandbox:SANDBOX_TAG_PLACEHOLDER
      sidecar_image: dexorder/ai-lifecycle-sidecar:SIDECAR_TAG_PLACEHOLDER
--- a/deploy/k8s/dev/kustomization.yaml
+++ b/deploy/k8s/dev/kustomization.yaml
@@ -8,7 +8,7 @@ resources:
  - storage-class.yaml
  - configs/gateway-config.yaml
  - gateway-health-ingress.yaml
-  - sandbox-config.yaml  # ConfigMap for sandbox pods in dexorder-sandboxes namespace
+  - sandbox-config.yaml

 # Dev-specific patches
 patches:
@@ -275,6 +275,12 @@ generatorOptions:



+
+
+
+
+
+



--- a/deploy/k8s/dev/sandbox-config.yaml
+++ b/deploy/k8s/dev/sandbox-config.yaml
@@ -1,11 +1,11 @@
-# Sandbox ConfigMap in dexorder-sandboxes namespace
+# Sandbox ConfigMap in sandbox namespace
 # This is mounted into dynamically created sandbox pods
 ---
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: sandbox-config
-  namespace: dexorder-sandboxes
+  namespace: sandbox
  labels:
    app.kubernetes.io/name: sandbox
    app.kubernetes.io/component: config
--- a/deploy/k8s/dev/sandbox-quotas-patch.yaml
+++ b/deploy/k8s/dev/sandbox-quotas-patch.yaml
@@ -5,7 +5,7 @@ apiVersion: v1
 kind: ResourceQuota
 metadata:
  name: sandbox-quota
-  namespace: dexorder-sandboxes
+  namespace: sandbox
 spec:
  hard:
    # Reduced for minikube
--- a/deploy/k8s/prod/configs/gateway-config.yaml
+++ b/deploy/k8s/prod/configs/gateway-config.yaml
@@ -4,22 +4,21 @@ metadata:
  name: gateway-config
 data:
  config.yaml: |
-    # Gateway Configuration
+    # Gateway Configuration (production)

    # Server configuration
    server:
      port: 3000
      host: 0.0.0.0
      log_level: info
-      cors_origin: "https://app.dexorder.com"
-      base_url: https://api.dexorder.com
+      cors_origin: "https://dexorder.ai"
+      base_url: https://dexorder.ai
      trusted_origins:
-        - https://app.dexorder.com
-        - https://api.dexorder.com
+        - https://dexorder.ai

    # Database
    database:
-      url: postgresql://postgres:postgres@postgres:5432/iceberg
+      url: postgresql://postgres:{{ op://AI Prod/PostgreSQL/password }}@postgres:5432/iceberg

    # Default model (if user has no preference)
    defaults:
@@ -28,12 +27,13 @@ data:

    # Kubernetes configuration
    kubernetes:
-      namespace: dexorder-sandboxes
+      namespace: sandbox
+      service_namespace: ai
      in_cluster: true
-      sandbox_image: dexorder/ai-sandbox:latest
-      sidecar_image: dexorder/ai-lifecycle-sidecar:latest
+      sandbox_image: git.dxod.org/dexorder/dexorder/ai-sandbox:latest
+      sidecar_image: git.dxod.org/dexorder/dexorder/ai-lifecycle-sidecar:latest
      storage_class: standard
-      image_pull_policy: Always  # For production - always pull from registry
+      image_pull_policy: Always

    # DragonflyDB (Redis-compatible, for hot storage and session management)
    redis:
@@ -62,4 +62,4 @@ data:

    # Email service configuration
    email:
-      from_address: noreply@dexorder.com
+      from_address: noreply@dexorder.ai
--- a/deploy/k8s/prod/infrastructure.yaml
+++ b/deploy/k8s/prod/infrastructure.yaml
@@ -0,0 +1,678 @@
+---
+# DragonflyDB (Redis-compatible in-memory datastore)
+apiVersion: v1
+kind: Service
+metadata:
+  name: dragonfly
+spec:
+  selector:
+    app: dragonfly
+  ports:
+    - protocol: TCP
+      port: 6379
+      targetPort: 6379
+  type: ClusterIP
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: dragonfly
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: dragonfly
+  template:
+    metadata:
+      labels:
+        app: dragonfly
+    spec:
+      containers:
+      - name: dragonfly
+        image: docker.dragonflydb.io/dragonflydb/dragonfly:latest
+        ports:
+        - containerPort: 6379
+          name: dragonfly
+        args:
+        - --logtostderr
+        - --alsologtostderr=false
+        - --cache_mode=true
+        resources:
+          requests:
+            memory: "256Mi"
+            cpu: "100m"
+          limits:
+            memory: "512Mi"
+            cpu: "500m"
+---
+# Qdrant (Vector database for RAG)
+apiVersion: v1
+kind: Service
+metadata:
+  name: qdrant
+spec:
+  selector:
+    app: qdrant
+  ports:
+    - name: http
+      protocol: TCP
+      port: 6333
+      targetPort: 6333
+    - name: grpc
+      protocol: TCP
+      port: 6334
+      targetPort: 6334
+  type: ClusterIP
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: qdrant
+spec:
+  serviceName: qdrant
+  replicas: 1
+  selector:
+    matchLabels:
+      app: qdrant
+  template:
+    metadata:
+      labels:
+        app: qdrant
+    spec:
+      containers:
+      - name: qdrant
+        image: qdrant/qdrant:latest
+        ports:
+        - containerPort: 6333
+          name: http
+        - containerPort: 6334
+          name: grpc
+        resources:
+          requests:
+            memory: "512Mi"
+            cpu: "200m"
+          limits:
+            memory: "1Gi"
+            cpu: "1000m"
+        volumeMounts:
+        - name: qdrant-data
+          mountPath: /qdrant/storage
+  volumeClaimTemplates:
+  - metadata:
+      name: qdrant-data
+    spec:
+      accessModes: ["ReadWriteOnce"]
+      resources:
+        requests:
+          storage: 10Gi
+---
+# Kafka (KRaft mode - no Zookeeper needed)
+apiVersion: v1
+kind: Service
+metadata:
+  name: kafka
+spec:
+  selector:
+    app: kafka
+  ports:
+    - name: broker
+      protocol: TCP
+      port: 9092
+      targetPort: 9092
+    - name: controller
+      protocol: TCP
+      port: 9093
+      targetPort: 9093
+  type: ClusterIP
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: kafka
+spec:
+  serviceName: kafka
+  replicas: 1
+  selector:
+    matchLabels:
+      app: kafka
+  template:
+    metadata:
+      labels:
+        app: kafka
+    spec:
+      containers:
+      - name: kafka
+        image: apache/kafka:3.9.0
+        ports:
+        - containerPort: 9092
+          name: broker
+        - containerPort: 9093
+          name: controller
+        command:
+        - sh
+        - -c
+        - |
+          CLUSTER_ID="dexorder-prod-cluster"
+          LOG_DIR="/var/lib/kafka/data"
+
+          # Ensure log directory exists
+          mkdir -p $LOG_DIR
+
+          # Create temporary config with custom log.dirs for formatting
+          cp /opt/kafka/config/kraft/server.properties /tmp/server.properties
+          echo "log.dirs=$LOG_DIR" >> /tmp/server.properties
+
+          # Format storage if not already formatted
+          if [ ! -f $LOG_DIR/meta.properties ]; then
+            echo "Formatting Kafka storage with cluster ID: $CLUSTER_ID"
+            /opt/kafka/bin/kafka-storage.sh format -t $CLUSTER_ID -c /tmp/server.properties
+          else
+            echo "Kafka storage already formatted, skipping format step"
+          fi
+
+          # Start Kafka server
+          /opt/kafka/bin/kafka-server-start.sh /opt/kafka/config/kraft/server.properties \
+            --override node.id=1 \
+            --override process.roles=broker,controller \
+            --override listeners=PLAINTEXT://:9092,CONTROLLER://:9093 \
+            --override advertised.listeners=PLAINTEXT://kafka:9092 \
+            --override controller.quorum.voters=1@kafka:9093 \
+            --override controller.listener.names=CONTROLLER \
+            --override listener.security.protocol.map=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT \
+            --override log.dirs=$LOG_DIR \
+            --override offsets.topic.replication.factor=1 \
+            --override transaction.state.log.replication.factor=1 \
+            --override transaction.state.log.min.isr=1
+        env: []
+        volumeMounts:
+        - name: kafka-data
+          mountPath: /var/lib/kafka/data
+  volumeClaimTemplates:
+  - metadata:
+      name: kafka-data
+    spec:
+      accessModes: ["ReadWriteOnce"]
+      resources:
+        requests:
+          storage: 20Gi
+---
+# PostgreSQL (for Iceberg catalog metadata and gateway user data)
+apiVersion: v1
+kind: Service
+metadata:
+  name: postgres
+spec:
+  selector:
+    app: postgres
+  ports:
+    - protocol: TCP
+      port: 5432
+      targetPort: 5432
+  type: ClusterIP
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: postgres
+spec:
+  serviceName: postgres
+  replicas: 1
+  selector:
+    matchLabels:
+      app: postgres
+  template:
+    metadata:
+      labels:
+        app: postgres
+    spec:
+      containers:
+      - name: postgres
+        image: postgres:15
+        ports:
+        - containerPort: 5432
+        env:
+        - name: POSTGRES_USER
+          value: postgres
+        - name: POSTGRES_PASSWORD
+          valueFrom:
+            secretKeyRef:
+              name: postgres-secret
+              key: password
+        - name: POSTGRES_DB
+          value: iceberg
+        volumeMounts:
+        - name: postgres-data
+          mountPath: /var/lib/postgresql/data
+  volumeClaimTemplates:
+  - metadata:
+      name: postgres-data
+    spec:
+      accessModes: ["ReadWriteOnce"]
+      resources:
+        requests:
+          storage: 10Gi
+---
+# MinIO (S3-compatible object storage)
+apiVersion: v1
+kind: Service
+metadata:
+  name: minio
+spec:
+  selector:
+    app: minio
+  ports:
+    - name: api
+      protocol: TCP
+      port: 9000
+      targetPort: 9000
+    - name: console
+      protocol: TCP
+      port: 9001
+      targetPort: 9001
+  type: ClusterIP
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: minio
+spec:
+  serviceName: minio
+  replicas: 1
+  selector:
+    matchLabels:
+      app: minio
+  template:
+    metadata:
+      labels:
+        app: minio
+    spec:
+      containers:
+      - name: minio
+        image: minio/minio:latest
+        args:
+        - server
+        - /data
+        - --console-address
+        - ":9001"
+        ports:
+        - containerPort: 9000
+          name: api
+        - containerPort: 9001
+          name: console
+        env:
+        - name: MINIO_ROOT_USER
+          valueFrom:
+            secretKeyRef:
+              name: minio-secret
+              key: root-user
+        - name: MINIO_ROOT_PASSWORD
+          valueFrom:
+            secretKeyRef:
+              name: minio-secret
+              key: root-password
+        volumeMounts:
+        - name: minio-data
+          mountPath: /data
+  volumeClaimTemplates:
+  - metadata:
+      name: minio-data
+    spec:
+      accessModes: ["ReadWriteOnce"]
+      resources:
+        requests:
+          storage: 50Gi
+---
+# MinIO bucket initialization job
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: minio-init-buckets
+spec:
+  ttlSecondsAfterFinished: 100
+  template:
+    spec:
+      restartPolicy: OnFailure
+      containers:
+      - name: create-buckets
+        image: minio/mc:latest
+        command:
+        - sh
+        - -c
+        - |
+          echo "Waiting for MinIO to be ready..."
+          until mc alias set minio http://minio:9000 $MINIO_ROOT_USER $MINIO_ROOT_PASSWORD; do
+            sleep 2
+          done
+
+          echo "Creating warehouse bucket..."
+          mc mb minio/warehouse --ignore-existing
+
+          echo "Buckets initialized successfully"
+        env:
+        - name: MINIO_ROOT_USER
+          valueFrom:
+            secretKeyRef:
+              name: minio-secret
+              key: root-user
+        - name: MINIO_ROOT_PASSWORD
+          valueFrom:
+            secretKeyRef:
+              name: minio-secret
+              key: root-password
+---
+# Iceberg REST Catalog
+apiVersion: v1
+kind: Service
+metadata:
+  name: iceberg-catalog
+spec:
+  selector:
+    app: iceberg-catalog
+  ports:
+    - protocol: TCP
+      port: 8181
+      targetPort: 8181
+  type: ClusterIP
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: iceberg-catalog
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: iceberg-catalog
+  template:
+    metadata:
+      labels:
+        app: iceberg-catalog
+    spec:
+      initContainers:
+      - name: wait-for-postgres
+        image: busybox:1.36
+        command: ['sh', '-c', 'until nc -z postgres 5432; do echo waiting for postgres; sleep 2; done;']
+      - name: wait-for-minio
+        image: busybox:1.36
+        command: ['sh', '-c', 'until nc -z minio 9000; do echo waiting for minio; sleep 2; done;']
+      containers:
+      - name: iceberg-catalog
+        image: tabulario/iceberg-rest:latest
+        ports:
+        - containerPort: 8181
+        env:
+        - name: CATALOG_WAREHOUSE
+          value: "s3://warehouse/"
+        - name: CATALOG_IO__IMPL
+          value: "org.apache.iceberg.aws.s3.S3FileIO"
+        - name: CATALOG_S3_ENDPOINT
+          value: "http://minio:9000"
+        - name: CATALOG_S3_ACCESS__KEY__ID
+          valueFrom:
+            secretKeyRef:
+              name: minio-secret
+              key: root-user
+        - name: CATALOG_S3_SECRET__ACCESS__KEY
+          valueFrom:
+            secretKeyRef:
+              name: minio-secret
+              key: root-password
+        - name: CATALOG_S3_PATH__STYLE__ACCESS
+          value: "true"
+        - name: AWS_REGION
+          value: "us-east-1"
+---
+# Flink JobManager
+apiVersion: v1
+kind: Service
+metadata:
+  name: flink-jobmanager
+spec:
+  selector:
+    app: flink-jobmanager
+  ports:
+    - name: rpc
+      protocol: TCP
+      port: 6123
+      targetPort: 6123
+    - name: ui
+      protocol: TCP
+      port: 8081
+      targetPort: 8081
+    - name: zmq-market-data
+      protocol: TCP
+      port: 5558
+      targetPort: 5558
+    - name: zmq-notif-pull
+      protocol: TCP
+      port: 5561
+      targetPort: 5561
+  type: ClusterIP
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: flink-jobmanager
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: flink-jobmanager
+  template:
+    metadata:
+      labels:
+        app: flink-jobmanager
+    spec:
+      initContainers:
+      - name: wait-for-kafka
+        image: busybox:1.36
+        command: ['sh', '-c', 'until nc -z kafka 9092; do echo waiting for kafka; sleep 2; done;']
+      - name: wait-for-iceberg-catalog
+        image: busybox:1.36
+        command: ['sh', '-c', 'until nc -z iceberg-catalog 8181; do echo waiting for iceberg-catalog; sleep 2; done;']
+      containers:
+      - name: flink-jobmanager
+        image: dexorder/ai-flink
+        imagePullPolicy: Always
+        args: ["standalone-job", "--job-classname", "com.dexorder.flink.TradingFlinkApp"]
+        ports:
+        - containerPort: 6123
+          name: rpc
+        - containerPort: 8081
+          name: ui
+        - containerPort: 5558
+          name: zmq-market-data
+        - containerPort: 5561
+          name: zmq-notif-pull
+        env:
+        - name: JOB_MANAGER_RPC_ADDRESS
+          value: flink-jobmanager
+        - name: AWS_REGION
+          value: us-east-1
+        - name: AWS_ACCESS_KEY_ID
+          valueFrom:
+            secretKeyRef:
+              name: flink-secrets
+              key: minio-access-key
+        - name: AWS_SECRET_ACCESS_KEY
+          valueFrom:
+            secretKeyRef:
+              name: flink-secrets
+              key: minio-secret-key
+        volumeMounts:
+        - name: flink-config
+          mountPath: /etc/config/config.yaml
+          subPath: config.yaml
+        - name: flink-secrets
+          mountPath: /etc/secrets
+      volumes:
+      - name: flink-config
+        configMap:
+          name: flink-config
+      - name: flink-secrets
+        secret:
+          secretName: flink-secrets
+---
+# Flink TaskManager
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: flink-taskmanager
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: flink-taskmanager
+  template:
+    metadata:
+      labels:
+        app: flink-taskmanager
+    spec:
+      initContainers:
+      - name: wait-for-jobmanager
+        image: busybox:1.36
+        command: ['sh', '-c', 'until nc -z flink-jobmanager 6123; do echo waiting for jobmanager; sleep 2; done;']
+      containers:
+      - name: flink-taskmanager
+        image: dexorder/ai-flink
+        imagePullPolicy: Always
+        args: ["taskmanager"]
+        env:
+        - name: JOB_MANAGER_RPC_ADDRESS
+          value: flink-jobmanager
+        - name: AWS_REGION
+          value: us-east-1
+        - name: AWS_ACCESS_KEY_ID
+          valueFrom:
+            secretKeyRef:
+              name: flink-secrets
+              key: minio-access-key
+        - name: AWS_SECRET_ACCESS_KEY
+          valueFrom:
+            secretKeyRef:
+              name: flink-secrets
+              key: minio-secret-key
+        volumeMounts:
+        - name: flink-config
+          mountPath: /etc/config/config.yaml
+          subPath: config.yaml
+        - name: flink-secrets
+          mountPath: /etc/secrets
+      volumes:
+      - name: flink-config
+        configMap:
+          name: flink-config
+      - name: flink-secrets
+        secret:
+          secretName: flink-secrets
+---
+# Relay (ZMQ router)
+apiVersion: v1
+kind: Service
+metadata:
+  name: relay
+spec:
+  selector:
+    app: relay
+  ports:
+    - name: work-queue
+      protocol: TCP
+      port: 5555
+      targetPort: 5555
+    - name: responses
+      protocol: TCP
+      port: 5556
+      targetPort: 5556
+    - name: market-data
+      protocol: TCP
+      port: 5558
+      targetPort: 5558
+    - name: client-requests
+      protocol: TCP
+      port: 5559
+      targetPort: 5559
+  type: ClusterIP
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: relay
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: relay
+  template:
+    metadata:
+      labels:
+        app: relay
+    spec:
+      containers:
+      - name: relay
+        image: dexorder/ai-relay
+        imagePullPolicy: Always
+        ports:
+        - containerPort: 5555
+          name: work-queue
+        - containerPort: 5556
+          name: responses
+        - containerPort: 5558
+          name: market-data
+        - containerPort: 5559
+          name: client-requests
+        env:
+        - name: RUST_LOG
+          value: relay=info
+        - name: CONFIG_PATH
+          value: /config/config.yaml
+        volumeMounts:
+        - name: relay-config
+          mountPath: /config
+      volumes:
+      - name: relay-config
+        configMap:
+          name: relay-config
+---
+# Ingestor (CCXT data fetcher)
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ingestor
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: ingestor
+  template:
+    metadata:
+      labels:
+        app: ingestor
+    spec:
+      initContainers:
+      - name: wait-for-relay
+        image: busybox:1.36
+        command: ['sh', '-c', 'until nc -z relay 5555; do echo waiting for relay; sleep 2; done;']
+      - name: wait-for-kafka
+        image: busybox:1.36
+        command: ['sh', '-c', 'until nc -z kafka 9092; do echo waiting for kafka; sleep 2; done;']
+      containers:
+      - name: ingestor
+        image: dexorder/ai-ingestor
+        imagePullPolicy: Always
+        env:
+        - name: LOG_LEVEL
+          value: info
+        - name: CONFIG_PATH
+          value: /config/config.yaml
+        volumeMounts:
+        - name: ingestor-config
+          mountPath: /config
+        - name: ingestor-secrets
+          mountPath: /secrets
+      volumes:
+      - name: ingestor-config
+        configMap:
+          name: ingestor-config
+      - name: ingestor-secrets
+        secret:
+          secretName: ingestor-secrets
--- a/deploy/k8s/prod/kustomization.yaml
+++ b/deploy/k8s/prod/kustomization.yaml
@@ -1,18 +1,27 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization

-# Base resources (includes all security policies)
+# No namespace: transformer — kubectl --context=prod is configured with 'ai'
+# as its default namespace, so all resources without an explicit namespace
+# land in 'ai' automatically.
+
 resources:
+  # Base: init.yaml, sandbox namespace, RBAC, admission-policy, sandbox-quotas,
+  #       network-policies, gateway.yaml, web.yaml, ingress.yaml, gateway-ingress.yaml
  - ../base
-  - configs/gateway-config.yaml
+  # Add the 'ai' namespace (base only creates 'sandbox')
+  - namespaces.yaml
+  # Prod infrastructure (postgres, minio, kafka, flink, relay, ingestor, qdrant, dragonfly, iceberg)
+  - infrastructure.yaml
+  # gateway-config ConfigMap is intentionally excluded from kustomize.
+  # It contains an op:// reference for the DB password. Apply via:
+  #   bin/config-update prod gateway-config

-# Production patches
 patches:
-  - path: patches.yaml
+  - path: patch-gateway-rbac-subject.yaml
+  - path: patch-web.yaml
+  - path: patch-gateway-ingress.yaml

-# ConfigMaps for service configs
-# In production, these might come from external sources
-# or be managed separately, but we'll include them here for consistency
 configMapGenerator:
  - name: relay-config
    files:
@@ -24,23 +33,28 @@ configMapGenerator:
    files:
      - config.yaml=configs/flink-config.yaml

-# Secrets (managed via kubectl, not committed)
-# These are created by bin/secret-update prod
+# Secrets managed via bin/secret-update prod (op inject | kubectl apply)
 secretGenerator: []

 generatorOptions:
  disableNameSuffixHash: true

-# Images
 images:
-  - name: dexorder/ai-backend
-    newTag: latest
-  - name: dexorder/ai-web
+  - name: dexorder/ai-gateway
+    newName: git.dxod.org/dexorder/dexorder/ai-gateway
    newTag: latest
  - name: ghcr.io/dexorder/gateway
+    newName: git.dxod.org/dexorder/dexorder/ai-gateway
    newTag: latest
-  - name: lifecycle-sidecar
-    newName: ghcr.io/dexorder/lifecycle-sidecar
+  - name: dexorder/ai-web
+    newName: git.dxod.org/dexorder/dexorder/ai-web
    newTag: latest
-  - name: ghcr.io/dexorder/agent
+  - name: dexorder/ai-flink
+    newName: git.dxod.org/dexorder/dexorder/ai-flink
+    newTag: latest
+  - name: dexorder/ai-relay
+    newName: git.dxod.org/dexorder/dexorder/ai-relay
+    newTag: latest
+  - name: dexorder/ai-ingestor
+    newName: git.dxod.org/dexorder/dexorder/ai-ingestor
    newTag: latest
--- a/deploy/k8s/prod/namespaces.yaml
+++ b/deploy/k8s/prod/namespaces.yaml
@@ -0,0 +1,10 @@
+# Production namespace: ai (for all platform services)
+# The 'sandbox' namespace is defined in base/namespaces.yaml
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: ai
+  labels:
+    app.kubernetes.io/part-of: dexorder
+    dexorder.io/type: system
--- a/deploy/k8s/prod/patch-gateway-ingress.yaml
+++ b/deploy/k8s/prod/patch-gateway-ingress.yaml
@@ -0,0 +1,10 @@
+# Production gateway ingress: WebSocket timeout annotations
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: gateway-ingress
+  annotations:
+    nginx.ingress.kubernetes.io/websocket-services: gateway
+    nginx.ingress.kubernetes.io/proxy-read-timeout: "3600"
+    nginx.ingress.kubernetes.io/proxy-send-timeout: "3600"
+    nginx.ingress.kubernetes.io/proxy-connect-timeout: "60"
--- a/deploy/k8s/prod/patch-gateway-rbac-subject.yaml
+++ b/deploy/k8s/prod/patch-gateway-rbac-subject.yaml
@@ -0,0 +1,10 @@
+# Fix gateway ServiceAccount namespace in RoleBinding subject (base uses 'default', prod uses 'ai')
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: gateway-sandbox-creator
+  namespace: sandbox
+subjects:
+  - kind: ServiceAccount
+    name: gateway
+    namespace: ai
--- a/deploy/k8s/prod/patch-web.yaml
+++ b/deploy/k8s/prod/patch-web.yaml
@@ -0,0 +1,18 @@
+# Production web: imagePullPolicy and resources
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ai-web
+spec:
+  template:
+    spec:
+      containers:
+      - name: ai-web
+        imagePullPolicy: Always
+        resources:
+          requests:
+            memory: "256Mi"
+            cpu: "100m"
+          limits:
+            memory: "512Mi"
+            cpu: "500m"
--- a/deploy/k8s/prod/patches.yaml
+++ b/deploy/k8s/prod/patches.yaml
@@ -1,52 +0,0 @@
---
-# Production backend patches
-apiVersion: apps/v1
-kind: StatefulSet
-metadata:
-  name: ai-backend
-spec:
-  replicas: 2
-  template:
-    spec:
-      runtimeClassName: gvisor
-      containers:
-      - name: ai-backend
-        image: dexorder/ai-backend:latest
-        imagePullPolicy: Always
-        env:
-        - name: CONFIG
-          value: "prod"
-        resources:
-          requests:
-            memory: "2Gi"
-            cpu: "1000m"
-          limits:
-            memory: "4Gi"
-            cpu: "2000m"
---
-# Production web patches
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: ai-web
-spec:
-  replicas: 2
-  template:
-    spec:
-      runtimeClassName: gvisor
-      containers:
-      - name: ai-web
-        image: dexorder/ai-web:latest
-        imagePullPolicy: Always
-        env:
-        - name: VITE_GATEWAY_URL
-          value: "https://dexorder.ai/api"
-        - name: VITE_WS_URL
-          value: "wss://dexorder.ai/ws"
-        resources:
-          requests:
-            memory: "512Mi"
-            cpu: "250m"
-          limits:
-            memory: "1Gi"
-            cpu: "500m"
--- a/deploy/k8s/prod/secrets/ai-secrets.tpl.yaml
+++ b/deploy/k8s/prod/secrets/ai-secrets.tpl.yaml
@@ -0,0 +1,8 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: ai-secrets
+  namespace: ai
+type: Opaque
+stringData:
+  anthropic-api-key: "{{ op://AI Prod/Gateway/anthropic_api_key }}"
--- a/deploy/k8s/prod/secrets/flink-secrets.tpl.yaml
+++ b/deploy/k8s/prod/secrets/flink-secrets.tpl.yaml
@@ -0,0 +1,9 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: flink-secrets
+  namespace: ai
+type: Opaque
+stringData:
+  minio-access-key: "{{ op://AI Prod/MinIO/access_key }}"
+  minio-secret-key: "{{ op://AI Prod/MinIO/secret_key }}"
--- a/deploy/k8s/prod/secrets/gateway-secrets.tpl.yaml
+++ b/deploy/k8s/prod/secrets/gateway-secrets.tpl.yaml
@@ -0,0 +1,45 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: gateway-secrets
+  namespace: ai
+type: Opaque
+stringData:
+  secrets.yaml: |
+    # Gateway Secrets (production)
+
+    # Authentication secret for JWT signing
+    auth:
+      secret: "{{ op://AI Prod/Gateway/jwt_secret }}"
+
+    # LLM Provider API Keys
+    llm_providers:
+      anthropic_api_key: "{{ op://AI Prod/Gateway/anthropic_api_key }}"
+      openai_api_key: "{{ op://AI Prod/Gateway/openai_api_key }}"
+      google_api_key: "{{ op://AI Prod/Gateway/google_api_key }}"
+      openrouter_api_key: "{{ op://AI Prod/Gateway/openrouter_api_key }}"
+
+    # Telegram (optional)
+    telegram:
+      bot_token: "{{ op://AI Prod/Telegram/bot_token }}"
+
+    # Email service (optional)
+    email:
+      service_key: ""
+
+    # Push notification service (optional)
+    push:
+      service_key: ""
+
+    # Qdrant API key (optional, for hosted Qdrant)
+    qdrant:
+      api_key: ""
+
+    # Iceberg S3 credentials (must match minio-secret)
+    iceberg:
+      s3_access_key: "{{ op://AI Prod/MinIO/access_key }}"
+      s3_secret_key: "{{ op://AI Prod/MinIO/secret_key }}"
+
+    # Embedding API key (if using external provider)
+    embedding:
+      api_key: ""
--- a/deploy/k8s/prod/secrets/ingestor-secrets.tpl.yaml
+++ b/deploy/k8s/prod/secrets/ingestor-secrets.tpl.yaml
@@ -0,0 +1,13 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: ingestor-secrets
+  namespace: ai
+type: Opaque
+stringData:
+  binance-api-key: "{{ op://AI Prod/Ingestor/binance_api_key }}"
+  binance-api-secret: "{{ op://AI Prod/Ingestor/binance_api_secret }}"
+  coinbase-api-key: "{{ op://AI Prod/Ingestor/coinbase_api_key }}"
+  coinbase-api-secret: "{{ op://AI Prod/Ingestor/coinbase_api_secret }}"
+  kraken-api-key: "{{ op://AI Prod/Ingestor/kraken_api_key }}"
+  kraken-api-secret: "{{ op://AI Prod/Ingestor/kraken_api_secret }}"
--- a/deploy/k8s/prod/secrets/minio-secret.tpl.yaml
+++ b/deploy/k8s/prod/secrets/minio-secret.tpl.yaml
@@ -0,0 +1,9 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: minio-secret
+  namespace: ai
+type: Opaque
+stringData:
+  root-user: "{{ op://AI Prod/MinIO/access_key }}"
+  root-password: "{{ op://AI Prod/MinIO/secret_key }}"
--- a/deploy/k8s/prod/secrets/postgres-secret.tpl.yaml
+++ b/deploy/k8s/prod/secrets/postgres-secret.tpl.yaml
@@ -0,0 +1,8 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: postgres-secret
+  namespace: ai
+type: Opaque
+stringData:
+  password: "{{ op://AI Prod/PostgreSQL/password }}"
--- a/deploy/k8s/prod/secrets/sandbox-secrets.tpl.yaml
+++ b/deploy/k8s/prod/secrets/sandbox-secrets.tpl.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: sandbox-secrets
+  namespace: sandbox
+  labels:
+    app.kubernetes.io/name: sandbox
+    app.kubernetes.io/component: secrets
+type: Opaque
+stringData:
+  secrets.yaml: |
+    # S3/MinIO credentials for Iceberg data access (must match minio-secret)
+    s3_access_key: "{{ op://AI Prod/MinIO/access_key }}"
+    s3_secret_key: "{{ op://AI Prod/MinIO/secret_key }}"