container lifecycle management

This commit is contained in:
2026-03-12 15:13:38 -04:00
parent e99ef5d2dd
commit b9cc397e05
61 changed files with 6880 additions and 31 deletions

View File

@@ -0,0 +1,110 @@
# ValidatingAdmissionPolicy to restrict images in dexorder-agents namespace
# Requires Kubernetes 1.30+ (or 1.28+ with feature gate)
# This is the critical security control that prevents arbitrary image execution
# even if the gateway is compromised.
---
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingAdmissionPolicy
metadata:
name: dexorder-agent-image-policy
spec:
failurePolicy: Fail
matchConstraints:
namespaceSelector:
matchLabels:
dexorder.io/type: agents
resourceRules:
- apiGroups: ["apps"]
apiVersions: ["v1"]
resources: ["deployments"]
operations: ["CREATE", "UPDATE"]
validations:
# Only allow images from our approved registry with agent prefix
- expression: |
object.spec.template.spec.containers.all(c,
c.image.startsWith('ghcr.io/dexorder/agent:') ||
c.image.startsWith('ghcr.io/dexorder/agent-'))
message: "Only approved dexorder agent images are allowed in the agents namespace"
reason: Forbidden
# No privileged containers
- expression: |
object.spec.template.spec.containers.all(c,
!has(c.securityContext) ||
!has(c.securityContext.privileged) ||
c.securityContext.privileged == false)
message: "Privileged containers are not allowed"
reason: Forbidden
# No hostPath volumes
- expression: |
!has(object.spec.template.spec.volumes) ||
object.spec.template.spec.volumes.all(v,
!has(v.hostPath))
message: "hostPath volumes are not allowed"
reason: Forbidden
# No hostNetwork
- expression: |
!has(object.spec.template.spec.hostNetwork) ||
object.spec.template.spec.hostNetwork == false
message: "hostNetwork is not allowed"
reason: Forbidden
# No hostPID
- expression: |
!has(object.spec.template.spec.hostPID) ||
object.spec.template.spec.hostPID == false
message: "hostPID is not allowed"
reason: Forbidden
# Containers must run as non-root
- expression: |
object.spec.template.spec.containers.all(c,
has(c.securityContext) &&
has(c.securityContext.runAsNonRoot) &&
c.securityContext.runAsNonRoot == true)
message: "Containers must run as non-root"
reason: Forbidden
# Must drop all capabilities
- expression: |
object.spec.template.spec.containers.all(c,
has(c.securityContext) &&
has(c.securityContext.capabilities) &&
has(c.securityContext.capabilities.drop) &&
c.securityContext.capabilities.drop.exists(cap, cap == 'ALL'))
message: "Containers must drop all capabilities"
reason: Forbidden
# Read-only root filesystem
- expression: |
object.spec.template.spec.containers.all(c,
has(c.securityContext) &&
has(c.securityContext.readOnlyRootFilesystem) &&
c.securityContext.readOnlyRootFilesystem == true)
message: "Containers must have read-only root filesystem"
reason: Forbidden
# Resource limits must be set
- expression: |
object.spec.template.spec.containers.all(c,
has(c.resources) &&
has(c.resources.limits) &&
has(c.resources.limits.memory) &&
has(c.resources.limits.cpu))
message: "Containers must have resource limits set"
reason: Forbidden
---
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingAdmissionPolicyBinding
metadata:
name: dexorder-agent-image-policy-binding
spec:
policyName: dexorder-agent-image-policy
validationActions:
- Deny
matchResources:
namespaceSelector:
matchLabels:
dexorder.io/type: agents

View File

@@ -0,0 +1,221 @@
# Example agent deployment with lifecycle sidecar
# This would be created by the gateway for each user
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: agent-user-abc123
namespace: dexorder-agents
labels:
app.kubernetes.io/name: agent
app.kubernetes.io/component: user-agent
dexorder.io/component: agent
dexorder.io/user-id: user-abc123
dexorder.io/deployment: agent-user-abc123
spec:
replicas: 1
selector:
matchLabels:
dexorder.io/user-id: user-abc123
template:
metadata:
labels:
dexorder.io/component: agent
dexorder.io/user-id: user-abc123
dexorder.io/deployment: agent-user-abc123
spec:
serviceAccountName: agent-lifecycle
# Share PID namespace so sidecar can monitor main container
shareProcessNamespace: true
# Security context
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers:
# Main agent container
- name: agent
image: ghcr.io/dexorder/agent:latest
imagePullPolicy: Always
# Security context (required by admission policy)
securityContext:
allowPrivilegeEscalation: false
runAsNonRoot: true
runAsUser: 1000
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
# Resource limits (required by admission policy)
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "1Gi"
cpu: "1000m"
# Environment variables
env:
- name: USER_ID
value: "user-abc123"
- name: IDLE_TIMEOUT_MINUTES
value: "15"
- name: IDLE_CHECK_INTERVAL_SECONDS
value: "60"
- name: ENABLE_IDLE_SHUTDOWN
value: "true"
- name: MCP_SERVER_PORT
value: "3000"
- name: ZMQ_CONTROL_PORT
value: "5555"
# Ports
ports:
- name: mcp
containerPort: 3000
protocol: TCP
- name: zmq-control
containerPort: 5555
protocol: TCP
# Volume mounts
volumeMounts:
- name: agent-data
mountPath: /app/data
- name: tmp
mountPath: /tmp
- name: shared-run
mountPath: /var/run/agent
# Liveness probe (agent's MCP server)
livenessProbe:
httpGet:
path: /health
port: mcp
initialDelaySeconds: 10
periodSeconds: 30
timeoutSeconds: 5
# Readiness probe
readinessProbe:
httpGet:
path: /ready
port: mcp
initialDelaySeconds: 5
periodSeconds: 10
# Lifecycle sidecar
- name: lifecycle-sidecar
image: ghcr.io/dexorder/lifecycle-sidecar:latest
imagePullPolicy: Always
# Security context
securityContext:
allowPrivilegeEscalation: false
runAsNonRoot: true
runAsUser: 1000
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
# Resource limits
resources:
requests:
memory: "32Mi"
cpu: "10m"
limits:
memory: "64Mi"
cpu: "50m"
# Environment variables (injected via downward API)
env:
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: DEPLOYMENT_NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['dexorder.io/deployment']
- name: USER_TYPE
value: "free" # Gateway sets this based on license
- name: MAIN_CONTAINER_PID
value: "1" # In shared PID namespace, main container is typically PID 1
# Volume mounts
volumeMounts:
- name: shared-run
mountPath: /var/run/agent
readOnly: true
# Volumes
volumes:
# Persistent data (user files, state)
- name: agent-data
persistentVolumeClaim:
claimName: agent-user-abc123-data
# Temporary writable filesystem (read-only rootfs)
- name: tmp
emptyDir:
medium: Memory
sizeLimit: 128Mi
# Shared between main container and sidecar
- name: shared-run
emptyDir:
medium: Memory
sizeLimit: 1Mi
# Restart policy
restartPolicy: Always
# Termination grace period
terminationGracePeriodSeconds: 30
---
# PVC for agent persistent data
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: agent-user-abc123-data
namespace: dexorder-agents
labels:
dexorder.io/user-id: user-abc123
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
storageClassName: standard # Or your preferred storage class
---
# Service to expose agent MCP endpoint
apiVersion: v1
kind: Service
metadata:
name: agent-user-abc123
namespace: dexorder-agents
labels:
dexorder.io/user-id: user-abc123
spec:
type: ClusterIP
selector:
dexorder.io/user-id: user-abc123
ports:
- name: mcp
port: 3000
targetPort: mcp
protocol: TCP
- name: zmq-control
port: 5555
targetPort: zmq-control
protocol: TCP

View File

@@ -0,0 +1,53 @@
# Resource constraints for the dexorder-agents namespace
# These limits apply regardless of what the gateway requests
---
# LimitRange: per-container defaults and maximums
apiVersion: v1
kind: LimitRange
metadata:
name: agent-limits
namespace: dexorder-agents
spec:
limits:
# Default limits applied if deployment doesn't specify
- type: Container
default:
memory: "512Mi"
cpu: "500m"
defaultRequest:
memory: "256Mi"
cpu: "100m"
# Maximum any single container can request
max:
memory: "2Gi"
cpu: "2000m"
min:
memory: "64Mi"
cpu: "50m"
# PVC size limits
- type: PersistentVolumeClaim
max:
storage: "10Gi"
min:
storage: "100Mi"
---
# ResourceQuota: total namespace limits
# Prevents a compromised gateway from exhausting cluster resources
apiVersion: v1
kind: ResourceQuota
metadata:
name: agent-quota
namespace: dexorder-agents
spec:
hard:
# Total compute limits for all agents combined
requests.cpu: "20"
requests.memory: "40Gi"
limits.cpu: "40"
limits.memory: "80Gi"
# Object count limits
pods: "100"
persistentvolumeclaims: "100"
services: "100"
# Storage limits
requests.storage: "500Gi"

View File

@@ -0,0 +1,65 @@
# RBAC for gateway to CREATE agent deployments only
# Principle of least privilege: gateway can ONLY create deployments/services/PVCs
# in the dexorder-agents namespace. Deletion is handled by the lifecycle sidecar.
# No pods, secrets, exec, or cross-namespace access.
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: gateway
namespace: dexorder-system
---
# Role scoped to dexorder-agents namespace only
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: agent-creator
namespace: dexorder-agents
rules:
# Deployments: create and read only (deletion handled by sidecar)
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["create", "get", "list", "watch", "patch", "update"]
# PVCs: create and read (deletion handled by sidecar)
- apiGroups: [""]
resources: ["persistentvolumeclaims"]
verbs: ["create", "get", "list", "watch"]
# Services: create and manage agent MCP endpoints
- apiGroups: [""]
resources: ["services"]
verbs: ["create", "get", "list", "watch", "patch", "update"]
# Read-only pod access for status checks (no exec!)
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list", "watch"]
# Pod logs for debugging (read-only)
- apiGroups: [""]
resources: ["pods/log"]
verbs: ["get"]
# Explicitly NOT included:
# - deployments/delete - handled by lifecycle sidecar
# - pvc/delete - handled by lifecycle sidecar
# - services/delete - handled by lifecycle sidecar
# - pods (create/delete) - must go through deployments
# - pods/exec, pods/attach - no shell access
# - secrets, configmaps - no credential access
# - any resources in other namespaces
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: gateway-agent-creator
namespace: dexorder-agents
subjects:
- kind: ServiceAccount
name: gateway
namespace: dexorder-system
roleRef:
kind: Role
name: agent-creator
apiGroup: rbac.authorization.k8s.io

View File

@@ -1,3 +1,6 @@
# Runtime and security initialization for dexorder AI platform
# Apply this first: kubectl apply -f init.yaml
---
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:

View File

@@ -1,5 +1,26 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources: []
# ingress.yaml - removed until we have services to expose
resources:
# Core initialization (runtime classes)
- init.yaml
# Namespace definitions with PodSecurity labels
- namespaces.yaml
# RBAC for gateway to create agents (creation only)
- gateway-rbac.yaml
# RBAC for lifecycle sidecar (self-deletion)
- lifecycle-sidecar-rbac.yaml
# Admission policies (image restriction, security requirements)
- admission-policy.yaml
# Resource quotas and limits for agents namespace
- agent-quotas.yaml
# Network isolation policies
- network-policies.yaml
# Gateway service (uncomment when ready)
# - gateway.yaml
# Example agent deployment (for reference, not applied by default)
# - agent-deployment-example.yaml
# Services (uncomment as needed)
# - backend.yaml
# - web.yaml
# - ingress.yaml

View File

@@ -0,0 +1,53 @@
# RBAC for lifecycle sidecar - allows self-deletion only
# Each agent pod gets this ServiceAccount and can only delete its own deployment
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: agent-lifecycle
namespace: dexorder-agents
---
# Role allowing deletion of deployments and PVCs
# This is scoped to the dexorder-agents namespace
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: agent-self-delete
namespace: dexorder-agents
rules:
# Allow getting and deleting deployments
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["get", "delete"]
# Allow getting and deleting PVCs (for anonymous users)
- apiGroups: [""]
resources: ["persistentvolumeclaims"]
verbs: ["get", "delete"]
# Read-only access to pods (for status checking)
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: agent-self-delete
namespace: dexorder-agents
subjects:
- kind: ServiceAccount
name: agent-lifecycle
namespace: dexorder-agents
roleRef:
kind: Role
name: agent-self-delete
apiGroup: rbac.authorization.k8s.io
---
# Additional security: ValidatingWebhookConfiguration to restrict deletion
# This ensures sidecars can only delete their own deployment
# Requires a validating webhook server (can be added later)
# For now, we rely on:
# 1. Sidecar only knowing its own deployment name (from env)
# 2. RBAC limiting to dexorder-agents namespace
# 3. Admission policy restricting deployment creation (already defined)

View File

@@ -0,0 +1,24 @@
# Namespace definitions for dexorder AI platform
# - dexorder-system: gateway, flink, kafka, and other infrastructure
# - dexorder-agents: user agent containers (isolated, restricted)
---
apiVersion: v1
kind: Namespace
metadata:
name: dexorder-system
labels:
app.kubernetes.io/part-of: dexorder
dexorder.io/type: system
---
apiVersion: v1
kind: Namespace
metadata:
name: dexorder-agents
labels:
app.kubernetes.io/part-of: dexorder
dexorder.io/type: agents
# Enforce restricted pod security standards
pod-security.kubernetes.io/enforce: restricted
pod-security.kubernetes.io/enforce-version: latest
pod-security.kubernetes.io/audit: restricted
pod-security.kubernetes.io/warn: restricted

View File

@@ -0,0 +1,121 @@
# Network policies for agent isolation
# Agents can only communicate with specific services, not with each other
# or with the Kubernetes API
---
# Default deny all ingress and egress in agents namespace
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: default-deny-all
namespace: dexorder-agents
spec:
podSelector: {}
policyTypes:
- Ingress
- Egress
---
# Allow agents to receive connections from gateway (MCP)
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-gateway-ingress
namespace: dexorder-agents
spec:
podSelector:
matchLabels:
dexorder.io/component: agent
policyTypes:
- Ingress
ingress:
- from:
- namespaceSelector:
matchLabels:
dexorder.io/type: system
podSelector:
matchLabels:
app: gateway
ports:
- protocol: TCP
port: 3000 # MCP server port
- protocol: TCP
port: 5555 # ZeroMQ control channel
---
# Allow agents to connect to required services
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-agent-egress
namespace: dexorder-agents
spec:
podSelector:
matchLabels:
dexorder.io/component: agent
policyTypes:
- Egress
egress:
# DNS resolution (required)
- to:
- namespaceSelector: {}
podSelector:
matchLabels:
k8s-app: kube-dns
ports:
- protocol: UDP
port: 53
- protocol: TCP
port: 53
# Gateway in system namespace (for callbacks)
- to:
- namespaceSelector:
matchLabels:
dexorder.io/type: system
podSelector:
matchLabels:
app: gateway
ports:
- protocol: TCP
port: 8080
# Kafka/Redpanda for data subscriptions
- to:
- namespaceSelector:
matchLabels:
dexorder.io/type: system
podSelector:
matchLabels:
app: redpanda
ports:
- protocol: TCP
port: 9092
# External HTTPS (for exchange APIs, LLM APIs)
- to:
- ipBlock:
cidr: 0.0.0.0/0
except:
# Block access to k8s API server (common ranges)
- 10.0.0.0/8
- 172.16.0.0/12
- 192.168.0.0/16
ports:
- protocol: TCP
port: 443
---
# System namespace: allow ingress from agents
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-agent-callbacks
namespace: dexorder-system
spec:
podSelector:
matchLabels:
app: gateway
policyTypes:
- Ingress
ingress:
- from:
- namespaceSelector:
matchLabels:
dexorder.io/type: agents
ports:
- protocol: TCP
port: 8080