feat: add @tag model override support and remove Qdrant dependencies

- Add model-tags parser for @Tag syntax in chat messages - Support Anthropic models (Sonnet, Haiku, Opus) via @tag - Remove Qdrant vector database from infrastructure and configs - Simplify license model config to use null fallbacks - Add greeting stream after model switch via @tag - Fix protobuf field names to camelCase for v7 compatibility - Add 429 rate limit retry logic with exponential backoff - Remove RAG references from agent harness documentation
2026-04-27 20:55:18 -04:00
parent 6f937f9e5e
commit d41fcd0499
50 changed files with 956 additions and 798 deletions
--- a/deploy/k8s/base/gateway.yaml
+++ b/deploy/k8s/base/gateway.yaml
@@ -44,9 +44,6 @@ spec:
        - name: wait-for-dragonfly
          image: busybox:1.36
          command: ['sh', '-c', 'until nc -z dragonfly 6379; do echo waiting for dragonfly; sleep 2; done;']
-        - name: wait-for-qdrant
-          image: busybox:1.36
-          command: ['sh', '-c', 'until nc -z qdrant 6333; do echo waiting for qdrant; sleep 2; done;']
        - name: wait-for-iceberg-catalog
          image: busybox:1.36
          command: ['sh', '-c', 'until nc -z iceberg-catalog 8181; do echo waiting for iceberg-catalog; sleep 2; done;']
--- a/deploy/k8s/dev/configs/gateway-config.yaml.tpl
+++ b/deploy/k8s/dev/configs/gateway-config.yaml.tpl
@@ -27,29 +27,22 @@ data:
      model_provider: deepinfra
      model: zai-org/GLM-5

-    # License tier model configuration
+    # License tier model configuration (null = fall back to defaults.model)
    license_models:
-      # Free tier models
      free:
-        default: zai-org/GLM-5
-        cost_optimized: zai-org/GLM-5
-        complex: zai-org/GLM-5
-        allowed_models:
-          - zai-org/GLM-5
+        default: ~
+        cost_optimized: ~
+        complex: ~

-      # Pro tier models
      pro:
-        default: zai-org/GLM-5
-        cost_optimized: zai-org/GLM-5
-        complex: zai-org/GLM-5
-        blocked_models:
-          - Qwen/Qwen3-235B-A22B-Instruct-2507
+        default: ~
+        cost_optimized: ~
+        complex: ~

-      # Enterprise tier models
      enterprise:
-        default: zai-org/GLM-5
-        cost_optimized: zai-org/GLM-5
-        complex: Qwen/Qwen3-235B-A22B-Instruct-2507
+        default: ~
+        cost_optimized: ~
+        complex: ~

    # Kubernetes configuration
    kubernetes:
@@ -70,11 +63,6 @@ data:
    redis:
      url: redis://dragonfly:6379

-    # Qdrant (for RAG vector search)
-    qdrant:
-      url: http://qdrant:6333
-      collection: gateway_memory
-
    # Iceberg (for durable storage via REST catalog)
    iceberg:
      catalog_uri: http://iceberg-catalog:8181
--- a/deploy/k8s/dev/infrastructure.yaml
+++ b/deploy/k8s/dev/infrastructure.yaml
@@ -45,68 +45,6 @@ spec:
            memory: "512Mi"
            cpu: "500m"
 ---
-# Qdrant (Vector database for RAG)
-apiVersion: v1
-kind: Service
-metadata:
-  name: qdrant
-spec:
-  selector:
-    app: qdrant
-  ports:
-    - name: http
-      protocol: TCP
-      port: 6333
-      targetPort: 6333
-    - name: grpc
-      protocol: TCP
-      port: 6334
-      targetPort: 6334
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: StatefulSet
-metadata:
-  name: qdrant
-spec:
-  serviceName: qdrant
-  replicas: 1
-  selector:
-    matchLabels:
-      app: qdrant
-  template:
-    metadata:
-      labels:
-        app: qdrant
-    spec:
-      containers:
-      - name: qdrant
-        image: qdrant/qdrant:latest
-        ports:
-        - containerPort: 6333
-          name: http
-        - containerPort: 6334
-          name: grpc
-        resources:
-          requests:
-            memory: "512Mi"
-            cpu: "200m"
-          limits:
-            memory: "1Gi"
-            cpu: "1000m"
-        volumeMounts:
-        - name: qdrant-data
-          mountPath: /qdrant/storage
-  volumeClaimTemplates:
-  - metadata:
-      name: qdrant-data
-    spec:
-      accessModes: ["ReadWriteOnce"]
-      storageClassName: dev-ephemeral
-      resources:
-        requests:
-          storage: 10Gi
---
 # Kafka (KRaft mode - no Zookeeper needed)
 # Using apache/kafka:3.9.0 instead of confluentinc/cp-kafka because:
 # - cp-kafka's entrypoint script has issues with KRaft configuration
--- a/deploy/k8s/prod/configs/gateway-config.yaml
+++ b/deploy/k8s/prod/configs/gateway-config.yaml
@@ -21,30 +21,6 @@ data:
      model_provider: deepinfra
      model: zai-org/GLM-5

-    # License tier model configuration
-    license_models:
-      # Free tier models
-      free:
-        default: zai-org/GLM-5
-        cost_optimized: zai-org/GLM-5
-        complex: zai-org/GLM-5
-        allowed_models:
-          - zai-org/GLM-5
-
-      # Pro tier models
-      pro:
-        default: zai-org/GLM-5
-        cost_optimized: zai-org/GLM-5
-        complex: zai-org/GLM-5
-        blocked_models:
-          - Qwen/Qwen3-235B-A22B-Instruct-2507
-
-      # Enterprise tier models
-      enterprise:
-        default: zai-org/GLM-5
-        cost_optimized: zai-org/GLM-5
-        complex: Qwen/Qwen3-235B-A22B-Instruct-2507
-
    # Kubernetes configuration
    kubernetes:
      namespace: sandbox
@@ -59,11 +35,6 @@ data:
    redis:
      url: redis://dragonfly:6379

-    # Qdrant (for RAG vector search)
-    qdrant:
-      url: http://qdrant:6333
-      collection: gateway_memory
-
    # Agent configuration
    agent:
      # Number of prior conversation turns loaded as LLM context and flushed to Iceberg at session end
--- a/deploy/k8s/prod/infrastructure.yaml
+++ b/deploy/k8s/prod/infrastructure.yaml
@@ -45,67 +45,6 @@ spec:
            memory: "512Mi"
            cpu: "500m"
 ---
-# Qdrant (Vector database for RAG)
-apiVersion: v1
-kind: Service
-metadata:
-  name: qdrant
-spec:
-  selector:
-    app: qdrant
-  ports:
-    - name: http
-      protocol: TCP
-      port: 6333
-      targetPort: 6333
-    - name: grpc
-      protocol: TCP
-      port: 6334
-      targetPort: 6334
-  type: ClusterIP
---
-apiVersion: apps/v1
-kind: StatefulSet
-metadata:
-  name: qdrant
-spec:
-  serviceName: qdrant
-  replicas: 1
-  selector:
-    matchLabels:
-      app: qdrant
-  template:
-    metadata:
-      labels:
-        app: qdrant
-    spec:
-      containers:
-      - name: qdrant
-        image: qdrant/qdrant:latest
-        ports:
-        - containerPort: 6333
-          name: http
-        - containerPort: 6334
-          name: grpc
-        resources:
-          requests:
-            memory: "512Mi"
-            cpu: "200m"
-          limits:
-            memory: "1Gi"
-            cpu: "1000m"
-        volumeMounts:
-        - name: qdrant-data
-          mountPath: /qdrant/storage
-  volumeClaimTemplates:
-  - metadata:
-      name: qdrant-data
-    spec:
-      accessModes: ["ReadWriteOnce"]
-      resources:
-        requests:
-          storage: 10Gi
---
 # Kafka (KRaft mode - no Zookeeper needed)
 apiVersion: v1
 kind: Service
--- a/deploy/k8s/prod/kustomization.yaml
+++ b/deploy/k8s/prod/kustomization.yaml
@@ -11,7 +11,7 @@ resources:
  - ../base
  # Add the 'ai' namespace (base only creates 'sandbox')
  - namespaces.yaml
-  # Prod infrastructure (postgres, minio, kafka, flink, relay, ingestor, qdrant, dragonfly, iceberg)
+  # Prod infrastructure (postgres, minio, kafka, flink, relay, ingestor, dragonfly, iceberg)
  - infrastructure.yaml
  # Sandbox namespace resources (go to sandbox namespace, not ai)
  - sandbox-config.yaml
--- a/deploy/k8s/prod/secrets/gateway-secrets.tpl.yaml
+++ b/deploy/k8s/prod/secrets/gateway-secrets.tpl.yaml
@@ -19,6 +19,7 @@ stringData:
    # LLM Provider API Keys
    llm_providers:
      deepinfra_api_key: "{{ op://AI Prod/Gateway/deepinfra_api_key }}"
+      anthropic_api_key: "{{ op://AI Prod/Gateway/anthropic_api_key }}"

    # Search API Keys
    search:
@@ -36,10 +37,6 @@ stringData:
    push:
      service_key: ""

-    # Qdrant API key (optional, for hosted Qdrant)
-    qdrant:
-      api_key: ""
-
    # Iceberg S3 credentials (must match minio-secret)
    iceberg:
      s3_access_key: "{{ op://AI Prod/MinIO/access_key }}"