feat: add @tag model override support and remove Qdrant dependencies

- Add model-tags parser for @Tag syntax in chat messages - Support Anthropic models (Sonnet, Haiku, Opus) via @tag - Remove Qdrant vector database from infrastructure and configs - Simplify license model config to use null fallbacks - Add greeting stream after model switch via @tag - Fix protobuf field names to camelCase for v7 compatibility - Add 429 rate limit retry logic with exponential backoff - Remove RAG references from agent harness documentation
2026-04-27 20:55:18 -04:00
parent 6f937f9e5e
commit d41fcd0499
50 changed files with 956 additions and 798 deletions
--- a/gateway/.env.example
+++ b/gateway/.env.example
@@ -38,10 +38,6 @@ SANDBOX_STORAGE_CLASS=standard
 # Redis (for hot storage and session management)
 REDIS_URL=redis://localhost:6379

-# Qdrant (for RAG vector search)
-QDRANT_URL=http://localhost:6333
-QDRANT_API_KEY=  # optional, leave empty for local dev
-
 # Iceberg (for durable storage via REST catalog)
 ICEBERG_CATALOG_URI=http://iceberg-catalog:8181
 ICEBERG_NAMESPACE=gateway
--- a/gateway/README.md
+++ b/gateway/README.md
@@ -58,7 +58,6 @@ Multi-channel gateway with agent harness for the Dexorder AI platform.
 - **Streaming responses**: Real-time chat with WebSocket and Telegram
 - **Complex workflows**: LangGraph for stateful trading analysis (backtest → risk → approval)
 - **Agent harness**: Stateless orchestrator (all context lives in user's MCP container)
- **MCP resource integration**: User's RAG, conversation history, and preferences

 ## Container Management

@@ -91,9 +90,7 @@ Containers self-manage their lifecycle using the lifecycle sidecar (see `../life
  - OpenAI GPT
  - Google Gemini
  - OpenRouter (one key for 300+ models)
- Ollama (for embeddings): https://ollama.com/download
 - Redis (for session/hot storage)
- Qdrant (for RAG vector search)
 - Kafka + Flink + Iceberg (for durable storage)

 ### Development
@@ -123,20 +120,7 @@ DEFAULT_MODEL_PROVIDER=anthropic
 DEFAULT_MODEL=claude-sonnet-4-6
 ```

-4. Start Ollama and pull embedding model:
-```bash
-# Install Ollama (one-time): https://ollama.com/download
-# Or with Docker: docker run -d -p 11434:11434 ollama/ollama
-
-# Pull the all-minilm embedding model (90MB, CPU-friendly)
-ollama pull all-minilm
-
-# Alternative models:
-# ollama pull nomic-embed-text  # 8K context length
-# ollama pull mxbai-embed-large  # Higher accuracy, slower
-```
-
-5. Run development server:
+4. Run development server:
 ```bash
 npm run dev
 ```
@@ -217,138 +201,6 @@ ws.send(JSON.stringify({
 **`GET /health`**
 - Returns server health status

-## Ollama Deployment Options
-
-The gateway requires Ollama for embedding generation in RAG queries. You have two deployment options:
-
-### Option 1: Ollama in Gateway Container (Recommended for simplicity)
-
-Install Ollama directly in the gateway container. This keeps all dependencies local and simplifies networking.
-
-**Dockerfile additions:**
-```dockerfile
-FROM node:22-slim
-
-# Install Ollama
-RUN curl -fsSL https://ollama.com/install.sh | sh
-
-# Pull embedding model at build time
-RUN ollama serve & \
-    sleep 5 && \
-    ollama pull all-minilm && \
-    pkill ollama
-
-# ... rest of your gateway Dockerfile
-```
-
-**Start script (entrypoint.sh):**
-```bash
-#!/bin/bash
-# Start Ollama in background
-ollama serve &
-
-# Start gateway
-node dist/main.js
-```
-
-**Pros:**
- Simple networking (localhost:11434)
- No extra K8s resources
- Self-contained deployment
-
-**Cons:**
- Larger container image (~200MB extra)
- CPU/memory shared with gateway process
-
-**Resource requirements:**
- Add +200MB memory
- Add +0.2 CPU cores for embedding inference
-
-### Option 2: Ollama as Separate Pod/Sidecar
-
-Deploy Ollama as a separate container in the same pod (sidecar) or as its own deployment.
-
-**K8s Deployment (sidecar pattern):**
-```yaml
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: gateway
-spec:
-  template:
-    spec:
-      containers:
-      - name: gateway
-        image: ghcr.io/dexorder/gateway:latest
-        env:
-        - name: OLLAMA_URL
-          value: http://localhost:11434
-
-      - name: ollama
-        image: ollama/ollama:latest
-        command: ["/bin/sh", "-c"]
-        args:
-          - |
-            ollama serve &
-            sleep 5
-            ollama pull all-minilm
-            wait
-        resources:
-          requests:
-            memory: "512Mi"
-            cpu: "500m"
-          limits:
-            memory: "1Gi"
-            cpu: "1000m"
-```
-
-**K8s Deployment (separate service):**
-```yaml
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: ollama
-spec:
-  replicas: 1
-  template:
-    spec:
-      containers:
-      - name: ollama
-        image: ollama/ollama:latest
-        # ... same as above
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: ollama
-spec:
-  selector:
-    app: ollama
-  ports:
-  - port: 11434
-```
-
-Gateway `.env`:
-```bash
-OLLAMA_URL=http://ollama:11434
-```
-
-**Pros:**
- Isolated resource limits
- Can scale separately
- Easier to monitor/debug
-
-**Cons:**
- More K8s resources
- Network hop (minimal latency)
- More complex deployment
-
-### Recommendation
-
-For most deployments: **Use Option 1 (in-container)** for simplicity, unless you need to:
- Share Ollama across multiple services
- Scale embedding inference independently
- Run Ollama on GPU nodes (gateway on CPU nodes)

 ## TODO

--- a/gateway/config.example.yaml
+++ b/gateway/config.example.yaml
@@ -58,11 +58,6 @@ kubernetes:
 redis:
  url: redis://localhost:6379

-# Qdrant (for RAG vector search)
-qdrant:
-  url: http://localhost:6333
-  collection: gateway_memory
-
 # Iceberg (for durable storage via REST catalog)
 iceberg:
  catalog_uri: http://iceberg-catalog:8181
--- a/gateway/knowledge/README.md
+++ b/gateway/knowledge/README.md
@@ -1,6 +1,6 @@
 # Dexorder Knowledge Base

-This directory contains global knowledge documents that are automatically loaded into the RAG system as platform-wide knowledge (user_id="0").
+This directory contains global knowledge documents that are automatically loaded into the agent's context at startup.

 ## Structure

@@ -40,9 +40,7 @@ Content here...

 1. At gateway startup, the DocumentLoader scans this directory
 2. Each markdown file is chunked by headers (max ~1000 tokens per chunk)
-3. Chunks are embedded using the configured embedding service
-4. Embeddings are stored in Qdrant with user_id="0" (global namespace)
-5. Content hash tracking enables incremental updates
+3. Content hash tracking enables incremental updates

 ## Updating Documents

@@ -55,14 +53,6 @@ Content here...
 - Deploy new version
 - Gateway will detect changes and update vectors automatically

-## RAG Integration
-
-When users query the agent:
-1. Their query is embedded
-2. Qdrant searches both global (user_id="0") and user-specific vectors
-3. Relevant chunks from these docs are included in context
-4. LLM generates response with platform knowledge
-
 ## Adding New Documents

 1. Create markdown file in appropriate subdirectory
@@ -90,12 +80,3 @@ Check logs for load statistics:
 ```
 Knowledge documents loaded: { loaded: 5, updated: 2, skipped: 3 }
 ```
-
-Monitor Qdrant collection stats:
-```
-GET /health
-{
-  "qdrantVectors": 1234,
-  "qdrantIndexed": 1234
-}
-```
--- a/gateway/package.json
+++ b/gateway/package.json
@@ -19,6 +19,7 @@
    "@langchain/community": "^1.1.27",
    "@langchain/core": "latest",
    "@langchain/langgraph": "latest",
+    "@langchain/anthropic": "latest",
    "@langchain/openai": "^1.4.2",
    "@modelcontextprotocol/sdk": "^1.0.4",
    "@types/pdf-parse": "^1.1.5",
--- a/gateway/prompt/agent-main.md
+++ b/gateway/prompt/agent-main.md
@@ -21,6 +21,8 @@ Delegate specialized tasks to subagents using the `Spawn` tool. Each subagent ha
 - Multi-symbol comparisons
 - Custom calculations using Python (pandas, numpy, scipy, matplotlib, etc.)

+**Always begin the instruction with:** `Research script name: "<Descriptive Name>"` — e.g. `Research script name: "Monday Tuesday Session Overlap"`. The research agent uses this name when calling `PythonWrite` or `PythonEdit`.
+
 Do **NOT** include time range, history length, bar count, period size, or resolution guidance in the instruction unless the user explicitly specifies such. The research agent selects its own optimal window and period otherwise.

 **`Spawn({agent: "indicator", instruction: "..."})`** — for ANYTHING indicator-related on the chart:
--- a/gateway/prompt/agent-research.md
+++ b/gateway/prompt/agent-research.md
@@ -15,6 +15,10 @@ dynamic_imports:

 You are a specialized assistant that creates Python research scripts for market data analysis and visualization.

+## CRITICAL RULE
+
+**You MUST call `PythonWrite` (new script) or `PythonEdit` (existing script) as your FIRST tool call. NEVER write analysis text without first creating or updating a script.** If you find yourself about to generate analysis text without a tool call, stop and call `PythonWrite` or `PythonEdit` first. A text-only response is always wrong.
+
 ## Your Purpose

 Create Python scripts that:
--- a/gateway/secrets.example.yaml
+++ b/gateway/secrets.example.yaml
@@ -26,10 +26,6 @@ email:
 push:
  service_key: ""

-# Qdrant API key (optional, for hosted Qdrant)
-qdrant:
-  api_key: ""
-
 # Iceberg S3 credentials
 iceberg:
  s3_access_key: minioadmin
--- a/gateway/src/auth/auth-service.ts
+++ b/gateway/src/auth/auth-service.ts
@@ -105,36 +105,36 @@ export class AuthService {
        asResponse: true,
      });

-      // Extract bearer token from response headers (set by bearer plugin)
-      const token = response.headers.get('set-auth-token');
-
-      if (!token) {
-        this.config.logger.error('Bearer token not found in response headers');
-        return {
-          token: '',
-          userId: '',
-          error: 'Authentication token not generated',
-        };
-      }
-
      // Parse the response body to get user info
      const result = await response.json() as {
        user?: { id: string; email: string; name: string };
        error?: string;
      };

+      if (!response.ok) {
+        this.config.logger.warn({ status: response.status }, 'Sign in rejected by auth provider');
+        return {
+          token: '',
+          userId: '',
+          error: 'Invalid email or password.',
+        };
+      }
+
+      // Extract bearer token from response headers (set by bearer plugin)
+      const token = response.headers.get('set-auth-token');
+
      this.config.logger.debug({
        hasUser: !!result.user,
        userId: result.user?.id,
        hasToken: !!token,
      }, 'Sign in result');

-      if (!result.user) {
-        this.config.logger.warn('Sign in failed: no user in result');
+      if (!token || !result.user) {
+        this.config.logger.error({ hasToken: !!token, hasUser: !!result.user }, 'Sign in succeeded but session data missing');
        return {
          token: '',
          userId: '',
-          error: 'Invalid credentials',
+          error: 'Login failed. Please try again.',
        };
      }

@@ -147,7 +147,7 @@ export class AuthService {
      return {
        token: '',
        userId: '',
-        error: error.message || 'Sign in failed',
+        error: 'Login failed. Please try again.',
      };
    }
  }
--- a/gateway/src/channels/websocket-handler.ts
+++ b/gateway/src/channels/websocket-handler.ts
@@ -5,6 +5,8 @@ import type { AgentHarness, HarnessFactory } from '../harness/agent-harness.js';
 import type { HarnessEvent } from '../harness/harness-events.js';
 import type { InboundMessage } from '../types/messages.js';
 import { randomUUID } from 'crypto';
+import { parseModelTag, MODEL_TAGS } from '../llm/model-tags.js';
+import type { LLMProvider } from '../llm/provider.js';
 import type { SessionRegistry, EventSubscriber, Session } from '../events/index.js';
 import type { OHLCService, BarUpdateCallback } from '../services/ohlc-service.js';
 import type { SymbolIndexService } from '../services/symbol-index-service.js';
@@ -30,6 +32,24 @@ function jsonStringifySafe(obj: any): string {
  );
 }

+function makeChunkDebouncer(send: (content: string) => void, delayMs = 200) {
+  let buffer = '';
+  let timer: ReturnType<typeof setTimeout> | null = null;
+
+  function flush() {
+    if (timer !== null) { clearTimeout(timer); timer = null; }
+    if (buffer.length > 0) { send(buffer); buffer = ''; }
+  }
+
+  function add(content: string) {
+    buffer += content;
+    if (timer !== null) clearTimeout(timer);
+    timer = setTimeout(flush, delayMs);
+  }
+
+  return { add, flush };
+}
+
 export type SessionStatus = 'authenticating' | 'spinning_up' | 'initializing' | 'ready' | 'error'

 function sendStatus(socket: WebSocket, status: SessionStatus, message: string): void {
@@ -257,6 +277,7 @@ export class WebSocketHandler {
          userId: authContext.userId,
          licenseType: authContext.license.licenseType,
          message: 'Connected to Dexorder AI',
+          modelTags: MODEL_TAGS.map(m => m.tag),
        })
      );

@@ -272,25 +293,32 @@ export class WebSocketHandler {
        } else {
          // First conversation — auto-send greeting prompt and stream the response
          socket.send(JSON.stringify({ type: 'agent_chunk', content: '', done: false }));
+          const greetingDebouncer = makeChunkDebouncer(content =>
+            socket.send(JSON.stringify({ type: 'agent_chunk', content, done: false }))
+          );
          for await (const event of harness!.streamGreeting()) {
            const e = event as HarnessEvent;
            switch (e.type) {
              case 'chunk':
-                socket.send(JSON.stringify({ type: 'agent_chunk', content: e.content, done: false }));
+                greetingDebouncer.add(e.content);
                break;
              case 'tool_call':
+                greetingDebouncer.flush();
                socket.send(JSON.stringify({ type: 'agent_tool_call', toolName: e.toolName, label: e.label }));
                break;
              case 'image':
+                greetingDebouncer.flush();
                socket.send(JSON.stringify({ type: 'image', data: e.data, mimeType: e.mimeType, caption: e.caption }));
                break;
              case 'error':
+                greetingDebouncer.flush();
                socket.send(JSON.stringify({ type: 'text', text: `An error occurred during greeting.` }));
                break;
              case 'done':
                break;
            }
          }
+          greetingDebouncer.flush();
          socket.send(JSON.stringify({ type: 'agent_chunk', content: '', done: true }));
        }
      }
@@ -304,47 +332,75 @@ export class WebSocketHandler {

          // Route based on message type
          if (payload.type === 'message' || payload.type === 'agent_user_message') {
-            // Chat message - send to agent harness with streaming
-            const inboundMessage: InboundMessage = {
-              messageId: randomUUID(),
-              userId: authContext.userId,
-              sessionId: authContext.sessionId,
-              content: payload.content,
-              attachments: payload.attachments,
-              timestamp: new Date(),
-            };
-
            if (!harness) {
              logger.error('Harness not initialized');
              socket.send(JSON.stringify({ type: 'error', message: 'Session not ready' }));
              return;
            }

+            // Check for @ModelTag at the start of the message
+            const parsedTag = parseModelTag(payload.content ?? '');
+            let messageContent: string = payload.content ?? '';
+            let modelOverride: { modelId: string; provider?: LLMProvider } | undefined;
+
+            if (parsedTag) {
+              await harness.clearHistory();
+              socket.send(JSON.stringify({ type: 'model_switched', tag: parsedTag.tag, modelId: parsedTag.modelId, rest: parsedTag.rest }));
+              messageContent = parsedTag.rest;
+              modelOverride = { modelId: parsedTag.modelId, provider: parsedTag.provider };
+              logger.info({ tag: parsedTag.tag, modelId: parsedTag.modelId }, 'Model tag switch');
+            }
+
+            // Chat message - send to agent harness with streaming
+            const inboundMessage: InboundMessage = {
+              messageId: randomUUID(),
+              userId: authContext.userId,
+              sessionId: authContext.sessionId,
+              content: messageContent,
+              attachments: payload.attachments,
+              timestamp: new Date(),
+            };
+
            try {
              // Acknowledge receipt immediately so the client can show the seen indicator
              socket.send(JSON.stringify({ type: 'agent_chunk', content: '', done: false }));

              logger.info('Streaming harness response');
              let fatalError = false;
-              for await (const event of harness.streamMessage(inboundMessage)) {
+              const msgDebouncer = makeChunkDebouncer(content =>
+                socket.send(JSON.stringify({ type: 'agent_chunk', content, done: false }))
+              );
+              const stream = (parsedTag && !messageContent)
+                ? harness.streamGreeting(modelOverride)
+                : harness.streamMessage(inboundMessage, { modelOverride });
+              for await (const event of stream) {
                const e = event as HarnessEvent;
                switch (e.type) {
                  case 'chunk':
-                    socket.send(JSON.stringify({ type: 'agent_chunk', content: e.content, done: false }));
+                    msgDebouncer.add(e.content);
                    break;
                  case 'tool_call':
+                    msgDebouncer.flush();
                    socket.send(JSON.stringify({ type: 'agent_tool_call', toolName: e.toolName, label: e.label }));
                    break;
                  case 'subagent_tool_call':
+                    msgDebouncer.flush();
                    socket.send(JSON.stringify({ type: 'subagent_tool_call', agentName: e.agentName, toolName: e.toolName, label: e.label }));
                    break;
                  case 'subagent_chunk':
+                    msgDebouncer.flush();
                    socket.send(JSON.stringify({ type: 'subagent_chunk', agentName: e.agentName, content: e.content }));
                    break;
+                  case 'subagent_thinking':
+                    msgDebouncer.flush();
+                    socket.send(JSON.stringify({ type: 'subagent_thinking', agentName: e.agentName, content: e.content }));
+                    break;
                  case 'image':
+                    msgDebouncer.flush();
                    socket.send(JSON.stringify({ type: 'image', data: e.data, mimeType: e.mimeType, caption: e.caption }));
                    break;
                  case 'error':
+                    msgDebouncer.flush();
                    socket.send(JSON.stringify({ type: 'text', text: `An unrecoverable error occurred in the ${e.source}.` }));
                    if (e.fatal) fatalError = true;
                    break;
@@ -352,6 +408,7 @@ export class WebSocketHandler {
                    break;
                }
              }
+              msgDebouncer.flush();

              if (fatalError) {
                socket.close(1011, 'Fatal error');
@@ -451,6 +508,9 @@ export class WebSocketHandler {
                    case 'subagent_tool_call':
                      socket.send(JSON.stringify({ type: 'subagent_tool_call', agentName: e.agentName, toolName: e.toolName, label: e.label }));
                      break;
+                    case 'subagent_thinking':
+                      socket.send(JSON.stringify({ type: 'subagent_thinking', agentName: e.agentName, content: e.content }));
+                      break;
                    case 'tool_call':
                      socket.send(JSON.stringify({ type: 'agent_tool_call', toolName: e.toolName, label: e.label }));
                      break;
@@ -730,6 +790,13 @@ export class WebSocketHandler {
          // Create a per-subscription callback that forwards bars to this socket
          const barCallback: BarUpdateCallback = (bar) => {
            if (socket.readyState !== 1 /* OPEN */) return;
+            const symbolMeta = symbolIndexService?.getSymbolByTicker(bar.ticker);
+            const priceDivisor = (symbolMeta?.price_precision ?? 0) > 0
+              ? Math.pow(10, symbolMeta!.price_precision!)
+              : 1;
+            const sizeDivisor = (symbolMeta?.size_precision ?? 0) > 0
+              ? Math.pow(10, symbolMeta!.size_precision!)
+              : 1;
            socket.send(JSON.stringify({
              type: 'bar_update',
              subscription_id: payload.subscription_id,
@@ -739,11 +806,11 @@ export class WebSocketHandler {
              bar: {
                // Convert nanoseconds → seconds for client compatibility
                time: Number(bar.timestamp / 1_000_000_000n),
-                open: bar.open,
-                high: bar.high,
-                low: bar.low,
-                close: bar.close,
-                volume: bar.volume,
+                open: bar.open / priceDivisor,
+                high: bar.high / priceDivisor,
+                low: bar.low / priceDivisor,
+                close: bar.close / priceDivisor,
+                volume: bar.volume / sizeDivisor,
              },
            }));
          };
--- a/gateway/src/events/types.ts
+++ b/gateway/src/events/types.ts
@@ -151,7 +151,7 @@ export function deserializeUserEvent(data: Buffer): UserEvent {
    eventId: json.event_id,
    timestamp: json.timestamp,
    eventType: json.event_type as EventType,
-    payload: Buffer.from(json.payload, 'base64'),
+    payload: json.payload ? Buffer.from(json.payload, 'base64') : Buffer.alloc(0),
    delivery: {
      priority: json.delivery.priority as Priority,
      channels: json.delivery.channels.map(
--- a/gateway/src/harness/README.md
+++ b/gateway/src/harness/README.md
@@ -7,7 +7,7 @@ Comprehensive agent orchestration system for Dexorder AI platform, built on Lang
 ```
 gateway/src/
 ├── harness/
-│   ├── memory/              # Storage layer (Redis + Iceberg + Qdrant)
+│   ├── memory/              # Storage layer (Redis + Iceberg)
 │   ├── subagents/           # Specialized agents with multi-file memory
 │   ├── workflows/           # LangGraph state machines
 │   ├── prompts/             # System prompts
@@ -27,13 +27,10 @@ Tiered storage architecture:

 - **Redis**: Hot state (active sessions, checkpoints)
 - **Iceberg**: Cold storage (durable conversations, analytics)
- **Qdrant**: Vector search (RAG, semantic memory)

 **Key Files:**
 - `checkpoint-saver.ts`: LangGraph checkpoint persistence
 - `conversation-store.ts`: Message history management
- `rag-retriever.ts`: Vector similarity search
- `embedding-service.ts`: Text→vector conversion
 - `session-context.ts`: User context with channel metadata

 ### 2. Tools (`../tools/`)
@@ -176,19 +173,11 @@ Based on [harness-rag.txt discussion](../../chat/harness-rag.txt):
 - Time-travel queries
 - GDPR-compliant deletion with compaction

-### Vector Search (Qdrant)
- Conversation embeddings
- Long-term memory
- RAG retrieval
- Payload-indexed by user_id for fast GDPR deletion
- **Global knowledge base** (user_id="0") loaded from markdown files
-
 ### GDPR Compliance

 ```typescript
 // Delete user data across all stores
 await conversationStore.deleteUserData(userId);
-await ragRetriever.deleteUserData(userId);
 await checkpointSaver.delete(userId);
 await containerManager.deleteContainer(userId);

@@ -247,19 +236,13 @@ Already in `gateway/package.json`:
 import Redis from 'ioredis';
 import {
  TieredCheckpointSaver,
-  ConversationStore,
-  EmbeddingService,
-  RAGRetriever
+  ConversationStore
 } from './harness/memory';

 const redis = new Redis(process.env.REDIS_URL);

 const checkpointSaver = new TieredCheckpointSaver(redis, logger);
 const conversationStore = new ConversationStore(redis, logger);
-const embeddings = new EmbeddingService({ provider: 'openai', apiKey }, logger);
-const ragRetriever = new RAGRetriever({ url: QDRANT_URL }, logger);
-
-await ragRetriever.initialize();
 ```

 ### 3. Create Subagents
@@ -309,56 +292,6 @@ const analysis = await skill.execute({
 });
 ```

-## Global Knowledge System
-
-The harness includes a document loader that automatically loads markdown files from `gateway/knowledge/` into Qdrant as global knowledge (user_id="0").
-
-### Directory Structure
-```
-gateway/knowledge/
-  ├── platform/          # Platform capabilities and architecture
-  ├── trading/           # Trading concepts and fundamentals
-  ├── indicators/        # Indicator development guides
-  └── strategies/        # Strategy patterns and examples
-```
-
-### How It Works
-
-1. **Startup**: Documents are loaded automatically when gateway starts
-2. **Chunking**: Intelligent splitting by markdown headers (~1000 tokens/chunk)
-3. **Embedding**: Chunks are embedded using configured embedding service
-4. **Storage**: Stored in Qdrant with user_id="0" (global namespace)
-5. **Updates**: Content hashing detects changes for incremental updates
-
-### RAG Query Flow
-
-When a user sends a message:
-1. Query is embedded using same embedding service
-2. Qdrant searches vectors with filter: `user_id = current_user OR user_id = "0"`
-3. Results include both user-specific and global knowledge
-4. Relevant chunks are added to LLM context
-5. LLM generates response with platform knowledge
-
-### Managing Knowledge
-
-**Add new documents**:
-```bash
-# Create markdown file in appropriate directory
-echo "# New Topic" > gateway/knowledge/platform/new-topic.md
-
-# Reload knowledge (development)
-curl -X POST http://localhost:3000/admin/reload-knowledge
-```
-
-**Check stats**:
-```bash
-curl http://localhost:3000/admin/knowledge-stats
-```
-
-**In production**: Just deploy updated markdown files - they'll be loaded on startup.
-
-See [gateway/knowledge/README.md](../../knowledge/README.md) for detailed documentation.
-
 ## Next Steps

 1. **Implement Iceberg Integration**: Complete TODOs in checkpoint-saver.ts and conversation-store.ts
@@ -371,5 +304,4 @@ See [gateway/knowledge/README.md](../../knowledge/README.md) for detailed docume

 - Architecture discussion: [chat/harness-rag.txt](../../chat/harness-rag.txt)
 - LangGraph docs: https://langchain-ai.github.io/langgraphjs/
- Qdrant docs: https://qdrant.tech/documentation/
 - Apache Iceberg: https://iceberg.apache.org/docs/latest/
--- a/gateway/src/harness/agent-harness.ts
+++ b/gateway/src/harness/agent-harness.ts
@@ -7,7 +7,7 @@ import type { ConversationStore } from './memory/conversation-store.js';
 import type { BlobStore } from './memory/blob-store.js';
 import type { InboundMessage, OutboundMessage } from '../types/messages.js';
 import { MCPClientConnector } from './mcp-client.js';
-import { LLMProviderFactory, type ProviderConfig } from '../llm/provider.js';
+import { LLMProvider, LLMProviderFactory, type ProviderConfig } from '../llm/provider.js';
 import { ModelRouter, RoutingStrategy } from '../llm/router.js';
 import type { ModelMiddleware } from '../llm/middleware.js';
 import type { WorkspaceManager } from '../workspace/workspace-manager.js';
@@ -107,13 +107,10 @@ export class AgentHarness {
      this.wikiLoader,
      getToolRegistry(),
      async (maxTokens?: number) => {
-        const { model } = await this.modelRouter.route(
-          'analyze and backtest research data',
-          this.config.license,
-          RoutingStrategy.COMPLEXITY,
-          this.config.userId,
-          maxTokens,
-        );
+        const { model } = this.modelRouter.createModel({
+          ...this.modelFactory.getDefaultModel(),
+          ...(maxTokens !== undefined && { maxTokens }),
+        });
        return model;
      },
      config.logger,
@@ -363,34 +360,50 @@ export class AgentHarness {

      this.config.logger.debug('Streaming model response...');
      let response: any = null;
-      try {
-        const stream = await model.stream(messagesCopy, { signal });
-        for await (const chunk of stream) {
-          if (typeof chunk.content === 'string' && chunk.content.length > 0) {
-            this.config.logger.trace({ content: chunk.content }, 'raw chunk');
-            yield { type: 'chunk', content: chunk.content };
-          } else if (Array.isArray(chunk.content)) {
-            for (const block of chunk.content) {
-              if (block.type === 'text' && block.text) {
-                this.config.logger.trace({ content: block.text }, 'raw chunk');
-                yield { type: 'chunk', content: block.text };
+      const MAX_STREAM_ATTEMPTS = 4;
+      for (let attempt = 1; attempt <= MAX_STREAM_ATTEMPTS; attempt++) {
+        response = null;
+        try {
+          const stream = await model.stream(messagesCopy, { signal });
+          for await (const chunk of stream) {
+            if (typeof chunk.content === 'string' && chunk.content.length > 0) {
+              this.config.logger.trace({ content: chunk.content }, 'raw chunk');
+              yield { type: 'chunk', content: chunk.content };
+            } else if (Array.isArray(chunk.content)) {
+              for (const block of chunk.content) {
+                if (block.type === 'text' && block.text) {
+                  this.config.logger.trace({ content: block.text }, 'raw chunk');
+                  yield { type: 'chunk', content: block.text };
+                }
              }
            }
+            response = response ? response.concat(chunk) : chunk;
          }
-          response = response ? response.concat(chunk) : chunk;
+          break; // success — exit retry loop
+        } catch (invokeError: any) {
+          const is429 = invokeError?.status === 429 || invokeError?.lc_error_code === 'MODEL_RATE_LIMIT';
+          if (is429 && attempt < MAX_STREAM_ATTEMPTS) {
+            const delaySec = parseRetryAfter(invokeError?.headers);
+            const delayMs = delaySec != null ? delaySec * 1000 : Math.min(5000 * attempt, 30000);
+            this.config.logger.warn(
+              { attempt, delayMs, iteration: iterations, messageCount: messagesCopy.length },
+              'Model rate limited (429), retrying after delay'
+            );
+            await new Promise(resolve => setTimeout(resolve, delayMs));
+            continue;
+          }
+          this.config.logger.error(
+            {
+              error: invokeError,
+              errorMessage: invokeError?.message,
+              errorStack: invokeError?.stack,
+              iteration: iterations,
+              messageCount: messagesCopy.length,
+            },
+            'Model streaming failed in tool calling loop'
+          );
+          throw invokeError;
        }
-      } catch (invokeError: any) {
-        this.config.logger.error(
-          {
-            error: invokeError,
-            errorMessage: invokeError?.message,
-            errorStack: invokeError?.stack,
-            iteration: iterations,
-            messageCount: messagesCopy.length,
-          },
-          'Model streaming failed in tool calling loop'
-        );
-        throw invokeError;
      }

      this.config.logger.info(
@@ -684,7 +697,7 @@ export class AgentHarness {
   * Yields typed HarnessEvents (chunk, tool_call, image, done) and saves the
   * conversation to the store once the done event has been emitted.
   */
-  async *streamMessage(message: InboundMessage, options?: { saveUserMessage?: boolean }): AsyncGenerator<HarnessEvent> {
+  async *streamMessage(message: InboundMessage, options?: { saveUserMessage?: boolean; modelOverride?: { modelId: string; provider?: LLMProvider } }): AsyncGenerator<HarnessEvent> {
    this.config.logger.info(
      { messageId: message.messageId, userId: message.userId, content: message.content.substring(0, 100) },
      'Processing user message'
@@ -725,12 +738,24 @@ export class AgentHarness {

    // 4. Get the configured model
    this.config.logger.debug('Routing to model');
-    const { model, middleware } = await this.modelRouter.route(
-      message.content,
-      this.config.license,
-      RoutingStrategy.COMPLEXITY,
-      this.config.userId
-    );
+    let model, middleware;
+    if (options?.modelOverride) {
+      const defaultConfig = this.modelRouter.getDefaultModelConfig();
+      ({ model, middleware } = this.modelRouter.createModel({
+        ...defaultConfig,
+        model: options.modelOverride.modelId,
+        provider: options.modelOverride.provider ?? defaultConfig.provider,
+        thinking: undefined,
+      }));
+      this.config.logger.info({ modelId: options.modelOverride.modelId, provider: options.modelOverride.provider }, 'Using @tag model override');
+    } else {
+      ({ model, middleware } = await this.modelRouter.route(
+        message.content,
+        this.config.license,
+        RoutingStrategy.COMPLEXITY,
+        this.config.userId
+      ));
+    }
    this.middleware = middleware;
    this.config.logger.info({ modelName: model.constructor.name }, 'Model selected');

@@ -837,11 +862,18 @@ export class AgentHarness {
    }
  }

+  async clearHistory(): Promise<void> {
+    if (this.conversationStore) {
+      const channelKey = this.config.channelType ?? ChannelType.WEBSOCKET;
+      await this.conversationStore.deleteSession(this.config.userId, this.config.sessionId, channelKey);
+    }
+  }
+
  /**
-   * Stream a greeting response for first-time users.
+   * Stream a greeting response for first-time users (or after a model switch).
   * Sends "Who are you and what can you do?" through the normal message pipeline.
   */
-  async *streamGreeting(): AsyncGenerator<HarnessEvent> {
+  async *streamGreeting(modelOverride?: { modelId: string; provider?: LLMProvider }): AsyncGenerator<HarnessEvent> {
    const content = await AgentHarness.loadWelcomePrompt();
    const greetingMessage: InboundMessage = {
      messageId: `greeting_${Date.now()}`,
@@ -850,7 +882,7 @@ export class AgentHarness {
      content,
      timestamp: new Date(),
    };
-    yield* this.streamMessage(greetingMessage, { saveUserMessage: false });
+    yield* this.streamMessage(greetingMessage, { saveUserMessage: false, modelOverride });
  }

  /**
@@ -1040,9 +1072,25 @@ export class AgentHarness {
 }

 // =============================================================================
-// Details update helpers (module-level, no class dependency)
+// Helpers (module-level, no class dependency)
 // =============================================================================

+/**
+ * Parse the Retry-After header value into seconds.
+ * Accepts both delta-seconds ("30") and HTTP-date ("Mon, 01 Jan 2026 00:00:00 GMT").
+ * Returns null if the header is absent or unparseable.
+ */
+function parseRetryAfter(headers: Record<string, string> | undefined): number | null {
+  if (!headers) return null;
+  const value = headers['retry-after'] ?? headers['Retry-After'];
+  if (!value) return null;
+  const num = parseFloat(value);
+  if (!isNaN(num)) return Math.max(0, num);
+  const date = new Date(value);
+  if (!isNaN(date.getTime())) return Math.max(0, (date.getTime() - Date.now()) / 1000);
+  return null;
+}
+
 /**
 * Produce a minimal unified diff between two strings, suitable for passing to
 * an LLM as a change description. Returns an empty string when there is no diff.
--- a/gateway/src/harness/harness-events.ts
+++ b/gateway/src/harness/harness-events.ts
@@ -48,4 +48,10 @@ export interface ErrorEvent {
  fatal: boolean;
 }

-export type HarnessEvent = ChunkEvent | ToolCallEvent | ImageEvent | DoneEvent | SubagentChunkEvent | SubagentThinkingEvent | SubagentToolCallEvent | ErrorEvent;
+export interface ModelSwitchedEvent {
+  type: 'model_switched';
+  tag: string;
+  modelId: string;
+}
+
+export type HarnessEvent = ChunkEvent | ToolCallEvent | ImageEvent | DoneEvent | SubagentChunkEvent | SubagentThinkingEvent | SubagentToolCallEvent | ErrorEvent | ModelSwitchedEvent;
--- a/gateway/src/harness/spawn/spawn-service.ts
+++ b/gateway/src/harness/spawn/spawn-service.ts
@@ -1,8 +1,5 @@
 import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
 import { SystemMessage, HumanMessage } from '@langchain/core/messages';
-
-/** All platform tool names available to every subagent. */
-const ALL_PLATFORM_TOOLS = ['SymbolLookup', 'GetChartData', 'GetTicker24h', 'WebSearch', 'FetchPage', 'ArxivSearch'];
 import type { FastifyBaseLogger } from 'fastify';
 import { createReactAgent } from '@langchain/langgraph/prebuilt';
 import type { HarnessEvent, SubagentChunkEvent, SubagentThinkingEvent } from '../harness-events.js';
@@ -13,6 +10,62 @@ import type { ToolRegistry } from '../../tools/tool-registry.js';
 import type { MCPToolInfo } from '../../tools/mcp/mcp-tool-wrapper.js';
 import { WikiLoader, type SpawnContext } from './wiki-loader.js';

+/** All platform tool names available to every subagent. */
+const ALL_PLATFORM_TOOLS = ['SymbolLookup', 'GetChartData', 'GetTicker24h', 'WebSearch', 'FetchPage', 'ArxivSearch'];
+
+/**
+ * Streaming filter that strips triple-backtick fenced code blocks from text as it
+ * arrives in chunks. Holds back at most 2 characters of look-ahead so normal text
+ * streams through with no perceptible delay.
+ */
+class FenceFilter {
+  private buf = '';
+  private inFence = false;
+
+  write(chunk: string): string {
+    this.buf += chunk;
+    return this.drain(false);
+  }
+
+  end(): string {
+    return this.drain(true);
+  }
+
+  private drain(final: boolean): string {
+    let out = '';
+    while (true) {
+      if (!this.inFence) {
+        const start = this.buf.indexOf('```');
+        if (start === -1) {
+          const keep = final ? this.buf.length : Math.max(0, this.buf.length - 2);
+          out += this.buf.slice(0, keep);
+          this.buf = this.buf.slice(keep);
+          break;
+        }
+        out += this.buf.slice(0, start);
+        const headerEnd = this.buf.indexOf('\n', start + 3);
+        if (headerEnd === -1 && !final) {
+          this.buf = this.buf.slice(start);
+          break;
+        }
+        this.inFence = true;
+        this.buf = headerEnd !== -1 ? this.buf.slice(headerEnd + 1) : '';
+      } else {
+        const end = this.buf.indexOf('```');
+        if (end === -1) {
+          this.buf = final ? '' : this.buf.slice(Math.max(0, this.buf.length - 2));
+          break;
+        }
+        this.inFence = false;
+        const closingEnd = this.buf.indexOf('\n', end + 3);
+        this.buf = closingEnd !== -1 ? this.buf.slice(closingEnd + 1) : this.buf.slice(end + 3);
+      }
+    }
+    // Collapse blank lines left where code blocks were removed
+    return out.replace(/\n{3,}/g, '\n\n');
+  }
+}
+
 export interface SpawnInput {
  agentName: string;
  instruction: string;
@@ -138,13 +191,15 @@ export class SpawnService {
    );

    let finalText = '';
+    const fenceFilter = new FenceFilter();

    for await (const [mode, data] of await stream) {
      if (signal?.aborted) break;

      if (mode === 'messages') {
        for (const chunk of SpawnService.extractStreamChunks(data, agentName)) {
-          yield chunk;
+          const filtered = fenceFilter.write(chunk.content);
+          if (filtered) yield { ...chunk, content: filtered };
        }
      } else if (mode === 'updates') {
        if ((data as any).agent?.messages) {
@@ -167,6 +222,9 @@ export class SpawnService {
      }
    }

+    const tail = fenceFilter.end();
+    if (tail) yield { type: 'subagent_chunk', agentName, content: tail };
+
    this.logger.info(
      { agentName, textLength: finalText.length, imageCount: imageCapture.length },
      'SpawnService: finished'
@@ -182,12 +240,16 @@ export class SpawnService {

  /**
   * Extract subagent_chunk / subagent_thinking events from a LangGraph `messages` stream datum.
+   * Only processes AIMessageChunks — ToolMessages (identified by tool_call_id) are skipped
+   * because their content is raw tool result data, not agent narrative text.
   */
  static extractStreamChunks(
    data: unknown,
    agentName: string,
  ): Array<SubagentChunkEvent | SubagentThinkingEvent> {
    const msg = Array.isArray(data) ? (data as unknown[])[0] : data;
+    // ToolMessages have tool_call_id; AIMessageChunks don't — skip tool results
+    if ((msg as any)?.tool_call_id != null) return [];
    const content = (msg as any)?.content;
    if (typeof content === 'string') {
      return content ? [{ type: 'subagent_chunk', agentName, content }] : [];
--- a/gateway/src/harness/workflows/trading-request/config.yaml
+++ b/gateway/src/harness/workflows/trading-request/config.yaml
@@ -14,6 +14,4 @@ approvalNodes:
 maxPositionPercent: 0.05  # 5% of portfolio max
 minRiskRewardRatio: 2.0   # Minimum 2:1 risk/reward

-# Model override (optional)
-model: claude-sonnet-4-6
 temperature: 0.2
--- a/gateway/src/llm/model-tags.ts
+++ b/gateway/src/llm/model-tags.ts
@@ -0,0 +1,30 @@
+import { LLMProvider } from './provider.js';
+
+export interface ModelTag {
+  tag: string;
+  modelId: string;
+  provider?: LLMProvider;
+}
+
+export const MODEL_TAGS: ModelTag[] = [
+  { tag: 'DeepSeek-Flash', modelId: 'deepseek-ai/DeepSeek-V4-Flash' },
+  { tag: 'DeepSeek-Pro',   modelId: 'deepseek-ai/DeepSeek-V4-Pro'   },
+  { tag: 'Kimi',           modelId: 'moonshotai/Kimi-K2.6'           },
+  { tag: 'GLM',            modelId: 'zai-org/GLM-5'                  },
+  { tag: 'Qwen',           modelId: 'Qwen/Qwen3.5-27B'               },
+  { tag: 'MiniMax',        modelId: 'MiniMaxAI/MiniMax-M2.5'         },
+  { tag: 'Sonnet',         modelId: 'claude-sonnet-4-6',              provider: LLMProvider.ANTHROPIC },
+  { tag: 'Haiku',          modelId: 'claude-haiku-4-5-20251001',      provider: LLMProvider.ANTHROPIC },
+  { tag: 'Opus',           modelId: 'claude-opus-4-7',                provider: LLMProvider.ANTHROPIC },
+];
+
+/** Parse a leading @Tag from message content. Case-insensitive. Returns null if not a known tag. */
+export function parseModelTag(content: string): (ModelTag & { rest: string }) | null {
+  const trimmed = content.trimStart();
+  if (!trimmed.startsWith('@')) return null;
+  const spaceIdx = trimmed.indexOf(' ');
+  const tagName = spaceIdx === -1 ? trimmed.slice(1) : trimmed.slice(1, spaceIdx);
+  const rest    = spaceIdx === -1 ? '' : trimmed.slice(spaceIdx + 1).trim();
+  const found   = MODEL_TAGS.find(m => m.tag.toLowerCase() === tagName.toLowerCase());
+  return found ? { ...found, rest } : null;
+}
--- a/gateway/src/llm/provider.ts
+++ b/gateway/src/llm/provider.ts
@@ -1,5 +1,6 @@
 import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
 import { ChatOpenAI } from '@langchain/openai';
+import { ChatAnthropic } from '@langchain/anthropic';
 import type { FastifyBaseLogger } from 'fastify';
 import { type ModelMiddleware, NoopMiddleware, AnthropicCachingMiddleware } from './middleware.js';

@@ -11,6 +12,8 @@ export { NoopMiddleware, AnthropicCachingMiddleware };
 */
 export enum LLMProvider {
  DEEP_INFRA = 'deepinfra',
+  DEEP_INFRA_ANTHROPIC = 'deepinfra_anthropic',
+  ANTHROPIC = 'anthropic',
 }

 /**
@@ -21,15 +24,16 @@ export interface ModelConfig {
  model: string;
  temperature?: number;
  maxTokens?: number;
+  thinking?: { budgetTokens: number };
 }

 /**
 * License tier model configuration
 */
 export interface LicenseTierModels {
-  default: string;
-  cost_optimized: string;
-  complex: string;
+  default: string | null;
+  cost_optimized: string | null;
+  complex: string | null;
  allowed_models?: string[];
  blocked_models?: string[];
 }
@@ -48,11 +52,13 @@ export interface LicenseModelsConfig {
 */
 export interface ProviderConfig {
  deepinfraApiKey?: string;
+  anthropicApiKey?: string;
  defaultModel?: ModelConfig;
  licenseModels?: LicenseModelsConfig;
 }

 const DEEP_INFRA_BASE_URL = 'https://api.deepinfra.com/v1/openai';
+const DEEP_INFRA_ANTHROPIC_BASE_URL = 'https://api.deepinfra.com/anthropic';

 /**
 * LLM Provider factory
@@ -80,6 +86,12 @@ export class LLMProviderFactory {
      case LLMProvider.DEEP_INFRA:
        return this.createDeepInfraModel(modelConfig);

+      case LLMProvider.DEEP_INFRA_ANTHROPIC:
+        return this.createDeepInfraAnthropicModel(modelConfig);
+
+      case LLMProvider.ANTHROPIC:
+        return this.createAnthropicModel(modelConfig);
+
      default:
        throw new Error(`Unsupported provider: ${modelConfig.provider}`);
    }
@@ -106,6 +118,49 @@ export class LLMProviderFactory {
    return { model, middleware: new NoopMiddleware() };
  }

+  /**
+   * Create Deep Infra model via Anthropic-compatible API (supports thinking)
+   */
+  private createDeepInfraAnthropicModel(config: ModelConfig): { model: ChatAnthropic; middleware: AnthropicCachingMiddleware } {
+    if (!this.config.deepinfraApiKey) {
+      throw new Error('Deep Infra API key not configured');
+    }
+
+    const model = new ChatAnthropic({
+      model: config.model,
+      ...(!config.thinking && { temperature: config.temperature ?? 0.7 }),
+      maxTokens: config.maxTokens ?? 8192,
+      anthropicApiKey: this.config.deepinfraApiKey,
+      clientOptions: { baseURL: DEEP_INFRA_ANTHROPIC_BASE_URL },
+      ...(config.thinking && {
+        thinking: { type: 'enabled' as const, budget_tokens: config.thinking.budgetTokens },
+      }),
+    });
+
+    return { model, middleware: new AnthropicCachingMiddleware() };
+  }
+
+  /**
+   * Create model via native Anthropic API (not Deep Infra)
+   */
+  private createAnthropicModel(config: ModelConfig): { model: ChatAnthropic; middleware: AnthropicCachingMiddleware } {
+    if (!this.config.anthropicApiKey) {
+      throw new Error('Anthropic API key not configured');
+    }
+
+    const model = new ChatAnthropic({
+      model: config.model,
+      ...(!config.thinking && { temperature: config.temperature ?? 0.7 }),
+      maxTokens: config.maxTokens ?? 8192,
+      anthropicApiKey: this.config.anthropicApiKey,
+      ...(config.thinking && {
+        thinking: { type: 'enabled' as const, budget_tokens: config.thinking.budgetTokens },
+      }),
+    });
+
+    return { model, middleware: new AnthropicCachingMiddleware() };
+  }
+
  /**
   * Get default model based on environment
   */
@@ -118,10 +173,7 @@ export class LLMProviderFactory {
      throw new Error('Deep Infra API key not configured');
    }

-    return {
-      provider: LLMProvider.DEEP_INFRA,
-      model: 'zai-org/GLM-5',
-    };
+    throw new Error('Default model not configured — set defaults.model in gateway config');
  }

  /**
@@ -132,16 +184,3 @@ export class LLMProviderFactory {
  }
 }

-/**
- * Predefined model configurations
- */
-export const MODELS = {
-  GLM_5: {
-    provider: LLMProvider.DEEP_INFRA,
-    model: 'zai-org/GLM-5',
-  },
-  QWEN_235B: {
-    provider: LLMProvider.DEEP_INFRA,
-    model: 'Qwen/Qwen3-235B-A22B-Instruct-2507',
-  },
-} as const satisfies Record<string, ModelConfig>;
--- a/gateway/src/llm/router.ts
+++ b/gateway/src/llm/router.ts
@@ -1,6 +1,6 @@
 import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
 import type { FastifyBaseLogger } from 'fastify';
-import { LLMProviderFactory, type ModelConfig, LLMProvider, type LicenseModelsConfig } from './provider.js';
+import { LLMProviderFactory, type ModelConfig, type LicenseModelsConfig } from './provider.js';
 import type { ModelMiddleware } from './middleware.js';
 import type { License } from '../types/user.js';

@@ -35,6 +35,17 @@ export class ModelRouter {
    this.licenseModels = factory.getLicenseModelsConfig();
  }

+  /**
+   * Create a model directly from a config, bypassing routing logic.
+   */
+  createModel(config: ModelConfig): { model: BaseChatModel; middleware: ModelMiddleware } {
+    return this.factory.createModel(config);
+  }
+
+  getDefaultModelConfig(): ModelConfig {
+    return this.defaultModel;
+  }
+
  /**
   * Route to appropriate model based on context
   */
@@ -107,73 +118,45 @@ export class ModelRouter {
  private routeByComplexity(message: string, license: License): ModelConfig {
    const isComplex = this.isComplexQuery(message);

-    // Use configuration if available
    if (this.licenseModels) {
      const tierConfig = this.licenseModels[license.licenseType];
      if (tierConfig) {
        const model = isComplex ? tierConfig.complex : tierConfig.default;
-        return { provider: this.defaultModel.provider as LLMProvider, model };
+        return model ? { ...this.defaultModel, model } : this.defaultModel;
      }
    }

-    // Fallback to hardcoded defaults
-    if (license.licenseType === 'enterprise') {
-      return isComplex
-        ? { provider: LLMProvider.DEEP_INFRA, model: 'Qwen/Qwen3-235B-A22B-Instruct-2507' }
-        : { provider: LLMProvider.DEEP_INFRA, model: 'zai-org/GLM-5' };
-    }
-
-    if (license.licenseType === 'pro') {
-      return isComplex
-        ? { provider: LLMProvider.DEEP_INFRA, model: 'zai-org/GLM-5' }
-        : { provider: LLMProvider.DEEP_INFRA, model: 'zai-org/GLM-5' };
-    }
-
-    return { provider: LLMProvider.DEEP_INFRA, model: 'zai-org/GLM-5' };
+    return this.defaultModel;
  }

  /**
   * Route based on license tier
   */
  private routeByLicenseTier(license: License): ModelConfig {
-    // Use configuration if available
    if (this.licenseModels) {
      const tierConfig = this.licenseModels[license.licenseType];
      if (tierConfig) {
-        return { provider: this.defaultModel.provider as LLMProvider, model: tierConfig.default };
+        const model = tierConfig.default;
+        return model ? { ...this.defaultModel, model } : this.defaultModel;
      }
    }

-    // Fallback to hardcoded defaults
-    switch (license.licenseType) {
-      case 'enterprise':
-        return { provider: LLMProvider.DEEP_INFRA, model: 'zai-org/GLM-5' };
-
-      case 'pro':
-        return { provider: LLMProvider.DEEP_INFRA, model: 'zai-org/GLM-5' };
-
-      case 'free':
-        return { provider: LLMProvider.DEEP_INFRA, model: 'zai-org/GLM-5' };
-
-      default:
-        return this.defaultModel;
-    }
+    return this.defaultModel;
  }

  /**
   * Route to cheapest available model
   */
  private routeByCost(license: License): ModelConfig {
-    // Use configuration if available
    if (this.licenseModels) {
      const tierConfig = this.licenseModels[license.licenseType];
      if (tierConfig) {
-        return { provider: this.defaultModel.provider as LLMProvider, model: tierConfig.cost_optimized };
+        const model = tierConfig.cost_optimized;
+        return model ? { ...this.defaultModel, model } : this.defaultModel;
      }
    }

-    // Fallback: use GLM-5
-    return { provider: LLMProvider.DEEP_INFRA, model: 'zai-org/GLM-5' };
+    return this.defaultModel;
  }

  /**
@@ -199,18 +182,7 @@ export class ModelRouter {
      }
    }

-    // Fallback to hardcoded defaults
-    if (license.licenseType === 'free') {
-      const allowedModels = ['zai-org/GLM-5'];
-      return allowedModels.includes(model.model);
-    }
-
-    if (license.licenseType === 'pro') {
-      const blockedModels = ['Qwen/Qwen3-235B-A22B-Instruct-2507'];
-      return !blockedModels.includes(model.model);
-    }
-
-    // Enterprise: all models allowed
+    // Without tier config, all models allowed
    return true;
  }

--- a/gateway/src/main.ts
+++ b/gateway/src/main.ts
@@ -86,27 +86,31 @@ function loadConfig() {
    // LLM provider API keys and model configuration
    providerConfig: {
      deepinfraApiKey: secretsData.llm_providers?.deepinfra_api_key || process.env.DEEPINFRA_API_KEY,
-      defaultModel: {
-        provider: configData.defaults?.model_provider || 'deepinfra',
-        model: configData.defaults?.model || 'zai-org/GLM-5',
-      },
+      anthropicApiKey: secretsData.llm_providers?.anthropic_api_key || process.env.ANTHROPIC_API_KEY,
+      defaultModel: configData.defaults?.model ? {
+        provider: configData.defaults.model_provider,
+        model: configData.defaults.model,
+        ...(configData.defaults.thinking_budget_tokens && {
+          thinking: { budgetTokens: configData.defaults.thinking_budget_tokens },
+        }),
+      } : undefined,
      licenseModels: {
        free: {
-          default: configData.license_models?.free?.default || 'zai-org/GLM-5',
-          cost_optimized: configData.license_models?.free?.cost_optimized || 'zai-org/GLM-5',
-          complex: configData.license_models?.free?.complex || 'zai-org/GLM-5',
-          allowed_models: configData.license_models?.free?.allowed_models || ['zai-org/GLM-5'],
+          default: configData.license_models?.free?.default || null,
+          cost_optimized: configData.license_models?.free?.cost_optimized || null,
+          complex: configData.license_models?.free?.complex || null,
+          allowed_models: configData.license_models?.free?.allowed_models,
        },
        pro: {
-          default: configData.license_models?.pro?.default || 'zai-org/GLM-5',
-          cost_optimized: configData.license_models?.pro?.cost_optimized || 'zai-org/GLM-5',
-          complex: configData.license_models?.pro?.complex || 'zai-org/GLM-5',
-          blocked_models: configData.license_models?.pro?.blocked_models || ['Qwen/Qwen3-235B-A22B-Instruct-2507'],
+          default: configData.license_models?.pro?.default || null,
+          cost_optimized: configData.license_models?.pro?.cost_optimized || null,
+          complex: configData.license_models?.pro?.complex || null,
+          blocked_models: configData.license_models?.pro?.blocked_models,
        },
        enterprise: {
-          default: configData.license_models?.enterprise?.default || 'zai-org/GLM-5',
-          cost_optimized: configData.license_models?.enterprise?.cost_optimized || 'zai-org/GLM-5',
-          complex: configData.license_models?.enterprise?.complex || 'Qwen/Qwen3-235B-A22B-Instruct-2507',
+          default: configData.license_models?.enterprise?.default || null,
+          cost_optimized: configData.license_models?.enterprise?.cost_optimized || null,
+          complex: configData.license_models?.enterprise?.complex || null,
        },
      },
    },
@@ -354,6 +358,7 @@ try {
    icebergClient,
    relayClient: zmqRelayClient,
    logger: app.log,
+    getSymbolIndex: () => symbolIndexService,
  });
  app.log.info('OHLC service initialized');
 } catch (error) {
--- a/gateway/src/services/ohlc-service.ts
+++ b/gateway/src/services/ohlc-service.ts
@@ -28,12 +28,14 @@ import {
  backendToTradingView,
  DEFAULT_SUPPORTED_RESOLUTIONS,
 } from '../types/ohlc.js';
+import type { SymbolIndexService } from './symbol-index-service.js';

 export interface OHLCServiceConfig {
  icebergClient: IcebergClient;
  relayClient: ZMQRelayClient;
  logger: FastifyBaseLogger;
  requestTimeout?: number;  // Request timeout in ms (default: 30000)
+  getSymbolIndex?: () => SymbolIndexService | undefined;
 }

 /**
@@ -45,11 +47,13 @@ export class OHLCService {
  private icebergClient: IcebergClient;
  private relayClient: ZMQRelayClient;
  private logger: FastifyBaseLogger;
+  private getSymbolIndex?: () => SymbolIndexService | undefined;

  constructor(config: OHLCServiceConfig) {
    this.icebergClient = config.icebergClient;
    this.relayClient = config.relayClient;
    this.logger = config.logger;
+    this.getSymbolIndex = config.getSymbolIndex;
  }

  /**
@@ -129,7 +133,7 @@ export class OHLCService {
    if (missingRanges.length === 0 && data.length > 0) {
      // All data exists in Iceberg
      this.logger.info({ ticker, period_seconds, cached: true }, 'OHLC data found in cache, returning immediately');
-      return this.formatHistoryResult(data, start_time, end_time, period_seconds, countback);
+      return this.formatHistoryResult(ticker, data, start_time, end_time, period_seconds, countback);
    }

    // Step 3: Request each missing range from the relay individually so we
@@ -160,7 +164,7 @@ export class OHLCService {
      data = await this.icebergClient.queryOHLC(ticker, period_seconds, start_time, end_time);
      this.logger.info({ ticker, period_seconds, dataCount: data.length }, 'Final Iceberg query complete, returning result');

-      return this.formatHistoryResult(data, start_time, end_time, period_seconds, countback);
+      return this.formatHistoryResult(ticker, data, start_time, end_time, period_seconds, countback);

    } catch (error: any) {
      this.logger.error({
@@ -179,8 +183,12 @@ export class OHLCService {
   * Interior gaps (confirmed trading periods with no trades) arrive as null-OHLC
   * rows from Iceberg. Edge gaps (data not yet ingested, in-progress candles) are
   * simply absent rows. Both are returned as-is; clients fill as appropriate.
+   *
+   * Applies decimal correction: Nautilus stores prices/volumes as integers;
+   * divide by 10^price_precision and 10^size_precision to recover float values.
   */
  private formatHistoryResult(
+    ticker: string,
    data: any[],
    // @ts-ignore
    start_time: bigint,
@@ -197,9 +205,33 @@ export class OHLCService {
      };
    }

-    // Convert to TradingView format without null-filling missing slots.
+    // Convert to TradingView format (also converts BigInt fields to Number).
    let bars: TradingViewBar[] = data.map(backendToTradingView);

+    // Apply decimal correction using symbol metadata.
+    const symbolMeta = this.getSymbolIndex?.()?.getSymbolByTicker(ticker);
+    if (symbolMeta) {
+      const pricePrecision = symbolMeta.price_precision;
+      const sizePrecision = symbolMeta.size_precision;
+      if (pricePrecision != null && pricePrecision > 0) {
+        const priceDivisor = Math.pow(10, pricePrecision);
+        bars = bars.map(bar => ({
+          ...bar,
+          open: bar.open / priceDivisor,
+          high: bar.high / priceDivisor,
+          low: bar.low / priceDivisor,
+          close: bar.close / priceDivisor,
+        }));
+      }
+      if (sizePrecision != null && sizePrecision > 0) {
+        const sizeDivisor = Math.pow(10, sizePrecision);
+        bars = bars.map(bar => ({
+          ...bar,
+          volume: bar.volume != null ? bar.volume / sizeDivisor : bar.volume,
+        }));
+      }
+    }
+
    bars.sort((a, b) => a.time - b.time);

    if (countback && bars.length > countback) {
--- a/gateway/src/services/symbol-index-service.ts
+++ b/gateway/src/services/symbol-index-service.ts
@@ -91,6 +91,13 @@ export class SymbolIndexService {
    await this.initPromise;
  }

+  /**
+   * Look up symbol metadata by Nautilus ticker (e.g. "BTC/USDT.BINANCE")
+   */
+  getSymbolByTicker(ticker: string): SymbolMetadata | undefined {
+    return this.symbols.get(ticker);
+  }
+
  /**
   * Update or add a symbol to the index
   */
--- a/gateway/src/types/ohlc.ts
+++ b/gateway/src/types/ohlc.ts
@@ -170,11 +170,11 @@ export function nanosToSeconds(nanos: bigint | number): number {
 export function backendToTradingView(backend: BackendOHLC): TradingViewBar {
  return {
    time: nanosToSeconds(backend.timestamp),
-    open: backend.open,
-    high: backend.high,
-    low: backend.low,
-    close: backend.close,
-    volume: backend.volume ?? undefined,
+    open: Number(backend.open),
+    high: Number(backend.high),
+    low: Number(backend.low),
+    close: Number(backend.close),
+    volume: backend.volume != null ? Number(backend.volume) : undefined,
  };
 }