redesign fully scaffolded and web login works

2026-03-17 20:10:47 -04:00
parent b9cc397e05
commit f6bd22a8ef
143 changed files with 17317 additions and 693 deletions
--- a/.agents/skills/better-auth-best-practices/SKILL.md
+++ b/.agents/skills/better-auth-best-practices/SKILL.md
@@ -0,0 +1,175 @@
+---
+name: better-auth-best-practices
+description: Configure Better Auth server and client, set up database adapters, manage sessions, add plugins, and handle environment variables. Use when users mention Better Auth, betterauth, auth.ts, or need to set up TypeScript authentication with email/password, OAuth, or plugin configuration.
+---
+
+# Better Auth Integration Guide
+
+**Always consult [better-auth.com/docs](https://better-auth.com/docs) for code examples and latest API.**
+
+---
+
+## Setup Workflow
+
+1. Install: `npm install better-auth`
+2. Set env vars: `BETTER_AUTH_SECRET` and `BETTER_AUTH_URL`
+3. Create `auth.ts` with database + config
+4. Create route handler for your framework
+5. Run `npx @better-auth/cli@latest migrate`
+6. Verify: call `GET /api/auth/ok` — should return `{ status: "ok" }`
+
+---
+
+## Quick Reference
+
+### Environment Variables
+- `BETTER_AUTH_SECRET` - Encryption secret (min 32 chars). Generate: `openssl rand -base64 32`
+- `BETTER_AUTH_URL` - Base URL (e.g., `https://example.com`)
+
+Only define `baseURL`/`secret` in config if env vars are NOT set.
+
+### File Location
+CLI looks for `auth.ts` in: `./`, `./lib`, `./utils`, or under `./src`. Use `--config` for custom path.
+
+### CLI Commands
+- `npx @better-auth/cli@latest migrate` - Apply schema (built-in adapter)
+- `npx @better-auth/cli@latest generate` - Generate schema for Prisma/Drizzle
+- `npx @better-auth/cli mcp --cursor` - Add MCP to AI tools
+
+**Re-run after adding/changing plugins.**
+
+---
+
+## Core Config Options
+
+| Option | Notes |
+|--------|-------|
+| `appName` | Optional display name |
+| `baseURL` | Only if `BETTER_AUTH_URL` not set |
+| `basePath` | Default `/api/auth`. Set `/` for root. |
+| `secret` | Only if `BETTER_AUTH_SECRET` not set |
+| `database` | Required for most features. See adapters docs. |
+| `secondaryStorage` | Redis/KV for sessions & rate limits |
+| `emailAndPassword` | `{ enabled: true }` to activate |
+| `socialProviders` | `{ google: { clientId, clientSecret }, ... }` |
+| `plugins` | Array of plugins |
+| `trustedOrigins` | CSRF whitelist |
+
+---
+
+## Database
+
+**Direct connections:** Pass `pg.Pool`, `mysql2` pool, `better-sqlite3`, or `bun:sqlite` instance.
+
+**ORM adapters:** Import from `better-auth/adapters/drizzle`, `better-auth/adapters/prisma`, `better-auth/adapters/mongodb`.
+
+**Critical:** Better Auth uses adapter model names, NOT underlying table names. If Prisma model is `User` mapping to table `users`, use `modelName: "user"` (Prisma reference), not `"users"`.
+
+---
+
+## Session Management
+
+**Storage priority:**
+1. If `secondaryStorage` defined → sessions go there (not DB)
+2. Set `session.storeSessionInDatabase: true` to also persist to DB
+3. No database + `cookieCache` → fully stateless mode
+
+**Cookie cache strategies:**
+- `compact` (default) - Base64url + HMAC. Smallest.
+- `jwt` - Standard JWT. Readable but signed.
+- `jwe` - Encrypted. Maximum security.
+
+**Key options:** `session.expiresIn` (default 7 days), `session.updateAge` (refresh interval), `session.cookieCache.maxAge`, `session.cookieCache.version` (change to invalidate all sessions).
+
+---
+
+## User & Account Config
+
+**User:** `user.modelName`, `user.fields` (column mapping), `user.additionalFields`, `user.changeEmail.enabled` (disabled by default), `user.deleteUser.enabled` (disabled by default).
+
+**Account:** `account.modelName`, `account.accountLinking.enabled`, `account.storeAccountCookie` (for stateless OAuth).
+
+**Required for registration:** `email` and `name` fields.
+
+---
+
+## Email Flows
+
+- `emailVerification.sendVerificationEmail` - Must be defined for verification to work
+- `emailVerification.sendOnSignUp` / `sendOnSignIn` - Auto-send triggers
+- `emailAndPassword.sendResetPassword` - Password reset email handler
+
+---
+
+## Security
+
+**In `advanced`:**
+- `useSecureCookies` - Force HTTPS cookies
+- `disableCSRFCheck` - ⚠️ Security risk
+- `disableOriginCheck` - ⚠️ Security risk  
+- `crossSubDomainCookies.enabled` - Share cookies across subdomains
+- `ipAddress.ipAddressHeaders` - Custom IP headers for proxies
+- `database.generateId` - Custom ID generation or `"serial"`/`"uuid"`/`false`
+
+**Rate limiting:** `rateLimit.enabled`, `rateLimit.window`, `rateLimit.max`, `rateLimit.storage` ("memory" | "database" | "secondary-storage").
+
+---
+
+## Hooks
+
+**Endpoint hooks:** `hooks.before` / `hooks.after` - Array of `{ matcher, handler }`. Use `createAuthMiddleware`. Access `ctx.path`, `ctx.context.returned` (after), `ctx.context.session`.
+
+**Database hooks:** `databaseHooks.user.create.before/after`, same for `session`, `account`. Useful for adding default values or post-creation actions.
+
+**Hook context (`ctx.context`):** `session`, `secret`, `authCookies`, `password.hash()`/`verify()`, `adapter`, `internalAdapter`, `generateId()`, `tables`, `baseURL`.
+
+---
+
+## Plugins
+
+**Import from dedicated paths for tree-shaking:**
+```
+import { twoFactor } from "better-auth/plugins/two-factor"
+```
+NOT `from "better-auth/plugins"`.
+
+**Popular plugins:** `twoFactor`, `organization`, `passkey`, `magicLink`, `emailOtp`, `username`, `phoneNumber`, `admin`, `apiKey`, `bearer`, `jwt`, `multiSession`, `sso`, `oauthProvider`, `oidcProvider`, `openAPI`, `genericOAuth`.
+
+Client plugins go in `createAuthClient({ plugins: [...] })`.
+
+---
+
+## Client
+
+Import from: `better-auth/client` (vanilla), `better-auth/react`, `better-auth/vue`, `better-auth/svelte`, `better-auth/solid`.
+
+Key methods: `signUp.email()`, `signIn.email()`, `signIn.social()`, `signOut()`, `useSession()`, `getSession()`, `revokeSession()`, `revokeSessions()`.
+
+---
+
+## Type Safety
+
+Infer types: `typeof auth.$Infer.Session`, `typeof auth.$Infer.Session.user`.
+
+For separate client/server projects: `createAuthClient<typeof auth>()`.
+
+---
+
+## Common Gotchas
+
+1. **Model vs table name** - Config uses ORM model name, not DB table name
+2. **Plugin schema** - Re-run CLI after adding plugins
+3. **Secondary storage** - Sessions go there by default, not DB
+4. **Cookie cache** - Custom session fields NOT cached, always re-fetched
+5. **Stateless mode** - No DB = session in cookie only, logout on cache expiry
+6. **Change email flow** - Sends to current email first, then new email
+
+---
+
+## Resources
+
+- [Docs](https://better-auth.com/docs)
+- [Options Reference](https://better-auth.com/docs/reference/options)
+- [LLMs.txt](https://better-auth.com/llms.txt)
+- [GitHub](https://github.com/better-auth/better-auth)
+- [Init Options Source](https://github.com/better-auth/better-auth/blob/main/packages/core/src/types/init-options.ts)
--- a/.agents/skills/create-auth-skill/SKILL.md
+++ b/.agents/skills/create-auth-skill/SKILL.md
@@ -0,0 +1,321 @@
+---
+name: create-auth-skill
+description: Scaffold and implement authentication in TypeScript/JavaScript apps using Better Auth. Detect frameworks, configure database adapters, set up route handlers, add OAuth providers, and create auth UI pages. Use when users want to add login, sign-up, or authentication to a new or existing project with Better Auth.
+---
+
+# Create Auth Skill
+
+Guide for adding authentication to TypeScript/JavaScript applications using Better Auth.
+
+**For code examples and syntax, see [better-auth.com/docs](https://better-auth.com/docs).**
+
+---
+
+## Phase 1: Planning (REQUIRED before implementation)
+
+Before writing any code, gather requirements by scanning the project and asking the user structured questions. This ensures the implementation matches their needs.
+
+### Step 1: Scan the project
+
+Analyze the codebase to auto-detect:
+- **Framework** — Look for `next.config`, `svelte.config`, `nuxt.config`, `astro.config`, `vite.config`, or Express/Hono entry files.
+- **Database/ORM** — Look for `prisma/schema.prisma`, `drizzle.config`, `package.json` deps (`pg`, `mysql2`, `better-sqlite3`, `mongoose`, `mongodb`).
+- **Existing auth** — Look for existing auth libraries (`next-auth`, `lucia`, `clerk`, `supabase/auth`, `firebase/auth`) in `package.json` or imports.
+- **Package manager** — Check for `pnpm-lock.yaml`, `yarn.lock`, `bun.lockb`, or `package-lock.json`.
+
+Use what you find to pre-fill defaults and skip questions you can already answer.
+
+### Step 2: Ask planning questions
+
+Use the `AskQuestion` tool to ask the user **all applicable questions in a single call**. Skip any question you already have a confident answer for from the scan. Group them under a title like "Auth Setup Planning".
+
+**Questions to ask:**
+
+1. **Project type** (skip if detected)
+   - Prompt: "What type of project is this?"
+   - Options: New project from scratch | Adding auth to existing project | Migrating from another auth library
+
+2. **Framework** (skip if detected)
+   - Prompt: "Which framework are you using?"
+   - Options: Next.js (App Router) | Next.js (Pages Router) | SvelteKit | Nuxt | Astro | Express | Hono | SolidStart | Other
+
+3. **Database & ORM** (skip if detected)
+   - Prompt: "Which database setup will you use?"
+   - Options: PostgreSQL (Prisma) | PostgreSQL (Drizzle) | PostgreSQL (pg driver) | MySQL (Prisma) | MySQL (Drizzle) | MySQL (mysql2 driver) | SQLite (Prisma) | SQLite (Drizzle) | SQLite (better-sqlite3 driver) | MongoDB (Mongoose) | MongoDB (native driver)
+
+4. **Authentication methods** (always ask, allow multiple)
+   - Prompt: "Which sign-in methods do you need?"
+   - Options: Email & password | Social OAuth (Google, GitHub, etc.) | Magic link (passwordless email) | Passkey (WebAuthn) | Phone number
+   - `allow_multiple: true`
+
+5. **Social providers** (only if they selected Social OAuth above — ask in a follow-up call)
+   - Prompt: "Which social providers do you need?"
+   - Options: Google | GitHub | Apple | Microsoft | Discord | Twitter/X
+   - `allow_multiple: true`
+
+6. **Email verification** (only if Email & password was selected above — ask in a follow-up call)
+   - Prompt: "Do you want to require email verification?"
+   - Options: Yes | No
+
+7. **Email provider** (only if email verification is Yes, or if Password reset is selected in features — ask in a follow-up call)
+   - Prompt: "How do you want to send emails?"
+   - Options: Resend | Mock it for now (console.log)
+
+8. **Features & plugins** (always ask, allow multiple)
+   - Prompt: "Which additional features do you need?"
+   - Options: Two-factor authentication (2FA) | Organizations / teams | Admin dashboard | API bearer tokens | Password reset | None of these
+   - `allow_multiple: true`
+
+9. **Auth pages** (always ask, allow multiple — pre-select based on earlier answers)
+   - Prompt: "Which auth pages do you need?"
+   - Options vary based on previous answers:
+     - Always available: Sign in | Sign up
+     - If Email & password selected: Forgot password | Reset password
+     - If email verification enabled: Email verification
+   - `allow_multiple: true`
+
+10. **Auth UI style** (always ask)
+   - Prompt: "What style do you want for the auth pages? Pick one or describe your own."
+   - Options: Minimal & clean | Centered card with background | Split layout (form + hero image) | Floating / glassmorphism | Other (I'll describe)
+
+### Step 3: Summarize the plan
+
+After collecting answers, present a concise implementation plan as a markdown checklist. Example:
+
+```
+## Auth Implementation Plan
+
+- **Framework:** Next.js (App Router)
+- **Database:** PostgreSQL via Prisma
+- **Auth methods:** Email/password, Google OAuth, GitHub OAuth
+- **Plugins:** 2FA, Organizations, Email verification
+- **UI:** Custom forms
+
+### Steps
+1. Install `better-auth` and `@better-auth/cli`
+2. Create `lib/auth.ts` with server config
+3. Create `lib/auth-client.ts` with React client
+4. Set up route handler at `app/api/auth/[...all]/route.ts`
+5. Configure Prisma adapter and generate schema
+6. Add Google & GitHub OAuth providers
+7. Enable `twoFactor` and `organization` plugins
+8. Set up email verification handler
+9. Run migrations
+10. Create sign-in / sign-up pages
+```
+
+Ask the user to confirm the plan before proceeding to Phase 2.
+
+---
+
+## Phase 2: Implementation
+
+Only proceed here after the user confirms the plan from Phase 1.
+
+Follow the decision tree below, guided by the answers collected above.
+
+```
+Is this a new/empty project?
+├─ YES → New project setup
+│   1. Install better-auth (+ scoped packages per plan)
+│   2. Create auth.ts with all planned config
+│   3. Create auth-client.ts with framework client
+│   4. Set up route handler
+│   5. Set up environment variables
+│   6. Run CLI migrate/generate
+│   7. Add plugins from plan
+│   8. Create auth UI pages
+│
+├─ MIGRATING → Migration from existing auth
+│   1. Audit current auth for gaps
+│   2. Plan incremental migration
+│   3. Install better-auth alongside existing auth
+│   4. Migrate routes, then session logic, then UI
+│   5. Remove old auth library
+│   6. See migration guides in docs
+│
+└─ ADDING → Add auth to existing project
+    1. Analyze project structure
+    2. Install better-auth
+    3. Create auth config matching plan
+    4. Add route handler
+    5. Run schema migrations
+    6. Integrate into existing pages
+    7. Add planned plugins and features
+```
+
+At the end of implementation, guide users thoroughly on remaining next steps (e.g., setting up OAuth app credentials, deploying env vars, testing flows).
+
+---
+
+## Installation
+
+**Core:** `npm install better-auth`
+
+**Scoped packages (as needed):**
+| Package | Use case |
+|---------|----------|
+| `@better-auth/passkey` | WebAuthn/Passkey auth |
+| `@better-auth/sso` | SAML/OIDC enterprise SSO |
+| `@better-auth/stripe` | Stripe payments |
+| `@better-auth/scim` | SCIM user provisioning |
+| `@better-auth/expo` | React Native/Expo |
+
+---
+
+## Environment Variables
+
+```env
+BETTER_AUTH_SECRET=<32+ chars, generate with: openssl rand -base64 32>
+BETTER_AUTH_URL=http://localhost:3000
+DATABASE_URL=<your database connection string>
+```
+
+Add OAuth secrets as needed: `GITHUB_CLIENT_ID`, `GITHUB_CLIENT_SECRET`, `GOOGLE_CLIENT_ID`, etc.
+
+---
+
+## Server Config (auth.ts)
+
+**Location:** `lib/auth.ts` or `src/lib/auth.ts`
+
+**Minimal config needs:**
+- `database` - Connection or adapter
+- `emailAndPassword: { enabled: true }` - For email/password auth
+
+**Standard config adds:**
+- `socialProviders` - OAuth providers (google, github, etc.)
+- `emailVerification.sendVerificationEmail` - Email verification handler
+- `emailAndPassword.sendResetPassword` - Password reset handler
+
+**Full config adds:**
+- `plugins` - Array of feature plugins
+- `session` - Expiry, cookie cache settings
+- `account.accountLinking` - Multi-provider linking
+- `rateLimit` - Rate limiting config
+
+**Export types:** `export type Session = typeof auth.$Infer.Session`
+
+---
+
+## Client Config (auth-client.ts)
+
+**Import by framework:**
+| Framework | Import |
+|-----------|--------|
+| React/Next.js | `better-auth/react` |
+| Vue | `better-auth/vue` |
+| Svelte | `better-auth/svelte` |
+| Solid | `better-auth/solid` |
+| Vanilla JS | `better-auth/client` |
+
+**Client plugins** go in `createAuthClient({ plugins: [...] })`.
+
+**Common exports:** `signIn`, `signUp`, `signOut`, `useSession`, `getSession`
+
+---
+
+## Route Handler Setup
+
+| Framework | File | Handler |
+|-----------|------|---------|
+| Next.js App Router | `app/api/auth/[...all]/route.ts` | `toNextJsHandler(auth)` → export `{ GET, POST }` |
+| Next.js Pages | `pages/api/auth/[...all].ts` | `toNextJsHandler(auth)` → default export |
+| Express | Any file | `app.all("/api/auth/*", toNodeHandler(auth))` |
+| SvelteKit | `src/hooks.server.ts` | `svelteKitHandler(auth)` |
+| SolidStart | Route file | `solidStartHandler(auth)` |
+| Hono | Route file | `auth.handler(c.req.raw)` |
+
+**Next.js Server Components:** Add `nextCookies()` plugin to auth config.
+
+---
+
+## Database Migrations
+
+| Adapter | Command |
+|---------|---------|
+| Built-in Kysely | `npx @better-auth/cli@latest migrate` (applies directly) |
+| Prisma | `npx @better-auth/cli@latest generate --output prisma/schema.prisma` then `npx prisma migrate dev` |
+| Drizzle | `npx @better-auth/cli@latest generate --output src/db/auth-schema.ts` then `npx drizzle-kit push` |
+
+**Re-run after adding plugins.**
+
+---
+
+## Database Adapters
+
+| Database | Setup |
+|----------|-------|
+| SQLite | Pass `better-sqlite3` or `bun:sqlite` instance directly |
+| PostgreSQL | Pass `pg.Pool` instance directly |
+| MySQL | Pass `mysql2` pool directly |
+| Prisma | `prismaAdapter(prisma, { provider: "postgresql" })` from `better-auth/adapters/prisma` |
+| Drizzle | `drizzleAdapter(db, { provider: "pg" })` from `better-auth/adapters/drizzle` |
+| MongoDB | `mongodbAdapter(db)` from `better-auth/adapters/mongodb` |
+
+---
+
+## Common Plugins
+
+| Plugin | Server Import | Client Import | Purpose |
+|--------|---------------|---------------|---------|
+| `twoFactor` | `better-auth/plugins` | `twoFactorClient` | 2FA with TOTP/OTP |
+| `organization` | `better-auth/plugins` | `organizationClient` | Teams/orgs |
+| `admin` | `better-auth/plugins` | `adminClient` | User management |
+| `bearer` | `better-auth/plugins` | - | API token auth |
+| `openAPI` | `better-auth/plugins` | - | API docs |
+| `passkey` | `@better-auth/passkey` | `passkeyClient` | WebAuthn |
+| `sso` | `@better-auth/sso` | - | Enterprise SSO |
+
+**Plugin pattern:** Server plugin + client plugin + run migrations.
+
+---
+
+## Auth UI Implementation
+
+**Sign in flow:**
+1. `signIn.email({ email, password })` or `signIn.social({ provider, callbackURL })`
+2. Handle `error` in response
+3. Redirect on success
+
+**Session check (client):** `useSession()` hook returns `{ data: session, isPending }`
+
+**Session check (server):** `auth.api.getSession({ headers: await headers() })`
+
+**Protected routes:** Check session, redirect to `/sign-in` if null.
+
+---
+
+## Security Checklist
+
+- [ ] `BETTER_AUTH_SECRET` set (32+ chars)
+- [ ] `advanced.useSecureCookies: true` in production
+- [ ] `trustedOrigins` configured
+- [ ] Rate limits enabled
+- [ ] Email verification enabled
+- [ ] Password reset implemented
+- [ ] 2FA for sensitive apps
+- [ ] CSRF protection NOT disabled
+- [ ] `account.accountLinking` reviewed
+
+---
+
+## Troubleshooting
+
+| Issue | Fix |
+|-------|-----|
+| "Secret not set" | Add `BETTER_AUTH_SECRET` env var |
+| "Invalid Origin" | Add domain to `trustedOrigins` |
+| Cookies not setting | Check `baseURL` matches domain; enable secure cookies in prod |
+| OAuth callback errors | Verify redirect URIs in provider dashboard |
+| Type errors after adding plugin | Re-run CLI generate/migrate |
+
+---
+
+## Resources
+
+- [Docs](https://better-auth.com/docs)
+- [Examples](https://github.com/better-auth/examples)
+- [Plugins](https://better-auth.com/docs/concepts/plugins)
+- [CLI](https://better-auth.com/docs/concepts/cli)
+- [Migration Guides](https://better-auth.com/docs/guides)
--- a/.agents/skills/email-and-password-best-practices/SKILL.md
+++ b/.agents/skills/email-and-password-best-practices/SKILL.md
@@ -0,0 +1,212 @@
+---
+name: email-and-password-best-practices
+description: Configure email verification, implement password reset flows, set password policies, and customise hashing algorithms for Better Auth email/password authentication. Use when users need to set up login, sign-in, sign-up, credential authentication, or password security with Better Auth.
+---
+
+## Quick Start
+
+1. Enable email/password: `emailAndPassword: { enabled: true }`
+2. Configure `emailVerification.sendVerificationEmail`
+3. Add `sendResetPassword` for password reset flows
+4. Run `npx @better-auth/cli@latest migrate`
+5. Verify: attempt sign-up and confirm verification email triggers
+
+---
+
+## Email Verification Setup
+
+Configure `emailVerification.sendVerificationEmail` to verify user email addresses.
+
+```ts
+import { betterAuth } from "better-auth";
+import { sendEmail } from "./email"; // your email sending function
+
+export const auth = betterAuth({
+  emailVerification: {
+    sendVerificationEmail: async ({ user, url, token }, request) => {
+      await sendEmail({
+        to: user.email,
+        subject: "Verify your email address",
+        text: `Click the link to verify your email: ${url}`,
+      });
+    },
+  },
+});
+```
+
+**Note**: The `url` parameter contains the full verification link. The `token` is available if you need to build a custom verification URL.
+
+### Requiring Email Verification
+
+For stricter security, enable `emailAndPassword.requireEmailVerification` to block sign-in until the user verifies their email. When enabled, unverified users will receive a new verification email on each sign-in attempt.
+
+```ts
+export const auth = betterAuth({
+  emailAndPassword: {
+    requireEmailVerification: true,
+  },
+});
+```
+
+**Note**: This requires `sendVerificationEmail` to be configured and only applies to email/password sign-ins.
+
+## Client Side Validation
+
+Implement client-side validation for immediate user feedback and reduced server load.
+
+## Callback URLs
+
+Always use absolute URLs (including the origin) for callback URLs in sign-up and sign-in requests. This prevents Better Auth from needing to infer the origin, which can cause issues when your backend and frontend are on different domains.
+
+```ts
+const { data, error } = await authClient.signUp.email({
+  callbackURL: "https://example.com/callback", // absolute URL with origin
+});
+```
+
+## Password Reset Flows
+
+Provide `sendResetPassword` in the email and password config to enable password resets.
+
+```ts
+import { betterAuth } from "better-auth";
+import { sendEmail } from "./email"; // your email sending function
+
+export const auth = betterAuth({
+  emailAndPassword: {
+    enabled: true,
+    // Custom email sending function to send reset-password email
+    sendResetPassword: async ({ user, url, token }, request) => {
+      void sendEmail({
+        to: user.email,
+        subject: "Reset your password",
+        text: `Click the link to reset your password: ${url}`,
+      });
+    },
+    // Optional event hook
+    onPasswordReset: async ({ user }, request) => {
+      // your logic here
+      console.log(`Password for user ${user.email} has been reset.`);
+    },
+  },
+});
+```
+
+### Security Considerations
+
+Built-in protections: background email sending (timing attack prevention), dummy operations on invalid requests, constant response messages regardless of user existence.
+
+On serverless platforms, configure a background task handler:
+
+```ts
+export const auth = betterAuth({
+  advanced: {
+    backgroundTasks: {
+      handler: (promise) => {
+        // Use platform-specific methods like waitUntil
+        waitUntil(promise);
+      },
+    },
+  },
+});
+```
+
+#### Token Security
+
+Tokens expire after 1 hour by default. Configure with `resetPasswordTokenExpiresIn` (in seconds):
+
+```ts
+export const auth = betterAuth({
+  emailAndPassword: {
+    enabled: true,
+    resetPasswordTokenExpiresIn: 60 * 30, // 30 minutes
+  },
+});
+```
+
+Tokens are single-use — deleted immediately after successful reset.
+
+#### Session Revocation
+
+Enable `revokeSessionsOnPasswordReset` to invalidate all existing sessions on password reset:
+
+```ts
+export const auth = betterAuth({
+  emailAndPassword: {
+    enabled: true,
+    revokeSessionsOnPasswordReset: true,
+  },
+});
+```
+
+#### Password Requirements
+
+Password length limits (configurable):
+
+```ts
+export const auth = betterAuth({
+  emailAndPassword: {
+    enabled: true,
+    minPasswordLength: 12,
+    maxPasswordLength: 256,
+  },
+});
+```
+
+### Sending the Password Reset
+
+Call `requestPasswordReset` to send the reset link. Triggers the `sendResetPassword` function from your config.
+
+```ts
+const data = await auth.api.requestPasswordReset({
+  body: {
+    email: "john.doe@example.com", // required
+    redirectTo: "https://example.com/reset-password",
+  },
+});
+```
+
+Or authClient:
+
+```ts
+const { data, error } = await authClient.requestPasswordReset({
+  email: "john.doe@example.com", // required
+  redirectTo: "https://example.com/reset-password",
+});
+```
+
+**Note**: While the `email` is required, we also recommend configuring the `redirectTo` for a smoother user experience.
+
+## Password Hashing
+
+Default: `scrypt` (Node.js native, no external dependencies).
+
+### Custom Hashing Algorithm
+
+To use Argon2id or another algorithm, provide custom `hash` and `verify` functions:
+
+```ts
+import { betterAuth } from "better-auth";
+import { hash, verify, type Options } from "@node-rs/argon2";
+
+const argon2Options: Options = {
+  memoryCost: 65536, // 64 MiB
+  timeCost: 3, // 3 iterations
+  parallelism: 4, // 4 parallel lanes
+  outputLen: 32, // 32 byte output
+  algorithm: 2, // Argon2id variant
+};
+
+export const auth = betterAuth({
+  emailAndPassword: {
+    enabled: true,
+    password: {
+      hash: (password) => hash(password, argon2Options),
+      verify: ({ password, hash: storedHash }) =>
+        verify(storedHash, password, argon2Options),
+    },
+  },
+});
+```
+
+**Note**: If you switch hashing algorithms on an existing system, users with passwords hashed using the old algorithm won't be able to sign in. Plan a migration strategy if needed.
--- a/.agents/skills/two-factor-authentication-best-practices/SKILL.md
+++ b/.agents/skills/two-factor-authentication-best-practices/SKILL.md
@@ -0,0 +1,331 @@
+---
+name: two-factor-authentication-best-practices
+description: Configure TOTP authenticator apps, send OTP codes via email/SMS, manage backup codes, handle trusted devices, and implement 2FA sign-in flows using Better Auth's twoFactor plugin. Use when users need MFA, multi-factor authentication, authenticator setup, or login security with Better Auth.
+---
+
+## Setup
+
+1. Add `twoFactor()` plugin to server config with `issuer`
+2. Add `twoFactorClient()` plugin to client config
+3. Run `npx @better-auth/cli migrate`
+4. Verify: check that `twoFactorSecret` column exists on user table
+
+```ts
+import { betterAuth } from "better-auth";
+import { twoFactor } from "better-auth/plugins";
+
+export const auth = betterAuth({
+  appName: "My App",
+  plugins: [
+    twoFactor({
+      issuer: "My App",
+    }),
+  ],
+});
+```
+
+### Client-Side Setup
+
+```ts
+import { createAuthClient } from "better-auth/client";
+import { twoFactorClient } from "better-auth/client/plugins";
+
+export const authClient = createAuthClient({
+  plugins: [
+    twoFactorClient({
+      onTwoFactorRedirect() {
+        window.location.href = "/2fa";
+      },
+    }),
+  ],
+});
+```
+
+## Enabling 2FA for Users
+
+Requires password verification. Returns TOTP URI (for QR code) and backup codes.
+
+```ts
+const enable2FA = async (password: string) => {
+  const { data, error } = await authClient.twoFactor.enable({
+    password,
+  });
+
+  if (data) {
+    // data.totpURI — generate a QR code from this
+    // data.backupCodes — display to user
+  }
+};
+```
+
+`twoFactorEnabled` is not set to `true` until first TOTP verification succeeds. Override with `skipVerificationOnEnable: true` (not recommended).
+
+## TOTP (Authenticator App)
+
+### Displaying the QR Code
+
+```tsx
+import QRCode from "react-qr-code";
+
+const TotpSetup = ({ totpURI }: { totpURI: string }) => {
+  return <QRCode value={totpURI} />;
+};
+```
+
+### Verifying TOTP Codes
+
+Accepts codes from one period before/after current time:
+
+```ts
+const verifyTotp = async (code: string) => {
+  const { data, error } = await authClient.twoFactor.verifyTotp({
+    code,
+    trustDevice: true,
+  });
+};
+```
+
+### TOTP Configuration Options
+
+```ts
+twoFactor({
+  totpOptions: {
+    digits: 6, // 6 or 8 digits (default: 6)
+    period: 30, // Code validity period in seconds (default: 30)
+  },
+});
+```
+
+## OTP (Email/SMS)
+
+### Configuring OTP Delivery
+
+```ts
+import { betterAuth } from "better-auth";
+import { twoFactor } from "better-auth/plugins";
+import { sendEmail } from "./email";
+
+export const auth = betterAuth({
+  plugins: [
+    twoFactor({
+      otpOptions: {
+        sendOTP: async ({ user, otp }, ctx) => {
+          await sendEmail({
+            to: user.email,
+            subject: "Your verification code",
+            text: `Your code is: ${otp}`,
+          });
+        },
+        period: 5, // Code validity in minutes (default: 3)
+        digits: 6, // Number of digits (default: 6)
+        allowedAttempts: 5, // Max verification attempts (default: 5)
+      },
+    }),
+  ],
+});
+```
+
+### Sending and Verifying OTP
+
+Send: `authClient.twoFactor.sendOtp()`. Verify: `authClient.twoFactor.verifyOtp({ code, trustDevice: true })`.
+
+### OTP Storage Security
+
+Configure how OTP codes are stored in the database:
+
+```ts
+twoFactor({
+  otpOptions: {
+    storeOTP: "encrypted", // Options: "plain", "encrypted", "hashed"
+  },
+});
+```
+
+For custom encryption:
+
+```ts
+twoFactor({
+  otpOptions: {
+    storeOTP: {
+      encrypt: async (token) => myEncrypt(token),
+      decrypt: async (token) => myDecrypt(token),
+    },
+  },
+});
+```
+
+## Backup Codes
+
+Generated automatically when 2FA is enabled. Each code is single-use.
+
+### Displaying Backup Codes
+
+```tsx
+const BackupCodes = ({ codes }: { codes: string[] }) => {
+  return (
+    <div>
+      <p>Save these codes in a secure location:</p>
+      <ul>
+        {codes.map((code, i) => (
+          <li key={i}>{code}</li>
+        ))}
+      </ul>
+    </div>
+  );
+};
+```
+
+### Regenerating Backup Codes
+
+Invalidates all previous codes:
+
+```ts
+const regenerateBackupCodes = async (password: string) => {
+  const { data, error } = await authClient.twoFactor.generateBackupCodes({
+    password,
+  });
+  // data.backupCodes contains the new codes
+};
+```
+
+### Using Backup Codes for Recovery
+
+```ts
+const verifyBackupCode = async (code: string) => {
+  const { data, error } = await authClient.twoFactor.verifyBackupCode({
+    code,
+    trustDevice: true,
+  });
+};
+```
+
+### Backup Code Configuration
+
+```ts
+twoFactor({
+  backupCodeOptions: {
+    amount: 10, // Number of codes to generate (default: 10)
+    length: 10, // Length of each code (default: 10)
+    storeBackupCodes: "encrypted", // Options: "plain", "encrypted"
+  },
+});
+```
+
+## Handling 2FA During Sign-In
+
+Response includes `twoFactorRedirect: true` when 2FA is required:
+
+### Sign-In Flow
+
+1. Call `signIn.email({ email, password })`
+2. Check `context.data.twoFactorRedirect` in `onSuccess`
+3. If `true`, redirect to `/2fa` verification page
+4. Verify via TOTP, OTP, or backup code
+5. Session cookie is created on successful verification
+
+```ts
+const signIn = async (email: string, password: string) => {
+  const { data, error } = await authClient.signIn.email(
+    { email, password },
+    {
+      onSuccess(context) {
+        if (context.data.twoFactorRedirect) {
+          window.location.href = "/2fa";
+        }
+      },
+    }
+  );
+};
+```
+
+Server-side: check `"twoFactorRedirect" in response` when using `auth.api.signInEmail`.
+
+## Trusted Devices
+
+Pass `trustDevice: true` when verifying. Default trust duration: 30 days (`trustDeviceMaxAge`). Refreshes on each sign-in.
+
+## Security Considerations
+
+### Session Management
+
+Flow: credentials → session removed → temporary 2FA cookie (10 min default) → verify → session created.
+
+```ts
+twoFactor({
+  twoFactorCookieMaxAge: 600, // 10 minutes in seconds (default)
+});
+```
+
+### Rate Limiting
+
+Built-in: 3 requests per 10 seconds for all 2FA endpoints. OTP has additional attempt limiting:
+
+```ts
+twoFactor({
+  otpOptions: {
+    allowedAttempts: 5, // Max attempts per OTP code (default: 5)
+  },
+});
+```
+
+### Encryption at Rest
+
+TOTP secrets: encrypted with auth secret. Backup codes: encrypted by default. OTP: configurable (`"plain"`, `"encrypted"`, `"hashed"`). Uses constant-time comparison for verification.
+
+2FA can only be enabled for credential (email/password) accounts.
+
+## Disabling 2FA
+
+Requires password confirmation. Revokes trusted device records:
+
+```ts
+const disable2FA = async (password: string) => {
+  const { data, error } = await authClient.twoFactor.disable({
+    password,
+  });
+};
+```
+
+## Complete Configuration Example
+
+```ts
+import { betterAuth } from "better-auth";
+import { twoFactor } from "better-auth/plugins";
+import { sendEmail } from "./email";
+
+export const auth = betterAuth({
+  appName: "My App",
+  plugins: [
+    twoFactor({
+      // TOTP settings
+      issuer: "My App",
+      totpOptions: {
+        digits: 6,
+        period: 30,
+      },
+      // OTP settings
+      otpOptions: {
+        sendOTP: async ({ user, otp }) => {
+          await sendEmail({
+            to: user.email,
+            subject: "Your verification code",
+            text: `Your code is: ${otp}`,
+          });
+        },
+        period: 5,
+        allowedAttempts: 5,
+        storeOTP: "encrypted",
+      },
+      // Backup code settings
+      backupCodeOptions: {
+        amount: 10,
+        length: 10,
+        storeBackupCodes: "encrypted",
+      },
+      // Session settings
+      twoFactorCookieMaxAge: 600, // 10 minutes
+      trustDeviceMaxAge: 30 * 24 * 60 * 60, // 30 days
+    }),
+  ],
+});
+```
--- a/.claude/skills/better-auth-best-practices
+++ b/.claude/skills/better-auth-best-practices
@@ -0,0 +1 @@
+../../.agents/skills/better-auth-best-practices
--- a/.claude/skills/create-auth-skill
+++ b/.claude/skills/create-auth-skill
@@ -0,0 +1 @@
+../../.agents/skills/create-auth-skill
--- a/.claude/skills/email-and-password-best-practices
+++ b/.claude/skills/email-and-password-best-practices
@@ -0,0 +1 @@
+../../.agents/skills/email-and-password-best-practices
--- a/.claude/skills/two-factor-authentication-best-practices
+++ b/.claude/skills/two-factor-authentication-best-practices
@@ -0,0 +1 @@
+../../.agents/skills/two-factor-authentication-best-practices
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 /backend.old/data
 /backend.old/uploads/
+chat/

 # Environment variables
 .env
--- a/.idea/ai.iml
+++ b/.idea/ai.iml
@@ -9,6 +9,13 @@
      <excludeFolder url="file://$MODULE_DIR$/backend.old/data" />
      <excludeFolder url="file://$MODULE_DIR$/doc.old" />
      <excludeFolder url="file://$MODULE_DIR$/backend.old" />
+      <excludeFolder url="file://$MODULE_DIR$/client-py/dexorder_client.egg-info" />
+      <excludeFolder url="file://$MODULE_DIR$/flink/protobuf" />
+      <excludeFolder url="file://$MODULE_DIR$/flink/target" />
+      <excludeFolder url="file://$MODULE_DIR$/ingestor/protobuf" />
+      <excludeFolder url="file://$MODULE_DIR$/ingestor/src/proto" />
+      <excludeFolder url="file://$MODULE_DIR$/relay/protobuf" />
+      <excludeFolder url="file://$MODULE_DIR$/relay/target" />
    </content>
    <orderEntry type="jdk" jdkName="Python 3.12 (ai)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
--- a/.idea/runConfigurations/dev.xml
+++ b/.idea/runConfigurations/dev.xml
@@ -1,12 +0,0 @@
-<component name="ProjectRunConfigurationManager">
-  <configuration default="false" name="dev" type="js.build_tools.npm" nameIsGenerated="true">
-    <package-json value="$PROJECT_DIR$/web/package.json" />
-    <command value="run" />
-    <scripts>
-      <script value="dev" />
-    </scripts>
-    <node-interpreter value="project" />
-    <envs />
-    <method v="2" />
-  </configuration>
-</component>
--- a/.junie/skills/better-auth-best-practices
+++ b/.junie/skills/better-auth-best-practices
@@ -0,0 +1 @@
+../../.agents/skills/better-auth-best-practices
--- a/.junie/skills/create-auth-skill
+++ b/.junie/skills/create-auth-skill
@@ -0,0 +1 @@
+../../.agents/skills/create-auth-skill
--- a/.junie/skills/email-and-password-best-practices
+++ b/.junie/skills/email-and-password-best-practices
@@ -0,0 +1 @@
+../../.agents/skills/email-and-password-best-practices
--- a/.junie/skills/two-factor-authentication-best-practices
+++ b/.junie/skills/two-factor-authentication-best-practices
@@ -0,0 +1 @@
+../../.agents/skills/two-factor-authentication-best-practices
--- a/bin/client-test
+++ b/bin/client-test
@@ -0,0 +1,197 @@
+#!/usr/bin/env bash
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+usage() {
+    echo "Usage: $0 [COMMAND]"
+    echo ""
+    echo "Test client-py against the development environment"
+    echo ""
+    echo "Commands:"
+    echo "  ohlc        Test OHLCClient API (default)"
+    echo "  history     Test low-level HistoryClient"
+    echo "  shell       Open Python shell with client installed"
+    echo ""
+    echo "Examples:"
+    echo "  $0              # Run OHLC client test"
+    echo "  $0 ohlc         # Run OHLC client test"
+    echo "  $0 history      # Run history client test"
+    echo "  $0 shell        # Interactive Python shell"
+    exit 1
+}
+
+COMMAND="${1:-ohlc}"
+
+check_kubectl() {
+    if ! command -v kubectl &> /dev/null; then
+        echo -e "${RED}Error: kubectl not found. Please install kubectl first.${NC}"
+        exit 1
+    fi
+}
+
+check_services() {
+    echo -e "${BLUE}Checking if services are running...${NC}"
+
+    # Check if required pods are running
+    local services=("relay" "flink-jobmanager" "iceberg-catalog")
+    local missing=()
+
+    for service in "${services[@]}"; do
+        if ! kubectl get pods -l app="$service" 2>/dev/null | grep -q "Running"; then
+            missing+=("$service")
+        fi
+    done
+
+    if [ ${#missing[@]} -gt 0 ]; then
+        echo -e "${RED}Error: Required services not running: ${missing[*]}${NC}"
+        echo -e "${YELLOW}Run 'bin/dev start' first to start the environment${NC}"
+        exit 1
+    fi
+
+    echo -e "${GREEN}✓ All required services are running${NC}"
+}
+
+setup_port_forwards() {
+    echo -e "${BLUE}Setting up port forwards...${NC}"
+
+    # Kill any existing port forwards
+    pkill -f "kubectl port-forward.*relay" 2>/dev/null || true
+    pkill -f "kubectl port-forward.*iceberg-catalog" 2>/dev/null || true
+    pkill -f "kubectl port-forward.*minio" 2>/dev/null || true
+
+    # Port forward relay (5558=market-data pub, 5559=client requests)
+    kubectl port-forward svc/relay 5558:5558 5559:5559 >/dev/null 2>&1 &
+    local relay_pid=$!
+
+    # Port forward iceberg-catalog (8181)
+    kubectl port-forward svc/iceberg-catalog 8181:8181 >/dev/null 2>&1 &
+    local iceberg_pid=$!
+
+    # Port forward MinIO (9000) - needed for PyIceberg to read data files
+    kubectl port-forward svc/minio 9000:9000 >/dev/null 2>&1 &
+    local minio_pid=$!
+
+    # Wait for port forwards to establish
+    sleep 2
+
+    echo -e "${GREEN}✓ Port forwards established${NC}"
+    echo -e "${YELLOW}  Relay:           localhost:5558 (market-data), 5559 (requests)${NC}"
+    echo -e "${YELLOW}  Iceberg Catalog: localhost:8181${NC}"
+    echo -e "${YELLOW}  MinIO:           localhost:9000${NC}"
+
+    # Store PIDs for cleanup
+    export PORT_FORWARD_PIDS="$relay_pid $iceberg_pid $minio_pid"
+}
+
+cleanup_port_forwards() {
+    if [ -n "$PORT_FORWARD_PIDS" ]; then
+        echo -e "\n${BLUE}Cleaning up port forwards...${NC}"
+        for pid in $PORT_FORWARD_PIDS; do
+            kill $pid 2>/dev/null || true
+        done
+    fi
+}
+
+run_ohlc_test() {
+    echo -e "${BLUE}Running OHLCClient test...${NC}"
+    echo ""
+
+    cd "$ROOT_DIR"
+
+    # Install client-py in development mode
+    pip install -e client-py >/dev/null 2>&1 || {
+        echo -e "${YELLOW}Installing client-py dependencies...${NC}"
+        pip install -e client-py
+    }
+
+    # Run the test
+    python3 test/history_client/client_ohlc_api.py
+}
+
+run_history_test() {
+    echo -e "${BLUE}Running HistoryClient test...${NC}"
+    echo ""
+
+    cd "$ROOT_DIR"
+
+    # Install client-py in development mode
+    pip install -e client-py >/dev/null 2>&1 || {
+        echo -e "${YELLOW}Installing client-py dependencies...${NC}"
+        pip install -e client-py
+    }
+
+    # Run the low-level test
+    python3 test/history_client/client.py
+}
+
+open_shell() {
+    echo -e "${BLUE}Opening Python shell with dexorder client...${NC}"
+    echo ""
+
+    cd "$ROOT_DIR"
+
+    # Install client-py in development mode
+    pip install -e client-py >/dev/null 2>&1 || {
+        echo -e "${YELLOW}Installing client-py dependencies...${NC}"
+        pip install -e client-py
+    }
+
+    echo -e "${BLUE}Example usage:${NC}"
+    echo -e "  from dexorder import OHLCClient"
+    echo -e "  import asyncio"
+    echo -e "  client = OHLCClient('http://localhost:8181', 'tcp://localhost:5559', 'tcp://localhost:5558',"
+    echo -e "                      s3_endpoint='http://localhost:9000', s3_access_key='minio', s3_secret_key='minio123')"
+    echo -e "  # Use asyncio.run(client.fetch_ohlc(...)) to fetch data"
+    echo ""
+
+    python3 -i -c "
+import sys
+import os
+sys.path.insert(0, os.path.join(os.getcwd(), 'client-py'))
+from dexorder import OHLCClient, HistoryClient, IcebergClient
+import asyncio
+print('✓ dexorder package imported')
+print('Available: OHLCClient, HistoryClient, IcebergClient, asyncio')
+"
+}
+
+# Set up cleanup trap
+trap cleanup_port_forwards EXIT
+
+# Main command routing
+check_kubectl
+
+case "$COMMAND" in
+    ohlc)
+        check_services
+        setup_port_forwards
+        run_ohlc_test
+        ;;
+    history)
+        check_services
+        setup_port_forwards
+        run_history_test
+        ;;
+    shell)
+        check_services
+        setup_port_forwards
+        open_shell
+        ;;
+    -h|--help|help)
+        usage
+        ;;
+    *)
+        echo -e "${RED}Unknown command: $COMMAND${NC}"
+        echo ""
+        usage
+        ;;
+esac
--- a/bin/dev
+++ b/bin/dev
@@ -17,19 +17,25 @@ usage() {
    echo "Manage the minikube development environment"
    echo ""
    echo "Commands:"
-    echo "  start            Start minikube and deploy all services"
-    echo "  stop             Stop minikube"
-    echo "  restart [svc]    Rebuild and redeploy all services, or just one (relay|ingestor|flink|sidecar)"
-    echo "  rebuild [svc]    Rebuild all custom images, or just one"
-    echo "  deploy  [svc]    Deploy/update all services, or just one"
-    echo "  status      Show status of all services"
-    echo "  logs        Tail logs for a service"
-    echo "  shell       Open a shell in a service pod"
-    echo "  clean       Delete all resources and volumes"
-    echo "  tunnel      Start minikube tunnel (for LoadBalancer access)"
+    echo "  start              Start minikube and deploy all services"
+    echo "  stop [--keep-data] Stop minikube (deletes PVCs by default)"
+    echo "  restart [svc]      Rebuild and redeploy all services, or just one (relay|ingestor|flink|gateway|sidecar|web)"
+    echo "  deep-restart [svc] Restart StatefulSet(s) and delete their PVCs (kafka|postgres|minio|qdrant|all)"
+    echo "  rebuild [svc]      Rebuild all custom images, or just one"
+    echo "  deploy  [svc]      Deploy/update all services, or just one"
+    echo "  delete-pvcs [svc]  Delete PVCs for specific service or all (kafka|postgres|minio|qdrant|all)"
+    echo "  status             Show status of all services"
+    echo "  logs               Tail logs for a service"
+    echo "  shell              Open a shell in a service pod"
+    echo "  clean              Delete all resources and volumes"
+    echo "  tunnel             Start minikube tunnel (for LoadBalancer access)"
    echo ""
    echo "Examples:"
    echo "  $0 start                    # Start minikube and deploy everything"
+    echo "  $0 stop                     # Stop minikube and delete PVCs"
+    echo "  $0 stop --keep-data         # Stop minikube but keep PVCs"
+    echo "  $0 deep-restart postgres    # Restart postgres with fresh storage"
+    echo "  $0 delete-pvcs kafka        # Delete kafka PVCs only"
    echo "  $0 rebuild                  # Rebuild all custom images"
    echo "  $0 logs relay               # Tail logs for relay service"
    echo "  $0 shell ingestor           # Open shell in ingestor pod"
@@ -66,6 +72,16 @@ start_minikube() {
    # Enable ingress addon
    echo -e "${BLUE}Enabling ingress addon...${NC}"
    minikube addons enable ingress
+
+    # Wait for ingress webhook to be ready
+    echo -e "${BLUE}Waiting for ingress webhook to be ready...${NC}"
+    kubectl wait --namespace ingress-nginx \
+      --for=condition=ready pod \
+      --selector=app.kubernetes.io/component=controller \
+      --timeout=120s 2>/dev/null || echo -e "${YELLOW}⚠️  Ingress controller not ready yet${NC}"
+
+    # Give webhook endpoint a moment to start listening
+    sleep 5
    echo -e "${GREEN}✓ Ingress enabled${NC}"

    # Set docker environment
@@ -127,6 +143,16 @@ rebuild_images() {
        docker tag "dexorder/ai-flink:$FLINK_TAG" "dexorder/flink:$FLINK_TAG"
    fi

+    # Build gateway (Node.js application)
+    if [ "$service" == "all" ] || [ "$service" == "gateway" ]; then
+        echo -e "${GREEN}→${NC} Building gateway..."
+        cd "$ROOT_DIR/gateway"
+        GATEWAY_TAG="dev$(date +%Y%m%d%H%M%S)"
+        docker build -t dexorder/gateway:latest -t dexorder/gateway:$GATEWAY_TAG . || exit 1
+        echo -e "${GREEN}✓ Built dexorder/gateway:$GATEWAY_TAG${NC}"
+        cd "$ROOT_DIR"
+    fi
+
    # Build lifecycle-sidecar (Go binary)
    if [ "$service" == "all" ] || [ "$service" == "lifecycle-sidecar" ] || [ "$service" == "sidecar" ]; then
        echo -e "${GREEN}→${NC} Building lifecycle-sidecar..."
@@ -137,19 +163,31 @@ rebuild_images() {
        cd "$ROOT_DIR"
    fi

+    # Build web (Vue.js application)
+    if [ "$service" == "all" ] || [ "$service" == "web" ]; then
+        echo -e "${GREEN}→${NC} Building web..."
+        cd "$ROOT_DIR/web"
+        WEB_TAG="dev$(date +%Y%m%d%H%M%S)"
+        docker build -t dexorder/ai-web:latest -t dexorder/ai-web:$WEB_TAG . || exit 1
+        echo -e "${GREEN}✓ Built dexorder/ai-web:$WEB_TAG${NC}"
+        cd "$ROOT_DIR"
+    fi
+
    # Save the tags for deployment (all services, preserving any we didn't rebuild)
    echo "RELAY_TAG=$RELAY_TAG" > "$ROOT_DIR/.dev-image-tag"
    echo "INGEST_TAG=$INGEST_TAG" >> "$ROOT_DIR/.dev-image-tag"
    echo "FLINK_TAG=$FLINK_TAG" >> "$ROOT_DIR/.dev-image-tag"
+    echo "GATEWAY_TAG=$GATEWAY_TAG" >> "$ROOT_DIR/.dev-image-tag"
    echo "SIDECAR_TAG=$SIDECAR_TAG" >> "$ROOT_DIR/.dev-image-tag"
+    echo "WEB_TAG=$WEB_TAG" >> "$ROOT_DIR/.dev-image-tag"

-    echo -e "${GREEN}✓ Images built: relay=$RELAY_TAG, ingestor=$INGEST_TAG, flink=$FLINK_TAG, sidecar=$SIDECAR_TAG${NC}"
+    echo -e "${GREEN}✓ Images built: relay=$RELAY_TAG, ingestor=$INGEST_TAG, flink=$FLINK_TAG, gateway=$GATEWAY_TAG, sidecar=$SIDECAR_TAG, web=$WEB_TAG${NC}"
 }

 deploy_services() {
    echo -e "${BLUE}Deploying services to minikube...${NC}"

-    cd "$ROOT_DIR"
+    cd "$ROOT_DIR/deploy/k8s/dev"

    # Get the dev image tags
    if [ -f "$ROOT_DIR/.dev-image-tag" ]; then
@@ -158,16 +196,20 @@ deploy_services() {
        echo -e "  Relay: $RELAY_TAG"
        echo -e "  Ingestor: $INGEST_TAG"
        echo -e "  Flink: $FLINK_TAG"
+        echo -e "  Gateway: $GATEWAY_TAG"
+        echo -e "  Web: $WEB_TAG"
    else
        echo -e "${YELLOW}⚠️  No dev tags found. Using 'latest'. Run rebuild first.${NC}"
        RELAY_TAG="latest"
        INGEST_TAG="latest"
        FLINK_TAG="latest"
+        GATEWAY_TAG="latest"
+        WEB_TAG="latest"
    fi

    # Create secrets first (if they exist)
    echo -e "${GREEN}→${NC} Checking secrets..."
-    if ls deploy/k8s/dev/secrets/*.yaml &> /dev/null; then
+    if ls secrets/*.yaml &> /dev/null; then
        "$SCRIPT_DIR/secret-update" dev || echo -e "${YELLOW}  (Some secrets missing - copy from .example files)${NC}"
    else
        echo -e "${YELLOW}⚠️  No secrets found. Copy from .example files:${NC}"
@@ -180,13 +222,30 @@ deploy_services() {
    echo -e "${GREEN}→${NC} Updating configs..."
    "$SCRIPT_DIR/config-update" dev

-    # Apply kustomize with image tag substitution
+    # Create a temporary kustomization overlay with image tags
+    echo -e "${GREEN}→${NC} Setting image tags in kustomization..."
+    cat >> kustomization.yaml <<EOF
+
+# Image tags (added by bin/dev)
+images:
+  - name: dexorder/relay
+    newTag: $RELAY_TAG
+  - name: dexorder/ingestor
+    newTag: $INGEST_TAG
+  - name: dexorder/flink
+    newTag: $FLINK_TAG
+  - name: dexorder/gateway
+    newTag: $GATEWAY_TAG
+  - name: dexorder/ai-web
+    newTag: $WEB_TAG
+EOF
+
+    # Apply kustomize
    echo -e "${GREEN}→${NC} Applying Kubernetes manifests..."
-    kubectl kustomize deploy/k8s/dev/ | \
-        sed "s|image: dexorder/flink:latest|image: dexorder/flink:$FLINK_TAG|g" | \
-        sed "s|image: dexorder/relay:latest|image: dexorder/relay:$RELAY_TAG|g" | \
-        sed "s|image: dexorder/ingestor:latest|image: dexorder/ingestor:$INGEST_TAG|g" | \
-        kubectl apply -f -
+    kubectl apply -k .
+
+    # Clean up the appended image tags from kustomization.yaml
+    sed -i '/# Image tags (added by bin\/dev)/,$d' kustomization.yaml

    echo -e "${GREEN}✓ Services deployed${NC}"

@@ -200,11 +259,105 @@ deploy_services() {
        deployment/flink-taskmanager \
        2>/dev/null || echo -e "${YELLOW}(Some deployments not ready yet)${NC}"

+    # Initialize gateway database schema
+    echo -e "${BLUE}Initializing gateway database schema...${NC}"
+    echo -e "${GREEN}→${NC} Waiting for postgres..."
+    kubectl wait --for=condition=ready --timeout=120s pod -l app=postgres 2>/dev/null || {
+        echo -e "${YELLOW}⚠️  Postgres not ready yet${NC}"
+    }
+
+    pg_pod=$(kubectl get pods -l app=postgres -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
+    if [ -n "$pg_pod" ]; then
+        table_count=$(kubectl exec "$pg_pod" -- psql -U postgres -d iceberg -t -c "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'user';" 2>/dev/null | tr -d ' ')
+        if [ "$table_count" = "1" ]; then
+            echo -e "${GREEN}✓ Gateway schema already exists${NC}"
+        else
+            echo -e "${GREEN}→${NC} Applying gateway schema..."
+            kubectl exec -i "$pg_pod" -- psql -U postgres -d iceberg < "$ROOT_DIR/gateway/schema.sql" > /dev/null 2>&1
+            if [ $? -eq 0 ]; then
+                echo -e "${GREEN}✓ Gateway schema initialized${NC}"
+            else
+                echo -e "${YELLOW}⚠️  Failed to initialize gateway schema${NC}"
+            fi
+        fi
+
+        # Create dev user via Better Auth API (skip if already exists)
+        echo -e "${GREEN}→${NC} Checking for dev user..."
+        user_id=$(kubectl exec "$pg_pod" -- psql -U postgres -d iceberg -t -c "SELECT id FROM \"user\" WHERE email = 'cryptochimp@dexorder.ai';" 2>/dev/null | tr -d ' ')
+
+        if [ -n "$user_id" ]; then
+            echo -e "${GREEN}✓ Dev user already exists (cryptochimp@dexorder.ai)${NC}"
+        else
+            echo -e "${GREEN}→${NC} Creating dev user via Better Auth API..."
+            echo -e "${BLUE}Waiting for gateway to be ready...${NC}"
+            kubectl wait --for=condition=available --timeout=120s deployment/gateway 2>/dev/null || {
+                echo -e "${YELLOW}⚠️  Gateway not ready after 120s${NC}"
+            }
+
+            # Give gateway a few seconds to start accepting requests
+            sleep 5
+
+            # Create user via custom auth endpoint
+            response=$(curl -s -w "\n%{http_code}" -X POST "http://dexorder.local/api/auth/register" \
+                -H "Content-Type: application/json" \
+                -d '{
+                  "email": "cryptochimp@dexorder.ai",
+                  "password": "moon2the",
+                  "name": "Crypto Chimp"
+                }' 2>&1)
+
+            http_code=$(echo "$response" | tail -n1)
+            if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
+                echo -e "${GREEN}✓ User created via auth API${NC}"
+            elif [ "$http_code" = "400" ]; then
+                echo -e "${YELLOW}⚠️  User may already exist (status 400)${NC}"
+            else
+                echo -e "${YELLOW}⚠️  API call returned status $http_code${NC}"
+            fi
+
+            # Wait a moment for database to be updated
+            sleep 2
+
+            # Check again if user exists now
+            user_id=$(kubectl exec "$pg_pod" -- psql -U postgres -d iceberg -t -c "SELECT id FROM \"user\" WHERE email = 'cryptochimp@dexorder.ai';" 2>/dev/null | tr -d ' ')
+
+            if [ -n "$user_id" ]; then
+                echo -e "${GREEN}✓ Dev user confirmed in database${NC}"
+            fi
+        fi
+
+        if [ -n "$user_id" ]; then
+            # Create/update license for the user
+            echo -e "${GREEN}→${NC} Creating pro license for dev user..."
+            kubectl exec "$pg_pod" -- psql -U postgres -d iceberg -c "
+                INSERT INTO user_licenses (user_id, email, license_type, mcp_server_url, features, resource_limits, preferred_model)
+                VALUES (
+                    '$user_id',
+                    'cryptochimp@dexorder.ai',
+                    'pro',
+                    'http://localhost:8080/mcp',
+                    '{\"maxIndicators\":50,\"maxStrategies\":20,\"maxBacktestDays\":365,\"realtimeData\":true,\"customExecutors\":true,\"apiAccess\":true}',
+                    '{\"maxConcurrentSessions\":5,\"maxMessagesPerDay\":1000,\"maxTokensPerMessage\":8192,\"rateLimitPerMinute\":60}',
+                    '{\"provider\":\"anthropic\",\"model\":\"claude-3-5-sonnet-20241022\",\"temperature\":0.7}'
+                )
+                ON CONFLICT (user_id) DO UPDATE SET
+                    license_type = EXCLUDED.license_type,
+                    features = EXCLUDED.features,
+                    resource_limits = EXCLUDED.resource_limits,
+                    preferred_model = EXCLUDED.preferred_model,
+                    updated_at = NOW();
+            " > /dev/null 2>&1
+            echo -e "${GREEN}✓ Dev user ready (cryptochimp@dexorder.ai / moon2the)${NC}"
+        else
+            echo -e "${YELLOW}⚠️  Could not create dev user (gateway may not be ready)${NC}"
+        fi
+    fi
+
    echo ""
    echo -e "${GREEN}✓ Dev environment ready!${NC}"
    echo ""
    echo -e "${BLUE}Access the application:${NC}"
-    echo -e "  Web UI:        http://dexorder.local/cryptochimp/"
+    echo -e "  Web UI:        http://dexorder.local/"
    echo -e "  Backend WS:    ws://dexorder.local/ws"
    echo ""
    echo -e "${BLUE}Admin UIs (use port-forward):${NC}"
@@ -266,6 +419,91 @@ open_shell() {
    kubectl exec -it "$pod" -- /bin/sh || kubectl exec -it "$pod" -- /bin/bash
 }

+delete_pvcs() {
+    local service="${1:-all}"
+
+    echo -e "${BLUE}Deleting PVCs for: $service${NC}"
+
+    case "$service" in
+        kafka)
+            kubectl delete pvc -l app=kafka || true
+            ;;
+        postgres)
+            kubectl delete pvc -l app=postgres || true
+            ;;
+        minio)
+            kubectl delete pvc -l app=minio || true
+            ;;
+        qdrant)
+            kubectl delete pvc -l app=qdrant || true
+            ;;
+        all)
+            echo -e "${YELLOW}Deleting all StatefulSet PVCs...${NC}"
+            kubectl delete pvc -l app=kafka 2>/dev/null || true
+            kubectl delete pvc -l app=postgres 2>/dev/null || true
+            kubectl delete pvc -l app=minio 2>/dev/null || true
+            kubectl delete pvc -l app=qdrant 2>/dev/null || true
+            ;;
+        *)
+            echo -e "${RED}Error: Unknown service '$service'${NC}"
+            echo "Valid services: kafka, postgres, minio, qdrant, all"
+            exit 1
+            ;;
+    esac
+
+    echo -e "${GREEN}✓ PVCs deleted${NC}"
+}
+
+deep_restart() {
+    local service="${1:-all}"
+
+    echo -e "${BLUE}Deep restart for: $service${NC}"
+    echo -e "${YELLOW}This will delete the StatefulSet(s) and their PVCs, then redeploy.${NC}"
+
+    case "$service" in
+        kafka)
+            echo -e "${GREEN}→${NC} Deleting kafka StatefulSet..."
+            kubectl delete statefulset kafka || true
+            sleep 2
+            delete_pvcs kafka
+            ;;
+        postgres)
+            echo -e "${GREEN}→${NC} Deleting postgres StatefulSet..."
+            kubectl delete statefulset postgres || true
+            sleep 2
+            delete_pvcs postgres
+            ;;
+        minio)
+            echo -e "${GREEN}→${NC} Deleting minio StatefulSet..."
+            kubectl delete statefulset minio || true
+            sleep 2
+            delete_pvcs minio
+            ;;
+        qdrant)
+            echo -e "${GREEN}→${NC} Deleting qdrant StatefulSet..."
+            kubectl delete statefulset qdrant || true
+            sleep 2
+            delete_pvcs qdrant
+            ;;
+        all)
+            echo -e "${GREEN}→${NC} Deleting all StatefulSets..."
+            kubectl delete statefulset kafka postgres minio qdrant || true
+            sleep 2
+            delete_pvcs all
+            ;;
+        *)
+            echo -e "${RED}Error: Unknown service '$service'${NC}"
+            echo "Valid services: kafka, postgres, minio, qdrant, all"
+            exit 1
+            ;;
+    esac
+
+    echo -e "${GREEN}→${NC} Redeploying services..."
+    deploy_services
+
+    echo -e "${GREEN}✓ Deep restart complete${NC}"
+}
+
 clean_all() {
    echo -e "${RED}⚠️  WARNING: This will delete all resources and volumes!${NC}"
    read -p "Are you sure? (yes/no): " confirm
@@ -288,7 +526,7 @@ start_tunnel() {
    minikube tunnel
 }

-# Deploy a single service using kubectl set image with the dev tag (never uses 'latest')
+# Deploy a single service by re-applying full kustomize (ensures patches are applied)
 deploy_service() {
    local service="$1"

@@ -296,28 +534,35 @@ deploy_service() {
        source "$ROOT_DIR/.dev-image-tag"
    fi

-    local image
-    case "$service" in
-        relay)    image="dexorder/relay:$RELAY_TAG" ;;
-        ingestor) image="dexorder/ingestor:$INGEST_TAG" ;;
-        flink)    image="dexorder/flink:$FLINK_TAG" ;;
-        *)
-            echo -e "${RED}Unknown service: $service. Use relay, ingestor, or flink.${NC}"
-            exit 1
-            ;;
-    esac
+    echo -e "${GREEN}→${NC} Deploying $service (via kustomize)..."

-    echo -e "${GREEN}→${NC} Deploying $service with image $image..."
-    case "$service" in
-        flink)
-            kubectl set image deployment/flink-jobmanager  flink-jobmanager=$image
-            kubectl set image deployment/flink-taskmanager flink-taskmanager=$image
-            ;;
-        *)
-            kubectl set image deployment/$service $service=$image
-            ;;
-    esac
-    echo -e "${GREEN}✓ $service updated to $image${NC}"
+    # Re-apply full kustomize with image tags properly set
+    # This ensures all patches (including imagePullPolicy) are properly applied
+    cd "$ROOT_DIR/deploy/k8s/dev"
+
+    # Create a temporary kustomization overlay with image tags
+    cat >> kustomization.yaml <<EOF
+
+# Image tags (added by bin/dev)
+images:
+  - name: dexorder/relay
+    newTag: $RELAY_TAG
+  - name: dexorder/ingestor
+    newTag: $INGEST_TAG
+  - name: dexorder/flink
+    newTag: $FLINK_TAG
+  - name: dexorder/gateway
+    newTag: $GATEWAY_TAG
+  - name: dexorder/ai-web
+    newTag: $WEB_TAG
+EOF
+
+    kubectl apply -k .
+
+    # Clean up the appended image tags from kustomization.yaml
+    sed -i '/# Image tags (added by bin\/dev)/,$d' kustomization.yaml
+
+    echo -e "${GREEN}✓ $service deployed${NC}"
 }

 # Main command routing
@@ -331,9 +576,28 @@ case "$COMMAND" in
        deploy_services
        ;;
    stop)
-        echo -e "${BLUE}Stopping minikube...${NC}"
-        minikube stop
-        echo -e "${GREEN}✓ Minikube stopped${NC}"
+        # Check for --keep-data flag
+        if [[ "$2" == "--keep-data" ]]; then
+            echo -e "${BLUE}Stopping minikube (keeping data)...${NC}"
+            minikube stop
+            echo -e "${GREEN}✓ Minikube stopped (PVCs preserved)${NC}"
+        else
+            echo -e "${BLUE}Stopping minikube and deleting PVCs...${NC}"
+            # Scale down StatefulSets first to release PVCs
+            echo -e "${GREEN}→${NC} Scaling down StatefulSets..."
+            kubectl scale statefulset kafka postgres minio qdrant --replicas=0 2>/dev/null || true
+            # Wait for pods to terminate
+            echo -e "${GREEN}→${NC} Waiting for pods to terminate..."
+            kubectl wait --for=delete pod -l app=kafka --timeout=60s 2>/dev/null || true
+            kubectl wait --for=delete pod -l app=postgres --timeout=60s 2>/dev/null || true
+            kubectl wait --for=delete pod -l app=minio --timeout=60s 2>/dev/null || true
+            kubectl wait --for=delete pod -l app=qdrant --timeout=60s 2>/dev/null || true
+            # Now delete PVCs
+            delete_pvcs all
+            minikube stop
+            echo -e "${GREEN}✓ Minikube stopped and PVCs deleted${NC}"
+            echo -e "${YELLOW}Tip: Use 'bin/dev stop --keep-data' to preserve PVCs${NC}"
+        fi
        ;;
    restart)
        if [ -n "$2" ]; then
@@ -366,6 +630,12 @@ case "$COMMAND" in
    clean)
        clean_all
        ;;
+    deep-restart)
+        deep_restart "${2:-all}"
+        ;;
+    delete-pvcs)
+        delete_pvcs "${2:-all}"
+        ;;
    tunnel)
        start_tunnel
        ;;
--- a/bin/secret-update
+++ b/bin/secret-update
@@ -93,6 +93,7 @@ else
        "minio-secret"
        "ingestor-secrets"
        "flink-secrets"
+        "gateway-secrets"
    )

    FAILED=0
--- a/client-py/.dockerignore
+++ b/client-py/.dockerignore
@@ -0,0 +1,57 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Virtual environments
+venv/
+env/
+ENV/
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+
+# Config and secrets (should come from k8s mounts)
+config.yaml
+secrets.yaml
+*.local.yaml
+
+# Data directories
+data/
+
+# Git
+.git/
+.gitignore
+
+# Documentation
+*.md
+docs/
+
+# Example files
+*.example.*
--- a/client-py/Dockerfile
+++ b/client-py/Dockerfile
@@ -0,0 +1,67 @@
+# Multi-stage build for DexOrder user container
+FROM python:3.11-slim as builder
+
+WORKDIR /build
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    gcc \
+    g++ \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy dependency specifications
+COPY setup.py .
+COPY dexorder/ dexorder/
+
+# Install dependencies to a target directory
+RUN pip install --no-cache-dir --target=/build/deps .
+
+# =============================================================================
+# Runtime stage
+# =============================================================================
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# Install runtime dependencies only
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libzmq5 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create non-root user
+RUN groupadd -r dexorder && useradd -r -g dexorder -u 1000 dexorder
+
+# Copy installed Python packages from builder
+COPY --from=builder /build/deps /usr/local/lib/python3.11/site-packages/
+
+# Copy application code
+COPY dexorder/ /app/dexorder/
+COPY main.py /app/
+
+# Create directories for config, secrets, and data
+RUN mkdir -p /app/config /app/secrets /app/data && \
+    chown -R dexorder:dexorder /app
+
+# Create writable tmp directory (read-only rootfs requirement)
+RUN mkdir -p /tmp && chmod 1777 /tmp
+
+# Switch to non-root user
+USER dexorder
+
+# Environment variables (can be overridden in k8s)
+ENV PYTHONUNBUFFERED=1 \
+    LOG_LEVEL=INFO \
+    CONFIG_PATH=/app/config/config.yaml \
+    SECRETS_PATH=/app/config/secrets.yaml \
+    ZMQ_XPUB_PORT=5570 \
+    ZMQ_GATEWAY_ENDPOINT=tcp://gateway:5571 \
+    MCP_SERVER_NAME=dexorder-user \
+    IDLE_TIMEOUT_MINUTES=15 \
+    ENABLE_IDLE_SHUTDOWN=true
+
+# Health check endpoint (simple check if process is running)
+HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
+    CMD python -c "import sys; sys.exit(0)"
+
+# Run the main application
+ENTRYPOINT ["python", "/app/main.py"]
--- a/client-py/config.example.yaml
+++ b/client-py/config.example.yaml
@@ -0,0 +1,30 @@
+# Example configuration file for DexOrder user container
+# Mount this at /app/config/config.yaml in k8s
+
+# User-specific settings
+user:
+  timezone: "UTC"
+
+# Data sources
+data:
+  iceberg:
+    catalog_name: "dexorder"
+    # Catalog properties loaded from secrets
+
+  relay:
+    endpoint: "tcp://relay:5560"
+    timeout_ms: 5000
+
+# Strategy settings
+strategies:
+  max_concurrent: 5
+  default_timeout_minutes: 60
+
+# Alert settings
+alerts:
+  max_active: 100
+
+# Logging
+logging:
+  level: "INFO"
+  include_timestamps: true
--- a/client-py/dexorder/init.py
+++ b/client-py/dexorder/init.py
@@ -5,6 +5,8 @@ Provides high-level APIs for:
 - Historical OHLC data retrieval with smart caching
 - Async request/response via relay
 - Iceberg data warehouse queries
+- User container event publishing
+- Container lifecycle management
 """

 __version__ = "0.1.0"
@@ -12,5 +14,36 @@ __version__ = "0.1.0"
 from .ohlc_client import OHLCClient
 from .iceberg_client import IcebergClient
 from .history_client import HistoryClient
+from .lifecycle_manager import (
+    LifecycleManager,
+    get_lifecycle_manager,
+    start_lifecycle_manager,
+)

-__all__ = ['OHLCClient', 'IcebergClient', 'HistoryClient']
+# Event system
+from .events import (
+    EventPublisher,
+    EventType,
+    Priority,
+    ChannelType,
+    DeliverySpec,
+    UserEvent,
+)
+
+__all__ = [
+    # Data clients
+    'OHLCClient',
+    'IcebergClient',
+    'HistoryClient',
+    # Lifecycle management
+    'LifecycleManager',
+    'get_lifecycle_manager',
+    'start_lifecycle_manager',
+    # Event system
+    'EventPublisher',
+    'EventType',
+    'Priority',
+    'ChannelType',
+    'DeliverySpec',
+    'UserEvent',
+]
--- a/client-py/dexorder/events/init.py
+++ b/client-py/dexorder/events/init.py
@@ -0,0 +1,57 @@
+"""
+User Container Event System
+
+Publishes events to the gateway via dual ZMQ patterns:
+- XPUB for informational events (fire-and-forget to active sessions)
+- DEALER for critical events (guaranteed delivery with ack)
+
+See doc/protocol.md and doc/user_container_events.md for details.
+"""
+
+from .types import (
+    # Enums
+    EventType,
+    Priority,
+    ChannelType,
+    AckStatus,
+    # Message types
+    ChannelPreference,
+    DeliverySpec,
+    UserEvent,
+    EventAck,
+    # Serialization
+    MSG_TYPE_USER_EVENT,
+    MSG_TYPE_EVENT_ACK,
+    serialize_user_event,
+    deserialize_user_event,
+    serialize_event_ack,
+    deserialize_event_ack,
+)
+
+from .publisher import EventPublisher
+
+from .pending_store import PendingStore
+
+__all__ = [
+    # Enums
+    "EventType",
+    "Priority",
+    "ChannelType",
+    "AckStatus",
+    # Message types
+    "ChannelPreference",
+    "DeliverySpec",
+    "UserEvent",
+    "EventAck",
+    # Serialization
+    "MSG_TYPE_USER_EVENT",
+    "MSG_TYPE_EVENT_ACK",
+    "serialize_user_event",
+    "deserialize_user_event",
+    "serialize_event_ack",
+    "deserialize_event_ack",
+    # Publisher
+    "EventPublisher",
+    # Storage
+    "PendingStore",
+]
--- a/client-py/dexorder/events/pending_store.py
+++ b/client-py/dexorder/events/pending_store.py
@@ -0,0 +1,120 @@
+"""
+Pending event store for crash recovery.
+
+Persists critical events that haven't been acknowledged to disk,
+so they can be replayed after a container restart.
+"""
+
+import json
+import logging
+from pathlib import Path
+from typing import List, Optional
+import asyncio
+import aiofiles
+
+from .types import UserEvent
+
+logger = logging.getLogger(__name__)
+
+
+class PendingStore:
+    """
+    Persists pending critical events to disk for crash recovery.
+
+    Events are written to a JSON file when the publisher shuts down,
+    and loaded back when it starts up.
+    """
+
+    def __init__(self, path: Optional[str] = None):
+        """
+        Initialize pending store.
+
+        Args:
+            path: Path to the pending events file.
+                  If None, persistence is disabled.
+        """
+        self.path = Path(path) if path else None
+        self._lock = asyncio.Lock()
+
+    async def save_pending(self, events: List[UserEvent]) -> None:
+        """
+        Save pending events to disk.
+
+        Args:
+            events: List of events to persist
+        """
+        if not self.path:
+            return
+
+        if not events:
+            # No events to save, remove file if exists
+            await self._remove_file()
+            return
+
+        async with self._lock:
+            try:
+                # Ensure parent directory exists
+                self.path.parent.mkdir(parents=True, exist_ok=True)
+
+                # Serialize events
+                data = [event.to_dict() for event in events]
+                json_str = json.dumps(data, indent=2)
+
+                # Write atomically (write to temp, then rename)
+                temp_path = self.path.with_suffix(".tmp")
+                async with aiofiles.open(temp_path, "w") as f:
+                    await f.write(json_str)
+
+                # Atomic rename
+                temp_path.rename(self.path)
+
+                logger.info("Saved %d pending events to %s", len(events), self.path)
+
+            except Exception as e:
+                logger.error("Failed to save pending events: %s", e, exc_info=True)
+
+    async def load_pending(self) -> List[UserEvent]:
+        """
+        Load pending events from disk.
+
+        Returns:
+            List of pending events, or empty list if none/error
+        """
+        if not self.path or not self.path.exists():
+            return []
+
+        async with self._lock:
+            try:
+                async with aiofiles.open(self.path, "r") as f:
+                    content = await f.read()
+
+                data = json.loads(content)
+                events = [UserEvent.from_dict(d) for d in data]
+
+                # Remove file after successful load
+                await self._remove_file()
+
+                logger.info("Loaded %d pending events from %s", len(events), self.path)
+                return events
+
+            except json.JSONDecodeError as e:
+                logger.error("Failed to parse pending events file: %s", e)
+                await self._remove_file()
+                return []
+
+            except Exception as e:
+                logger.error("Failed to load pending events: %s", e, exc_info=True)
+                return []
+
+    async def _remove_file(self) -> None:
+        """Remove the pending events file."""
+        if self.path and self.path.exists():
+            try:
+                self.path.unlink()
+                logger.debug("Removed pending events file: %s", self.path)
+            except Exception as e:
+                logger.warning("Failed to remove pending events file: %s", e)
+
+    def has_pending(self) -> bool:
+        """Check if there are pending events on disk."""
+        return self.path is not None and self.path.exists()
--- a/client-py/dexorder/events/publisher.py
+++ b/client-py/dexorder/events/publisher.py
@@ -0,0 +1,441 @@
+"""
+Event publisher for user containers.
+
+Publishes events via dual ZMQ patterns:
+- XPUB for informational events (fire-and-forget)
+- DEALER for critical events (guaranteed delivery with ack)
+
+The publisher automatically routes events based on:
+1. Event priority (INFORMATIONAL always uses XPUB if subscribed)
+2. Whether an active gateway session is subscribed (tracked via XPUB)
+"""
+
+import asyncio
+import logging
+import time
+import uuid
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Set
+
+import zmq
+import zmq.asyncio
+
+from .types import (
+    UserEvent,
+    EventAck,
+    Priority,
+    AckStatus,
+    serialize_user_event,
+    deserialize_event_ack,
+    get_event_type_name,
+)
+from .pending_store import PendingStore
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class PendingEvent:
+    """Tracks a pending critical event awaiting acknowledgment."""
+
+    event: UserEvent
+    sent_at: float
+    retries: int = 0
+
+
+class EventPublisher:
+    """
+    Publishes user events via dual ZMQ channels.
+
+    - XPUB socket (bind): For informational events to active sessions
+    - DEALER socket (connect): For critical events with guaranteed delivery
+
+    The publisher tracks XPUB subscriptions to know when a gateway has an
+    active session for this user. If subscribed, events go via XPUB (fast).
+    If not subscribed and priority > INFORMATIONAL, events go via DEALER.
+
+    Usage:
+        publisher = EventPublisher(user_id="user-123")
+        await publisher.start()
+
+        # Informational event (dropped if no active session)
+        await publisher.publish(UserEvent(
+            event_type=EventType.INDICATOR_UPDATED,
+            payload={"indicator": "RSI", "value": 65.5},
+            delivery=DeliverySpec.informational(),
+        ))
+
+        # Critical event (guaranteed delivery)
+        await publisher.publish(UserEvent(
+            event_type=EventType.ORDER_FILLED,
+            payload={"order_id": "123", "symbol": "BTC/USDT", ...},
+            delivery=DeliverySpec.critical(),
+        ))
+
+        await publisher.stop()
+    """
+
+    def __init__(
+        self,
+        user_id: str,
+        xpub_port: int = 5570,
+        gateway_router_endpoint: str = "tcp://gateway:5571",
+        ack_timeout: float = 30.0,
+        max_retries: int = 3,
+        pending_store_path: Optional[str] = None,
+    ):
+        """
+        Initialize event publisher.
+
+        Args:
+            user_id: User ID for this container
+            xpub_port: Port to bind XPUB socket on (gateway connects here)
+            gateway_router_endpoint: Gateway ROUTER socket endpoint (we connect)
+            ack_timeout: Seconds to wait for ack before retrying
+            max_retries: Maximum retries for critical events
+            pending_store_path: Path to persist pending events (for crash recovery)
+        """
+        self.user_id = user_id
+        self.xpub_port = xpub_port
+        self.gateway_router_endpoint = gateway_router_endpoint
+        self.ack_timeout = ack_timeout
+        self.max_retries = max_retries
+
+        # ZMQ context and sockets
+        self.ctx: Optional[zmq.asyncio.Context] = None
+        self.xpub_socket: Optional[zmq.asyncio.Socket] = None
+        self.dealer_socket: Optional[zmq.asyncio.Socket] = None
+
+        # Track active subscriptions (set of topic strings)
+        self.active_subscriptions: Set[str] = set()
+
+        # Track pending critical events awaiting ack
+        self.pending_events: Dict[str, PendingEvent] = {}
+
+        # Persistent store for crash recovery
+        self.pending_store = PendingStore(pending_store_path)
+
+        # Background tasks
+        self._subscription_task: Optional[asyncio.Task] = None
+        self._ack_task: Optional[asyncio.Task] = None
+        self._retry_task: Optional[asyncio.Task] = None
+        self._running = False
+
+        # Statistics
+        self.stats = {
+            "events_published": 0,
+            "events_via_xpub": 0,
+            "events_via_dealer": 0,
+            "events_dropped": 0,
+            "events_delivered": 0,
+            "events_failed": 0,
+            "retries": 0,
+        }
+
+    async def start(self) -> None:
+        """Start the event publisher."""
+        if self._running:
+            logger.warning("Event publisher already running")
+            return
+
+        logger.info(
+            "Starting event publisher: user_id=%s, xpub_port=%d, gateway=%s",
+            self.user_id,
+            self.xpub_port,
+            self.gateway_router_endpoint,
+        )
+
+        # Create ZMQ context
+        self.ctx = zmq.asyncio.Context()
+
+        # Create XPUB socket for informational events
+        self.xpub_socket = self.ctx.socket(zmq.XPUB)
+        self.xpub_socket.setsockopt(zmq.XPUB_VERBOSE, 1)  # Receive all sub/unsub
+        self.xpub_socket.bind(f"tcp://*:{self.xpub_port}")
+        logger.info("XPUB socket bound on port %d", self.xpub_port)
+
+        # Create DEALER socket for critical events
+        self.dealer_socket = self.ctx.socket(zmq.DEALER)
+        self.dealer_socket.setsockopt_string(zmq.IDENTITY, f"container-{self.user_id}")
+        self.dealer_socket.connect(self.gateway_router_endpoint)
+        logger.info("DEALER socket connected to %s", self.gateway_router_endpoint)
+
+        # Load any persisted pending events
+        persisted = await self.pending_store.load_pending()
+        for event in persisted:
+            self.pending_events[event.event_id] = PendingEvent(
+                event=event,
+                sent_at=time.time(),
+                retries=0,
+            )
+        if persisted:
+            logger.info("Loaded %d pending events from disk", len(persisted))
+
+        # Start background tasks
+        self._running = True
+        self._subscription_task = asyncio.create_task(self._subscription_loop())
+        self._ack_task = asyncio.create_task(self._ack_loop())
+        self._retry_task = asyncio.create_task(self._retry_loop())
+
+        # Resend any loaded pending events
+        for pending in list(self.pending_events.values()):
+            await self._send_via_dealer(pending.event)
+
+        logger.info("Event publisher started")
+
+    async def stop(self) -> None:
+        """Stop the event publisher and persist pending events."""
+        if not self._running:
+            return
+
+        logger.info("Stopping event publisher")
+        self._running = False
+
+        # Cancel background tasks
+        for task in [self._subscription_task, self._ack_task, self._retry_task]:
+            if task:
+                task.cancel()
+                try:
+                    await task
+                except asyncio.CancelledError:
+                    pass
+
+        # Persist pending critical events for crash recovery
+        if self.pending_events:
+            events = [pe.event for pe in self.pending_events.values()]
+            await self.pending_store.save_pending(events)
+            logger.info("Persisted %d pending events", len(events))
+
+        # Close sockets
+        if self.xpub_socket:
+            self.xpub_socket.close()
+        if self.dealer_socket:
+            self.dealer_socket.close()
+        if self.ctx:
+            self.ctx.term()
+
+        logger.info(
+            "Event publisher stopped. Stats: %s",
+            self.stats,
+        )
+
+    def has_active_subscriber(self) -> bool:
+        """Check if any gateway is subscribed to this user's events."""
+        topic = f"USER:{self.user_id}"
+        return topic in self.active_subscriptions
+
+    async def publish(self, event: UserEvent) -> None:
+        """
+        Publish an event via the appropriate channel.
+
+        Routing logic:
+        - INFORMATIONAL: XPUB only if subscribed, else drop
+        - NORMAL/CRITICAL + subscribed: XPUB (fast path)
+        - NORMAL/CRITICAL + not subscribed: DEALER (guaranteed)
+
+        Args:
+            event: Event to publish
+        """
+        # Ensure event has required fields
+        if not event.event_id:
+            event.event_id = str(uuid.uuid4())
+        if not event.user_id:
+            event.user_id = self.user_id
+        if not event.timestamp:
+            event.timestamp = int(time.time() * 1000)
+
+        priority = event.delivery.priority
+        has_subscriber = self.has_active_subscriber()
+
+        logger.debug(
+            "Publishing event: id=%s, type=%s, priority=%s, has_subscriber=%s",
+            event.event_id,
+            get_event_type_name(event.event_type),
+            Priority(priority).name,
+            has_subscriber,
+        )
+
+        self.stats["events_published"] += 1
+
+        if priority == Priority.INFORMATIONAL:
+            # Fire and forget - only send if someone's listening
+            if has_subscriber:
+                await self._send_via_xpub(event)
+                self.stats["events_via_xpub"] += 1
+            else:
+                logger.debug(
+                    "Dropping informational event (no subscriber): %s",
+                    event.event_id,
+                )
+                self.stats["events_dropped"] += 1
+
+        elif has_subscriber:
+            # Active session exists - use fast path
+            await self._send_via_xpub(event)
+            self.stats["events_via_xpub"] += 1
+
+        else:
+            # No active session - use guaranteed delivery
+            await self._send_via_dealer(event)
+            self.stats["events_via_dealer"] += 1
+
+            # Track for ack
+            self.pending_events[event.event_id] = PendingEvent(
+                event=event,
+                sent_at=time.time(),
+                retries=0,
+            )
+
+    async def _send_via_xpub(self, event: UserEvent) -> None:
+        """Send event via XPUB socket (fire-and-forget)."""
+        topic = f"USER:{self.user_id}"
+        payload = serialize_user_event(event)
+
+        await self.xpub_socket.send_multipart([topic.encode(), payload])
+
+        logger.debug(
+            "Sent event via XPUB: id=%s, type=%s",
+            event.event_id,
+            get_event_type_name(event.event_type),
+        )
+
+    async def _send_via_dealer(self, event: UserEvent) -> None:
+        """Send event via DEALER socket (with ack tracking)."""
+        payload = serialize_user_event(event)
+        await self.dealer_socket.send(payload)
+
+        logger.debug(
+            "Sent event via DEALER: id=%s, type=%s",
+            event.event_id,
+            get_event_type_name(event.event_type),
+        )
+
+    async def _subscription_loop(self) -> None:
+        """Process XPUB subscription/unsubscription messages."""
+        logger.debug("Starting subscription loop")
+
+        while self._running:
+            try:
+                # Poll with timeout to allow shutdown
+                if await self.xpub_socket.poll(100):
+                    msg = await self.xpub_socket.recv()
+
+                    # First byte: 1 = subscribe, 0 = unsubscribe
+                    # Remaining bytes: topic
+                    is_subscribe = msg[0] == 1
+                    topic = msg[1:].decode()
+
+                    if is_subscribe:
+                        self.active_subscriptions.add(topic)
+                        logger.info("Gateway subscribed to topic: %s", topic)
+                    else:
+                        self.active_subscriptions.discard(topic)
+                        logger.info("Gateway unsubscribed from topic: %s", topic)
+
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                logger.error("Error in subscription loop: %s", e, exc_info=True)
+
+        logger.debug("Subscription loop ended")
+
+    async def _ack_loop(self) -> None:
+        """Process EventAck messages from gateway."""
+        logger.debug("Starting ack loop")
+
+        while self._running:
+            try:
+                # Poll with timeout
+                if await self.dealer_socket.poll(100):
+                    payload = await self.dealer_socket.recv()
+                    ack = deserialize_event_ack(payload)
+
+                    logger.debug(
+                        "Received ack: event_id=%s, status=%s",
+                        ack.event_id,
+                        AckStatus(ack.status).name,
+                    )
+
+                    if ack.event_id in self.pending_events:
+                        pending = self.pending_events.pop(ack.event_id)
+
+                        if ack.status == AckStatus.DELIVERED:
+                            logger.info(
+                                "Event delivered: id=%s, type=%s, via=%s",
+                                ack.event_id,
+                                get_event_type_name(pending.event.event_type),
+                                ack.delivered_via.name if ack.delivered_via else "unknown",
+                            )
+                            self.stats["events_delivered"] += 1
+
+                        elif ack.status == AckStatus.QUEUED:
+                            logger.info(
+                                "Event queued for delivery: id=%s",
+                                ack.event_id,
+                            )
+                            self.stats["events_delivered"] += 1
+
+                        elif ack.status == AckStatus.ACK_ERROR:
+                            logger.error(
+                                "Event delivery failed: id=%s, error=%s",
+                                ack.event_id,
+                                ack.error_message,
+                            )
+                            self.stats["events_failed"] += 1
+                    else:
+                        logger.debug("Received ack for unknown event: %s", ack.event_id)
+
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                logger.error("Error in ack loop: %s", e, exc_info=True)
+
+        logger.debug("Ack loop ended")
+
+    async def _retry_loop(self) -> None:
+        """Retry pending events that haven't been acked."""
+        logger.debug("Starting retry loop")
+
+        while self._running:
+            try:
+                await asyncio.sleep(5)  # Check every 5 seconds
+
+                now = time.time()
+                for event_id, pending in list(self.pending_events.items()):
+                    if now - pending.sent_at > self.ack_timeout:
+                        if pending.retries >= self.max_retries:
+                            # Give up
+                            logger.error(
+                                "Event exceeded max retries, dropping: id=%s, type=%s",
+                                event_id,
+                                get_event_type_name(pending.event.event_type),
+                            )
+                            del self.pending_events[event_id]
+                            self.stats["events_failed"] += 1
+                        else:
+                            # Retry
+                            pending.retries += 1
+                            pending.sent_at = now
+                            await self._send_via_dealer(pending.event)
+                            logger.info(
+                                "Retrying event: id=%s, attempt=%d/%d",
+                                event_id,
+                                pending.retries,
+                                self.max_retries,
+                            )
+                            self.stats["retries"] += 1
+
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                logger.error("Error in retry loop: %s", e, exc_info=True)
+
+        logger.debug("Retry loop ended")
+
+    def get_stats(self) -> Dict[str, int]:
+        """Get publisher statistics."""
+        return {
+            **self.stats,
+            "pending_events": len(self.pending_events),
+            "active_subscriptions": len(self.active_subscriptions),
+        }
--- a/client-py/dexorder/events/types.py
+++ b/client-py/dexorder/events/types.py
@@ -0,0 +1,384 @@
+"""
+User Event Types for Container → Gateway communication.
+
+These types mirror the protobuf definitions in protobuf/user_events.proto
+and the TypeScript types in gateway/src/events/types.ts.
+
+Message Type IDs (must match protocol.md):
+- UserEvent: 0x20
+- EventAck: 0x21
+"""
+
+import json
+import uuid
+import time
+from dataclasses import dataclass, field
+from enum import IntEnum
+from typing import List, Optional, Any
+
+# =============================================================================
+# Message Type IDs
+# =============================================================================
+
+MSG_TYPE_USER_EVENT = 0x20
+MSG_TYPE_EVENT_ACK = 0x21
+
+# =============================================================================
+# Enums
+# =============================================================================
+
+
+class EventType(IntEnum):
+    """Types of events that containers can emit."""
+
+    # Trading events
+    ORDER_PLACED = 0
+    ORDER_FILLED = 1
+    ORDER_CANCELLED = 2
+    ORDER_REJECTED = 3
+    ORDER_EXPIRED = 4
+
+    # Alert events
+    ALERT_TRIGGERED = 10
+    ALERT_CREATED = 11
+    ALERT_DELETED = 12
+
+    # Position events
+    POSITION_OPENED = 20
+    POSITION_CLOSED = 21
+    POSITION_UPDATED = 22
+    POSITION_LIQUIDATED = 23
+
+    # Workspace/chart events
+    WORKSPACE_CHANGED = 30
+    CHART_ANNOTATION_ADDED = 31
+    CHART_ANNOTATION_REMOVED = 32
+    INDICATOR_UPDATED = 33
+
+    # Strategy events
+    STRATEGY_STARTED = 40
+    STRATEGY_STOPPED = 41
+    STRATEGY_LOG = 42
+    STRATEGY_ERROR = 43
+    BACKTEST_COMPLETED = 44
+
+    # System events
+    CONTAINER_STARTING = 50
+    CONTAINER_READY = 51
+    CONTAINER_SHUTTING_DOWN = 52
+    EVENT_ERROR = 53
+
+
+class Priority(IntEnum):
+    """
+    Event delivery priority.
+
+    INFORMATIONAL: Drop if no active session (fire-and-forget via XPUB)
+    NORMAL: Best effort - XPUB if subscribed, else DEALER
+    CRITICAL: Must deliver - always uses DEALER with ack
+    """
+
+    INFORMATIONAL = 0
+    NORMAL = 1
+    CRITICAL = 2
+
+
+class ChannelType(IntEnum):
+    """Delivery channel types."""
+
+    ACTIVE_SESSION = 0  # Whatever's currently connected
+    WEB = 1  # WebSocket to web UI
+    TELEGRAM = 2  # Telegram bot message
+    EMAIL = 3  # Email notification
+    PUSH = 4  # Mobile push notification
+    DISCORD = 5  # Discord webhook
+    SLACK = 6  # Slack webhook
+
+
+class AckStatus(IntEnum):
+    """Event acknowledgment status."""
+
+    DELIVERED = 0  # Successfully delivered to at least one channel
+    QUEUED = 1  # Accepted and queued for delivery
+    ACK_ERROR = 2  # Permanent failure - all channels failed
+
+
+# =============================================================================
+# Message Types
+# =============================================================================
+
+
+@dataclass
+class ChannelPreference:
+    """
+    Channel delivery preference.
+
+    Args:
+        channel: Which channel to deliver to
+        only_if_active: If True, skip this channel if user is not connected
+    """
+
+    channel: ChannelType
+    only_if_active: bool = False
+
+    def to_dict(self) -> dict:
+        return {
+            "channel": int(self.channel),
+            "only_if_active": self.only_if_active,
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict) -> "ChannelPreference":
+        return cls(
+            channel=ChannelType(data["channel"]),
+            only_if_active=data.get("only_if_active", False),
+        )
+
+
+@dataclass
+class DeliverySpec:
+    """
+    Delivery specification for an event.
+
+    Args:
+        priority: Delivery priority (INFORMATIONAL, NORMAL, CRITICAL)
+        channels: Ordered list of channel preferences to try
+    """
+
+    priority: Priority = Priority.NORMAL
+    channels: List[ChannelPreference] = field(default_factory=list)
+
+    def to_dict(self) -> dict:
+        return {
+            "priority": int(self.priority),
+            "channels": [c.to_dict() for c in self.channels],
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict) -> "DeliverySpec":
+        return cls(
+            priority=Priority(data.get("priority", Priority.NORMAL)),
+            channels=[
+                ChannelPreference.from_dict(c) for c in data.get("channels", [])
+            ],
+        )
+
+    # -------------------------------------------------------------------------
+    # Convenience constructors
+    # -------------------------------------------------------------------------
+
+    @staticmethod
+    def informational() -> "DeliverySpec":
+        """
+        Drop if no active session.
+        Use for: indicator updates, chart syncs, strategy logs when watching.
+        """
+        return DeliverySpec(
+            priority=Priority.INFORMATIONAL,
+            channels=[ChannelPreference(ChannelType.ACTIVE_SESSION, only_if_active=True)],
+        )
+
+    @staticmethod
+    def active_or_telegram() -> "DeliverySpec":
+        """
+        Active session preferred, fallback to Telegram.
+        Use for: alerts, position updates.
+        """
+        return DeliverySpec(
+            priority=Priority.NORMAL,
+            channels=[
+                ChannelPreference(ChannelType.ACTIVE_SESSION, only_if_active=True),
+                ChannelPreference(ChannelType.TELEGRAM, only_if_active=False),
+            ],
+        )
+
+    @staticmethod
+    def active_or_push() -> "DeliverySpec":
+        """
+        Active session preferred, fallback to push notification.
+        Use for: alerts, position updates on mobile.
+        """
+        return DeliverySpec(
+            priority=Priority.NORMAL,
+            channels=[
+                ChannelPreference(ChannelType.ACTIVE_SESSION, only_if_active=True),
+                ChannelPreference(ChannelType.PUSH, only_if_active=False),
+            ],
+        )
+
+    @staticmethod
+    def critical() -> "DeliverySpec":
+        """
+        Must deliver through any available channel.
+        Use for: order fills, liquidations, critical errors.
+        """
+        return DeliverySpec(
+            priority=Priority.CRITICAL,
+            channels=[
+                ChannelPreference(ChannelType.ACTIVE_SESSION, only_if_active=True),
+                ChannelPreference(ChannelType.TELEGRAM, only_if_active=False),
+                ChannelPreference(ChannelType.PUSH, only_if_active=False),
+                ChannelPreference(ChannelType.EMAIL, only_if_active=False),
+            ],
+        )
+
+    @staticmethod
+    def telegram_only() -> "DeliverySpec":
+        """
+        Deliver only to Telegram, regardless of active session.
+        Use for: scheduled reports, digest notifications.
+        """
+        return DeliverySpec(
+            priority=Priority.NORMAL,
+            channels=[
+                ChannelPreference(ChannelType.TELEGRAM, only_if_active=False),
+            ],
+        )
+
+
+@dataclass
+class UserEvent:
+    """
+    Event emitted by user container to gateway.
+
+    Args:
+        user_id: User ID this event belongs to
+        event_id: Unique event ID (UUID) for deduplication and ack
+        timestamp: Unix milliseconds when event was generated
+        event_type: Type of event
+        payload: Event data (will be JSON encoded)
+        delivery: Delivery specification
+    """
+
+    event_type: EventType
+    payload: Any = None
+    delivery: DeliverySpec = field(default_factory=DeliverySpec)
+    user_id: str = ""
+    event_id: str = ""
+    timestamp: int = 0
+
+    def __post_init__(self):
+        if not self.event_id:
+            self.event_id = str(uuid.uuid4())
+        if not self.timestamp:
+            self.timestamp = int(time.time() * 1000)
+
+    def to_dict(self) -> dict:
+        return {
+            "user_id": self.user_id,
+            "event_id": self.event_id,
+            "timestamp": self.timestamp,
+            "event_type": int(self.event_type),
+            "payload": self.payload,
+            "delivery": self.delivery.to_dict(),
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict) -> "UserEvent":
+        return cls(
+            user_id=data.get("user_id", ""),
+            event_id=data.get("event_id", ""),
+            timestamp=data.get("timestamp", 0),
+            event_type=EventType(data["event_type"]),
+            payload=data.get("payload"),
+            delivery=DeliverySpec.from_dict(data.get("delivery", {})),
+        )
+
+
+@dataclass
+class EventAck:
+    """
+    Acknowledgment from gateway for a critical event.
+
+    Args:
+        event_id: Event ID being acknowledged
+        status: Delivery status
+        error_message: Error message if status is ERROR
+        delivered_via: Which channel successfully delivered (optional)
+    """
+
+    event_id: str
+    status: AckStatus
+    error_message: str = ""
+    delivered_via: Optional[ChannelType] = None
+
+    def to_dict(self) -> dict:
+        return {
+            "event_id": self.event_id,
+            "status": int(self.status),
+            "error_message": self.error_message,
+            "delivered_via": int(self.delivered_via) if self.delivered_via is not None else None,
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict) -> "EventAck":
+        delivered_via = data.get("delivered_via")
+        return cls(
+            event_id=data["event_id"],
+            status=AckStatus(data["status"]),
+            error_message=data.get("error_message", ""),
+            delivered_via=ChannelType(delivered_via) if delivered_via is not None else None,
+        )
+
+
+# =============================================================================
+# Serialization
+# =============================================================================
+
+
+def serialize_user_event(event: UserEvent) -> bytes:
+    """
+    Serialize UserEvent to wire format.
+    Format: [1 byte msg type][JSON payload]
+
+    Note: In production, replace with proper protobuf serialization.
+    """
+    json_bytes = json.dumps(event.to_dict()).encode("utf-8")
+    return bytes([MSG_TYPE_USER_EVENT]) + json_bytes
+
+
+def deserialize_user_event(data: bytes) -> UserEvent:
+    """Deserialize UserEvent from wire format."""
+    if len(data) < 2:
+        raise ValueError("Data too short")
+
+    msg_type = data[0]
+    if msg_type != MSG_TYPE_USER_EVENT:
+        raise ValueError(f"Invalid message type: expected {MSG_TYPE_USER_EVENT}, got {msg_type}")
+
+    json_str = data[1:].decode("utf-8")
+    return UserEvent.from_dict(json.loads(json_str))
+
+
+def serialize_event_ack(ack: EventAck) -> bytes:
+    """Serialize EventAck to wire format."""
+    json_bytes = json.dumps(ack.to_dict()).encode("utf-8")
+    return bytes([MSG_TYPE_EVENT_ACK]) + json_bytes
+
+
+def deserialize_event_ack(data: bytes) -> EventAck:
+    """Deserialize EventAck from wire format."""
+    if len(data) < 2:
+        raise ValueError("Data too short")
+
+    msg_type = data[0]
+    if msg_type != MSG_TYPE_EVENT_ACK:
+        raise ValueError(f"Invalid message type: expected {MSG_TYPE_EVENT_ACK}, got {msg_type}")
+
+    json_str = data[1:].decode("utf-8")
+    return EventAck.from_dict(json.loads(json_str))
+
+
+# =============================================================================
+# Helpers
+# =============================================================================
+
+
+def get_event_type_name(event_type: EventType) -> str:
+    """Get human-readable event type name."""
+    return event_type.name
+
+
+def get_channel_type_name(channel_type: ChannelType) -> str:
+    """Get human-readable channel type name."""
+    return channel_type.name
--- a/client-py/main.py
+++ b/client-py/main.py
@@ -0,0 +1,275 @@
+#!/usr/bin/env python3
+"""
+DexOrder User Container Main Entry Point
+
+Brings together:
+- Config and secrets loading from k8s mounted YAML files
+- ZeroMQ event publisher for user events
+- MCP server with minimal "hello world" resource
+- Lifecycle management integration
+"""
+
+import asyncio
+import logging
+import os
+import signal
+import sys
+from pathlib import Path
+from typing import Optional
+
+import yaml
+from mcp.server import Server
+from mcp.server.stdio import stdio_server
+
+from dexorder import EventPublisher, start_lifecycle_manager, get_lifecycle_manager
+from dexorder.events import EventType, UserEvent, DeliverySpec
+
+
+# =============================================================================
+# Configuration
+# =============================================================================
+
+class Config:
+    """Application configuration loaded from config.yaml and secrets.yaml"""
+
+    def __init__(self):
+        # User ID (required)
+        self.user_id: str = os.getenv("USER_ID", "")
+        if not self.user_id:
+            raise ValueError("USER_ID environment variable required")
+
+        # Config and secrets paths (k8s mounted)
+        self.config_path = Path(os.getenv("CONFIG_PATH", "/app/config/config.yaml"))
+        self.secrets_path = Path(os.getenv("SECRETS_PATH", "/app/config/secrets.yaml"))
+
+        # ZMQ ports for event system
+        self.zmq_xpub_port: int = int(os.getenv("ZMQ_XPUB_PORT", "5570"))
+        self.zmq_gateway_endpoint: str = os.getenv(
+            "ZMQ_GATEWAY_ENDPOINT",
+            "tcp://gateway:5571"
+        )
+
+        # MCP server settings
+        self.mcp_server_name: str = os.getenv("MCP_SERVER_NAME", "dexorder-user")
+
+        # Lifecycle settings
+        self.idle_timeout_minutes: int = int(os.getenv("IDLE_TIMEOUT_MINUTES", "15"))
+        self.enable_idle_shutdown: bool = os.getenv("ENABLE_IDLE_SHUTDOWN", "true").lower() == "true"
+
+        # Loaded from files
+        self.config_data: dict = {}
+        self.secrets_data: dict = {}
+
+    def load(self) -> None:
+        """Load configuration and secrets from YAML files"""
+        # Load config.yaml if exists
+        if self.config_path.exists():
+            with open(self.config_path) as f:
+                self.config_data = yaml.safe_load(f) or {}
+            logging.info(f"Loaded config from {self.config_path}")
+        else:
+            logging.warning(f"Config file not found: {self.config_path}")
+
+        # Load secrets.yaml if exists
+        if self.secrets_path.exists():
+            with open(self.secrets_path) as f:
+                self.secrets_data = yaml.safe_load(f) or {}
+            logging.info(f"Loaded secrets from {self.secrets_path}")
+        else:
+            logging.warning(f"Secrets file not found: {self.secrets_path}")
+
+
+# =============================================================================
+# MCP Server Setup
+# =============================================================================
+
+def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server:
+    """Create MCP server with minimal hello world resource"""
+
+    server = Server(config.mcp_server_name)
+
+    @server.list_resources()
+    async def list_resources():
+        """List available resources"""
+        return [
+            {
+                "uri": f"dexorder://user/{config.user_id}/hello",
+                "name": "Hello World",
+                "description": "A simple hello world resource",
+                "mimeType": "text/plain",
+            }
+        ]
+
+    @server.read_resource()
+    async def read_resource(uri: str):
+        """Read a resource by URI"""
+        if uri == f"dexorder://user/{config.user_id}/hello":
+            # Publish an event when resource is accessed
+            await event_publisher.publish(UserEvent(
+                event_type=EventType.STRATEGY_LOG,
+                payload={
+                    "message": "Hello world resource accessed",
+                    "uri": uri,
+                },
+                delivery=DeliverySpec.informational(),
+            ))
+
+            return {
+                "uri": uri,
+                "mimeType": "text/plain",
+                "text": f"Hello from DexOrder user container!\nUser ID: {config.user_id}\n",
+            }
+        else:
+            raise ValueError(f"Unknown resource: {uri}")
+
+    logging.info(f"MCP server '{config.mcp_server_name}' created")
+    return server
+
+
+# =============================================================================
+# Main Application
+# =============================================================================
+
+class UserContainer:
+    """Main user container application"""
+
+    def __init__(self):
+        self.config = Config()
+        self.event_publisher: Optional[EventPublisher] = None
+        self.mcp_server: Optional[Server] = None
+        self.running = False
+
+    async def start(self) -> None:
+        """Start all subsystems"""
+        logging.info("Starting user container")
+
+        # Load configuration
+        self.config.load()
+
+        # Start lifecycle manager
+        await start_lifecycle_manager(
+            user_id=self.config.user_id,
+            idle_timeout_minutes=self.config.idle_timeout_minutes,
+            enable_idle_shutdown=self.config.enable_idle_shutdown,
+        )
+        logging.info("Lifecycle manager started")
+
+        # Start event publisher
+        self.event_publisher = EventPublisher(
+            user_id=self.config.user_id,
+            xpub_port=self.config.zmq_xpub_port,
+            gateway_router_endpoint=self.config.zmq_gateway_endpoint,
+        )
+        await self.event_publisher.start()
+        logging.info("Event publisher started")
+
+        # Publish CONTAINER_STARTING event
+        await self.event_publisher.publish(UserEvent(
+            event_type=EventType.CONTAINER_STARTING,
+            payload={
+                "user_id": self.config.user_id,
+                "timestamp": None,  # Will be auto-filled
+            },
+            delivery=DeliverySpec.active_or_telegram(),
+        ))
+
+        # Create MCP server
+        self.mcp_server = create_mcp_server(self.config, self.event_publisher)
+
+        # Publish CONTAINER_READY event
+        await self.event_publisher.publish(UserEvent(
+            event_type=EventType.CONTAINER_READY,
+            payload={
+                "user_id": self.config.user_id,
+            },
+            delivery=DeliverySpec.active_or_telegram(),
+        ))
+
+        self.running = True
+        logging.info("User container ready")
+
+    async def stop(self) -> None:
+        """Stop all subsystems"""
+        if not self.running:
+            return
+
+        logging.info("Stopping user container")
+        self.running = False
+
+        # Publish CONTAINER_SHUTTING_DOWN event
+        if self.event_publisher:
+            await self.event_publisher.publish(UserEvent(
+                event_type=EventType.CONTAINER_SHUTTING_DOWN,
+                payload={
+                    "user_id": self.config.user_id,
+                },
+                delivery=DeliverySpec.active_or_telegram(),
+            ))
+
+        # Stop subsystems
+        if self.event_publisher:
+            await self.event_publisher.stop()
+            logging.info("Event publisher stopped")
+
+        lifecycle = get_lifecycle_manager()
+        if lifecycle:
+            await lifecycle.stop()
+            logging.info("Lifecycle manager stopped")
+
+        logging.info("User container stopped")
+
+    async def run(self) -> None:
+        """Run the MCP server via stdio"""
+        await self.start()
+
+        try:
+            # Run MCP server on stdio
+            async with stdio_server() as (read_stream, write_stream):
+                await self.mcp_server.run(
+                    read_stream,
+                    write_stream,
+                    self.mcp_server.create_initialization_options()
+                )
+        finally:
+            await self.stop()
+
+
+# =============================================================================
+# Entry Point
+# =============================================================================
+
+async def main():
+    """Main entry point"""
+    # Setup logging
+    log_level = os.getenv("LOG_LEVEL", "INFO").upper()
+    logging.basicConfig(
+        level=getattr(logging, log_level),
+        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+        stream=sys.stderr,  # MCP uses stdout for protocol
+    )
+
+    # Create and run container
+    container = UserContainer()
+
+    # Handle shutdown signals
+    loop = asyncio.get_event_loop()
+
+    def handle_signal(sig):
+        logging.info(f"Received signal {sig}, shutting down...")
+        asyncio.create_task(container.stop())
+        loop.stop()
+
+    for sig in (signal.SIGTERM, signal.SIGINT):
+        loop.add_signal_handler(sig, lambda s=sig: handle_signal(s))
+
+    try:
+        await container.run()
+    except KeyboardInterrupt:
+        logging.info("Keyboard interrupt received")
+    except Exception as e:
+        logging.error(f"Fatal error: {e}", exc_info=True)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/client-py/secrets.example.yaml
+++ b/client-py/secrets.example.yaml
@@ -0,0 +1,25 @@
+# Example secrets file for DexOrder user container
+# Mount this at /app/config/secrets.yaml in k8s
+# This file should be created from k8s secrets
+
+# Iceberg catalog credentials
+iceberg:
+  s3:
+    endpoint: "http://minio:9000"
+    access_key_id: "minioadmin"
+    secret_access_key: "minioadmin"
+    region: "us-east-1"
+
+  catalog:
+    uri: "http://iceberg-rest:8181"
+    warehouse: "s3://warehouse/"
+
+# API keys for external services (if needed)
+api_keys:
+  telegram_bot_token: ""
+
+# Internal auth (for mode A - platform harness)
+auth:
+  mode: "mtls"  # or "platform_token" or "api_key"
+  # API key hash if using api_key mode
+  api_key_hash: ""
--- a/client-py/setup.py
+++ b/client-py/setup.py
@@ -10,9 +10,11 @@ setup(
        "pyiceberg>=0.6.0",
        "pyarrow>=14.0.0",
        "pandas>=2.0.0",
-        "zmq>=0.0.0",
+        "pyzmq>=25.0.0",
        "protobuf>=4.25.0",
        "pyyaml>=6.0",
+        "aiofiles>=23.0.0",
+        "mcp>=0.9.0",
    ],
    extras_require={
        "dev": [
--- a/deploy/k8s/README.md
+++ b/deploy/k8s/README.md
@@ -47,8 +47,8 @@ deploy/k8s/
 bin/dev start

 # Access the application
-# Web UI: http://dexorder.local/cryptochimp/
-# Backend: ws://dexorder.local/ws
+# Web UI: http://dexorder.local/
+# Gateway: http://dexorder.local/api

 # In another terminal, start tunnel for ingress
 bin/dev tunnel
--- a/deploy/k8s/base/gateway-ingress.yaml
+++ b/deploy/k8s/base/gateway-ingress.yaml
@@ -0,0 +1,26 @@
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: gateway-ingress
+  annotations:
+    cert-manager.io/cluster-issuer: "letsencrypt-prod"
+    nginx.ingress.kubernetes.io/rewrite-target: /$1
+spec:
+  ingressClassName: nginx
+  tls:
+  - hosts:
+    - dexorder.ai
+    secretName: gateway-tls
+  rules:
+  - host: dexorder.ai
+    http:
+      paths:
+      # Gateway API routes - strip /api prefix
+      - path: /api/(.*)
+        pathType: ImplementationSpecific
+        backend:
+          service:
+            name: gateway
+            port:
+              number: 3000
--- a/deploy/k8s/base/gateway-rbac.yaml
+++ b/deploy/k8s/base/gateway-rbac.yaml
@@ -7,7 +7,6 @@ apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: gateway
-  namespace: dexorder-system
 ---
 # Role scoped to dexorder-agents namespace only
 apiVersion: rbac.authorization.k8s.io/v1
@@ -20,27 +19,27 @@ rules:
  - apiGroups: ["apps"]
    resources: ["deployments"]
    verbs: ["create", "get", "list", "watch", "patch", "update"]
-  
+
  # PVCs: create and read (deletion handled by sidecar)
  - apiGroups: [""]
    resources: ["persistentvolumeclaims"]
    verbs: ["create", "get", "list", "watch"]
-  
+
  # Services: create and manage agent MCP endpoints
  - apiGroups: [""]
    resources: ["services"]
    verbs: ["create", "get", "list", "watch", "patch", "update"]
-  
+
  # Read-only pod access for status checks (no exec!)
  - apiGroups: [""]
    resources: ["pods"]
    verbs: ["get", "list", "watch"]
-  
+
  # Pod logs for debugging (read-only)
  - apiGroups: [""]
    resources: ["pods/log"]
    verbs: ["get"]
-  
+
  # Explicitly NOT included:
  # - deployments/delete - handled by lifecycle sidecar
  # - pvc/delete - handled by lifecycle sidecar
@@ -58,7 +57,7 @@ metadata:
 subjects:
  - kind: ServiceAccount
    name: gateway
-    namespace: dexorder-system
+    namespace: default
 roleRef:
  kind: Role
  name: agent-creator
--- a/deploy/k8s/base/gateway.yaml
+++ b/deploy/k8s/base/gateway.yaml
@@ -0,0 +1,101 @@
+# Gateway deployment
+# Multi-channel gateway with automatic container provisioning
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: gateway
+spec:
+  selector:
+    app: gateway
+  ports:
+    - name: http
+      protocol: TCP
+      port: 3000
+      targetPort: http
+  type: ClusterIP
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gateway
+  labels:
+    app: gateway
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gateway
+  template:
+    metadata:
+      labels:
+        app: gateway
+    spec:
+      serviceAccountName: gateway
+
+      initContainers:
+        - name: wait-for-postgres
+          image: busybox:1.36
+          command: ['sh', '-c', 'until nc -z postgres 5432; do echo waiting for postgres; sleep 2; done;']
+        - name: wait-for-dragonfly
+          image: busybox:1.36
+          command: ['sh', '-c', 'until nc -z dragonfly 6379; do echo waiting for dragonfly; sleep 2; done;']
+        - name: wait-for-qdrant
+          image: busybox:1.36
+          command: ['sh', '-c', 'until nc -z qdrant 6333; do echo waiting for qdrant; sleep 2; done;']
+
+      volumes:
+        - name: config
+          configMap:
+            name: gateway-config
+        - name: secrets
+          secret:
+            secretName: gateway-secrets
+
+      containers:
+        - name: gateway
+          image: ghcr.io/dexorder/gateway:latest
+          imagePullPolicy: Always
+
+          ports:
+            - name: http
+              containerPort: 3000
+              protocol: TCP
+
+          volumeMounts:
+            - name: config
+              mountPath: /config/config.yaml
+              subPath: config.yaml
+              readOnly: true
+            - name: secrets
+              mountPath: /config/secrets.yaml
+              subPath: secrets.yaml
+              readOnly: true
+
+          env:
+            - name: CONFIG_PATH
+              value: "/config/config.yaml"
+            - name: SECRETS_PATH
+              value: "/config/secrets.yaml"
+
+          resources:
+            requests:
+              memory: "256Mi"
+              cpu: "100m"
+            limits:
+              memory: "512Mi"
+              cpu: "500m"
+
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 10
+            periodSeconds: 30
+
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 5
+            periodSeconds: 10
--- a/deploy/k8s/base/ingress.yaml
+++ b/deploy/k8s/base/ingress.yaml
@@ -14,4 +14,12 @@ spec:
  rules:
  - host: dexorder.ai
    http:
-      paths: []
+      paths:
+      # Web application at root
+      - path: /
+        pathType: Prefix
+        backend:
+          service:
+            name: ai-web
+            port:
+              number: 5173
--- a/deploy/k8s/base/kustomization.yaml
+++ b/deploy/k8s/base/kustomization.yaml
@@ -16,11 +16,11 @@ resources:
  - agent-quotas.yaml
  # Network isolation policies
  - network-policies.yaml
-  # Gateway service (uncomment when ready)
-  # - gateway.yaml
+  # Gateway service
+  - gateway.yaml
+  - gateway-ingress.yaml
  # Example agent deployment (for reference, not applied by default)
  # - agent-deployment-example.yaml
-  # Services (uncomment as needed)
-  # - backend.yaml
-  # - web.yaml
-  # - ingress.yaml
+  # Services
+  - web.yaml
+  - ingress.yaml
--- a/deploy/k8s/base/namespaces.yaml
+++ b/deploy/k8s/base/namespaces.yaml
@@ -1,17 +1,9 @@
 # Namespace definitions for dexorder AI platform
-# - dexorder-system: gateway, flink, kafka, and other infrastructure
+# - default: gateway, web, and infrastructure services
 # - dexorder-agents: user agent containers (isolated, restricted)
 ---
 apiVersion: v1
 kind: Namespace
-metadata:
-  name: dexorder-system
-  labels:
-    app.kubernetes.io/part-of: dexorder
-    dexorder.io/type: system
---
-apiVersion: v1
-kind: Namespace
 metadata:
  name: dexorder-agents
  labels:
--- a/deploy/k8s/base/network-policies.yaml
+++ b/deploy/k8s/base/network-policies.yaml
@@ -28,10 +28,7 @@ spec:
    - Ingress
  ingress:
    - from:
-        - namespaceSelector:
-            matchLabels:
-              dexorder.io/type: system
-          podSelector:
+        - podSelector:
            matchLabels:
              app: gateway
      ports:
@@ -64,17 +61,14 @@ spec:
          port: 53
        - protocol: TCP
          port: 53
-    # Gateway in system namespace (for callbacks)
+    # Gateway (for callbacks)
    - to:
-        - namespaceSelector:
-            matchLabels:
-              dexorder.io/type: system
-          podSelector:
+        - podSelector:
            matchLabels:
              app: gateway
      ports:
        - protocol: TCP
-          port: 8080
+          port: 3000
    # Kafka/Redpanda for data subscriptions
    - to:
        - namespaceSelector:
@@ -99,12 +93,11 @@ spec:
        - protocol: TCP
          port: 443
 ---
-# System namespace: allow ingress from agents
+# Default namespace: allow ingress from agents to gateway
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
  name: allow-agent-callbacks
-  namespace: dexorder-system
 spec:
  podSelector:
    matchLabels:
@@ -118,4 +111,4 @@ spec:
              dexorder.io/type: agents
      ports:
        - protocol: TCP
-          port: 8080
+          port: 3000
--- a/deploy/k8s/base/web.yaml
+++ b/deploy/k8s/base/web.yaml
@@ -32,7 +32,7 @@ spec:
        ports:
        - containerPort: 5173
        env:
-        - name: VITE_BASE_PATH
-          value: "/cryptochimp/"
+        - name: VITE_GATEWAY_URL
+          value: "https://dexorder.ai/api"
        - name: VITE_WS_URL
          value: "wss://dexorder.ai/ws"
--- a/deploy/k8s/dev/configs/gateway-config.yaml
+++ b/deploy/k8s/dev/configs/gateway-config.yaml
@@ -0,0 +1,66 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: gateway-config
+data:
+  config.yaml: |
+    # Gateway Configuration
+
+    # Server configuration
+    server:
+      port: 3000
+      host: 0.0.0.0
+      log_level: debug
+      cors_origin: "*"
+      base_url: http://dexorder.local
+      trusted_origins:
+        - http://dexorder.local
+        - http://localhost:5173
+        - ws://dexorder.local
+
+    # Database
+    database:
+      url: postgresql://postgres:password@postgres:5432/iceberg
+
+    # Default model (if user has no preference)
+    defaults:
+      model_provider: anthropic
+      model: claude-3-5-sonnet-20241022
+
+    # Kubernetes configuration
+    kubernetes:
+      namespace: dexorder-agents
+      in_cluster: true
+      agent_image: ghcr.io/dexorder/agent:latest
+      sidecar_image: lifecycle-sidecar:latest
+      storage_class: standard
+
+    # DragonflyDB (Redis-compatible, for hot storage and session management)
+    redis:
+      url: redis://dragonfly:6379
+
+    # Qdrant (for RAG vector search)
+    qdrant:
+      url: http://qdrant:6333
+      collection: gateway_memory
+
+    # Iceberg (for durable storage via REST catalog)
+    iceberg:
+      catalog_uri: http://iceberg-catalog:8181
+      namespace: gateway
+      s3_endpoint: http://minio:9000
+
+    # Event router (ZeroMQ)
+    events:
+      router_bind: tcp://*:5571
+
+    # Embeddings (for RAG vector search)
+    # Ollama runs in the same container as the gateway (see gateway/Dockerfile)
+    embedding:
+      provider: ollama
+      model: all-minilm
+      ollama_url: http://localhost:11434
+
+    # Email service configuration
+    email:
+      from_address: noreply@dexorder.com
--- a/deploy/k8s/dev/gateway-dev-patch.yaml
+++ b/deploy/k8s/dev/gateway-dev-patch.yaml
@@ -0,0 +1,15 @@
+# Gateway dev overrides - use local image
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gateway
+spec:
+  template:
+    spec:
+      containers:
+        - name: gateway
+          image: dexorder/gateway:latest
+          imagePullPolicy: Never
+          env:
+            - name: NODE_OPTIONS
+              value: "--trace-deprecation"
--- a/deploy/k8s/dev/gateway-health-ingress.yaml
+++ b/deploy/k8s/dev/gateway-health-ingress.yaml
@@ -0,0 +1,19 @@
+---
+# Separate ingress for health endpoint without rewrite
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: gateway-health-ingress
+spec:
+  ingressClassName: nginx
+  rules:
+  - host: dexorder.local
+    http:
+      paths:
+      - path: /health
+        pathType: Exact
+        backend:
+          service:
+            name: gateway
+            port:
+              number: 3000
--- a/deploy/k8s/dev/gateway-ingress-patch.yaml
+++ b/deploy/k8s/dev/gateway-ingress-patch.yaml
@@ -0,0 +1,27 @@
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: gateway-ingress
+  annotations:
+    nginx.ingress.kubernetes.io/rewrite-target: /$1
+    # Enable WebSocket support
+    nginx.ingress.kubernetes.io/websocket-services: gateway
+    nginx.ingress.kubernetes.io/proxy-read-timeout: "3600"
+    nginx.ingress.kubernetes.io/proxy-send-timeout: "3600"
+spec:
+  ingressClassName: nginx
+  # Remove TLS for dev
+  tls: []
+  rules:
+  - host: dexorder.local
+    http:
+      paths:
+      # Gateway API routes - strip /api prefix
+      - path: /api/(.*)
+        pathType: ImplementationSpecific
+        backend:
+          service:
+            name: gateway
+            port:
+              number: 3000
--- a/deploy/k8s/dev/infrastructure.yaml
+++ b/deploy/k8s/dev/infrastructure.yaml
@@ -1,4 +1,112 @@
 ---
+# DragonflyDB (Redis-compatible in-memory datastore)
+apiVersion: v1
+kind: Service
+metadata:
+  name: dragonfly
+spec:
+  selector:
+    app: dragonfly
+  ports:
+    - protocol: TCP
+      port: 6379
+      targetPort: 6379
+  type: ClusterIP
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: dragonfly
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: dragonfly
+  template:
+    metadata:
+      labels:
+        app: dragonfly
+    spec:
+      containers:
+      - name: dragonfly
+        image: docker.dragonflydb.io/dragonflydb/dragonfly:latest
+        ports:
+        - containerPort: 6379
+          name: dragonfly
+        args:
+        - --logtostderr
+        - --alsologtostderr=false
+        - --cache_mode=true
+        resources:
+          requests:
+            memory: "256Mi"
+            cpu: "100m"
+          limits:
+            memory: "512Mi"
+            cpu: "500m"
+---
+# Qdrant (Vector database for RAG)
+apiVersion: v1
+kind: Service
+metadata:
+  name: qdrant
+spec:
+  selector:
+    app: qdrant
+  ports:
+    - name: http
+      protocol: TCP
+      port: 6333
+      targetPort: 6333
+    - name: grpc
+      protocol: TCP
+      port: 6334
+      targetPort: 6334
+  type: ClusterIP
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: qdrant
+spec:
+  serviceName: qdrant
+  replicas: 1
+  selector:
+    matchLabels:
+      app: qdrant
+  template:
+    metadata:
+      labels:
+        app: qdrant
+    spec:
+      containers:
+      - name: qdrant
+        image: qdrant/qdrant:latest
+        ports:
+        - containerPort: 6333
+          name: http
+        - containerPort: 6334
+          name: grpc
+        resources:
+          requests:
+            memory: "512Mi"
+            cpu: "200m"
+          limits:
+            memory: "1Gi"
+            cpu: "1000m"
+        volumeMounts:
+        - name: qdrant-data
+          mountPath: /qdrant/storage
+  volumeClaimTemplates:
+  - metadata:
+      name: qdrant-data
+    spec:
+      accessModes: ["ReadWriteOnce"]
+      storageClassName: dev-ephemeral
+      resources:
+        requests:
+          storage: 10Gi
+---
 # Kafka (KRaft mode - no Zookeeper needed)
 # Using apache/kafka:3.9.0 instead of confluentinc/cp-kafka because:
 # - cp-kafka's entrypoint script has issues with KRaft configuration
@@ -74,6 +182,7 @@ spec:
      name: kafka-data
    spec:
      accessModes: ["ReadWriteOnce"]
+      storageClassName: dev-ephemeral
      resources:
        requests:
          storage: 5Gi
@@ -130,6 +239,7 @@ spec:
      name: postgres-data
    spec:
      accessModes: ["ReadWriteOnce"]
+      storageClassName: dev-ephemeral
      resources:
        requests:
          storage: 2Gi
@@ -200,6 +310,7 @@ spec:
      name: minio-data
    spec:
      accessModes: ["ReadWriteOnce"]
+      storageClassName: dev-ephemeral
      resources:
        requests:
          storage: 10Gi
--- a/deploy/k8s/dev/ingress-dev.yaml
+++ b/deploy/k8s/dev/ingress-dev.yaml
@@ -8,4 +8,12 @@ spec:
  rules:
  - host: dexorder.local
    http:
-      paths: []
+      paths:
+      # Web application at root
+      - path: /
+        pathType: Prefix
+        backend:
+          service:
+            name: ai-web
+            port:
+              number: 5173
--- a/deploy/k8s/dev/kustomization.yaml
+++ b/deploy/k8s/dev/kustomization.yaml
@@ -1,13 +1,13 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization

-# Note: namespaces are defined in base; workloads go to dexorder-system
-namespace: dexorder-system
-
 # Base resources (includes security policies)
 resources:
  - ../base
  - infrastructure.yaml
+  - storage-class.yaml
+  - configs/gateway-config.yaml
+  - gateway-health-ingress.yaml

 # Dev-specific patches
 patches:
@@ -15,6 +15,14 @@ patches:
  - path: agent-quotas-patch.yaml
  # Allow local registry images
  - path: admission-policy-patch.yaml
+  # Web environment variables for dev
+  - path: web-dev-patch.yaml
+  # Web ingress for dev (no TLS, dexorder.local)
+  - path: web-ingress-patch.yaml
+  # Gateway dev overrides (use local image)
+  - path: gateway-dev-patch.yaml
+  # Gateway ingress for dev (no TLS, dexorder.local)
+  - path: gateway-ingress-patch.yaml

 # ConfigMaps for service configs
 configMapGenerator:
@@ -34,3 +42,24 @@ secretGenerator: []

 generatorOptions:
  disableNameSuffixHash: true
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/deploy/k8s/dev/storage-class.yaml
+++ b/deploy/k8s/dev/storage-class.yaml
@@ -0,0 +1,13 @@
+---
+# Development-specific StorageClass with auto-deletion
+# This ensures PVCs and PVs are automatically cleaned up when released
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: dev-ephemeral
+  labels:
+    environment: development
+provisioner: k8s.io/minikube-hostpath
+reclaimPolicy: Delete
+volumeBindingMode: Immediate
+allowVolumeExpansion: false
--- a/deploy/k8s/dev/web-dev-patch.yaml
+++ b/deploy/k8s/dev/web-dev-patch.yaml
@@ -0,0 +1,17 @@
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ai-web
+spec:
+  template:
+    spec:
+      containers:
+      - name: ai-web
+        image: dexorder/ai-web:latest
+        imagePullPolicy: Never
+        env:
+        - name: VITE_GATEWAY_URL
+          value: "/api"
+        - name: VITE_WS_URL
+          value: "ws://dexorder.local/api/ws/chat"
--- a/deploy/k8s/dev/web-ingress-patch.yaml
+++ b/deploy/k8s/dev/web-ingress-patch.yaml
@@ -0,0 +1,21 @@
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: ai-ingress
+spec:
+  ingressClassName: nginx
+  # Remove TLS for dev
+  tls: []
+  rules:
+  - host: dexorder.local
+    http:
+      paths:
+      # Web application at root
+      - path: /
+        pathType: Prefix
+        backend:
+          service:
+            name: ai-web
+            port:
+              number: 5173
--- a/deploy/k8s/prod/configs/gateway-config.yaml
+++ b/deploy/k8s/prod/configs/gateway-config.yaml
@@ -0,0 +1,64 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: gateway-config
+data:
+  config.yaml: |
+    # Gateway Configuration
+
+    # Server configuration
+    server:
+      port: 3000
+      host: 0.0.0.0
+      log_level: info
+      cors_origin: "https://app.dexorder.com"
+      base_url: https://api.dexorder.com
+      trusted_origins:
+        - https://app.dexorder.com
+        - https://api.dexorder.com
+
+    # Database
+    database:
+      url: postgresql://postgres:postgres@postgres:5432/iceberg
+
+    # Default model (if user has no preference)
+    defaults:
+      model_provider: anthropic
+      model: claude-3-5-sonnet-20241022
+
+    # Kubernetes configuration
+    kubernetes:
+      namespace: dexorder-agents
+      in_cluster: true
+      agent_image: ghcr.io/dexorder/agent:latest
+      sidecar_image: ghcr.io/dexorder/lifecycle-sidecar:latest
+      storage_class: standard
+
+    # DragonflyDB (Redis-compatible, for hot storage and session management)
+    redis:
+      url: redis://dragonfly:6379
+
+    # Qdrant (for RAG vector search)
+    qdrant:
+      url: http://qdrant:6333
+      collection: gateway_memory
+
+    # Iceberg (for durable storage via REST catalog)
+    iceberg:
+      catalog_uri: http://iceberg-catalog:8181
+      namespace: gateway
+      s3_endpoint: http://minio:9000
+
+    # Event router (ZeroMQ)
+    events:
+      router_bind: tcp://*:5571
+
+    # Embeddings (for RAG vector search)
+    embedding:
+      provider: ollama
+      model: all-minilm
+      ollama_url: http://ollama:11434
+
+    # Email service configuration
+    email:
+      from_address: noreply@dexorder.com
--- a/deploy/k8s/prod/kustomization.yaml
+++ b/deploy/k8s/prod/kustomization.yaml
@@ -1,12 +1,10 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization

-# Note: namespaces are defined in base; workloads go to dexorder-system
-namespace: dexorder-system
-
 # Base resources (includes all security policies)
 resources:
  - ../base
+  - configs/gateway-config.yaml

 # Production patches
 patches:
@@ -18,13 +16,13 @@ patches:
 configMapGenerator:
  - name: relay-config
    files:
-      - config.yaml=../../configmaps/relay-config.yaml
+      - config.yaml=configs/relay-config.yaml
  - name: ingestor-config
    files:
-      - config.yaml=../../configmaps/ingestor-config.yaml
+      - config.yaml=configs/ingestor-config.yaml
  - name: flink-config
    files:
-      - config.yaml=../../configmaps/flink-config.yaml
+      - config.yaml=configs/flink-config.yaml

 # Secrets (managed via kubectl, not committed)
 # These are created by bin/secret-update prod
--- a/deploy/k8s/prod/patches.yaml
+++ b/deploy/k8s/prod/patches.yaml
@@ -39,8 +39,8 @@ spec:
        image: dexorder/ai-web:latest
        imagePullPolicy: Always
        env:
-        - name: VITE_BASE_PATH
-          value: "/cryptochimp/"
+        - name: VITE_GATEWAY_URL
+          value: "https://dexorder.ai/api"
        - name: VITE_WS_URL
          value: "wss://dexorder.ai/ws"
        resources:
--- a/doc/agent_harness.md
+++ b/doc/agent_harness.md
@@ -0,0 +1,392 @@
+# Agent Harness Architecture
+
+The Agent Harness is the core orchestration layer for the Dexorder AI platform, built on LangChain.js and LangGraph.js.
+
+## Architecture Overview
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                    Gateway (Fastify)                        │
+│  ┌──────────────┐  ┌──────────────┐  ┌──────────────┐     │
+│  │  WebSocket   │  │  Telegram    │  │  Event       │     │
+│  │  Handler     │  │  Handler     │  │  Router      │     │
+│  └──────┬───────┘  └──────┬───────┘  └──────┬───────┘     │
+│         │                  │                  │              │
+│         └──────────────────┴──────────────────┘              │
+│                            │                                 │
+│                    ┌───────▼────────┐                        │
+│                    │ Agent Harness  │                        │
+│                    │  (Stateless)   │                        │
+│                    └───────┬────────┘                        │
+│                            │                                 │
+│         ┌──────────────────┼──────────────────┐             │
+│         │                  │                  │             │
+│    ┌────▼─────┐      ┌────▼─────┐      ┌────▼─────┐       │
+│    │   MCP    │      │   LLM    │      │   RAG    │       │
+│    │ Connector│      │  Router  │      │ Retriever│       │
+│    └────┬─────┘      └────┬─────┘      └────┬─────┘       │
+│         │                  │                  │             │
+└─────────┼──────────────────┼──────────────────┼─────────────┘
+          │                  │                  │
+          ▼                  ▼                  ▼
+   ┌────────────┐     ┌───────────┐     ┌───────────┐
+   │   User's   │     │    LLM    │     │  Qdrant   │
+   │    MCP     │     │ Providers │     │ (Vectors) │
+   │ Container  │     │(Anthropic,│     │           │
+   │ (k8s pod)  │     │  OpenAI,  │     │  Global + │
+   │            │     │   etc)    │     │   User    │
+   └────────────┘     └───────────┘     └───────────┘
+```
+
+## Message Processing Flow
+
+When a user sends a message:
+
+```
+1. Gateway receives message via channel (WebSocket/Telegram)
+   ↓
+2. Authenticator validates user and gets license info
+   ↓
+3. Container Manager ensures user's MCP container is running
+   ↓
+4. Agent Harness processes message:
+   │
+   ├─→ a. MCPClientConnector fetches context resources:
+   │      - context://user-profile
+   │      - context://conversation-summary
+   │      - context://workspace-state
+   │      - context://system-prompt
+   │
+   ├─→ b. RAGRetriever searches for relevant memories:
+   │      - Embeds user query
+   │      - Searches Qdrant: user_id = current_user OR user_id = "0"
+   │      - Returns user-specific + global platform knowledge
+   │
+   ├─→ c. Build system prompt:
+   │      - Base platform prompt
+   │      - User profile context
+   │      - Workspace state
+   │      - Custom user instructions
+   │      - Relevant RAG memories
+   │
+   ├─→ d. ModelRouter selects LLM:
+   │      - Based on license tier
+   │      - Query complexity
+   │      - Configured routing strategy
+   │
+   ├─→ e. LLM invocation with tool support:
+   │      - Send messages to LLM
+   │      - If tool calls requested:
+   │         • Platform tools → handled by gateway
+   │         • User tools → proxied to MCP container
+   │      - Loop until no more tool calls
+   │
+   ├─→ f. Save conversation to MCP:
+   │      - mcp.callTool('save_message', user_message)
+   │      - mcp.callTool('save_message', assistant_message)
+   │
+   └─→ g. Return response to user via channel
+```
+
+## Core Components
+
+### 1. Agent Harness (`gateway/src/harness/agent-harness.ts`)
+
+**Stateless orchestrator** - all state lives in user's MCP container or RAG.
+
+**Responsibilities:**
+- Fetch context from user's MCP resources
+- Query RAG for relevant memories
+- Build prompts with full context
+- Route to appropriate LLM
+- Handle tool calls (platform vs user)
+- Save conversation back to MCP
+- Stream responses to user
+
+**Key Methods:**
+- `handleMessage()`: Process single message (non-streaming)
+- `streamMessage()`: Process with streaming response
+- `initialize()`: Connect to user's MCP server
+
+### 2. MCP Client Connector (`gateway/src/harness/mcp-client.ts`)
+
+Connects to user's MCP container using Model Context Protocol.
+
+**Features:**
+- Resource reading (context://, indicators://, strategies://)
+- Tool execution (save_message, run_backtest, etc.)
+- Automatic reconnection on container restarts
+- Error handling and fallbacks
+
+### 3. Model Router (`gateway/src/llm/router.ts`)
+
+Routes queries to appropriate LLM based on:
+- **License tier**: Free users → smaller models, paid → better models
+- **Complexity**: Simple queries → fast models, complex → powerful models
+- **Cost optimization**: Balance performance vs cost
+
+**Routing Strategies:**
+- `COST`: Minimize cost
+- `COMPLEXITY`: Match model to query complexity
+- `SPEED`: Prioritize fast responses
+- `QUALITY`: Best available model
+
+### 4. Memory Layer
+
+#### Three-Tier Storage:
+
+**Redis** (Hot Storage)
+- Active session state
+- Recent conversation history (last 50 messages)
+- LangGraph checkpoints (1 hour TTL)
+- Fast reads for active conversations
+
+**Qdrant** (Vector Search)
+- Conversation embeddings
+- User-specific memories (user_id = actual user ID)
+- **Global platform knowledge** (user_id = "0")
+- RAG retrieval with cosine similarity
+- GDPR-compliant (indexed by user_id for fast deletion)
+
+**Iceberg** (Cold Storage)
+- Full conversation history (partitioned by user_id, session_id)
+- Checkpoint snapshots for replay
+- Analytics and time-travel queries
+- GDPR-compliant with compaction
+
+#### RAG System:
+
+**Global Knowledge** (user_id="0"):
+- Platform capabilities and architecture
+- Trading concepts and fundamentals
+- Indicator development guides
+- Strategy patterns and examples
+- Loaded from `gateway/knowledge/` markdown files
+
+**User Knowledge** (user_id=specific user):
+- Personal conversation history
+- Trading preferences and style
+- Custom indicators and strategies
+- Workspace state and context
+
+**Query Flow:**
+1. User query is embedded using EmbeddingService
+2. Qdrant searches: `user_id IN (current_user, "0")`
+3. Top-K relevant chunks returned
+4. Added to LLM context automatically
+
+### 5. Skills vs Subagents
+
+#### Skills (`gateway/src/harness/skills/`)
+
+**Use for**: Well-defined, specific tasks
+- Market analysis
+- Strategy validation
+- Single-purpose capabilities
+- Defined in markdown + TypeScript
+
+**Structure:**
+```typescript
+class MarketAnalysisSkill extends BaseSkill {
+  async execute(context, parameters) {
+    // Implementation
+  }
+}
+```
+
+#### Subagents (`gateway/src/harness/subagents/`)
+
+**Use for**: Complex domain expertise with context
+- Code reviewer with review guidelines
+- Risk analyzer with risk models
+- Multi-file knowledge base in memory/ directory
+- Custom system prompts
+
+**Structure:**
+```
+subagents/
+  code-reviewer/
+    config.yaml              # Model, memory files, capabilities
+    system-prompt.md         # Specialized instructions
+    memory/
+      review-guidelines.md
+      common-patterns.md
+      best-practices.md
+    index.ts
+```
+
+**Recommendation**: Prefer skills for most tasks. Use subagents when you need:
+- Substantial domain-specific knowledge
+- Multi-file context management
+- Specialized system prompts
+
+### 6. Workflows (`gateway/src/harness/workflows/`)
+
+LangGraph state machines for multi-step orchestration:
+
+**Features:**
+- Validation loops (retry with fixes)
+- Human-in-the-loop (approval gates)
+- Error recovery
+- State persistence via checkpoints
+
+**Example Workflows:**
+- Strategy validation: review → backtest → risk → approval
+- Trading request: analysis → risk → approval → execute
+
+## User Context Structure
+
+Every interaction includes rich context:
+
+```typescript
+interface UserContext {
+  userId: string;
+  sessionId: string;
+  license: UserLicense;
+
+  // Multi-channel support
+  activeChannel: {
+    type: 'websocket' | 'telegram' | 'slack' | 'discord';
+    channelUserId: string;
+    capabilities: {
+      supportsMarkdown: boolean;
+      supportsImages: boolean;
+      supportsButtons: boolean;
+      maxMessageLength: number;
+    };
+  };
+
+  // Retrieved from MCP + RAG
+  conversationHistory: BaseMessage[];
+  relevantMemories: MemoryChunk[];
+  workspaceState: WorkspaceContext;
+}
+```
+
+## User-Specific Files and Tools
+
+User's MCP container provides access to:
+
+**Indicators** (`indicators/*.py`)
+- Custom technical indicators
+- Pure functions: DataFrame → Series/DataFrame
+- Version controlled in user's git repo
+
+**Strategies** (`strategies/*.py`)
+- Trading strategies with entry/exit rules
+- Position sizing and risk management
+- Backtestable and deployable
+
+**Watchlists**
+- Saved ticker lists
+- Market monitoring
+
+**Preferences**
+- Trading style and risk tolerance
+- Chart settings and colors
+- Notification preferences
+
+**Executors** (sub-strategies)
+- Tactical order generators (TWAP, iceberg, etc.)
+- Smart order routing
+
+## Global Knowledge Management
+
+### Document Loading
+
+At gateway startup:
+1. DocumentLoader scans `gateway/knowledge/` directory
+2. Markdown files chunked by headers (~1000 tokens/chunk)
+3. Embeddings generated via EmbeddingService
+4. Stored in Qdrant with user_id="0"
+5. Content hashing enables incremental updates
+
+### Directory Structure
+
+```
+gateway/knowledge/
+  ├── platform/          # Platform capabilities
+  ├── trading/           # Trading fundamentals
+  ├── indicators/        # Indicator development
+  └── strategies/        # Strategy patterns
+```
+
+### Updating Knowledge
+
+**Development:**
+```bash
+curl -X POST http://localhost:3000/admin/reload-knowledge
+```
+
+**Production:**
+- Update markdown files
+- Deploy new version
+- Auto-loaded on startup
+
+**Monitoring:**
+```bash
+curl http://localhost:3000/admin/knowledge-stats
+```
+
+## Container Lifecycle
+
+### User Container Creation
+
+When user connects:
+1. Gateway checks if container exists (ContainerManager)
+2. If not, creates Kubernetes pod with:
+   - Agent container (Python + conda)
+   - Lifecycle sidecar (container management)
+   - Persistent volume (git repo)
+3. Waits for MCP server ready (~5-10s cold start)
+4. Establishes MCP connection
+5. Begins message processing
+
+### Container Shutdown
+
+**Free users:** 15 minutes idle timeout
+**Paid users:** Longer timeout based on license
+**On shutdown:**
+- Graceful save of all state
+- Persistent storage retained
+- Fast restart on next connection
+
+### MCP Authentication Modes
+
+1. **Public Mode** (Free tier): No auth, read-only, anonymous session
+2. **Gateway Auth** (Standard): Gateway authenticates, container trusts gateway
+3. **Direct Auth** (Enterprise): User authenticates directly with container
+
+## Implementation Status
+
+### ✅ Completed
+- Agent Harness with MCP integration
+- Model routing with license tiers
+- RAG retriever with Qdrant
+- Document loader for global knowledge
+- EmbeddingService (Ollama/OpenAI)
+- Skills and subagents framework
+- Multi-channel support (WebSocket, Telegram)
+- Container lifecycle management
+- Event system with ZeroMQ
+
+### 🚧 In Progress
+- Iceberg integration (checkpoint-saver, conversation-store)
+- More subagents (risk-analyzer, market-analyst)
+- LangGraph workflows with interrupts
+- Platform tools (market data, charting)
+
+### 📋 Planned
+- File watcher for hot-reload in development
+- Advanced RAG strategies (hybrid search, re-ranking)
+- Caching layer for expensive operations
+- Performance monitoring and metrics
+
+## References
+
+- Implementation: `gateway/src/harness/`
+- Documentation: `gateway/src/harness/README.md`
+- Knowledge base: `gateway/knowledge/`
+- LangGraph: https://langchain-ai.github.io/langgraphjs/
+- Qdrant: https://qdrant.tech/documentation/
+- MCP Spec: https://modelcontextprotocol.io/
--- a/doc/agent_harness_flow.md
+++ b/doc/agent_harness_flow.md
@@ -1,21 +0,0 @@
-┌─────────────────────────────────────────────────┐
-│              Agent Harness (your servers)         │
-│                                                   │
-│  on_message(user_id, message):                    │
-│    1. Look up user's MCP endpoint from Postgres   │
-│    2. mcp.call("get_context_summary")             │
-│    3. mcp.call("get_conversation_history", 20)    │
-│    4. Build prompt:                                │
-│         system = BASE_PROMPT                       │
-│                + context_summary                   │
-│                + user_agent_prompt (from MCP)      │
-│         messages = history + new message           │
-│    5. LLM call (your API key)                      │
-│    6. While LLM wants tool calls:                  │
-│         - Platform tools → handle locally          │
-│         - User tools → proxy to MCP                │
-│         - LLM call again with results              │
-│    7. mcp.call("save_message", ...)                │
-│    8. Return response to user                      │
-│                                                   │
-└─────────────────────────────────────────────────┘
--- a/doc/agent_redesign.md
+++ b/doc/agent_redesign.md
@@ -1,11 +0,0 @@
-Generally use skills instead of subagents, except for the analysis subagent.
-
-## User-specific files and tools
-* Indicators
-* Strategies
-* Watchlists
-* Preferences
-  * Trading style
-  * Charting / colors
-* Executors (really just sub-strategies)
-  * tactical-level order generators e.g. TWAP, iceberg, etc. 
--- a/doc/architecture.md
+++ b/doc/architecture.md
@@ -0,0 +1,656 @@
+# DexOrder AI Platform Architecture
+
+## Overview
+
+DexOrder is an AI-powered trading platform that combines real-time market data processing, user-specific AI agents, and a flexible data pipeline. The system is designed for scalability, isolation, and extensibility.
+
+## High-Level Architecture
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                         User Clients                             │
+│              (Web, Mobile, Telegram, External MCP)               │
+└────────────────────────────┬────────────────────────────────────┘
+                             │
+                             ▼
+┌─────────────────────────────────────────────────────────────────┐
+│                          Gateway                                 │
+│  • WebSocket/HTTP/Telegram handlers                             │
+│  • Authentication & session management                           │
+│  • Agent Harness (LangChain/LangGraph orchestration)            │
+│    - MCP client connector to user containers                    │
+│    - RAG retriever (Qdrant)                                     │
+│    - Model router (LLM selection)                               │
+│    - Skills & subagents framework                               │
+│  • Dynamic user container provisioning                           │
+│  • Event routing (informational & critical)                      │
+└────────┬──────────────────┬────────────────────┬────────────────┘
+         │                  │                    │
+         ▼                  ▼                    ▼
+┌──────────────────┐  ┌──────────────┐   ┌──────────────────────┐
+│ User Containers  │  │    Relay     │   │   Infrastructure     │
+│ (per-user pods)  │  │ (ZMQ Router) │   │ • DragonflyDB (cache)│
+│                  │  │              │   │ • Qdrant (vectors)   │
+│ • MCP Server     │  │ • Market data│   │ • PostgreSQL (meta)  │
+│ • User files:    │  │   fanout     │   │ • MinIO (S3)         │
+│   - Indicators   │  │ • Work queue │   │                      │
+│   - Strategies   │  │ • Stateless  │   │                      │
+│   - Preferences  │  │              │   │                      │
+│ • Event Publisher│  │              │   │                      │
+│ • Lifecycle Mgmt │  │              │   │                      │
+└──────────────────┘  └──────┬───────┘   └──────────────────────┘
+                             │
+              ┌──────────────┴──────────────┐
+              │                             │
+              ▼                             ▼
+    ┌──────────────────┐        ┌──────────────────────┐
+    │   Ingestors      │        │    Flink Cluster     │
+    │ • CCXT adapters  │        │ • Deduplication      │
+    │ • Exchange APIs  │        │ • OHLC aggregation   │
+    │ • Push to Kafka  │        │ • CEP engine         │
+    └────────┬─────────┘        │ • Writes to Iceberg  │
+             │                  │ • Market data PUB    │
+             │                  └──────────┬───────────┘
+             ▼                             │
+    ┌─────────────────────────────────────▼────────────┐
+    │                    Kafka                          │
+    │  • Durable append log                            │
+    │  • Topic-based streams                           │
+    │  • Event sourcing                                │
+    └──────────────────────┬───────────────────────────┘
+                           │
+                           ▼
+                  ┌─────────────────┐
+                  │ Iceberg Catalog │
+                  │ • Historical    │
+                  │   OHLC storage  │
+                  │ • Query API     │
+                  └─────────────────┘
+```
+
+## Core Components
+
+### 1. Gateway
+
+**Location:** `gateway/`
+**Language:** TypeScript (Node.js)
+**Purpose:** Entry point for all user interactions
+
+**Responsibilities:**
+- **Authentication:** JWT tokens, Telegram OAuth, multi-tier licensing
+- **Session Management:** WebSocket connections, Telegram webhooks, multi-channel support
+- **Container Orchestration:** Dynamic provisioning of user agent pods ([[gateway_container_creation]])
+- **Event Handling:**
+  - Subscribe to user container events (XPUB/SUB for informational)
+  - Route critical events (ROUTER/DEALER with ack) ([[user_container_events]])
+- **Agent Harness (LangChain/LangGraph):** ([[agent_harness]])
+  - Stateless LLM orchestration
+  - MCP client connector to user containers
+  - RAG retrieval from Qdrant (global + user-specific knowledge)
+  - Model routing based on license tier and complexity
+  - Skills and subagents framework
+  - Workflow state machines with validation loops
+
+**Key Features:**
+- **Stateless design:** All conversation state lives in user containers or Qdrant
+- **Multi-channel support:** WebSocket, Telegram (future: mobile, Discord, Slack)
+- **Kubernetes-native:** Uses k8s API for container management
+- **Three-tier memory:**
+  - Redis: Hot storage, active sessions, LangGraph checkpoints (1 hour TTL)
+  - Qdrant: Vector search, RAG, global + user knowledge, GDPR-compliant
+  - Iceberg: Cold storage, full history, analytics, time-travel queries
+
+**Infrastructure:**
+- Deployed in `dexorder-system` namespace
+- RBAC: Can create but not delete user containers
+- Network policies: Access to k8s API, user containers, infrastructure
+
+---
+
+### 2. User Containers
+
+**Location:** `client-py/`
+**Language:** Python
+**Purpose:** Per-user isolated workspace and data storage
+
+**Architecture:**
+- One pod per user (auto-provisioned by gateway)
+- Persistent storage (PVC) for user data
+- Multi-container pod:
+  - **Agent container:** MCP server + event publisher + user files
+  - **Lifecycle sidecar:** Auto-shutdown and cleanup
+
+**Components:**
+
+#### MCP Server
+Exposes user-specific resources and tools via Model Context Protocol.
+
+**Resources (Context for LLM):**
+Gateway fetches these before each LLM call:
+- `context://user-profile` - Trading preferences, style, risk tolerance
+- `context://conversation-summary` - Recent conversation with semantic context
+- `context://workspace-state` - Current chart, watchlist, positions, alerts
+- `context://system-prompt` - User's custom AI instructions
+
+**Tools (Actions with side effects):**
+Gateway proxies these to user's MCP server:
+- `save_message(role, content)` - Save to conversation history
+- `search_conversation(query)` - Semantic search over past conversations
+- `list_strategies()`, `read_strategy(name)`, `write_strategy(name, code)`
+- `list_indicators()`, `read_indicator(name)`, `write_indicator(name, code)`
+- `run_backtest(strategy, params)` - Execute backtest
+- `get_watchlist()`, `execute_trade(params)`, `get_positions()`
+- `run_python(code)` - Execute Python with data science libraries
+
+**User Files:**
+- `indicators/*.py` - Custom technical indicators
+- `strategies/*.py` - Trading strategies with entry/exit rules
+- Watchlists and preferences
+- Git-versioned in persistent volume
+
+#### Event Publisher ([[user_container_events]])
+Publishes user events (order fills, alerts, workspace changes) via dual-channel ZMQ:
+- **XPUB:** Informational events (fire-and-forget to active sessions)
+- **DEALER:** Critical events (guaranteed delivery with ack)
+
+#### Lifecycle Manager ([[container_lifecycle_management]])
+Tracks activity and triggers; auto-shuts down when idle:
+- Configurable idle timeouts by license tier
+- Exit code 42 signals intentional shutdown
+- Sidecar deletes deployment and optionally PVC
+
+**Isolation:**
+- Network policies: Cannot access k8s API, other users, or system services
+- PodSecurity: Non-root, read-only rootfs, dropped capabilities
+- Resource limits enforced by license tier
+
+---
+
+### 3. Data Pipeline
+
+#### Relay (ZMQ Router)
+
+**Location:** `relay/`
+**Language:** Rust
+**Purpose:** Stateless message router for market data and requests
+
+**Architecture:**
+- Well-known bind point (all components connect to it)
+- No request tracking or state
+- Topic-based routing
+
+**Channels:**
+1. **Client Requests (ROUTER):** Port 5559 - Historical data requests
+2. **Ingestor Work Queue (PUB):** Port 5555 - Work distribution with exchange prefix
+3. **Market Data Fanout (XPUB/XSUB):** Port 5558 - Realtime data + notifications
+4. **Responses (SUB → PUB proxy):** Notifications from Flink to clients
+
+See [[protocol]] for detailed ZMQ patterns and message formats.
+
+---
+
+#### Ingestors
+
+**Location:** `ingestor/`
+**Language:** Python
+**Purpose:** Fetch market data from exchanges
+
+**Features:**
+- CCXT-based exchange adapters
+- Subscribes to work queue via exchange prefix (e.g., `BINANCE:`)
+- Writes raw data to Kafka only (no direct client responses)
+- Supports realtime ticks and historical OHLC
+
+**Data Flow:**
+```
+Exchange API → Ingestor → Kafka → Flink → Iceberg
+                                      ↓
+                                  Notification → Relay → Clients
+```
+
+---
+
+#### Kafka
+
+**Deployment:** KRaft mode (no Zookeeper)
+**Purpose:** Durable event log and stream processing backbone
+
+**Topics:**
+- Raw market data streams (per exchange/symbol)
+- Processed OHLC data
+- Notification events
+- User events (orders, alerts)
+
+**Retention:**
+- Configurable per topic (default: 7 days for raw data)
+- Longer retention for aggregated data
+
+---
+
+#### Flink
+
+**Deployment:** JobManager + TaskManager(s)
+**Purpose:** Stream processing and aggregation
+
+**Jobs:**
+1. **Deduplication:** Remove duplicate ticks from multiple ingestors
+2. **OHLC Aggregation:** Build candles from tick streams
+3. **CEP (Complex Event Processing):** Pattern detection and alerts
+4. **Iceberg Writer:** Batch write to long-term storage
+5. **Notification Publisher:** ZMQ PUB for async client notifications
+
+**State:**
+- Checkpointing to MinIO (S3-compatible)
+- Exactly-once processing semantics
+
+**Scaling:**
+- Multiple TaskManagers for parallelism
+- Headless service for ZMQ discovery (see [[protocol#TODO: Flink-to-Relay ZMQ Discovery]])
+
+---
+
+#### Apache Iceberg
+
+**Deployment:** REST catalog with PostgreSQL backend
+**Purpose:** Historical data lake for OHLC and analytics
+
+**Features:**
+- Schema evolution
+- Time travel queries
+- Partitioning by date/symbol
+- Efficient columnar storage (Parquet)
+
+**Storage:** MinIO (S3-compatible object storage)
+
+---
+
+### 4. Infrastructure Services
+
+#### DragonflyDB
+- Redis-compatible in-memory cache
+- Session state, rate limiting, hot data
+
+#### Qdrant
+- Vector database for RAG
+- **Global knowledge** (user_id="0"): Platform capabilities, trading concepts, strategy patterns
+- **User knowledge** (user_id=specific): Personal conversations, preferences, strategies
+- GDPR-compliant (indexed by user_id for fast deletion)
+
+#### PostgreSQL
+- Iceberg catalog metadata
+- User accounts and license info (gateway)
+- Per-user data lives in user containers
+
+#### MinIO
+- S3-compatible object storage
+- Iceberg table data
+- Flink checkpoints
+- User file uploads
+
+---
+
+## Data Flow Patterns
+
+### Historical Data Query (Async)
+
+```
+1. Client → Gateway → User Container MCP: User requests data
+2. Gateway → Relay (REQ/REP): Submit historical request
+3. Relay → Ingestors (PUB/SUB): Broadcast work with exchange prefix
+4. Ingestor → Exchange API: Fetch data
+5. Ingestor → Kafka: Write OHLC batch with metadata
+6. Flink → Kafka: Read, process, dedupe
+7. Flink → Iceberg: Write to table
+8. Flink → Relay (PUB): Publish HistoryReadyNotification
+9. Relay → Client (SUB): Notification delivered
+10. Client → Iceberg: Query data directly
+```
+
+**Key Design:**
+- Client subscribes to notification topic BEFORE submitting request (prevents race)
+- Notification topics are deterministic: `RESPONSE:{client_id}` or `HISTORY_READY:{request_id}`
+- No state in Relay (fully topic-based routing)
+
+See [[protocol#Historical Data Query Flow]] for details.
+
+---
+
+### Realtime Market Data
+
+```
+1. Ingestor → Kafka: Write realtime ticks
+2. Flink → Kafka: Read and aggregate OHLC
+3. Flink → Relay (PUB): Publish market data
+4. Relay → Clients (XPUB/SUB): Fanout to subscribers
+```
+
+**Topic Format:** `{ticker}|{data_type}` (e.g., `BINANCE:BTC/USDT|tick`)
+
+---
+
+### User Events
+
+User containers emit events (order fills, alerts) that must reach users reliably.
+
+**Dual-Channel Design:**
+
+1. **Informational Events (XPUB/SUB):**
+   - Container tracks active subscriptions via XPUB
+   - Publishes only if someone is listening
+   - Zero latency, fire-and-forget
+
+2. **Critical Events (DEALER/ROUTER):**
+   - Container sends to gateway ROUTER with event ID
+   - Gateway delivers via Telegram/email/push
+   - Gateway sends EventAck back to container
+   - Container retries on timeout
+   - Persisted to disk on shutdown
+
+See [[user_container_events]] for implementation.
+
+---
+
+## Container Lifecycle
+
+### Creation ([[gateway_container_creation]])
+
+```
+User authenticates → Gateway checks if deployment exists
+                  → If missing, create from template (based on license tier)
+                  → Wait for ready (2min timeout)
+                  → Return MCP endpoint
+```
+
+**Templates by Tier:**
+| Tier | Memory | CPU | Storage | Idle Timeout |
+|------|--------|-----|---------|--------------|
+| Free | 512Mi | 500m | 1Gi | 15min |
+| Pro | 2Gi | 2000m | 10Gi | 60min |
+| Enterprise | 4Gi | 4000m | 50Gi | Never |
+
+---
+
+### Lifecycle Management ([[container_lifecycle_management]])
+
+**Idle Detection:**
+- Container is idle when: no active triggers + no recent MCP activity
+- Lifecycle manager tracks:
+  - MCP tool/resource calls (reset idle timer)
+  - Active triggers (data subscriptions, CEP patterns)
+
+**Shutdown:**
+- On idle timeout: exit with code 42
+- Lifecycle sidecar detects exit code 42
+- Sidecar calls k8s API to delete deployment
+- Optionally deletes PVC (anonymous users only)
+
+**Security:**
+- Sidecar has RBAC to delete its own deployment only
+- Cannot delete other deployments or access other namespaces
+- Gateway cannot delete deployments (separation of concerns)
+
+---
+
+## Security Architecture
+
+### Network Isolation
+
+**NetworkPolicies:**
+- User containers:
+  - ✅ Connect to gateway (MCP)
+  - ✅ Connect to relay (market data)
+  - ✅ Outbound HTTPS (exchanges, LLM APIs)
+  - ❌ No k8s API access
+  - ❌ No system namespace access
+  - ❌ No inter-user communication
+
+- Gateway:
+  - ✅ k8s API (create containers)
+  - ✅ User containers (MCP client)
+  - ✅ Infrastructure (Postgres, Redis)
+  - ✅ Outbound (Anthropic API)
+
+---
+
+### RBAC
+
+**Gateway ServiceAccount:**
+- Create deployments/services/PVCs in `dexorder-agents` namespace
+- Read pod status and logs
+- Cannot delete, exec, or access secrets
+
+**Lifecycle Sidecar ServiceAccount:**
+- Delete deployments in `dexorder-agents` namespace
+- Delete PVCs (conditional on user type)
+- Cannot access other resources
+
+---
+
+### Admission Control
+
+All pods in `dexorder-agents` namespace must:
+- Use approved images only (allowlist)
+- Run as non-root
+- Drop all capabilities
+- Use read-only root filesystem
+- Have resource limits
+
+See `deploy/k8s/base/admission-policy.yaml`
+
+---
+
+## Agent Harness Flow
+
+The gateway's agent harness (LangChain/LangGraph) orchestrates LLM interactions with full context.
+
+```
+1. User sends message → Gateway (WebSocket/Telegram)
+   ↓
+2. Authenticator validates user and gets license info
+   ↓
+3. Container Manager ensures user's MCP container is running
+   ↓
+4. Agent Harness processes message:
+   │
+   ├─→ a. MCPClientConnector fetches context resources from user's MCP:
+   │      - context://user-profile
+   │      - context://conversation-summary
+   │      - context://workspace-state
+   │      - context://system-prompt
+   │
+   ├─→ b. RAGRetriever searches Qdrant for relevant memories:
+   │      - Embeds user query
+   │      - Searches: user_id IN (current_user, "0")
+   │      - Returns user-specific + global platform knowledge
+   │
+   ├─→ c. Build system prompt:
+   │      - Base platform prompt
+   │      - User profile context
+   │      - Workspace state
+   │      - Custom user instructions
+   │      - Relevant RAG memories
+   │
+   ├─→ d. ModelRouter selects LLM:
+   │      - Based on license tier
+   │      - Query complexity
+   │      - Routing strategy (cost/speed/quality)
+   │
+   ├─→ e. LLM invocation with tool support:
+   │      - Send messages to LLM
+   │      - If tool calls requested:
+   │         • Platform tools → handled by gateway
+   │         • User tools → proxied to MCP container
+   │      - Loop until no more tool calls
+   │
+   ├─→ f. Save conversation to MCP:
+   │      - mcp.callTool('save_message', user_message)
+   │      - mcp.callTool('save_message', assistant_message)
+   │
+   └─→ g. Return response to user via channel
+```
+
+**Key Architecture:**
+- **Gateway is stateless:** No conversation history stored in gateway
+- **User context in MCP:** All user-specific data lives in user's container
+- **Global knowledge in Qdrant:** Platform documentation loaded from `gateway/knowledge/`
+- **RAG at gateway level:** Semantic search combines global + user knowledge
+- **Skills vs Subagents:**
+  - Skills: Well-defined, single-purpose tasks
+  - Subagents: Complex domain expertise with multi-file context
+- **Workflows:** LangGraph state machines for multi-step processes
+
+See [[agent_harness]] for detailed implementation.
+
+---
+
+## Configuration Management
+
+All services use dual YAML files:
+- `config.yaml` - Non-sensitive configuration (mounted from ConfigMap)
+- `secrets.yaml` - Credentials and tokens (mounted from Secret)
+
+**Environment Variables:**
+- K8s downward API for pod metadata
+- Service discovery via DNS (e.g., `kafka:9092`)
+
+---
+
+## Deployment
+
+### Development
+
+```bash
+# Start local k8s
+minikube start
+
+# Apply infrastructure
+kubectl apply -k deploy/k8s/dev
+
+# Build and load images
+docker build -t dexorder/gateway:latest gateway/
+minikube image load dexorder/gateway:latest
+
+# Port-forward for access
+kubectl port-forward -n dexorder-system svc/gateway 3000:3000
+```
+
+---
+
+### Production
+
+```bash
+# Apply production configs
+kubectl apply -k deploy/k8s/prod
+
+# Push images to registry
+docker push ghcr.io/dexorder/gateway:latest
+docker push ghcr.io/dexorder/agent:latest
+docker push ghcr.io/dexorder/lifecycle-sidecar:latest
+```
+
+**Namespaces:**
+- `dexorder-system` - Platform services (gateway, infrastructure)
+- `dexorder-agents` - User containers (isolated)
+
+---
+
+## Observability
+
+### Metrics (Prometheus)
+- Container creation/deletion rates
+- Idle shutdown counts
+- MCP call latency and errors
+- Event delivery rates and retries
+- Kafka lag and throughput
+- Flink checkpoint duration
+
+### Logging
+- Structured JSON logs
+- User ID in all agent logs
+- Aggregated via Loki or CloudWatch
+
+### Tracing
+- OpenTelemetry spans across gateway → MCP → LLM
+- User-scoped traces for debugging
+
+---
+
+## Scalability
+
+### Horizontal Scaling
+
+**Stateless Components:**
+- Gateway: Add replicas behind load balancer
+- Relay: Single instance (stateless router)
+- Ingestors: Scale by exchange workload
+
+**Stateful Components:**
+- Flink: Scale TaskManagers
+- User containers: One per user (1000s of pods)
+
+**Bottlenecks:**
+- Flink → Relay ZMQ: Requires discovery protocol (see [[protocol#TODO: Flink-to-Relay ZMQ Discovery]])
+- Kafka: Partition by symbol for parallelism
+- Iceberg: Partition by date/symbol
+
+---
+
+### Cost Optimization
+
+**Tiered Resources:**
+- Free users: Aggressive idle shutdown (15min)
+- Pro users: Longer timeout (60min)
+- Enterprise: Always-on containers
+
+**Storage:**
+- PVC deletion for anonymous users
+- Tiered storage classes (fast SSD → cheap HDD)
+
+**LLM Costs:**
+- Rate limiting per license tier
+- Caching of MCP resources (1-5min TTL)
+- Conversation summarization to reduce context size
+
+---
+
+## Development Roadmap
+
+See [[backend_redesign]] for detailed notes.
+
+**Phase 1: Foundation (Complete)**
+- Gateway with k8s integration
+- User container provisioning
+- MCP protocol implementation
+- Basic market data pipeline
+
+**Phase 2: Data Pipeline (In Progress)**
+- Kafka topic schemas
+- Flink jobs for aggregation
+- Iceberg integration
+- Historical backfill service
+
+**Phase 3: Agent Features**
+- RAG integration (Qdrant)
+- Strategy backtesting
+- Risk management tools
+- Portfolio analytics
+
+**Phase 4: Production Hardening**
+- Multi-region deployment
+- HA for infrastructure
+- Comprehensive monitoring
+- Performance optimization
+
+---
+
+## Related Documentation
+
+- [[protocol]] - ZMQ message protocols and data flow
+- [[gateway_container_creation]] - Dynamic container provisioning
+- [[container_lifecycle_management]] - Idle shutdown and cleanup
+- [[user_container_events]] - Event system implementation
+- [[agent_harness]] - LLM orchestration flow
+- [[m_c_p_tools_architecture]] - User MCP tools specification
+- [[user_mcp_resources]] - Context resources and RAG
+- [[m_c_p_client_authentication_modes]] - MCP authentication patterns
+- [[backend_redesign]] - Design notes and TODO items
--- a/doc/auth.md
+++ b/doc/auth.md
@@ -0,0 +1,468 @@
+# Authentication System Setup
+
+This document describes the multi-channel authentication system for the Dexorder AI Gateway.
+
+## Overview
+
+The gateway now implements a comprehensive authentication system using **Better Auth** with support for:
+
+- ✅ Email/Password authentication
+- ✅ Passkey/WebAuthn (passwordless biometric auth)
+- ✅ JWT token-based sessions
+- ✅ Multi-channel support (WebSocket, Telegram, REST API)
+- ✅ PostgreSQL-based user management
+- ✅ Secure password hashing with Argon2
+
+## Architecture
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                        Client Apps                          │
+│  (Web, Mobile, CLI, Telegram, etc.)                        │
+└────────────┬────────────────────────────────┬───────────────┘
+             │                                │
+             │ HTTP/REST                      │ WebSocket
+             │                                │
+┌────────────▼────────────────────────────────▼───────────────┐
+│                    Gateway (Fastify)                        │
+│                                                             │
+│  ┌──────────────┐  ┌──────────────┐  ┌─────────────────┐  │
+│  │ Auth Routes  │  │ WebSocket    │  │ Telegram        │  │
+│  │ /auth/*      │  │ Handler      │  │ Handler         │  │
+│  └──────┬───────┘  └──────┬───────┘  └────────┬────────┘  │
+│         │                 │                    │           │
+│         └─────────────────┴────────────────────┘           │
+│                           │                                │
+│                  ┌────────▼──────────┐                     │
+│                  │   Auth Service    │                     │
+│                  │  (Better Auth)    │                     │
+│                  └────────┬──────────┘                     │
+│                           │                                │
+└───────────────────────────┼────────────────────────────────┘
+                            │
+                   ┌────────▼────────┐
+                   │   PostgreSQL    │
+                   │  - users        │
+                   │  - sessions     │
+                   │  - passkeys     │
+                   │  - credentials  │
+                   └─────────────────┘
+```
+
+## Database Schema
+
+The authentication system uses the following PostgreSQL tables:
+
+### Core Tables
+
+1. **users** - Core user accounts
+   - `id` (PRIMARY KEY)
+   - `email` (UNIQUE)
+   - `email_verified`
+   - `name`
+   - `created_at`, `updated_at`
+
+2. **user_credentials** - Password hashes
+   - `user_id` (FOREIGN KEY → users.id)
+   - `password_hash` (Argon2)
+
+3. **sessions** - JWT sessions
+   - `id` (PRIMARY KEY)
+   - `user_id` (FOREIGN KEY → users.id)
+   - `expires_at`
+   - `ip_address`, `user_agent`
+
+4. **passkeys** - WebAuthn credentials
+   - `id` (PRIMARY KEY)
+   - `user_id` (FOREIGN KEY → users.id)
+   - `credential_id` (UNIQUE)
+   - `credential_public_key`
+   - `counter`, `transports`
+
+5. **verification_tokens** - Email verification, password reset
+   - `identifier`, `token`, `expires_at`
+
+### Integration Tables
+
+6. **user_licenses** - User authorization & feature flags
+   - `user_id` (FOREIGN KEY → users.id)
+   - `license_type` (free, pro, enterprise)
+   - `features` (JSONB)
+   - `resource_limits` (JSONB)
+
+7. **user_channel_links** - Multi-channel support
+   - `user_id` (FOREIGN KEY → users.id)
+   - `channel_type` (telegram, slack, discord, websocket)
+   - `channel_user_id`
+
+## Installation
+
+1. **Install dependencies:**
+
+```bash
+cd gateway
+npm install
+```
+
+The following packages are added:
+- `better-auth` - Main authentication framework
+- `@simplewebauthn/server` - WebAuthn/passkey support
+- `@simplewebauthn/browser` - Client-side passkey helpers
+- `@fastify/jwt` - JWT utilities
+- `argon2` - Secure password hashing
+
+2. **Apply database schema:**
+
+```bash
+psql $DATABASE_URL -f schema.sql
+```
+
+3. **Configure secrets:**
+
+Copy `secrets.example.yaml` to your actual secrets file and update:
+
+```yaml
+auth:
+  secret: "YOUR-SUPER-SECRET-KEY-HERE"  # Generate with: openssl rand -base64 32
+```
+
+4. **Configure server:**
+
+Update `config.yaml`:
+
+```yaml
+server:
+  base_url: http://localhost:3000  # Or your production URL
+  trusted_origins:
+    - http://localhost:3000
+    - http://localhost:5173  # Your web app
+    - https://yourdomain.com
+```
+
+## API Endpoints
+
+### Authentication Routes
+
+All Better Auth automatic routes are available at `/api/auth/*`:
+
+- `POST /api/auth/sign-up/email` - Register with email/password
+- `POST /api/auth/sign-in/email` - Sign in with email/password
+- `POST /api/auth/sign-out` - Sign out
+- `GET /api/auth/session` - Get current session
+- `POST /api/auth/passkey/register` - Register passkey
+- `POST /api/auth/passkey/authenticate` - Authenticate with passkey
+
+### Custom Routes (Simplified)
+
+- `POST /auth/register` - Register and auto sign-in
+- `POST /auth/login` - Sign in
+- `POST /auth/logout` - Sign out
+- `GET /auth/session` - Get session
+- `GET /auth/health` - Auth system health check
+
+### Example Usage
+
+#### Register a new user
+
+```bash
+curl -X POST http://localhost:3000/auth/register \
+  -H "Content-Type: application/json" \
+  -d '{
+    "email": "user@example.com",
+    "password": "SecurePassword123!",
+    "name": "John Doe"
+  }'
+```
+
+Response:
+```json
+{
+  "success": true,
+  "userId": "user_1234567890_abc123",
+  "token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9..."
+}
+```
+
+#### Sign in
+
+```bash
+curl -X POST http://localhost:3000/auth/login \
+  -H "Content-Type: application/json" \
+  -d '{
+    "email": "user@example.com",
+    "password": "SecurePassword123!"
+  }'
+```
+
+Response:
+```json
+{
+  "success": true,
+  "userId": "user_1234567890_abc123",
+  "token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9..."
+}
+```
+
+#### Connect to WebSocket with JWT
+
+```javascript
+const ws = new WebSocket('ws://localhost:3000/ws/chat');
+ws.addEventListener('open', () => {
+  // Send auth token in initial message
+  ws.send(JSON.stringify({
+    type: 'auth',
+    token: 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...'
+  }));
+});
+```
+
+Or use Authorization header:
+```javascript
+const ws = new WebSocket('ws://localhost:3000/ws/chat', {
+  headers: {
+    'Authorization': 'Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...'
+  }
+});
+```
+
+#### Get current session
+
+```bash
+curl http://localhost:3000/auth/session \
+  -H "Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9..."
+```
+
+## Passkey (WebAuthn) Support
+
+### Server Setup
+
+Passkeys are automatically configured in `better-auth-config.ts`:
+
+```typescript
+passkey({
+  rpName: 'Dexorder AI',
+  rpID: new URL(config.baseUrl).hostname,
+  origin: config.baseUrl,
+})
+```
+
+### Client-Side Integration
+
+```typescript
+import { startRegistration, startAuthentication } from '@simplewebauthn/browser';
+
+// 1. Register a passkey (user must be logged in)
+async function registerPasskey(token: string) {
+  // Get options from server
+  const optionsResponse = await fetch('/auth/passkey/register/options', {
+    method: 'POST',
+    headers: {
+      'Authorization': `Bearer ${token}`
+    }
+  });
+  const options = await optionsResponse.json();
+
+  // Start WebAuthn registration
+  const credential = await startRegistration(options);
+
+  // Send credential to server
+  const response = await fetch('/api/auth/passkey/register', {
+    method: 'POST',
+    headers: {
+      'Content-Type': 'application/json',
+      'Authorization': `Bearer ${token}`
+    },
+    body: JSON.stringify({ credential })
+  });
+
+  return response.json();
+}
+
+// 2. Authenticate with passkey
+async function authenticateWithPasskey() {
+  // Get challenge from server
+  const optionsResponse = await fetch('/api/auth/passkey/authenticate/options');
+  const options = await optionsResponse.json();
+
+  // Start WebAuthn authentication
+  const credential = await startAuthentication(options);
+
+  // Verify with server
+  const response = await fetch('/auth/passkey/authenticate', {
+    method: 'POST',
+    headers: {
+      'Content-Type': 'application/json'
+    },
+    body: JSON.stringify({ credential })
+  });
+
+  const { token, userId } = await response.json();
+  return { token, userId };
+}
+```
+
+## Multi-Channel Support
+
+### WebSocket Authentication
+
+The WebSocket handler (`websocket-handler.ts`) now properly verifies JWT tokens:
+
+```typescript
+// User connects with JWT token in Authorization header
+const authContext = await authenticator.authenticateWebSocket(request);
+// Returns: { userId, sessionId, license, ... }
+```
+
+### Telegram Bot Authentication
+
+Users link their Telegram account via the `user_channel_links` table:
+
+```sql
+INSERT INTO user_channel_links (user_id, channel_type, channel_user_id)
+VALUES ('user_1234567890_abc123', 'telegram', '987654321');
+```
+
+The `authenticator.authenticateTelegram()` method resolves the user from their Telegram ID.
+
+### API Authentication
+
+All REST API calls use the `Authorization: Bearer <token>` header.
+
+## Security Considerations
+
+### Production Checklist
+
+- [ ] Generate a strong random secret: `openssl rand -base64 32`
+- [ ] Enable email verification: Set `requireEmailVerification: true`
+- [ ] Configure HTTPS only in production
+- [ ] Set proper `trusted_origins` for CORS
+- [ ] Implement rate limiting (consider adding `@fastify/rate-limit`)
+- [ ] Set up email service for password reset
+- [ ] Configure session expiry based on security requirements
+- [ ] Enable 2FA for sensitive operations
+- [ ] Implement audit logging for auth events
+- [ ] Set up monitoring for failed login attempts
+
+### Password Security
+
+- Uses **Argon2** (winner of Password Hashing Competition)
+- Automatically salted and hashed by Better Auth
+- Never stored or logged in plain text
+
+### JWT Security
+
+- Tokens expire after 7 days (configurable)
+- Sessions update every 24 hours
+- Tokens signed with HMAC-SHA256
+- Store secret in k8s secrets, never in code
+
+### Passkey Security
+
+- Uses FIDO2/WebAuthn standards
+- Hardware-backed authentication
+- Phishing-resistant
+- No passwords to leak or forget
+
+## Migration Guide
+
+If you have existing users with a different auth system:
+
+1. **Create users in new schema:**
+```sql
+INSERT INTO users (id, email, email_verified, name)
+SELECT user_id, email, true, name FROM old_users_table;
+```
+
+2. **Migrate licenses:**
+```sql
+-- Ensure user_licenses references users.id
+UPDATE user_licenses SET user_id = users.id WHERE ...;
+```
+
+3. **User must reset password** or register passkey on first login with new system
+
+## Troubleshooting
+
+### "Authentication failed" on WebSocket
+
+- Check that the JWT token is valid and not expired
+- Verify the Authorization header format: `Bearer <token>`
+- Check server logs for detailed error messages
+
+### "Invalid credentials" on login
+
+- Verify the user exists in the `users` table
+- Check that `user_credentials` has a password_hash for the user
+- Passwords are case-sensitive
+
+### Passkey registration fails
+
+- Check browser support for WebAuthn
+- Verify HTTPS is enabled (required for WebAuthn in production)
+- Check `rpID` matches your domain
+- Ensure user is authenticated before registering passkey
+
+## Development Tips
+
+### Testing with dev user
+
+A development user is created automatically:
+
+```javascript
+// Email: dev@example.com
+// User ID: dev-user-001
+// License: pro
+```
+
+Generate a token for testing:
+```bash
+curl -X POST http://localhost:3000/auth/login \
+  -H "Content-Type: application/json" \
+  -d '{"email":"dev@example.com","password":"<set-in-db>"}'
+```
+
+### Inspecting tokens
+
+```bash
+# Decode JWT (header and payload only, signature verification needed)
+echo "eyJhbGc..." | cut -d. -f2 | base64 -d | jq
+```
+
+### Database queries
+
+```sql
+-- List all users
+SELECT id, email, name, created_at FROM users;
+
+-- List active sessions
+SELECT s.id, s.user_id, u.email, s.expires_at
+FROM sessions s
+JOIN users u ON s.user_id = u.id
+WHERE s.expires_at > NOW();
+
+-- List passkeys
+SELECT p.id, p.name, u.email, p.created_at
+FROM passkeys p
+JOIN users u ON p.user_id = u.id;
+```
+
+## Future Enhancements
+
+Potential additions to consider:
+
+- [ ] OAuth providers (Google, GitHub, etc.)
+- [ ] Magic link authentication
+- [ ] Two-factor authentication (TOTP)
+- [ ] Session management dashboard
+- [ ] Audit log for security events
+- [ ] IP-based restrictions
+- [ ] Device management (trusted devices)
+- [ ] Anonymous authentication for trials
+
+## References
+
+- [Better Auth Documentation](https://better-auth.com/)
+- [SimpleWebAuthn Guide](https://simplewebauthn.dev/)
+- [WebAuthn Guide](https://webauthn.guide/)
+- [FIDO Alliance](https://fidoalliance.org/)
+- [Fastify Authentication](https://fastify.dev/docs/latest/Guides/Getting-Started/#your-first-plugin)
--- a/doc/backend_redesign.md
+++ b/doc/backend_redesign.md
@@ -100,7 +100,7 @@ Ingestion API
  * RAG namespace
  * Agents
    * Top-level coordinator
-    * TradingView agent
+    * TradingView skill
      * Indicators, Drawings, Annotations
    * Research Agent
      * Pandas/Polars analysis
--- a/doc/protocol.md
+++ b/doc/protocol.md
@@ -151,6 +151,310 @@ The two-frame envelope is the **logical protocol format**, but physical transmis
 | 0x11    | SubmitResponse            | Immediate ack with notification topic          |
 | 0x12    | HistoryReadyNotification  | Notification that data is ready in Iceberg     |

+## User Container Event System
+
+User containers emit events (order executions, alerts, workspace changes) that must be delivered to users via their active session or external channels (Telegram, email, push). This requires two ZMQ patterns with different delivery guarantees.
+
+### Event Flow Overview
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                   User Container                             │
+│                                                              │
+│  Strategy/Indicator Engine                                   │
+│         │                                                    │
+│         ▼                                                    │
+│  ┌─────────────────────────────────────────────────────┐   │
+│  │              Event Publisher                         │   │
+│  │                                                      │   │
+│  │  1. Check delivery spec                              │   │
+│  │  2. If INFORMATIONAL or has_active_subscriber():    │   │
+│  │        → XPUB (fast path)                           │   │
+│  │  3. Else (CRITICAL or no active session):           │   │
+│  │        → DEALER (guaranteed delivery)               │   │
+│  └─────────────────────────────────────────────────────┘   │
+│         │                           │                       │
+│    XPUB socket                 DEALER socket                │
+│    (port 5570)                 (port 5571)                  │
+└─────────┼───────────────────────────┼───────────────────────┘
+          │                           │
+          ▼                           ▼
+┌─────────────────────────────────────────────────────────────┐
+│                    Gateway Pool                              │
+│                                                              │
+│  ┌──────────────────┐      ┌──────────────────────────┐    │
+│  │ SUB socket       │      │ ROUTER socket            │    │
+│  │ (per-session)    │      │ (shared, any gateway)    │    │
+│  │                  │      │                          │    │
+│  │ Subscribe to     │      │ Pull event, deliver,     │    │
+│  │ USER:{user_id}   │      │ send EventAck back       │    │
+│  │ on connect       │      │                          │    │
+│  └────────┬─────────┘      └─────────────┬────────────┘    │
+│           │                              │                  │
+│           ▼                              ▼                  │
+│  ┌─────────────┐            ┌─────────────────────────┐    │
+│  │ Active WS/  │            │ Telegram API / Email /  │    │
+│  │ Telegram    │            │ Push Notification       │    │
+│  └─────────────┘            └─────────────────────────┘    │
+└─────────────────────────────────────────────────────────────┘
+```
+
+### 5. User Event Channel - Informational (Container → Gateway)
+**Pattern**: XPUB/SUB with subscription tracking
+- **Socket Type**: Container uses XPUB (bind), Gateway uses SUB (connect)
+- **Endpoint**: `tcp://*:5570` (Container binds)
+- **Message Types**: `UserEvent`
+- **Topic Format**: `USER:{user_id}` (e.g., `USER:user-abc123`)
+- **Behavior**:
+  - Gateway subscribes to `USER:{user_id}` when user's WebSocket/Telegram session connects
+  - Gateway unsubscribes when session disconnects
+  - Container uses XPUB with `ZMQ_XPUB_VERBOSE` to track active subscriptions
+  - Container checks subscription set before publishing
+  - If no subscriber, message is either dropped (INFORMATIONAL) or routed to critical channel
+  - Zero coordination, fire-and-forget for active sessions
+
+### 6. User Event Channel - Critical (Container → Gateway)
+**Pattern**: DEALER/ROUTER with acknowledgment
+- **Socket Type**: Container uses DEALER (connect), Gateway uses ROUTER (bind)
+- **Endpoint**: `tcp://gateway:5571` (Gateway binds, containers connect)
+- **Message Types**: `UserEvent` → `EventAck`
+- **Behavior**:
+  - Container sends `UserEvent` with `event_id` via DEALER
+  - DEALER round-robins to available gateway ROUTER sockets
+  - Gateway processes event (sends to Telegram, email, etc.)
+  - Gateway sends `EventAck` back to container
+  - Container tracks pending events with timeout (30s default)
+  - On timeout without ack: resend (DEALER routes to next gateway)
+  - On container shutdown: persist pending to disk, reload on startup
+  - Provides at-least-once delivery guarantee
+
+### Subscription Tracking (Container Side)
+
+Container uses XPUB to detect active sessions:
+
+```python
+# Container event publisher initialization
+xpub_socket = ctx.socket(zmq.XPUB)
+xpub_socket.setsockopt(zmq.XPUB_VERBOSE, 1)  # Receive all sub/unsub
+xpub_socket.bind("tcp://*:5570")
+
+active_subscriptions: set[str] = set()
+
+# In event loop, handle subscription messages
+def process_subscriptions():
+    while xpub_socket.poll(0):
+        msg = xpub_socket.recv()
+        topic = msg[1:].decode()  # Skip first byte (sub/unsub flag)
+        if msg[0] == 1:  # Subscribe
+            active_subscriptions.add(topic)
+        elif msg[0] == 0:  # Unsubscribe
+            active_subscriptions.discard(topic)
+
+def has_active_subscriber(user_id: str) -> bool:
+    return f"USER:{user_id}" in active_subscriptions
+```
+
+### Event Routing Logic (Container Side)
+
+```python
+def publish_event(event: UserEvent):
+    topic = f"USER:{event.user_id}"
+
+    if event.delivery.priority == Priority.INFORMATIONAL:
+        # Fire and forget - drop if nobody's listening
+        if has_active_subscriber(event.user_id):
+            xpub_socket.send_multipart([topic.encode(), serialize(event)])
+        # else: silently drop
+
+    elif has_active_subscriber(event.user_id):
+        # Active session exists - use fast path
+        xpub_socket.send_multipart([topic.encode(), serialize(event)])
+
+    else:
+        # No active session - use guaranteed delivery
+        send_via_dealer(event)
+
+def send_via_dealer(event: UserEvent):
+    pending_events[event.event_id] = PendingEvent(
+        event=event,
+        sent_at=time.time(),
+        retries=0
+    )
+    dealer_socket.send(serialize(event))
+```
+
+### Message Type IDs (User Events)
+
+| Type ID | Message Type    | Description                                    |
+|---------|-----------------|------------------------------------------------|
+| 0x20    | UserEvent       | Container → Gateway event                      |
+| 0x21    | EventAck        | Gateway → Container acknowledgment             |
+
+### UserEvent Message
+
+```protobuf
+message UserEvent {
+  string user_id = 1;
+  string event_id = 2;      // UUID for dedup/ack
+  int64 timestamp = 3;      // Unix millis
+
+  EventType event_type = 4;
+  bytes payload = 5;        // JSON or nested protobuf
+
+  DeliverySpec delivery = 6;
+}
+
+enum EventType {
+  ORDER_PLACED = 0;
+  ORDER_FILLED = 1;
+  ORDER_CANCELLED = 2;
+  ALERT_TRIGGERED = 3;
+  POSITION_UPDATED = 4;
+  WORKSPACE_CHANGED = 5;
+  STRATEGY_LOG = 6;
+}
+
+message DeliverySpec {
+  Priority priority = 1;
+  repeated ChannelPreference channels = 2;  // Ordered preference list
+}
+
+enum Priority {
+  INFORMATIONAL = 0;  // Drop if no active session
+  NORMAL = 1;         // Best effort, short queue
+  CRITICAL = 2;       // Must deliver, retry, escalate
+}
+
+message ChannelPreference {
+  ChannelType channel = 1;
+  bool only_if_active = 2;  // true = skip if not connected
+}
+
+enum ChannelType {
+  ACTIVE_SESSION = 0;  // Whatever's currently connected
+  WEB = 1;
+  TELEGRAM = 2;
+  EMAIL = 3;
+  PUSH = 4;            // Mobile push notification
+}
+```
+
+### EventAck Message
+
+```protobuf
+message EventAck {
+  string event_id = 1;
+  AckStatus status = 2;
+  string error_message = 3;  // If status is ERROR
+}
+
+enum AckStatus {
+  DELIVERED = 0;      // Successfully sent to at least one channel
+  QUEUED = 1;         // Accepted, will retry (e.g., Telegram rate limit)
+  ERROR = 2;          // Permanent failure
+}
+```
+
+### Delivery Examples
+
+```python
+# "Show on screen if they're watching, otherwise don't bother"
+# → Uses XPUB path only, dropped if no subscriber
+UserEvent(
+  delivery=DeliverySpec(
+    priority=Priority.INFORMATIONAL,
+    channels=[ChannelPreference(ChannelType.ACTIVE_SESSION, only_if_active=True)]
+  )
+)
+
+# "Active session preferred, fallback to Telegram"
+# → Tries XPUB first (if subscribed), else DEALER for Telegram delivery
+UserEvent(
+  delivery=DeliverySpec(
+    priority=Priority.NORMAL,
+    channels=[
+      ChannelPreference(ChannelType.ACTIVE_SESSION, only_if_active=True),
+      ChannelPreference(ChannelType.TELEGRAM, only_if_active=False),
+    ]
+  )
+)
+
+# "Order executed - MUST get through"
+# → Always uses DEALER path for guaranteed delivery
+UserEvent(
+  delivery=DeliverySpec(
+    priority=Priority.CRITICAL,
+    channels=[
+      ChannelPreference(ChannelType.ACTIVE_SESSION, only_if_active=True),
+      ChannelPreference(ChannelType.TELEGRAM, only_if_active=False),
+      ChannelPreference(ChannelType.PUSH, only_if_active=False),
+      ChannelPreference(ChannelType.EMAIL, only_if_active=False),
+    ]
+  )
+)
+```
+
+### Gateway Event Processing
+
+Gateway maintains:
+1. **Session registry**: Maps user_id → active WebSocket/channel connections
+2. **Channel credentials**: Telegram bot token, email service keys, push certificates
+3. **SUB socket per user session**: Subscribes to `USER:{user_id}` on container's XPUB
+4. **Shared ROUTER socket**: Receives critical events from any container
+
+```typescript
+// On user WebSocket connect
+async onSessionConnect(userId: string, ws: WebSocket) {
+  // Subscribe to user's informational events
+  subSocket.subscribe(`USER:${userId}`);
+  sessions.set(userId, ws);
+}
+
+// On user WebSocket disconnect
+async onSessionDisconnect(userId: string) {
+  subSocket.unsubscribe(`USER:${userId}`);
+  sessions.delete(userId);
+}
+
+// Handle informational events (from SUB socket)
+subSocket.on('message', (topic, payload) => {
+  const event = deserialize(payload);
+  const ws = sessions.get(event.userId);
+  if (ws) {
+    ws.send(JSON.stringify({ type: 'event', ...event }));
+  }
+});
+
+// Handle critical events (from ROUTER socket)
+routerSocket.on('message', (identity, payload) => {
+  const event = deserialize(payload);
+  deliverEvent(event).then(status => {
+    routerSocket.send([identity, serialize(EventAck(event.eventId, status))]);
+  });
+});
+
+async function deliverEvent(event: UserEvent): Promise<AckStatus> {
+  for (const pref of event.delivery.channels) {
+    if (pref.onlyIfActive && !sessions.has(event.userId)) continue;
+
+    switch (pref.channel) {
+      case ChannelType.ACTIVE_SESSION:
+        const ws = sessions.get(event.userId);
+        if (ws) { ws.send(...); return AckStatus.DELIVERED; }
+        break;
+      case ChannelType.TELEGRAM:
+        await telegramBot.sendMessage(event.userId, formatEvent(event));
+        return AckStatus.DELIVERED;
+      case ChannelType.EMAIL:
+        await emailService.send(event.userId, formatEvent(event));
+        return AckStatus.DELIVERED;
+      // ... etc
+    }
+  }
+  return AckStatus.ERROR;
+}
+```
+
 ## Error Handling

 **Async Architecture Error Handling**:
@@ -162,7 +466,51 @@ The two-frame envelope is the **logical protocol format**, but physical transmis
 - PUB/SUB has no delivery guarantees (Kafka provides durability)
 - No response routing needed - all notifications via topic-based pub/sub

+**User Event Error Handling**:
+- Informational events: dropped silently if no active session (by design)
+- Critical events: container retries on ack timeout (30s default)
+- Gateway tracks event_id for deduplication (5 minute window)
+- If all channels fail: return ERROR ack, container may escalate or log
+- Container persists pending critical events to disk on shutdown
+
 **Durability**:
 - All data flows through Kafka for durability
 - Flink checkpointing ensures exactly-once processing
 - Client can retry request with new request_id if notification not received
+- Critical user events use DEALER/ROUTER with ack for at-least-once delivery
+
+## Scaling
+
+### TODO: Flink-to-Relay ZMQ Discovery
+
+Currently Relay connects to Flink via XSUB on a single endpoint. With multiple Flink instances behind a K8s service, we need many-to-many connectivity.
+
+**Problem**: K8s service load balancing doesn't help ZMQ since connections are persistent. Relay needs to connect to ALL Flink instances to receive all published messages.
+
+**Proposed Solution**: Use a K8s headless service for Flink workers:
+
+```yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: flink-workers
+spec:
+  clusterIP: None
+  selector:
+    app: flink
+```
+
+Relay implementation:
+1. On startup and periodically (every N seconds), resolve `flink-workers.namespace.svc.cluster.local`
+2. DNS returns A records for all Flink pod IPs
+3. Diff against current XSUB connections
+4. Connect to new pods, disconnect from removed pods
+
+**Alternative approaches considered**:
+- XPUB/XSUB broker: Adds single point of failure and latency
+- Service discovery (etcd/Redis): More complex, requires additional infrastructure
+
+**Open questions**:
+- Appropriate polling interval for DNS resolution (5-10 seconds?)
+- Handling of brief disconnection during pod replacement
+- Whether to use K8s Endpoints API watch instead of DNS polling for faster reaction
--- a/doc/user_container_events.md
+++ b/doc/user_container_events.md
--- a/gateway/.env.example
+++ b/gateway/.env.example
@@ -35,5 +35,27 @@ AGENT_IMAGE=ghcr.io/dexorder/agent:latest
 SIDECAR_IMAGE=ghcr.io/dexorder/lifecycle-sidecar:latest
 AGENT_STORAGE_CLASS=standard

-# Redis (for session management - future)
-# REDIS_URL=redis://localhost:6379
+# Redis (for hot storage and session management)
+REDIS_URL=redis://localhost:6379
+
+# Qdrant (for RAG vector search)
+QDRANT_URL=http://localhost:6333
+QDRANT_API_KEY=  # optional, leave empty for local dev
+
+# Iceberg (for durable storage via REST catalog)
+ICEBERG_CATALOG_URI=http://iceberg-catalog:8181
+ICEBERG_NAMESPACE=gateway
+S3_ENDPOINT=http://minio:9000
+S3_ACCESS_KEY=minioadmin
+S3_SECRET_KEY=minioadmin
+
+# Event router (ZeroMQ)
+EVENT_ROUTER_BIND=tcp://*:5571
+
+# Embeddings (for RAG vector search)
+# Recommended: ollama with all-minilm (90MB model, CPU-friendly, ~100MB RAM)
+EMBEDDING_PROVIDER=ollama
+EMBEDDING_MODEL=all-minilm
+OLLAMA_URL=http://localhost:11434
+# Alternative models: nomic-embed-text (8K context), mxbai-embed-large (higher accuracy)
+# For OpenAI embeddings, set: EMBEDDING_PROVIDER=openai, EMBEDDING_MODEL=text-embedding-3-small
--- a/gateway/Dockerfile
+++ b/gateway/Dockerfile
@@ -1,4 +1,4 @@
-FROM node:22-alpine AS builder
+FROM node:22-slim AS builder

 WORKDIR /app

@@ -7,7 +7,7 @@ COPY package*.json ./
 COPY tsconfig.json ./

 # Install dependencies
-RUN npm ci
+RUN npm install

 # Copy source
 COPY src ./src
@@ -16,25 +16,52 @@ COPY src ./src
 RUN npm run build

 # Production image
-FROM node:22-alpine
+FROM node:22-slim

 WORKDIR /app

+# Install dependencies for Ollama (early in the build for caching)
+RUN apt-get update && apt-get install -y curl bash zstd ca-certificates && rm -rf /var/lib/apt/lists/*
+
+# Install Ollama (before npm dependencies for better caching)
+RUN curl -fsSL https://ollama.com/install.sh | sh
+
+# Create non-root user early (before pulling model)
+RUN groupadd --gid 1001 nodejs && \
+    useradd --uid 1001 --gid nodejs --shell /bin/bash --create-home nodejs && \
+    chown -R nodejs:nodejs /app
+
+# Pull embedding model (all-minilm: 90MB, CPU-friendly) as nodejs user
+# This is the most expensive operation, so do it early
+USER nodejs
+RUN ollama serve & \
+    OLLAMA_PID=$! && \
+    sleep 10 && \
+    ollama pull all-minilm && \
+    kill $OLLAMA_PID && \
+    wait $OLLAMA_PID || true
+
+# Switch back to root for remaining setup
+USER root
+
 # Copy package files
 COPY package*.json ./

 # Install production dependencies only
-RUN npm ci --omit=dev
+RUN npm install --omit=dev

 # Copy built application
 COPY --from=builder /app/dist ./dist

-# Create non-root user
-RUN addgroup -g 1001 -S nodejs && \
-    adduser -S nodejs -u 1001
+# Copy entrypoint script
+COPY entrypoint.sh ./
+RUN chmod +x entrypoint.sh
+
+# Ensure nodejs user owns everything
+RUN chown -R nodejs:nodejs /app

 USER nodejs

 EXPOSE 3000

-CMD ["node", "dist/main.js"]
+ENTRYPOINT ["./entrypoint.sh"]
--- a/gateway/README.md
+++ b/gateway/README.md
@@ -91,6 +91,10 @@ Containers self-manage their lifecycle using the lifecycle sidecar (see `../life
  - OpenAI GPT
  - Google Gemini
  - OpenRouter (one key for 300+ models)
+- Ollama (for embeddings): https://ollama.com/download
+- Redis (for session/hot storage)
+- Qdrant (for RAG vector search)
+- Kafka + Flink + Iceberg (for durable storage)

 ### Development

@@ -119,7 +123,20 @@ DEFAULT_MODEL_PROVIDER=anthropic
 DEFAULT_MODEL=claude-3-5-sonnet-20241022
 ```

-4. Run development server:
+4. Start Ollama and pull embedding model:
+```bash
+# Install Ollama (one-time): https://ollama.com/download
+# Or with Docker: docker run -d -p 11434:11434 ollama/ollama
+
+# Pull the all-minilm embedding model (90MB, CPU-friendly)
+ollama pull all-minilm
+
+# Alternative models:
+# ollama pull nomic-embed-text  # 8K context length
+# ollama pull mxbai-embed-large  # Higher accuracy, slower
+```
+
+5. Run development server:
 ```bash
 npm run dev
 ```
@@ -200,11 +217,143 @@ ws.send(JSON.stringify({
 **`GET /health`**
 - Returns server health status

+## Ollama Deployment Options
+
+The gateway requires Ollama for embedding generation in RAG queries. You have two deployment options:
+
+### Option 1: Ollama in Gateway Container (Recommended for simplicity)
+
+Install Ollama directly in the gateway container. This keeps all dependencies local and simplifies networking.
+
+**Dockerfile additions:**
+```dockerfile
+FROM node:22-slim
+
+# Install Ollama
+RUN curl -fsSL https://ollama.com/install.sh | sh
+
+# Pull embedding model at build time
+RUN ollama serve & \
+    sleep 5 && \
+    ollama pull all-minilm && \
+    pkill ollama
+
+# ... rest of your gateway Dockerfile
+```
+
+**Start script (entrypoint.sh):**
+```bash
+#!/bin/bash
+# Start Ollama in background
+ollama serve &
+
+# Start gateway
+node dist/main.js
+```
+
+**Pros:**
+- Simple networking (localhost:11434)
+- No extra K8s resources
+- Self-contained deployment
+
+**Cons:**
+- Larger container image (~200MB extra)
+- CPU/memory shared with gateway process
+
+**Resource requirements:**
+- Add +200MB memory
+- Add +0.2 CPU cores for embedding inference
+
+### Option 2: Ollama as Separate Pod/Sidecar
+
+Deploy Ollama as a separate container in the same pod (sidecar) or as its own deployment.
+
+**K8s Deployment (sidecar pattern):**
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gateway
+spec:
+  template:
+    spec:
+      containers:
+      - name: gateway
+        image: ghcr.io/dexorder/gateway:latest
+        env:
+        - name: OLLAMA_URL
+          value: http://localhost:11434
+
+      - name: ollama
+        image: ollama/ollama:latest
+        command: ["/bin/sh", "-c"]
+        args:
+          - |
+            ollama serve &
+            sleep 5
+            ollama pull all-minilm
+            wait
+        resources:
+          requests:
+            memory: "512Mi"
+            cpu: "500m"
+          limits:
+            memory: "1Gi"
+            cpu: "1000m"
+```
+
+**K8s Deployment (separate service):**
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ollama
+spec:
+  replicas: 1
+  template:
+    spec:
+      containers:
+      - name: ollama
+        image: ollama/ollama:latest
+        # ... same as above
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: ollama
+spec:
+  selector:
+    app: ollama
+  ports:
+  - port: 11434
+```
+
+Gateway `.env`:
+```bash
+OLLAMA_URL=http://ollama:11434
+```
+
+**Pros:**
+- Isolated resource limits
+- Can scale separately
+- Easier to monitor/debug
+
+**Cons:**
+- More K8s resources
+- Network hop (minimal latency)
+- More complex deployment
+
+### Recommendation
+
+For most deployments: **Use Option 1 (in-container)** for simplicity, unless you need to:
+- Share Ollama across multiple services
+- Scale embedding inference independently
+- Run Ollama on GPU nodes (gateway on CPU nodes)
+
 ## TODO

 - [ ] Implement JWT verification with JWKS
 - [ ] Implement MCP HTTP/SSE transport
- [ ] Add Redis for session persistence
 - [ ] Add rate limiting per user license
 - [ ] Add message usage tracking
 - [ ] Add streaming responses for WebSocket
--- a/gateway/config.example.yaml
+++ b/gateway/config.example.yaml
@@ -0,0 +1,61 @@
+# Gateway Configuration
+
+# Server configuration
+server:
+  port: 3000
+  host: 0.0.0.0
+  log_level: info
+  cors_origin: "*"
+  base_url: http://localhost:3000
+  trusted_origins:
+    - http://localhost:3000
+    - http://localhost:5173
+    - http://localhost:8080
+
+# Database
+database:
+  url: postgresql://postgres:postgres@localhost:5432/dexorder
+
+# Default model (if user has no preference)
+defaults:
+  model_provider: anthropic
+  model: claude-3-5-sonnet-20241022
+
+# Kubernetes configuration
+kubernetes:
+  namespace: dexorder-agents
+  in_cluster: false
+  context: minikube
+  agent_image: ghcr.io/dexorder/agent:latest
+  sidecar_image: ghcr.io/dexorder/lifecycle-sidecar:latest
+  storage_class: standard
+
+# DragonflyDB (Redis-compatible, for hot storage and session management)
+redis:
+  url: redis://localhost:6379
+
+# Qdrant (for RAG vector search)
+qdrant:
+  url: http://localhost:6333
+  collection: gateway_memory
+
+# Iceberg (for durable storage via REST catalog)
+iceberg:
+  catalog_uri: http://iceberg-catalog:8181
+  namespace: gateway
+  s3_endpoint: http://minio:9000
+
+# Event router (ZeroMQ)
+events:
+  router_bind: tcp://*:5571
+
+# Embeddings (for RAG vector search)
+# Recommended: ollama with all-minilm (90MB model, CPU-friendly, ~100MB RAM)
+embedding:
+  provider: ollama
+  model: all-minilm
+  ollama_url: http://localhost:11434
+
+# Email service configuration
+email:
+  from_address: noreply@dexorder.com
--- a/gateway/db-dev.sql
+++ b/gateway/db-dev.sql
@@ -0,0 +1,8 @@
+-- Development seed data
+-- This file contains sample data for local development and testing
+--
+-- Dev user: cryptochimp@dexorder.ai / moon2the
+-- User is created via Better Auth API in bin/dev script
+-- License is also created in bin/dev script
+--
+-- This file is kept for future dev seed data that may be needed
--- a/gateway/entrypoint.sh
+++ b/gateway/entrypoint.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+set -e
+
+# Start Ollama server in background
+echo "Starting Ollama server..."
+ollama serve &
+OLLAMA_PID=$!
+
+# Wait for Ollama to be ready
+echo "Waiting for Ollama to be ready..."
+for i in {1..30}; do
+  if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
+    echo "Ollama is ready!"
+    break
+  fi
+  if [ $i -eq 30 ]; then
+    echo "Ollama failed to start within 30 seconds"
+    exit 1
+  fi
+  sleep 1
+done
+
+# Start the Node.js gateway application
+echo "Starting gateway..."
+exec node dist/main.js
--- a/gateway/knowledge/README.md
+++ b/gateway/knowledge/README.md
@@ -0,0 +1,94 @@
+# Dexorder Knowledge Base
+
+This directory contains global knowledge documents that are automatically loaded into the RAG system as platform-wide knowledge (user_id="0").
+
+## Structure
+
+- **platform/**: Platform architecture and capabilities
+- **trading/**: Trading concepts and fundamentals
+- **indicators/**: Indicator development and usage
+- **strategies/**: Strategy development and patterns
+
+## Document Format
+
+Documents should be in Markdown format with:
+- Clear headings for chunking
+- Optional YAML frontmatter for tags
+- Code examples where relevant
+- Cross-references to other docs
+
+### Example with Frontmatter
+
+```markdown
+---
+tags: [trading, risk-management, position-sizing]
+---
+
+# Risk Management
+
+Content here...
+```
+
+## How It Works
+
+1. At gateway startup, the DocumentLoader scans this directory
+2. Each markdown file is chunked by headers (max ~1000 tokens per chunk)
+3. Chunks are embedded using the configured embedding service
+4. Embeddings are stored in Qdrant with user_id="0" (global namespace)
+5. Content hash tracking enables incremental updates
+
+## Updating Documents
+
+### During Development
+- Edit markdown files
+- Restart gateway or call reload endpoint: `POST /admin/reload-knowledge`
+
+### In Production
+- Update markdown files in git
+- Deploy new version
+- Gateway will detect changes and update vectors automatically
+
+## RAG Integration
+
+When users query the agent:
+1. Their query is embedded
+2. Qdrant searches both global (user_id="0") and user-specific vectors
+3. Relevant chunks from these docs are included in context
+4. LLM generates response with platform knowledge
+
+## Adding New Documents
+
+1. Create markdown file in appropriate subdirectory
+2. Use clear section headers (##, ###) for automatic chunking
+3. Include practical examples and code samples
+4. Add tags in frontmatter if using complex categorization
+5. Restart gateway or reload knowledge
+
+## Best Practices
+
+- **Keep chunks focused**: Each section should cover one topic
+- **Use examples**: Code samples and practical examples help
+- **Link concepts**: Reference other docs for deeper dives
+- **Update regularly**: Keep knowledge current with platform changes
+- **Test queries**: Verify RAG retrieves relevant chunks
+
+## Maintenance
+
+The DocumentLoader tracks:
+- Content hashes for change detection
+- Number of chunks per document
+- Last update timestamps
+
+Check logs for load statistics:
+```
+Knowledge documents loaded: { loaded: 5, updated: 2, skipped: 3 }
+```
+
+Monitor Qdrant collection stats:
+```
+GET /health
+{
+  "qdrantVectors": 1234,
+  "qdrantIndexed": 1234
+}
+```
--- a/gateway/knowledge/indicators/indicator-development.md
+++ b/gateway/knowledge/indicators/indicator-development.md
@@ -0,0 +1,142 @@
+# Indicator Development Guide
+
+Custom indicators in Dexorder are Python functions that process OHLCV data and return signals or values.
+
+## Indicator Structure
+
+```python
+def my_indicator(df, **params):
+    """
+    Calculate custom indicator
+
+    Args:
+        df: DataFrame with columns [open, high, low, close, volume]
+        **params: Indicator parameters
+
+    Returns:
+        Series or DataFrame with indicator values
+    """
+    # Implementation
+    return result
+```
+
+## Common Patterns
+
+### Simple Moving Average
+```python
+def sma(df, period=20):
+    return df['close'].rolling(window=period).mean()
+```
+
+### Exponential Moving Average
+```python
+def ema(df, period=20):
+    return df['close'].ewm(span=period, adjust=False).mean()
+```
+
+### RSI (Relative Strength Index)
+```python
+def rsi(df, period=14):
+    delta = df['close'].diff()
+    gain = delta.where(delta > 0, 0).rolling(window=period).mean()
+    loss = -delta.where(delta < 0, 0).rolling(window=period).mean()
+    rs = gain / loss
+    return 100 - (100 / (1 + rs))
+```
+
+### MACD
+```python
+def macd(df, fast=12, slow=26, signal=9):
+    ema_fast = df['close'].ewm(span=fast).mean()
+    ema_slow = df['close'].ewm(span=slow).mean()
+    macd_line = ema_fast - ema_slow
+    signal_line = macd_line.ewm(span=signal).mean()
+    histogram = macd_line - signal_line
+
+    return pd.DataFrame({
+        'macd': macd_line,
+        'signal': signal_line,
+        'histogram': histogram
+    })
+```
+
+## Best Practices
+
+### Data Handling
+- Always validate input DataFrame has required columns
+- Handle NaN values appropriately
+- Use `.copy()` to avoid modifying original data
+- Consider edge cases (not enough data, etc.)
+
+### Performance
+- Vectorize operations when possible (avoid loops)
+- Use pandas/numpy built-in functions
+- Cache expensive calculations
+- Test on large datasets
+
+### Parameters
+- Provide sensible defaults
+- Document parameter ranges
+- Validate parameter values
+- Consider optimization bounds
+
+### Testing
+```python
+def test_indicator():
+    # Create sample data
+    df = pd.DataFrame({
+        'close': [100, 102, 101, 103, 105]
+    })
+
+    # Test calculation
+    result = my_indicator(df, param=10)
+
+    # Validate output
+    assert not result.isna().all()
+    assert len(result) == len(df)
+```
+
+## Common Pitfalls
+
+### Look-Ahead Bias
+Never use future data:
+```python
+# WRONG - uses future data
+df['signal'] = df['close'].shift(-1) > df['close']
+
+# CORRECT - only past data
+df['signal'] = df['close'] > df['close'].shift(1)
+```
+
+### Repainting
+Indicator values should not change for closed bars:
+```python
+# Ensure calculations are based on closed candles
+# Avoid using unstable data sources
+```
+
+### Overfitting
+- Don't optimize on same data you test on
+- Use separate train/validation/test sets
+- Walk-forward analysis for robustness
+- Simple is often better than complex
+
+## Integration with Strategies
+
+Indicators are used in strategy signals:
+```python
+def my_strategy(df):
+    # Calculate indicators
+    df['rsi'] = rsi(df, period=14)
+    df['sma_fast'] = sma(df, period=20)
+    df['sma_slow'] = sma(df, period=50)
+
+    # Generate signals
+    df['signal'] = 0
+    df.loc[(df['rsi'] < 30) & (df['sma_fast'] > df['sma_slow']), 'signal'] = 1
+    df.loc[(df['rsi'] > 70) & (df['sma_fast'] < df['sma_slow']), 'signal'] = -1
+
+    return df
+```
+
+Store indicators in your git repository under `indicators/` directory.
--- a/gateway/knowledge/platform/agent-system.md
+++ b/gateway/knowledge/platform/agent-system.md
@@ -0,0 +1,71 @@
+# Agent System Architecture
+
+The Dexorder AI platform uses a sophisticated agent harness that orchestrates between user interactions, LLM models, and user-specific tools.
+
+## Core Components
+
+### Gateway
+Multi-channel gateway supporting:
+- WebSocket connections for web/mobile
+- Telegram integration
+- Real-time event streaming
+
+### Agent Harness
+Stateless orchestrator that:
+1. Fetches context from user's MCP server
+2. Routes to appropriate LLM model based on license
+3. Calls LLM with embedded context
+4. Routes tool calls to user's MCP or platform tools
+5. Saves conversation history back to MCP
+
+### Memory Architecture
+Three-tier storage system:
+- **Redis**: Hot state for active sessions and checkpoints
+- **Qdrant**: Vector search for RAG and semantic memory
+- **Iceberg**: Cold storage for durable conversations and analytics
+
+### User Context
+Every interaction includes:
+- User ID and license information
+- Active channel (websocket, telegram, etc.)
+- Channel capabilities (markdown, images, buttons)
+- Conversation history
+- Relevant memories from RAG
+- Workspace state
+
+## Skills vs Subagents
+
+### Skills
+Self-contained capabilities for specific tasks:
+- Market analysis
+- Strategy validation
+- Indicator development
+- Defined in markdown + TypeScript
+- Use when task is well-defined and scoped
+
+### Subagents
+Specialized agents with dedicated memory:
+- Code reviewer with review guidelines
+- Risk analyzer with risk models
+- Multi-file knowledge base
+- Custom system prompts
+- Use when domain expertise is needed
+
+## Global vs User Memory
+
+### Global Memory (user_id="0")
+Platform-wide knowledge available to all users:
+- Trading concepts and terminology
+- Platform capabilities
+- Indicator documentation
+- Strategy patterns
+- Best practices
+
+### User Memory
+Personal context specific to each user:
+- Conversation history
+- Preferences and trading style
+- Custom indicators and strategies
+- Workspace state
+
+All RAG queries automatically search both global and user-specific memories.
--- a/gateway/knowledge/platform/mcp-integration.md
+++ b/gateway/knowledge/platform/mcp-integration.md
@@ -0,0 +1,88 @@
+# Model Context Protocol (MCP) Integration
+
+Dexorder uses the Model Context Protocol for user-specific tool execution and state management.
+
+## Container Architecture
+
+Each user has a dedicated Kubernetes pod running:
+- **Agent Container**: Python environment with conda packages
+- **Lifecycle Sidecar**: Manages container lifecycle and communication
+- **Persistent Storage**: User's git repository with indicators/strategies
+
+## Authentication Modes
+
+Three MCP authentication modes:
+
+### 1. Public Mode (Free Tier)
+- No authentication required
+- Container creates anonymous session
+- Limited to read-only resources
+- Session expires after timeout
+
+### 2. Gateway Auth Mode (Standard)
+- Gateway authenticates user
+- Passes verified user ID to container
+- Container trusts gateway's authentication
+- Full access to user's tools and data
+
+### 3. Direct Auth Mode (Enterprise)
+- User authenticates directly with container
+- Gateway forwards encrypted credentials
+- Container validates credentials independently
+- Highest security for sensitive operations
+
+## MCP Resources
+
+The container exposes standard resources:
+
+### context://user-profile
+User preferences and trading style
+
+### context://conversation-summary
+Recent conversation context and history
+
+### context://workspace-state
+Current chart, indicators, and analysis state
+
+### context://system-prompt
+User's custom agent instructions
+
+### indicators://list
+Available indicators with signatures
+
+### strategies://list
+User's trading strategies
+
+## Tool Execution Flow
+
+1. User sends message to gateway
+2. Gateway queries user's MCP resources for context
+3. LLM generates response with tool calls
+4. Gateway routes tool calls:
+   - Platform tools → handled by gateway
+   - User tools → proxied to MCP container
+5. Tool results returned to LLM
+6. Final response sent to user
+7. Conversation saved to MCP container
+
+## Container Lifecycle
+
+### Startup
+1. Gateway receives user connection
+2. Checks if container exists
+3. Creates pod if needed (cold start ~5-10s)
+4. Waits for container ready
+5. Establishes MCP connection
+
+### Active
+- Container stays alive during active session
+- Receives tool calls via MCP
+- Maintains workspace state
+- Saves files to persistent storage
+
+### Shutdown
+- Free users: timeout after 15 minutes idle
+- Paid users: longer timeout based on license
+- Graceful shutdown saves state
+- Persistent storage retained
+- Fast restart on next connection
--- a/gateway/knowledge/strategies/strategy-development.md
+++ b/gateway/knowledge/strategies/strategy-development.md
@@ -0,0 +1,188 @@
+# Strategy Development Guide
+
+Trading strategies in Dexorder define entry/exit rules and position management logic.
+
+## Strategy Structure
+
+```python
+class Strategy:
+    def __init__(self, **params):
+        """Initialize strategy with parameters"""
+        self.params = params
+
+    def generate_signals(self, df):
+        """
+        Generate trading signals
+
+        Args:
+            df: DataFrame with OHLCV + indicator columns
+
+        Returns:
+            DataFrame with 'signal' column:
+                1 = long entry
+               -1 = short entry
+                0 = no action
+        """
+        pass
+
+    def calculate_position_size(self, capital, price, risk_pct):
+        """Calculate position size based on risk"""
+        pass
+
+    def get_stop_loss(self, entry_price, direction):
+        """Calculate stop loss level"""
+        pass
+
+    def get_take_profit(self, entry_price, direction):
+        """Calculate take profit level"""
+        pass
+```
+
+## Example: Simple Moving Average Crossover
+
+```python
+class SMACrossoverStrategy:
+    def __init__(self, fast_period=20, slow_period=50, risk_pct=0.02):
+        self.fast_period = fast_period
+        self.slow_period = slow_period
+        self.risk_pct = risk_pct
+
+    def generate_signals(self, df):
+        # Calculate moving averages
+        df['sma_fast'] = df['close'].rolling(self.fast_period).mean()
+        df['sma_slow'] = df['close'].rolling(self.slow_period).mean()
+
+        # Generate signals
+        df['signal'] = 0
+
+        # Long when fast crosses above slow
+        df.loc[
+            (df['sma_fast'] > df['sma_slow']) &
+            (df['sma_fast'].shift(1) <= df['sma_slow'].shift(1)),
+            'signal'
+        ] = 1
+
+        # Short when fast crosses below slow
+        df.loc[
+            (df['sma_fast'] < df['sma_slow']) &
+            (df['sma_fast'].shift(1) >= df['sma_slow'].shift(1)),
+            'signal'
+        ] = -1
+
+        return df
+
+    def calculate_position_size(self, capital, price, atr):
+        # Risk-based position sizing
+        risk_amount = capital * self.risk_pct
+        stop_distance = 2 * atr
+        position_size = risk_amount / stop_distance
+        return position_size
+
+    def get_stop_loss(self, entry_price, direction, atr):
+        if direction == 1:  # Long
+            return entry_price - (2 * atr)
+        else:  # Short
+            return entry_price + (2 * atr)
+
+    def get_take_profit(self, entry_price, direction, atr):
+        if direction == 1:  # Long
+            return entry_price + (4 * atr)  # 2:1 risk/reward
+        else:  # Short
+            return entry_price - (4 * atr)
+```
+
+## Strategy Components
+
+### Signal Generation
+Entry conditions based on:
+- Indicator crossovers
+- Price patterns
+- Volume confirmation
+- Multiple timeframe confluence
+
+### Risk Management
+Essential elements:
+- **Position Sizing**: Based on account risk percentage
+- **Stop Losses**: ATR-based or support/resistance
+- **Take Profits**: Multiple targets or trailing stops
+- **Max Positions**: Limit concurrent trades
+
+### Filters
+Reduce false signals:
+- **Trend Filter**: Only trade with the trend
+- **Volatility Filter**: Avoid low volatility periods
+- **Time Filter**: Specific trading hours
+- **Volume Filter**: Minimum volume requirements
+
+### Exit Rules
+Multiple exit types:
+- **Stop Loss**: Protect capital
+- **Take Profit**: Lock in gains
+- **Trailing Stop**: Follow profitable moves
+- **Time Exit**: Close at end of period
+- **Signal Exit**: Opposite signal
+
+## Backtesting Considerations
+
+### Data Quality
+- Use clean, validated data
+- Handle missing data appropriately
+- Account for survivorship bias
+- Include realistic spreads and slippage
+
+### Performance Metrics
+Track key metrics:
+- **Total Return**: Cumulative profit/loss
+- **Sharpe Ratio**: Risk-adjusted returns
+- **Max Drawdown**: Largest peak-to-trough decline
+- **Win Rate**: Percentage of profitable trades
+- **Profit Factor**: Gross profit / gross loss
+- **Expectancy**: Average $ per trade
+
+### Validation
+Prevent overfitting:
+- **Train/Test Split**: 70/30 or 60/40
+- **Walk-Forward**: Rolling windows
+- **Out-of-Sample**: Test on recent unseen data
+- **Monte Carlo**: Randomize trade order
+- **Paper Trading**: Live validation
+
+## Common Strategy Types
+
+### Trend Following
+Follow sustained price movements:
+- Moving average crossovers
+- Breakout strategies
+- Trend channels
+- Works best in trending markets
+
+### Mean Reversion
+Profit from price returning to average:
+- Bollinger Band reversals
+- RSI extremes
+- Statistical arbitrage
+- Works best in ranging markets
+
+### Momentum
+Trade in direction of strong moves:
+- Relative strength
+- Price acceleration
+- Volume surges
+- Breakout confirmation
+
+### Arbitrage
+Exploit price discrepancies:
+- Cross-exchange spreads
+- Funding rate arbitrage
+- Statistical pairs trading
+- Requires low latency
+
+## Integration with Platform
+
+Store strategies in your git repository under `strategies/` directory.
+
+Test using the backtesting tools provided by the platform.
+
+Deploy live strategies through the execution engine with proper risk controls.
+
+Monitor performance and adjust parameters as market conditions change.
--- a/gateway/knowledge/trading/technical-analysis.md
+++ b/gateway/knowledge/trading/technical-analysis.md
@@ -0,0 +1,72 @@
+# Technical Analysis Fundamentals
+
+Technical analysis is the study of historical price and volume data to identify patterns and predict future market movements.
+
+## Key Concepts
+
+### Price Action
+Raw price movement without indicators:
+- Support and resistance levels
+- Trend lines and channels
+- Chart patterns (head and shoulders, double tops, etc.)
+- Candlestick patterns
+
+### Trends
+Markets move in trends:
+- **Uptrend**: Higher highs and higher lows
+- **Downtrend**: Lower highs and lower lows
+- **Sideways**: Range-bound consolidation
+- Trend strength measured by consistency
+
+### Volume
+Trading volume confirms price movements:
+- High volume confirms trends
+- Low volume suggests weak moves
+- Volume precedes price
+- Divergences signal reversals
+
+## Common Indicators
+
+### Trend Indicators
+- **Moving Averages**: SMA, EMA, WMA
+- **MACD**: Trend and momentum
+- **ADX**: Trend strength
+- **Parabolic SAR**: Trend direction
+
+### Momentum Indicators
+- **RSI**: Overbought/oversold conditions (0-100)
+- **Stochastic**: Fast vs slow momentum
+- **CCI**: Cyclical trends
+- **Williams %R**: Momentum oscillator
+
+### Volatility Indicators
+- **Bollinger Bands**: Price envelope around MA
+- **ATR**: Average True Range for volatility
+- **Keltner Channels**: ATR-based bands
+- **Donchian Channels**: High/low breakouts
+
+### Volume Indicators
+- **OBV**: On Balance Volume
+- **VWAP**: Volume Weighted Average Price
+- **Volume Profile**: Price levels by volume
+- **Chaikin Money Flow**: Volume pressure
+
+## Timeframes
+
+Different timeframes for different strategies:
+- **Scalping**: 1m, 5m charts
+- **Day Trading**: 5m, 15m, 1h charts
+- **Swing Trading**: 4h, 1D charts
+- **Position Trading**: 1D, 1W charts
+
+Always analyze multiple timeframes for context.
+
+## Risk Management
+
+Essential principles:
+- **Position Sizing**: Risk 1-2% per trade
+- **Stop Losses**: Define exit before entry
+- **Risk/Reward**: Minimum 1:2 ratio
+- **Diversification**: Multiple uncorrelated positions
+
+Never trade without a plan and defined risk parameters.
--- a/gateway/package.json
+++ b/gateway/package.json
@@ -12,28 +12,33 @@
  },
  "dependencies": {
    "@fastify/cors": "^10.0.1",
+    "@fastify/jwt": "^9.0.1",
    "@fastify/websocket": "^11.0.1",
-    "@kubernetes/client-node": "^0.21.0",
-    "@langchain/anthropic": "^0.3.8",
-    "@langchain/core": "^0.3.24",
-    "@langchain/google-genai": "^0.1.6",
-    "@langchain/langgraph": "^0.2.26",
-    "@langchain/openai": "^0.3.21",
-    "@langchain/openrouter": "^0.1.2",
+    "@kubernetes/client-node": "^1.0.0",
+    "@langchain/anthropic": "latest",
+    "@langchain/core": "latest",
+    "@langchain/langgraph": "latest",
    "@modelcontextprotocol/sdk": "^1.0.4",
+    "@qdrant/js-client-rest": "^1.17.0",
+    "argon2": "^0.41.1",
+    "better-auth": "^1.5.3",
    "fastify": "^5.2.0",
+    "iceberg-js": "latest",
    "ioredis": "^5.4.2",
    "js-yaml": "^4.1.0",
+    "kysely": "^0.27.3",
+    "ollama": "^0.5.10",
    "pg": "^8.13.1",
    "pino": "^9.6.0",
    "pino-pretty": "^13.0.0",
+    "zeromq": "^6.0.0-beta.20",
    "zod": "^3.24.1"
  },
  "devDependencies": {
    "@types/js-yaml": "^4.0.9",
    "@types/node": "^22.10.2",
    "@types/pg": "^8.11.10",
-    "tsx": "^4.19.2",
+    "tsx": "^4.21.0",
    "typescript": "^5.7.2"
  },
  "engines": {
--- a/gateway/protobuf/user_events.proto
+++ b/gateway/protobuf/user_events.proto
@@ -0,0 +1,258 @@
+syntax = "proto3";
+
+option java_multiple_files = true;
+option java_package = "com.dexorder.proto";
+
+// User container event system for delivering notifications to users
+// via active sessions or external channels (Telegram, email, push).
+//
+// Two ZMQ patterns:
+// - XPUB/SUB (port 5570): Fast path for informational events to active sessions
+// - DEALER/ROUTER (port 5571): Guaranteed delivery for critical events with ack
+//
+// See doc/protocol.md and doc/user_container_events.md for details.
+
+// =============================================================================
+// User Event (Container → Gateway)
+// Message Type ID: 0x20
+// =============================================================================
+
+message UserEvent {
+  // User ID this event belongs to
+  string user_id = 1;
+
+  // Unique event ID for deduplication and ack tracking (UUID)
+  string event_id = 2;
+
+  // Timestamp when event was generated (Unix milliseconds)
+  int64 timestamp = 3;
+
+  // Type of event
+  EventType event_type = 4;
+
+  // Event payload (JSON or nested protobuf, depending on event_type)
+  bytes payload = 5;
+
+  // Delivery specification (priority and channel preferences)
+  DeliverySpec delivery = 6;
+}
+
+enum EventType {
+  // Trading events
+  ORDER_PLACED = 0;
+  ORDER_FILLED = 1;
+  ORDER_CANCELLED = 2;
+  ORDER_REJECTED = 3;
+  ORDER_EXPIRED = 4;
+
+  // Alert events
+  ALERT_TRIGGERED = 10;
+  ALERT_CREATED = 11;
+  ALERT_DELETED = 12;
+
+  // Position events
+  POSITION_OPENED = 20;
+  POSITION_CLOSED = 21;
+  POSITION_UPDATED = 22;
+  POSITION_LIQUIDATED = 23;
+
+  // Workspace/chart events
+  WORKSPACE_CHANGED = 30;
+  CHART_ANNOTATION_ADDED = 31;
+  CHART_ANNOTATION_REMOVED = 32;
+  INDICATOR_UPDATED = 33;
+
+  // Strategy events
+  STRATEGY_STARTED = 40;
+  STRATEGY_STOPPED = 41;
+  STRATEGY_LOG = 42;
+  STRATEGY_ERROR = 43;
+  BACKTEST_COMPLETED = 44;
+
+  // System events
+  CONTAINER_STARTING = 50;
+  CONTAINER_READY = 51;
+  CONTAINER_SHUTTING_DOWN = 52;
+  ERROR = 53;
+}
+
+// =============================================================================
+// Delivery Specification
+// =============================================================================
+
+message DeliverySpec {
+  // Priority determines routing behavior
+  Priority priority = 1;
+
+  // Ordered list of channel preferences (try first, then second, etc.)
+  repeated ChannelPreference channels = 2;
+}
+
+enum Priority {
+  // Drop if no active session (fire-and-forget via XPUB)
+  // Use for: indicator updates, chart syncs, strategy logs when watching
+  INFORMATIONAL = 0;
+
+  // Best effort delivery - queue briefly, deliver when possible
+  // Uses XPUB if subscribed, otherwise DEALER
+  // Use for: alerts, position updates
+  NORMAL = 1;
+
+  // Must deliver - retry until acked, escalate channels
+  // Always uses DEALER for guaranteed delivery
+  // Use for: order fills, liquidations, critical errors
+  CRITICAL = 2;
+}
+
+message ChannelPreference {
+  // Channel to deliver to
+  ChannelType channel = 1;
+
+  // If true, skip this channel if user is not connected to it
+  // If false, deliver even if user is not actively connected
+  // (e.g., send Telegram message even if user isn't in Telegram chat)
+  bool only_if_active = 2;
+}
+
+enum ChannelType {
+  // Whatever channel the user currently has open (WebSocket, Telegram session)
+  ACTIVE_SESSION = 0;
+
+  // Specific channels
+  WEB = 1;           // WebSocket to web UI
+  TELEGRAM = 2;      // Telegram bot message
+  EMAIL = 3;         // Email notification
+  PUSH = 4;          // Mobile push notification (iOS/Android)
+  DISCORD = 5;       // Discord webhook (future)
+  SLACK = 6;         // Slack webhook (future)
+}
+
+// =============================================================================
+// Event Acknowledgment (Gateway → Container)
+// Message Type ID: 0x21
+// =============================================================================
+
+message EventAck {
+  // Event ID being acknowledged
+  string event_id = 1;
+
+  // Delivery status
+  AckStatus status = 2;
+
+  // Error message if status is ERROR
+  string error_message = 3;
+
+  // Channel that successfully delivered (for logging/debugging)
+  ChannelType delivered_via = 4;
+}
+
+enum AckStatus {
+  // Successfully delivered to at least one channel
+  DELIVERED = 0;
+
+  // Accepted and queued for delivery (e.g., rate limited, will retry)
+  QUEUED = 1;
+
+  // Permanent failure - all channels failed
+  ERROR = 2;
+}
+
+// =============================================================================
+// Event Payloads
+// These are JSON-encoded in the UserEvent.payload field.
+// Defined here for documentation; actual encoding is JSON for flexibility.
+// =============================================================================
+
+// Payload for ORDER_PLACED, ORDER_FILLED, ORDER_CANCELLED, etc.
+message OrderEventPayload {
+  string order_id = 1;
+  string symbol = 2;
+  string side = 3;              // "buy" or "sell"
+  string order_type = 4;        // "market", "limit", "stop_limit", etc.
+  string quantity = 5;          // Decimal string
+  string price = 6;             // Decimal string (for limit orders)
+  string fill_price = 7;        // Decimal string (for fills)
+  string fill_quantity = 8;     // Decimal string (for partial fills)
+  string status = 9;            // "open", "filled", "cancelled", etc.
+  string exchange = 10;
+  int64 timestamp = 11;         // Unix milliseconds
+  string strategy_id = 12;      // If order was placed by a strategy
+  string error_message = 13;    // If rejected/failed
+}
+
+// Payload for ALERT_TRIGGERED
+message AlertEventPayload {
+  string alert_id = 1;
+  string symbol = 2;
+  string condition = 3;         // Human-readable condition (e.g., "BTC > 50000")
+  string triggered_price = 4;   // Decimal string
+  int64 timestamp = 5;
+}
+
+// Payload for POSITION_OPENED, POSITION_CLOSED, POSITION_UPDATED
+message PositionEventPayload {
+  string position_id = 1;
+  string symbol = 2;
+  string side = 3;              // "long" or "short"
+  string size = 4;              // Decimal string
+  string entry_price = 5;       // Decimal string
+  string current_price = 6;     // Decimal string
+  string unrealized_pnl = 7;    // Decimal string
+  string realized_pnl = 8;      // Decimal string (for closed positions)
+  string leverage = 9;          // Decimal string (for margin)
+  string liquidation_price = 10;
+  string exchange = 11;
+  int64 timestamp = 12;
+}
+
+// Payload for WORKSPACE_CHANGED, CHART_ANNOTATION_*, INDICATOR_UPDATED
+message WorkspaceEventPayload {
+  string workspace_id = 1;
+  string change_type = 2;       // "symbol_changed", "timeframe_changed", "annotation_added", etc.
+  string symbol = 3;
+  string timeframe = 4;
+
+  // For annotations
+  string annotation_id = 5;
+  string annotation_type = 6;   // "trendline", "horizontal", "rectangle", "text", etc.
+  string annotation_data = 7;   // JSON string with coordinates, style, etc.
+
+  // For indicators
+  string indicator_name = 8;
+  string indicator_params = 9;  // JSON string with indicator parameters
+
+  int64 timestamp = 10;
+}
+
+// Payload for STRATEGY_LOG, STRATEGY_ERROR
+message StrategyEventPayload {
+  string strategy_id = 1;
+  string strategy_name = 2;
+  string log_level = 3;         // "debug", "info", "warn", "error"
+  string message = 4;
+  string details = 5;           // JSON string with additional context
+  int64 timestamp = 6;
+}
+
+// Payload for BACKTEST_COMPLETED
+message BacktestEventPayload {
+  string backtest_id = 1;
+  string strategy_id = 2;
+  string strategy_name = 3;
+  string symbol = 4;
+  string timeframe = 5;
+  int64 start_time = 6;
+  int64 end_time = 7;
+
+  // Results summary
+  int32 total_trades = 8;
+  int32 winning_trades = 9;
+  int32 losing_trades = 10;
+  string total_pnl = 11;        // Decimal string
+  string win_rate = 12;         // Decimal string (0-1)
+  string sharpe_ratio = 13;     // Decimal string
+  string max_drawdown = 14;     // Decimal string (0-1)
+
+  string results_path = 15;     // Path to full results file
+  int64 completed_at = 16;
+}
--- a/gateway/schema.sql
+++ b/gateway/schema.sql
@@ -1,7 +1,70 @@
-- User license and authorization schema
+-- Better Auth Core Schema
+-- See: https://better-auth.com/docs/concepts/database
+-- Note: Using quoted "user" to avoid SQL keyword issues while keeping Better Auth's expected table name
+
+-- User table (better-auth core)
+CREATE TABLE IF NOT EXISTS "user" (
+  id TEXT PRIMARY KEY,
+  name TEXT NOT NULL,
+  email TEXT UNIQUE NOT NULL,
+  "emailVerified" BOOLEAN NOT NULL DEFAULT FALSE,
+  image TEXT,
+  "createdAt" TIMESTAMP NOT NULL DEFAULT NOW(),
+  "updatedAt" TIMESTAMP NOT NULL DEFAULT NOW()
+);
+
+CREATE INDEX idx_user_email ON "user"(email);
+
+-- Session table (better-auth core)
+CREATE TABLE IF NOT EXISTS session (
+  id TEXT PRIMARY KEY,
+  "expiresAt" TIMESTAMP NOT NULL,
+  token TEXT UNIQUE NOT NULL,
+  "createdAt" TIMESTAMP NOT NULL DEFAULT NOW(),
+  "updatedAt" TIMESTAMP NOT NULL DEFAULT NOW(),
+  "ipAddress" TEXT,
+  "userAgent" TEXT,
+  "userId" TEXT NOT NULL REFERENCES "user"(id) ON DELETE CASCADE
+);
+
+CREATE INDEX idx_session_userId ON session("userId");
+CREATE INDEX idx_session_token ON session(token);
+
+-- Account table (better-auth core, for OAuth providers)
+CREATE TABLE IF NOT EXISTS account (
+  id TEXT PRIMARY KEY,
+  "accountId" TEXT NOT NULL,
+  "providerId" TEXT NOT NULL,
+  "userId" TEXT NOT NULL REFERENCES "user"(id) ON DELETE CASCADE,
+  "accessToken" TEXT,
+  "refreshToken" TEXT,
+  "idToken" TEXT,
+  "accessTokenExpiresAt" TIMESTAMP,
+  "refreshTokenExpiresAt" TIMESTAMP,
+  scope TEXT,
+  password TEXT,
+  "createdAt" TIMESTAMP NOT NULL DEFAULT NOW(),
+  "updatedAt" TIMESTAMP NOT NULL DEFAULT NOW()
+);
+
+CREATE INDEX idx_account_userId ON account("userId");
+
+-- Verification table (better-auth core)
+CREATE TABLE IF NOT EXISTS verification (
+  id TEXT PRIMARY KEY,
+  identifier TEXT NOT NULL,
+  value TEXT NOT NULL,
+  "expiresAt" TIMESTAMP NOT NULL,
+  "createdAt" TIMESTAMP,
+  "updatedAt" TIMESTAMP
+);
+
+CREATE INDEX idx_verification_identifier ON verification(identifier);
+
+-- User license and authorization schema (custom tables)

 CREATE TABLE IF NOT EXISTS user_licenses (
-  user_id TEXT PRIMARY KEY,
+  user_id TEXT PRIMARY KEY REFERENCES "user"(id) ON DELETE CASCADE,
  email TEXT,
  license_type TEXT NOT NULL CHECK (license_type IN ('free', 'pro', 'enterprise')),
  features JSONB NOT NULL DEFAULT '{
@@ -43,37 +106,3 @@ CREATE TABLE IF NOT EXISTS user_channel_links (

 CREATE INDEX idx_user_channel_links_user_id ON user_channel_links(user_id);
 CREATE INDEX idx_user_channel_links_channel ON user_channel_links(channel_type, channel_user_id);
-
-- Example data for development
-INSERT INTO user_licenses (user_id, email, license_type, mcp_server_url, features, resource_limits, preferred_model)
-VALUES (
-  'dev-user-001',
-  'dev@example.com',
-  'pro',
-  'http://localhost:8080/mcp',
-  '{
-    "maxIndicators": 50,
-    "maxStrategies": 20,
-    "maxBacktestDays": 365,
-    "realtimeData": true,
-    "customExecutors": true,
-    "apiAccess": true
-  }',
-  '{
-    "maxConcurrentSessions": 5,
-    "maxMessagesPerDay": 1000,
-    "maxTokensPerMessage": 8192,
-    "rateLimitPerMinute": 60
-  }',
-  '{
-    "provider": "anthropic",
-    "model": "claude-3-5-sonnet-20241022",
-    "temperature": 0.7
-  }'
-)
-ON CONFLICT (user_id) DO NOTHING;
-
-- Example Telegram link
-INSERT INTO user_channel_links (user_id, channel_type, channel_user_id)
-VALUES ('dev-user-001', 'telegram', '123456789')
-ON CONFLICT (channel_type, channel_user_id) DO NOTHING;
--- a/gateway/secrets.example.yaml
+++ b/gateway/secrets.example.yaml
@@ -0,0 +1,40 @@
+# Gateway Secrets
+# These should be mounted from k8s secrets
+
+# Authentication
+# IMPORTANT: Generate a strong random secret for production
+# Example: openssl rand -base64 32
+auth:
+  secret: "change-me-in-production-use-openssl-rand-base64-32"
+
+# LLM Provider API Keys (configure at least one)
+llm_providers:
+  anthropic_api_key: sk-ant-xxxxx
+  openai_api_key: sk-xxxxx
+  google_api_key: xxxxx
+  openrouter_api_key: sk-or-xxxxx
+
+# Telegram (optional)
+telegram:
+  bot_token: ""
+
+# Email service (optional)
+email:
+  service_key: ""
+
+# Push notification service (optional)
+push:
+  service_key: ""
+
+# Qdrant API key (optional, for hosted Qdrant)
+qdrant:
+  api_key: ""
+
+# Iceberg S3 credentials
+iceberg:
+  s3_access_key: minioadmin
+  s3_secret_key: minioadmin
+
+# Embedding API key (if using external provider)
+embedding:
+  api_key: ""
--- a/gateway/src/auth/auth-service.ts
+++ b/gateway/src/auth/auth-service.ts
@@ -0,0 +1,173 @@
+import type { BetterAuthInstance } from './better-auth-config.js';
+import type { FastifyBaseLogger } from 'fastify';
+import type { Pool } from 'pg';
+
+export interface AuthServiceConfig {
+  auth: BetterAuthInstance;
+  pool: Pool;
+  logger: FastifyBaseLogger;
+}
+
+/**
+ * Authentication service that integrates Better Auth with existing user system
+ */
+export class AuthService {
+  private config: AuthServiceConfig;
+
+  constructor(config: AuthServiceConfig) {
+    this.config = config;
+  }
+
+  /**
+   * Verify JWT token and return user ID
+   * Replaces the placeholder implementation in UserService
+   */
+  async verifyToken(token: string): Promise<string | null> {
+    try {
+      // Better Auth's session verification
+      const session = await this.config.auth.api.getSession({
+        headers: {
+          authorization: `Bearer ${token}`,
+        },
+      });
+
+      if (!session || !session.user) {
+        return null;
+      }
+
+      return session.user.id;
+    } catch (error) {
+      this.config.logger.debug({ error }, 'Token verification failed');
+      return null;
+    }
+  }
+
+  /**
+   * Create user with email and password
+   */
+  async createUser(email: string, password: string, name?: string): Promise<{ userId: string; error?: string }> {
+    try {
+      const result = await this.config.auth.api.signUpEmail({
+        body: {
+          email,
+          password,
+          name: name || email.split('@')[0],
+        },
+      });
+
+      if (!result.user) {
+        return {
+          userId: '',
+          error: 'Failed to create user',
+        };
+      }
+
+      return {
+        userId: result.user.id,
+      };
+    } catch (error: any) {
+      this.config.logger.error({ error }, 'User creation failed');
+      return {
+        userId: '',
+        error: error.message || 'User creation failed',
+      };
+    }
+  }
+
+  /**
+   * Sign in with email and password
+   */
+  async signIn(email: string, password: string): Promise<{ token: string; userId: string; error?: string }> {
+    try {
+      const result = await this.config.auth.api.signInEmail({
+        body: {
+          email,
+          password,
+        },
+      });
+
+      if (!result.token || !result.user) {
+        return {
+          token: '',
+          userId: '',
+          error: 'Invalid credentials',
+        };
+      }
+
+      return {
+        token: result.token,
+        userId: result.user.id,
+      };
+    } catch (error: any) {
+      this.config.logger.error({ error }, 'Sign in failed');
+      return {
+        token: '',
+        userId: '',
+        error: error.message || 'Sign in failed',
+      };
+    }
+  }
+
+  /**
+   * Sign out and invalidate session
+   */
+  async signOut(token: string): Promise<{ success: boolean }> {
+    try {
+      await this.config.auth.api.signOut({
+        headers: {
+          authorization: `Bearer ${token}`,
+        },
+      });
+
+      return { success: true };
+    } catch (error) {
+      this.config.logger.error({ error }, 'Sign out failed');
+      return { success: false };
+    }
+  }
+
+  /**
+   * Get current session from token
+   */
+  async getSession(token: string) {
+    try {
+      const session = await this.config.auth.api.getSession({
+        headers: {
+          authorization: `Bearer ${token}`,
+        },
+      });
+
+      return session;
+    } catch (error) {
+      this.config.logger.debug({ error }, 'Get session failed');
+      return null;
+    }
+  }
+
+  /**
+   * Ensure user has a license (create default license if needed)
+   */
+  async ensureUserLicense(userId: string, email: string): Promise<void> {
+    const client = await this.config.pool.connect();
+    try {
+      // Check if license exists
+      const licenseCheck = await client.query(
+        'SELECT user_id FROM user_licenses WHERE user_id = $1',
+        [userId]
+      );
+
+      if (licenseCheck.rows.length === 0) {
+        // Create default free license
+        await client.query(
+          `INSERT INTO user_licenses (user_id, email, license_type, mcp_server_url)
+           VALUES ($1, $2, 'free', 'pending')`,
+          [userId, email]
+        );
+
+        this.config.logger.info({ userId }, 'Created default free license for new user');
+      }
+    } finally {
+      client.release();
+    }
+  }
+}
--- a/gateway/src/auth/better-auth-config.ts
+++ b/gateway/src/auth/better-auth-config.ts
@@ -0,0 +1,106 @@
+import { betterAuth } from 'better-auth';
+import { Pool } from 'pg';
+import { Kysely, PostgresDialect } from 'kysely';
+import type { FastifyBaseLogger } from 'fastify';
+
+export interface BetterAuthConfig {
+  databaseUrl: string;
+  pool?: Pool;
+  secret: string;
+  baseUrl: string;
+  trustedOrigins: string[];
+  logger: FastifyBaseLogger;
+}
+
+/**
+ * Create Better Auth instance with PostgreSQL adapter and passkey support
+ */
+export async function createBetterAuth(config: BetterAuthConfig) {
+  try {
+    config.logger.debug({
+      databaseUrl: config.databaseUrl.replace(/:[^:@]+@/, ':***@'),
+      baseUrl: config.baseUrl,
+    }, 'Creating Better Auth instance');
+
+    // Use existing pool if provided, otherwise create new one
+    const pool = config.pool || new Pool({
+      connectionString: config.databaseUrl,
+    });
+
+    config.logger.debug('PostgreSQL pool created');
+
+    // Test database connection first
+    try {
+      config.logger.debug('Testing database connection...');
+      const testClient = await pool.connect();
+      await testClient.query('SELECT 1');
+      testClient.release();
+      config.logger.debug('Database connection test successful');
+    } catch (dbError: any) {
+      config.logger.error({
+        error: dbError,
+        message: dbError.message,
+        stack: dbError.stack,
+      }, 'Database connection test failed');
+      throw new Error(`Database connection failed: ${dbError.message}`);
+    }
+
+    // Create Kysely instance for Better Auth
+    config.logger.debug('Creating Kysely database instance...');
+    const db = new Kysely({
+      dialect: new PostgresDialect({ pool }),
+    });
+    config.logger.debug('Kysely instance created');
+
+    // Better Auth v1.5.3 postgres configuration
+    const auth = betterAuth({
+      database: {
+        db,
+        type: 'postgres',
+      },
+
+      // Secret for JWT signing
+      secret: config.secret,
+
+      // Base URL for callbacks and redirects
+      baseURL: config.baseUrl,
+
+      // Trusted origins for CORS
+      trustedOrigins: config.trustedOrigins,
+
+      // Email/password authentication
+      emailAndPassword: {
+        enabled: true,
+        requireEmailVerification: false, // Set to true in production
+        sendResetPassword: async ({ user, url }) => {
+          // TODO: Implement email sending
+          config.logger.info({ userId: user.id, resetUrl: url }, 'Password reset requested');
+        },
+      },
+
+      // Session configuration
+      session: {
+        expiresIn: 60 * 60 * 24 * 7, // 7 days
+        updateAge: 60 * 60 * 24, // Update session every 24 hours
+        cookieCache: {
+          enabled: true,
+          maxAge: 5 * 60, // 5 minutes
+        },
+      },
+
+    });
+
+    config.logger.debug('Better Auth instance created');
+    return auth;
+  } catch (error: any) {
+    config.logger.error({
+      error,
+      message: error.message,
+      stack: error.stack,
+      cause: error.cause,
+    }, 'Error creating Better Auth instance');
+    throw error;
+  }
+}
+
+export type BetterAuthInstance = Awaited<ReturnType<typeof createBetterAuth>>;
--- a/gateway/src/channels/telegram-handler.ts
+++ b/gateway/src/channels/telegram-handler.ts
@@ -156,7 +156,7 @@ export class TelegramHandler {
  /**
   * Cleanup old sessions (call periodically)
   */
-  async cleanupSessions(maxAgeMs = 30 * 60 * 1000): Promise<void> {
+  async cleanupSessions(_maxAgeMs = 30 * 60 * 1000): Promise<void> {
    // TODO: Track session last activity and cleanup
    // For now, sessions persist until server restart
  }
--- a/gateway/src/channels/websocket-handler.ts
+++ b/gateway/src/channels/websocket-handler.ts
@@ -6,18 +6,24 @@ import type { InboundMessage } from '../types/messages.js';
 import { randomUUID } from 'crypto';

 import type { ProviderConfig } from '../llm/provider.js';
+import type { SessionRegistry, EventSubscriber, Session } from '../events/index.js';

 export interface WebSocketHandlerConfig {
  authenticator: Authenticator;
  providerConfig: ProviderConfig;
+  sessionRegistry: SessionRegistry;
+  eventSubscriber: EventSubscriber;
 }

 /**
 * WebSocket channel handler
+ *
+ * Handles WebSocket connections for chat and integrates with the event system
+ * for container-to-client notifications.
 */
 export class WebSocketHandler {
  private config: WebSocketHandlerConfig;
-  private sessions = new Map<string, AgentHarness>();
+  private harnesses = new Map<string, AgentHarness>();

  constructor(config: WebSocketHandlerConfig) {
    this.config = config;
@@ -94,7 +100,30 @@ export class WebSocketHandler {

    try {
      await harness.initialize();
-      this.sessions.set(authContext.sessionId, harness);
+      this.harnesses.set(authContext.sessionId, harness);
+
+      // Register session for event system
+      // Container endpoint is derived from the MCP server URL (same container, different port)
+      const containerEventEndpoint = this.getContainerEventEndpoint(authContext.license.mcpServerUrl);
+
+      const session: Session = {
+        userId: authContext.userId,
+        sessionId: authContext.sessionId,
+        socket,
+        channelType: 'websocket',
+        containerEndpoint: containerEventEndpoint,
+        connectedAt: new Date(),
+      };
+
+      this.config.sessionRegistry.register(session);
+
+      // Subscribe to informational events from user's container
+      await this.config.eventSubscriber.onSessionConnect(session);
+
+      logger.info(
+        { userId: authContext.userId, containerEndpoint: containerEventEndpoint },
+        'Session registered for events'
+      );

      // Send connected message
      socket.send(
@@ -145,11 +174,19 @@ export class WebSocketHandler {
      // Handle disconnection
      socket.on('close', async () => {
        logger.info({ sessionId: authContext.sessionId }, 'WebSocket disconnected');
+
+        // Unregister from event system
+        const removedSession = this.config.sessionRegistry.unregister(authContext.sessionId);
+        if (removedSession) {
+          await this.config.eventSubscriber.onSessionDisconnect(removedSession);
+        }
+
+        // Cleanup harness
        await harness.cleanup();
-        this.sessions.delete(authContext.sessionId);
+        this.harnesses.delete(authContext.sessionId);
      });

-      socket.on('error', (error) => {
+      socket.on('error', (error: any) => {
        logger.error({ error, sessionId: authContext.sessionId }, 'WebSocket error');
      });
    } catch (error) {
@@ -158,4 +195,21 @@ export class WebSocketHandler {
      await harness.cleanup();
    }
  }
+
+  /**
+   * Derive the container's XPUB event endpoint from the MCP server URL.
+   *
+   * MCP URL format: http://agent-user-abc123.dexorder-agents.svc.cluster.local:3000
+   * Event endpoint: tcp://agent-user-abc123.dexorder-agents.svc.cluster.local:5570
+   */
+  private getContainerEventEndpoint(mcpServerUrl: string): string {
+    try {
+      const url = new URL(mcpServerUrl);
+      // Replace protocol and port
+      return `tcp://${url.hostname}:5570`;
+    } catch {
+      // Fallback if URL parsing fails
+      return mcpServerUrl.replace('http://', 'tcp://').replace(':3000', ':5570');
+    }
+  }
 }
--- a/gateway/src/clients/iceberg-client.ts
+++ b/gateway/src/clients/iceberg-client.ts
@@ -0,0 +1,209 @@
+import { IcebergRestCatalog } from 'iceberg-js';
+import type { FastifyBaseLogger } from 'fastify';
+
+/**
+ * Iceberg client configuration
+ */
+export interface IcebergConfig {
+  catalogUri: string;
+  namespace: string;
+  s3Endpoint?: string;
+  s3AccessKey?: string;
+  s3SecretKey?: string;
+}
+
+/**
+ * Message record for Iceberg storage
+ */
+export interface IcebergMessage {
+  id: string;
+  user_id: string;
+  session_id: string;
+  role: 'user' | 'assistant' | 'system';
+  content: string;
+  metadata: string; // JSON string
+  timestamp: number; // microseconds
+}
+
+/**
+ * Checkpoint record for Iceberg storage
+ */
+export interface IcebergCheckpoint {
+  user_id: string;
+  session_id: string;
+  checkpoint_id: string;
+  checkpoint_data: string; // JSON string
+  metadata: string; // JSON string
+  timestamp: number; // microseconds
+}
+
+/**
+ * Iceberg REST client wrapper for durable storage
+ *
+ * Uses Iceberg REST Catalog API to:
+ * - Query conversation history from gateway.conversations
+ * - Query checkpoints from gateway.checkpoints
+ * - Note: Writes are handled by Flink; this is read-only
+ *
+ * For writes, we'll send to a Kafka topic that Flink consumes
+ * (or implement direct REST catalog write if needed)
+ */
+export class IcebergClient {
+  private namespace: string;
+  private logger: FastifyBaseLogger;
+
+  constructor(config: IcebergConfig, logger: FastifyBaseLogger) {
+    this.logger = logger;
+    this.namespace = config.namespace;
+
+    // Initialize Iceberg REST client
+    const clientConfig: any = {
+      uri: config.catalogUri,
+    };
+
+    if (config.s3Endpoint) {
+      clientConfig.s3 = {
+        endpoint: config.s3Endpoint,
+        'access-key-id': config.s3AccessKey,
+        'secret-access-key': config.s3SecretKey,
+        'path-style-access': 'true',
+      };
+    }
+
+    // TODO: Store client for queries when needed
+    new IcebergRestCatalog(clientConfig);
+
+    this.logger.info({
+      catalogUri: config.catalogUri,
+      namespace: this.namespace,
+    }, 'Iceberg client initialized');
+  }
+
+  /**
+   * Query messages from gateway.conversations table
+   *
+   * Note: This is a simplified interface. The actual Iceberg REST API
+   * returns table metadata, and you'd need to query the underlying
+   * Parquet files via S3 or use a query engine like DuckDB/Trino.
+   *
+   * For now, we'll document the expected schema and leave actual
+   * implementation as TODO since Flink handles writes.
+   */
+  async queryMessages(
+    userId: string,
+    sessionId: string,
+    _options?: {
+      startTime?: number;
+      endTime?: number;
+      limit?: number;
+    }
+  ): Promise<IcebergMessage[]> {
+    this.logger.debug({
+      userId,
+      sessionId,
+      table: `${this.namespace}.conversations`,
+    }, 'Querying messages from Iceberg');
+
+    // TODO: Implement actual Iceberg query
+    // Options:
+    // 1. Use iceberg-js to get table metadata and Parquet file locations
+    // 2. Query Parquet files directly via S3 + parquet-wasm
+    // 3. Use external query engine (DuckDB, Trino, Presto)
+    // 4. Use Flink SQL REST endpoint for queries
+
+    this.logger.warn('Iceberg query not yet implemented - returning empty array');
+    return [];
+  }
+
+  /**
+   * Query checkpoint from gateway.checkpoints table
+   */
+  async queryCheckpoint(
+    userId: string,
+    sessionId: string,
+    checkpointId?: string
+  ): Promise<IcebergCheckpoint | null> {
+    this.logger.debug({
+      userId,
+      sessionId,
+      checkpointId,
+      table: `${this.namespace}.checkpoints`,
+    }, 'Querying checkpoint from Iceberg');
+
+    // TODO: Implement actual Iceberg query
+    this.logger.warn('Iceberg query not yet implemented - returning null');
+    return null;
+  }
+
+  /**
+   * Get table metadata
+   */
+  async getTableMetadata(tableName: string): Promise<any> {
+    try {
+      const tableId = `${this.namespace}.${tableName}`;
+
+      // Note: iceberg-js provides catalog operations
+      // For actual data queries, you'd need to:
+      // 1. Get table metadata
+      // 2. Find data file locations
+      // 3. Query Parquet files from S3
+
+      this.logger.info({ table: tableId }, 'Getting table metadata');
+
+      // TODO: Implement table metadata query via REST API
+      return null;
+    } catch (error) {
+      this.logger.error({ error, tableName }, 'Failed to get table metadata');
+      throw error;
+    }
+  }
+
+  /**
+   * List tables in namespace
+   */
+  async listTables(): Promise<string[]> {
+    try {
+      this.logger.info({ namespace: this.namespace }, 'Listing tables');
+
+      // TODO: Use iceberg-js to list tables
+      // const tables = await this.client.listTables(this.namespace);
+      // return tables.map(t => t.name);
+
+      return [];
+    } catch (error) {
+      this.logger.error({ error }, 'Failed to list tables');
+      throw error;
+    }
+  }
+
+  /**
+   * Check if table exists
+   */
+  async tableExists(tableName: string): Promise<boolean> {
+    try {
+      const tables = await this.listTables();
+      return tables.includes(tableName);
+    } catch (error) {
+      this.logger.error({ error, tableName }, 'Failed to check table existence');
+      return false;
+    }
+  }
+}
+
+/**
+ * Note on Iceberg Writes:
+ *
+ * For appending messages and checkpoints to Iceberg, we have two options:
+ *
+ * 1. **Via Kafka + Flink** (Recommended):
+ *    - Gateway writes to Kafka topics (gateway_messages, gateway_checkpoints)
+ *    - Flink consumes and writes to Iceberg with proper partitioning
+ *    - Benefits: Proven architecture, handles backpressure, deduplication
+ *
+ * 2. **Direct REST Catalog Write**:
+ *    - Use Iceberg REST API to commit new data files
+ *    - More complex: need to create Parquet files, upload to S3, commit transaction
+ *    - Library like parquet-wasm could help
+ *
+ * For now, recommend Option 1 (Kafka + Flink) for consistency with existing architecture.
+ */
--- a/gateway/src/clients/qdrant-client.ts
+++ b/gateway/src/clients/qdrant-client.ts
@@ -0,0 +1,319 @@
+import { QdrantClient as QdrantRestClient } from '@qdrant/js-client-rest';
+import type { FastifyBaseLogger } from 'fastify';
+
+/**
+ * Qdrant client configuration
+ */
+export interface QdrantConfig {
+  url: string;
+  apiKey?: string;
+  collectionName?: string;
+}
+
+/**
+ * Qdrant client wrapper for RAG vector storage
+ *
+ * Features:
+ * - Global namespace (user_id = "0") for platform knowledge
+ * - User-specific namespaces for personal memories
+ * - Payload-indexed by user_id for GDPR compliance
+ * - Cosine similarity search
+ */
+export class QdrantClient {
+  private client: QdrantRestClient;
+  private collectionName: string;
+  private vectorDimension: number;
+  private logger: FastifyBaseLogger;
+
+  constructor(config: QdrantConfig, logger: FastifyBaseLogger, vectorDimension: number = 1536) {
+    this.logger = logger;
+    this.collectionName = config.collectionName || 'gateway_memory';
+    this.vectorDimension = vectorDimension;
+
+    // Initialize Qdrant REST client
+    this.client = new QdrantRestClient({
+      url: config.url,
+      apiKey: config.apiKey,
+    });
+
+    this.logger.info({
+      url: config.url,
+      collection: this.collectionName,
+      vectorDimension,
+    }, 'Qdrant client initialized');
+  }
+
+  /**
+   * Initialize collection with proper schema and indexes
+   */
+  async initialize(): Promise<void> {
+    this.logger.info({ collection: this.collectionName }, 'Initializing Qdrant collection');
+
+    try {
+      // Check if collection exists
+      const collections = await this.client.getCollections();
+      const exists = collections.collections.some(c => c.name === this.collectionName);
+
+      if (!exists) {
+        this.logger.info({ collection: this.collectionName }, 'Creating new collection');
+
+        // Create collection with vector configuration
+        await this.client.createCollection(this.collectionName, {
+          vectors: {
+            size: this.vectorDimension,
+            distance: 'Cosine',
+          },
+        });
+
+        // Create payload indexes for efficient filtering
+        await this.client.createPayloadIndex(this.collectionName, {
+          field_name: 'user_id',
+          field_schema: 'keyword',
+        });
+
+        await this.client.createPayloadIndex(this.collectionName, {
+          field_name: 'session_id',
+          field_schema: 'keyword',
+        });
+
+        await this.client.createPayloadIndex(this.collectionName, {
+          field_name: 'timestamp',
+          field_schema: 'integer',
+        });
+
+        this.logger.info({ collection: this.collectionName }, 'Collection created successfully');
+      } else {
+        this.logger.info({ collection: this.collectionName }, 'Collection already exists');
+      }
+    } catch (error) {
+      this.logger.error({ error, collection: this.collectionName }, 'Failed to initialize collection');
+      throw error;
+    }
+  }
+
+  /**
+   * Store a vector point with payload
+   */
+  async upsertPoint(
+    id: string,
+    vector: number[],
+    payload: Record<string, any>
+  ): Promise<void> {
+    try {
+      await this.client.upsert(this.collectionName, {
+        wait: true,
+        points: [{
+          id,
+          vector,
+          payload,
+        }],
+      });
+    } catch (error) {
+      this.logger.error({ error, id }, 'Failed to upsert point');
+      throw error;
+    }
+  }
+
+  /**
+   * Search for similar vectors
+   * Queries both global (user_id="0") and user-specific vectors
+   */
+  async search(
+    userId: string,
+    queryVector: number[],
+    options?: {
+      limit?: number;
+      scoreThreshold?: number;
+      sessionId?: string;
+      timeRange?: { start: number; end: number };
+    }
+  ): Promise<Array<{
+    id: string;
+    score: number;
+    payload: Record<string, any>;
+  }>> {
+    const limit = options?.limit || 5;
+    const scoreThreshold = options?.scoreThreshold || 0.7;
+
+    try {
+      // Build filter: (user_id = userId OR user_id = "0") AND other conditions
+      const mustConditions: any[] = [];
+      const shouldConditions: any[] = [
+        { key: 'user_id', match: { value: userId } },
+        { key: 'user_id', match: { value: '0' } }, // Global namespace
+      ];
+
+      // Add session filter if provided
+      if (options?.sessionId) {
+        mustConditions.push({
+          key: 'session_id',
+          match: { value: options.sessionId },
+        });
+      }
+
+      // Add time range filter if provided
+      if (options?.timeRange) {
+        mustConditions.push({
+          key: 'timestamp',
+          range: {
+            gte: options.timeRange.start,
+            lte: options.timeRange.end,
+          },
+        });
+      }
+
+      // Perform search
+      const results = await this.client.search(this.collectionName, {
+        vector: queryVector,
+        filter: {
+          must: mustConditions.length > 0 ? mustConditions : undefined,
+          should: shouldConditions,
+        },
+        limit,
+        score_threshold: scoreThreshold,
+        with_payload: true,
+      });
+
+      return results.map(r => ({
+        id: r.id as string,
+        score: r.score,
+        payload: r.payload || {},
+      }));
+    } catch (error) {
+      this.logger.error({ error, userId }, 'Search failed');
+      throw error;
+    }
+  }
+
+  /**
+   * Get points by filter (without vector search)
+   */
+  async scroll(
+    userId: string,
+    options?: {
+      limit?: number;
+      sessionId?: string;
+      offset?: string;
+    }
+  ): Promise<{
+    points: Array<{ id: string; payload: Record<string, any> }>;
+    nextOffset?: string;
+  }> {
+    try {
+      const filter: any = {
+        must: [
+          { key: 'user_id', match: { value: userId } },
+        ],
+      };
+
+      if (options?.sessionId) {
+        filter.must.push({
+          key: 'session_id',
+          match: { value: options.sessionId },
+        });
+      }
+
+      const result = await this.client.scroll(this.collectionName, {
+        filter,
+        limit: options?.limit || 10,
+        offset: options?.offset,
+        with_payload: true,
+        with_vector: false,
+      });
+
+      return {
+        points: result.points.map(p => ({
+          id: p.id as string,
+          payload: p.payload || {},
+        })),
+        nextOffset: result.next_page_offset as string | undefined,
+      };
+    } catch (error) {
+      this.logger.error({ error, userId }, 'Scroll failed');
+      throw error;
+    }
+  }
+
+  /**
+   * Delete all points for a user (GDPR compliance)
+   */
+  async deleteUserData(userId: string): Promise<void> {
+    this.logger.info({ userId }, 'Deleting user vectors for GDPR compliance');
+
+    try {
+      await this.client.delete(this.collectionName, {
+        wait: true,
+        filter: {
+          must: [
+            { key: 'user_id', match: { value: userId } },
+          ],
+        },
+      });
+
+      this.logger.info({ userId }, 'User vectors deleted');
+    } catch (error) {
+      this.logger.error({ error, userId }, 'Failed to delete user data');
+      throw error;
+    }
+  }
+
+  /**
+   * Delete points for a specific session
+   */
+  async deleteSession(userId: string, sessionId: string): Promise<void> {
+    this.logger.info({ userId, sessionId }, 'Deleting session vectors');
+
+    try {
+      await this.client.delete(this.collectionName, {
+        wait: true,
+        filter: {
+          must: [
+            { key: 'user_id', match: { value: userId } },
+            { key: 'session_id', match: { value: sessionId } },
+          ],
+        },
+      });
+
+      this.logger.info({ userId, sessionId }, 'Session vectors deleted');
+    } catch (error) {
+      this.logger.error({ error, userId, sessionId }, 'Failed to delete session');
+      throw error;
+    }
+  }
+
+  /**
+   * Get collection info and statistics
+   */
+  async getCollectionInfo(): Promise<{
+    vectorsCount: number;
+    indexedVectorsCount: number;
+    pointsCount: number;
+  }> {
+    try {
+      const info = await this.client.getCollection(this.collectionName);
+
+      return {
+        vectorsCount: (info as any).vectors_count || 0,
+        indexedVectorsCount: info.indexed_vectors_count || 0,
+        pointsCount: info.points_count || 0,
+      };
+    } catch (error) {
+      this.logger.error({ error }, 'Failed to get collection info');
+      throw error;
+    }
+  }
+
+  /**
+   * Store global platform knowledge (user_id = "0")
+   */
+  async storeGlobalKnowledge(
+    id: string,
+    vector: number[],
+    payload: Omit<Record<string, any>, 'user_id'>
+  ): Promise<void> {
+    return this.upsertPoint(id, vector, {
+      ...payload,
+      user_id: '0', // Global namespace
+    });
+  }
+}
--- a/gateway/src/db/user-service.ts
+++ b/gateway/src/db/user-service.ts
@@ -1,9 +1,11 @@
-import { Pool, PoolClient } from 'pg';
+import { Pool } from 'pg';
 import type { UserLicense } from '../types/user.js';
 import { UserLicenseSchema } from '../types/user.js';
+import type { AuthService } from '../auth/auth-service.js';

 export class UserService {
  private pool: Pool;
+  private authService?: AuthService;

  constructor(connectionString: string) {
    this.pool = new Pool({
@@ -14,6 +16,21 @@ export class UserService {
    });
  }

+  /**
+   * Get the database pool (for AuthService)
+   */
+  getPool(): Pool {
+    return this.pool;
+  }
+
+  /**
+   * Set auth service for JWT verification
+   * Called after AuthService is initialized
+   */
+  setAuthService(authService: AuthService): void {
+    this.authService = authService;
+  }
+
  /**
   * Get user license by user ID
   */
@@ -83,19 +100,24 @@ export class UserService {

  /**
   * Verify JWT token from web client
-   * TODO: Implement JWT verification with JWKS
+   * Uses Better Auth for proper JWT verification
   */
  async verifyWebToken(token: string): Promise<string | null> {
-    // Placeholder - implement JWT verification
-    // For now, decode without verification (INSECURE - FOR DEV ONLY)
-    try {
-      const payload = JSON.parse(
-        Buffer.from(token.split('.')[1], 'base64').toString()
-      );
-      return payload.sub || null;
-    } catch {
-      return null;
+    if (!this.authService) {
+      // Fallback for development - decode without verification
+      // This allows backward compatibility during migration
+      try {
+        const payload = JSON.parse(
+          Buffer.from(token.split('.')[1], 'base64').toString()
+        );
+        return payload.sub || null;
+      } catch {
+        return null;
+      }
    }
+
+    // Use Better Auth for proper verification
+    return await this.authService.verifyToken(token);
  }

  /**
--- a/gateway/src/events/delivery-service.ts
+++ b/gateway/src/events/delivery-service.ts
@@ -0,0 +1,507 @@
+import type { FastifyBaseLogger } from 'fastify';
+import {
+  EventType,
+  parseEventPayload,
+  getEventTypeName,
+  type UserEvent,
+  type OrderEventPayload,
+  type AlertEventPayload,
+  type PositionEventPayload,
+  type StrategyEventPayload,
+} from './types.js';
+
+/**
+ * User channel configuration loaded from database.
+ */
+export interface UserChannelConfig {
+  telegramChatId?: string;
+  email?: string;
+  pushToken?: string;
+  discordWebhook?: string;
+  slackWebhook?: string;
+}
+
+export interface DeliveryServiceConfig {
+  telegramBotToken?: string;
+  emailServiceKey?: string;
+  emailFromAddress?: string;
+  pushServiceKey?: string;
+  logger: FastifyBaseLogger;
+}
+
+/**
+ * DeliveryService handles actual delivery to external channels.
+ *
+ * Owns credentials for:
+ * - Telegram bot
+ * - Email service (SendGrid, SES, etc.)
+ * - Push notifications (Firebase, APNs)
+ * - Discord/Slack webhooks
+ *
+ * User-specific channel configs (chat IDs, emails, tokens) are loaded
+ * from the database on demand.
+ */
+export class DeliveryService {
+  private telegramBotToken?: string;
+  private emailServiceKey?: string;
+  private pushServiceKey?: string;
+  private logger: FastifyBaseLogger;
+
+  // Cache of user channel configs
+  // In production, this should have TTL and be backed by Redis
+  private userConfigs = new Map<string, UserChannelConfig>();
+
+  constructor(config: DeliveryServiceConfig) {
+    this.telegramBotToken = config.telegramBotToken;
+    this.emailServiceKey = config.emailServiceKey;
+    this.pushServiceKey = config.pushServiceKey;
+    this.logger = config.logger.child({ component: 'DeliveryService' });
+  }
+
+  /**
+   * Load user's channel configuration from database.
+   * TODO: Implement actual database lookup.
+   */
+  async loadUserConfig(userId: string): Promise<UserChannelConfig> {
+    // Check cache first
+    const cached = this.userConfigs.get(userId);
+    if (cached) return cached;
+
+    // TODO: Load from database
+    // For now, return empty config
+    const config: UserChannelConfig = {};
+    this.userConfigs.set(userId, config);
+    return config;
+  }
+
+  /**
+   * Update cached user config (called when user updates their settings).
+   */
+  updateUserConfig(userId: string, config: Partial<UserChannelConfig>): void {
+    const existing = this.userConfigs.get(userId) || {};
+    this.userConfigs.set(userId, { ...existing, ...config });
+  }
+
+  /**
+   * Clear cached user config.
+   */
+  clearUserConfig(userId: string): void {
+    this.userConfigs.delete(userId);
+  }
+
+  // ===========================================================================
+  // Telegram
+  // ===========================================================================
+
+  /**
+   * Send event notification via Telegram.
+   */
+  async sendTelegram(userId: string, event: UserEvent): Promise<void> {
+    if (!this.telegramBotToken) {
+      throw new Error('Telegram bot token not configured');
+    }
+
+    const config = await this.loadUserConfig(userId);
+    if (!config.telegramChatId) {
+      throw new Error('User has no Telegram chat ID configured');
+    }
+
+    const message = this.formatTelegramMessage(event);
+
+    const response = await fetch(
+      `https://api.telegram.org/bot${this.telegramBotToken}/sendMessage`,
+      {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({
+          chat_id: config.telegramChatId,
+          text: message,
+          parse_mode: 'Markdown',
+        }),
+      }
+    );
+
+    if (!response.ok) {
+      const error = await response.text();
+      throw new Error(`Telegram API error: ${error}`);
+    }
+
+    this.logger.info(
+      { userId, eventId: event.eventId, chatId: config.telegramChatId },
+      'Sent Telegram notification'
+    );
+  }
+
+  /**
+   * Format event as Telegram message with Markdown.
+   */
+  private formatTelegramMessage(event: UserEvent): string {
+    switch (event.eventType) {
+      case EventType.ORDER_PLACED:
+      case EventType.ORDER_FILLED:
+      case EventType.ORDER_CANCELLED:
+      case EventType.ORDER_REJECTED:
+      case EventType.ORDER_EXPIRED: {
+        const payload = parseEventPayload<OrderEventPayload>(event);
+        if (!payload) return this.formatGenericMessage(event);
+
+        const emoji = this.getOrderEmoji(event.eventType);
+        const title = this.getOrderTitle(event.eventType);
+
+        return (
+          `${emoji} *${title}*\n\n` +
+          `Symbol: \`${payload.symbol}\`\n` +
+          `Side: ${payload.side.toUpperCase()}\n` +
+          `Quantity: ${payload.quantity}\n` +
+          (payload.fillPrice ? `Fill Price: ${payload.fillPrice}\n` : '') +
+          (payload.price ? `Limit Price: ${payload.price}\n` : '') +
+          `Exchange: ${payload.exchange}\n` +
+          (payload.strategyId ? `Strategy: ${payload.strategyId}\n` : '') +
+          (payload.errorMessage ? `Error: ${payload.errorMessage}\n` : '')
+        );
+      }
+
+      case EventType.ALERT_TRIGGERED: {
+        const payload = parseEventPayload<AlertEventPayload>(event);
+        if (!payload) return this.formatGenericMessage(event);
+
+        return (
+          `🔔 *Alert Triggered*\n\n` +
+          `Symbol: \`${payload.symbol}\`\n` +
+          `Condition: ${payload.condition}\n` +
+          `Price: ${payload.triggeredPrice}`
+        );
+      }
+
+      case EventType.POSITION_OPENED:
+      case EventType.POSITION_CLOSED:
+      case EventType.POSITION_UPDATED:
+      case EventType.POSITION_LIQUIDATED: {
+        const payload = parseEventPayload<PositionEventPayload>(event);
+        if (!payload) return this.formatGenericMessage(event);
+
+        const emoji = this.getPositionEmoji(event.eventType);
+        const title = this.getPositionTitle(event.eventType);
+
+        return (
+          `${emoji} *${title}*\n\n` +
+          `Symbol: \`${payload.symbol}\`\n` +
+          `Side: ${payload.side.toUpperCase()}\n` +
+          `Size: ${payload.size}\n` +
+          `Entry: ${payload.entryPrice}\n` +
+          `Current: ${payload.currentPrice}\n` +
+          `Unrealized PnL: ${payload.unrealizedPnl}\n` +
+          (payload.realizedPnl ? `Realized PnL: ${payload.realizedPnl}\n` : '') +
+          (payload.liquidationPrice ? `Liquidation: ${payload.liquidationPrice}\n` : '')
+        );
+      }
+
+      case EventType.STRATEGY_ERROR: {
+        const payload = parseEventPayload<StrategyEventPayload>(event);
+        if (!payload) return this.formatGenericMessage(event);
+
+        return (
+          `🚨 *Strategy Error*\n\n` +
+          `Strategy: ${payload.strategyName}\n` +
+          `Message: ${payload.message}\n` +
+          (payload.details ? `Details: ${payload.details}` : '')
+        );
+      }
+
+      default:
+        return this.formatGenericMessage(event);
+    }
+  }
+
+  private formatGenericMessage(event: UserEvent): string {
+    const payload = parseEventPayload(event);
+    return (
+      `📌 *${getEventTypeName(event.eventType)}*\n\n` +
+      `\`\`\`\n${JSON.stringify(payload, null, 2)}\n\`\`\``
+    );
+  }
+
+  private getOrderEmoji(eventType: EventType): string {
+    switch (eventType) {
+      case EventType.ORDER_FILLED:
+        return '✅';
+      case EventType.ORDER_PLACED:
+        return '📝';
+      case EventType.ORDER_CANCELLED:
+        return '❌';
+      case EventType.ORDER_REJECTED:
+        return '🚫';
+      case EventType.ORDER_EXPIRED:
+        return '⏰';
+      default:
+        return '📋';
+    }
+  }
+
+  private getOrderTitle(eventType: EventType): string {
+    switch (eventType) {
+      case EventType.ORDER_FILLED:
+        return 'Order Filled';
+      case EventType.ORDER_PLACED:
+        return 'Order Placed';
+      case EventType.ORDER_CANCELLED:
+        return 'Order Cancelled';
+      case EventType.ORDER_REJECTED:
+        return 'Order Rejected';
+      case EventType.ORDER_EXPIRED:
+        return 'Order Expired';
+      default:
+        return 'Order Update';
+    }
+  }
+
+  private getPositionEmoji(eventType: EventType): string {
+    switch (eventType) {
+      case EventType.POSITION_OPENED:
+        return '📈';
+      case EventType.POSITION_CLOSED:
+        return '📉';
+      case EventType.POSITION_UPDATED:
+        return '📊';
+      case EventType.POSITION_LIQUIDATED:
+        return '💥';
+      default:
+        return '📊';
+    }
+  }
+
+  private getPositionTitle(eventType: EventType): string {
+    switch (eventType) {
+      case EventType.POSITION_OPENED:
+        return 'Position Opened';
+      case EventType.POSITION_CLOSED:
+        return 'Position Closed';
+      case EventType.POSITION_UPDATED:
+        return 'Position Updated';
+      case EventType.POSITION_LIQUIDATED:
+        return 'Position Liquidated';
+      default:
+        return 'Position Update';
+    }
+  }
+
+  // ===========================================================================
+  // Email
+  // ===========================================================================
+
+  /**
+   * Send event notification via email.
+   * TODO: Implement with SendGrid, SES, or similar.
+   */
+  async sendEmail(userId: string, event: UserEvent): Promise<void> {
+    if (!this.emailServiceKey) {
+      throw new Error('Email service not configured');
+    }
+
+    const config = await this.loadUserConfig(userId);
+    if (!config.email) {
+      throw new Error('User has no email configured');
+    }
+
+    const subject = this.formatEmailSubject(event);
+    // const body = this.formatEmailBody(event);
+
+    // TODO: Implement actual email sending
+    // Example with SendGrid:
+    // await sendgrid.send({
+    //   to: config.email,
+    //   from: this.emailFromAddress,
+    //   subject,
+    //   html: body,
+    // });
+
+    this.logger.info(
+      { userId, eventId: event.eventId, email: config.email, subject },
+      'Sent email notification (stub)'
+    );
+  }
+
+  private formatEmailSubject(event: UserEvent): string {
+    switch (event.eventType) {
+      case EventType.ORDER_FILLED: {
+        const payload = parseEventPayload<OrderEventPayload>(event);
+        return `Order Filled: ${payload?.symbol || 'Unknown'}`;
+      }
+      case EventType.ALERT_TRIGGERED: {
+        const payload = parseEventPayload<AlertEventPayload>(event);
+        return `Alert: ${payload?.symbol || 'Unknown'} - ${payload?.condition || ''}`;
+      }
+      case EventType.POSITION_LIQUIDATED: {
+        const payload = parseEventPayload<PositionEventPayload>(event);
+        return `⚠️ Position Liquidated: ${payload?.symbol || 'Unknown'}`;
+      }
+      default:
+        return `Dexorder: ${getEventTypeName(event.eventType)}`;
+    }
+  }
+
+
+
+  // ===========================================================================
+  // Push Notifications
+  // ===========================================================================
+
+  /**
+   * Send push notification via Firebase/APNs.
+   * TODO: Implement with Firebase Admin SDK.
+   */
+  async sendPush(userId: string, event: UserEvent): Promise<void> {
+    if (!this.pushServiceKey) {
+      throw new Error('Push service not configured');
+    }
+
+    const config = await this.loadUserConfig(userId);
+    if (!config.pushToken) {
+      throw new Error('User has no push token configured');
+    }
+
+    const title = this.formatPushTitle(event);
+    // const body = this.formatPushBody(event);
+
+    // TODO: Implement actual push notification
+    // Example with Firebase:
+    // await admin.messaging().send({
+    //   token: config.pushToken,
+    //   notification: { title, body },
+    //   data: { eventId: event.eventId, eventType: String(event.eventType) },
+    // });
+
+    this.logger.info(
+      { userId, eventId: event.eventId, title },
+      'Sent push notification (stub)'
+    );
+  }
+
+  private formatPushTitle(event: UserEvent): string {
+    switch (event.eventType) {
+      case EventType.ORDER_FILLED:
+        return 'Order Filled';
+      case EventType.ALERT_TRIGGERED:
+        return 'Alert Triggered';
+      case EventType.POSITION_LIQUIDATED:
+        return '⚠️ Position Liquidated';
+      default:
+        return 'Dexorder';
+    }
+  }
+
+
+
+  // ===========================================================================
+  // Discord
+  // ===========================================================================
+
+  /**
+   * Send event notification via Discord webhook.
+   */
+  async sendDiscord(userId: string, event: UserEvent): Promise<void> {
+    const config = await this.loadUserConfig(userId);
+    if (!config.discordWebhook) {
+      throw new Error('User has no Discord webhook configured');
+    }
+
+    const embed = this.formatDiscordEmbed(event);
+
+    const response = await fetch(config.discordWebhook, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ embeds: [embed] }),
+    });
+
+    if (!response.ok) {
+      const error = await response.text();
+      throw new Error(`Discord webhook error: ${error}`);
+    }
+
+    this.logger.info({ userId, eventId: event.eventId }, 'Sent Discord notification');
+  }
+
+  private formatDiscordEmbed(event: UserEvent): object {
+    const payload = parseEventPayload(event);
+    return {
+      title: getEventTypeName(event.eventType),
+      description: JSON.stringify(payload, null, 2).slice(0, 2000),
+      color: this.getDiscordColor(event.eventType),
+      timestamp: new Date(event.timestamp).toISOString(),
+      footer: { text: `Event ID: ${event.eventId}` },
+    };
+  }
+
+  private getDiscordColor(eventType: EventType): number {
+    switch (eventType) {
+      case EventType.ORDER_FILLED:
+        return 0x00ff00; // Green
+      case EventType.ORDER_REJECTED:
+      case EventType.POSITION_LIQUIDATED:
+      case EventType.STRATEGY_ERROR:
+        return 0xff0000; // Red
+      case EventType.ALERT_TRIGGERED:
+        return 0xffff00; // Yellow
+      default:
+        return 0x0099ff; // Blue
+    }
+  }
+
+  // ===========================================================================
+  // Slack
+  // ===========================================================================
+
+  /**
+   * Send event notification via Slack webhook.
+   */
+  async sendSlack(userId: string, event: UserEvent): Promise<void> {
+    const config = await this.loadUserConfig(userId);
+    if (!config.slackWebhook) {
+      throw new Error('User has no Slack webhook configured');
+    }
+
+    const blocks = this.formatSlackBlocks(event);
+
+    const response = await fetch(config.slackWebhook, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ blocks }),
+    });
+
+    if (!response.ok) {
+      const error = await response.text();
+      throw new Error(`Slack webhook error: ${error}`);
+    }
+
+    this.logger.info({ userId, eventId: event.eventId }, 'Sent Slack notification');
+  }
+
+  private formatSlackBlocks(event: UserEvent): object[] {
+    const payload = parseEventPayload(event);
+    return [
+      {
+        type: 'header',
+        text: {
+          type: 'plain_text',
+          text: getEventTypeName(event.eventType),
+        },
+      },
+      {
+        type: 'section',
+        text: {
+          type: 'mrkdwn',
+          text: '```' + JSON.stringify(payload, null, 2).slice(0, 2000) + '```',
+        },
+      },
+      {
+        type: 'context',
+        elements: [
+          {
+            type: 'mrkdwn',
+            text: `Event ID: ${event.eventId}`,
+          },
+        ],
+      },
+    ];
+  }
+}
--- a/gateway/src/events/event-router.ts
+++ b/gateway/src/events/event-router.ts
@@ -0,0 +1,338 @@
+import { Router } from 'zeromq';
+import type { FastifyBaseLogger } from 'fastify';
+import type { SessionRegistry } from './session-registry.js';
+import type { DeliveryService } from './delivery-service.js';
+import {
+  deserializeUserEvent,
+  serializeEventAck,
+  parseEventPayload,
+  getEventTypeName,
+  getChannelTypeName,
+  AckStatus,
+  ChannelType,
+  Priority,
+  type UserEvent,
+  type EventAck,
+} from './types.js';
+
+export interface EventRouterConfig {
+  sessions: SessionRegistry;
+  delivery: DeliveryService;
+  logger: FastifyBaseLogger;
+  bindEndpoint?: string;
+  dedupTtlMs?: number;
+}
+
+/**
+ * EventRouter handles critical events from user containers.
+ *
+ * Uses ZMQ ROUTER socket that all containers connect to via DEALER.
+ * Provides guaranteed delivery with acknowledgments.
+ *
+ * Flow:
+ * 1. Container sends UserEvent via DEALER
+ * 2. Router receives with identity frame
+ * 3. Router delivers event through channel preferences
+ * 4. Router sends EventAck back to container
+ *
+ * Features:
+ * - Deduplication (5 minute window by default)
+ * - Channel preference ordering (try first channel, then second, etc.)
+ * - Acknowledgment with delivery status
+ */
+export class EventRouter {
+  private socket: Router;
+  private sessions: SessionRegistry;
+  private delivery: DeliveryService;
+  private logger: FastifyBaseLogger;
+  private bindEndpoint: string;
+  private dedupTtlMs: number;
+
+  // Deduplication: track recently processed event IDs
+  // Map: eventId -> timestamp when processed
+  private processedEvents = new Map<string, number>();
+
+  private running = false;
+  private messageLoopPromise: Promise<void> | null = null;
+  private cleanupInterval: NodeJS.Timeout | null = null;
+
+  constructor(config: EventRouterConfig) {
+    this.socket = new Router();
+    this.sessions = config.sessions;
+    this.delivery = config.delivery;
+    this.logger = config.logger.child({ component: 'EventRouter' });
+    this.bindEndpoint = config.bindEndpoint || 'tcp://*:5571';
+    this.dedupTtlMs = config.dedupTtlMs || 5 * 60 * 1000; // 5 minutes
+  }
+
+  /**
+   * Start the event router.
+   */
+  async start(): Promise<void> {
+    await this.socket.bind(this.bindEndpoint);
+    this.logger.info({ endpoint: this.bindEndpoint }, 'Event router bound');
+
+    this.running = true;
+    this.messageLoopPromise = this.messageLoop();
+    this.startCleanupTimer();
+
+    this.logger.info('Event router started');
+  }
+
+  /**
+   * Stop the event router.
+   */
+  async stop(): Promise<void> {
+    this.running = false;
+
+    if (this.cleanupInterval) {
+      clearInterval(this.cleanupInterval);
+      this.cleanupInterval = null;
+    }
+
+    this.socket.close();
+
+    if (this.messageLoopPromise) {
+      try {
+        await this.messageLoopPromise;
+      } catch {
+        // Ignore errors during shutdown
+      }
+    }
+
+    this.logger.info('Event router stopped');
+  }
+
+  /**
+   * Main message processing loop.
+   */
+  private async messageLoop(): Promise<void> {
+    this.logger.debug('Starting message loop');
+
+    try {
+      for await (const [identity, payload] of this.socket) {
+        if (!this.running) break;
+
+        try {
+          await this.processMessage(identity, payload);
+        } catch (error) {
+          this.logger.error({ error }, 'Error processing critical event');
+          // Don't ack on error - let container retry
+        }
+      }
+    } catch (error) {
+      if (this.running) {
+        this.logger.error({ error }, 'Message loop error');
+      }
+    }
+
+    this.logger.debug('Message loop ended');
+  }
+
+  /**
+   * Process a single message from the ROUTER socket.
+   */
+  private async processMessage(identity: Buffer, payload: Buffer): Promise<void> {
+    // Deserialize the event
+    const event = deserializeUserEvent(payload);
+
+    this.logger.info(
+      {
+        userId: event.userId,
+        eventId: event.eventId,
+        eventType: getEventTypeName(event.eventType),
+        priority: Priority[event.delivery.priority],
+      },
+      'Received critical event'
+    );
+
+    // Deduplication check
+    if (this.processedEvents.has(event.eventId)) {
+      this.logger.debug({ eventId: event.eventId }, 'Duplicate event, sending cached ack');
+      await this.sendAck(identity, {
+        eventId: event.eventId,
+        status: AckStatus.DELIVERED,
+        errorMessage: '',
+      });
+      return;
+    }
+
+    // Deliver through channel preferences
+    const result = await this.deliverEvent(event);
+
+    // Mark as processed (for deduplication)
+    this.processedEvents.set(event.eventId, Date.now());
+
+    // Send ack back to container
+    await this.sendAck(identity, result);
+
+    this.logger.info(
+      {
+        eventId: event.eventId,
+        status: AckStatus[result.status],
+        deliveredVia: result.deliveredVia ? getChannelTypeName(result.deliveredVia) : undefined,
+      },
+      'Event processed'
+    );
+  }
+
+  /**
+   * Deliver an event through channel preferences.
+   * Tries each channel in order until one succeeds.
+   */
+  private async deliverEvent(event: UserEvent): Promise<EventAck> {
+    for (const pref of event.delivery.channels) {
+      // Skip if channel requires active session but none exists
+      if (pref.onlyIfActive && !this.sessions.has(event.userId)) {
+        this.logger.debug(
+          {
+            eventId: event.eventId,
+            channel: getChannelTypeName(pref.channel),
+          },
+          'Skipping channel (requires active session)'
+        );
+        continue;
+      }
+
+      try {
+        const delivered = await this.deliverToChannel(event, pref.channel);
+        if (delivered) {
+          return {
+            eventId: event.eventId,
+            status: AckStatus.DELIVERED,
+            errorMessage: '',
+            deliveredVia: pref.channel,
+          };
+        }
+      } catch (error) {
+        this.logger.warn(
+          {
+            error,
+            eventId: event.eventId,
+            channel: getChannelTypeName(pref.channel),
+          },
+          'Channel delivery failed, trying next'
+        );
+        // Continue to next channel preference
+      }
+    }
+
+    // All channels failed
+    this.logger.error({ eventId: event.eventId }, 'All delivery channels failed');
+    return {
+      eventId: event.eventId,
+      status: AckStatus.ACK_ERROR,
+      errorMessage: 'All delivery channels failed',
+    };
+  }
+
+  /**
+   * Deliver event to a specific channel.
+   * Returns true if delivery succeeded.
+   */
+  private async deliverToChannel(event: UserEvent, channel: ChannelType): Promise<boolean> {
+    switch (channel) {
+      case ChannelType.ACTIVE_SESSION: {
+        const session = this.sessions.get(event.userId);
+        if (!session) return false;
+
+        const message = this.formatWebSocketMessage(event);
+        session.socket.send(message);
+        return true;
+      }
+
+      case ChannelType.WEB: {
+        // WEB is same as ACTIVE_SESSION for WebSocket connections
+        const session = this.sessions.get(event.userId);
+        if (!session || session.channelType !== 'websocket') return false;
+
+        const message = this.formatWebSocketMessage(event);
+        session.socket.send(message);
+        return true;
+      }
+
+      case ChannelType.TELEGRAM:
+        await this.delivery.sendTelegram(event.userId, event);
+        return true;
+
+      case ChannelType.EMAIL:
+        await this.delivery.sendEmail(event.userId, event);
+        return true;
+
+      case ChannelType.PUSH:
+        await this.delivery.sendPush(event.userId, event);
+        return true;
+
+      case ChannelType.DISCORD:
+        await this.delivery.sendDiscord(event.userId, event);
+        return true;
+
+      case ChannelType.SLACK:
+        await this.delivery.sendSlack(event.userId, event);
+        return true;
+
+      default:
+        this.logger.warn({ channel }, 'Unknown channel type');
+        return false;
+    }
+  }
+
+  /**
+   * Send an EventAck back to a container.
+   */
+  private async sendAck(identity: Buffer, ack: EventAck): Promise<void> {
+    const payload = serializeEventAck(ack);
+    await this.socket.send([identity, payload]);
+  }
+
+  /**
+   * Format a UserEvent as a WebSocket message.
+   */
+  private formatWebSocketMessage(event: UserEvent): string {
+    const payload = parseEventPayload(event);
+
+    return JSON.stringify({
+      type: 'event',
+      eventType: getEventTypeName(event.eventType),
+      eventId: event.eventId,
+      timestamp: event.timestamp,
+      payload: payload,
+      priority: Priority[event.delivery.priority],
+    });
+  }
+
+  /**
+   * Start the deduplication cleanup timer.
+   */
+  private startCleanupTimer(): void {
+    this.cleanupInterval = setInterval(() => {
+      this.cleanupProcessedEvents();
+    }, 60000); // Cleanup every minute
+  }
+
+  /**
+   * Remove expired entries from the processed events map.
+   */
+  private cleanupProcessedEvents(): void {
+    const now = Date.now();
+    let cleaned = 0;
+
+    for (const [eventId, timestamp] of this.processedEvents) {
+      if (now - timestamp > this.dedupTtlMs) {
+        this.processedEvents.delete(eventId);
+        cleaned++;
+      }
+    }
+
+    if (cleaned > 0) {
+      this.logger.debug({ cleaned }, 'Cleaned up processed events');
+    }
+  }
+
+  /**
+   * Get count of tracked processed events (for monitoring).
+   */
+  getProcessedEventCount(): number {
+    return this.processedEvents.size;
+  }
+}
--- a/gateway/src/events/event-subscriber.ts
+++ b/gateway/src/events/event-subscriber.ts
@@ -0,0 +1,218 @@
+import { Subscriber } from 'zeromq';
+import type { FastifyBaseLogger } from 'fastify';
+import type { SessionRegistry, Session } from './session-registry.js';
+import {
+  deserializeUserEvent,
+  parseEventPayload,
+  getEventTypeName,
+  type UserEvent,
+} from './types.js';
+
+/**
+ * EventSubscriber handles informational events from user containers.
+ *
+ * Uses ZMQ SUB socket to connect to container XPUB sockets.
+ * When a user session connects, we subscribe to their events.
+ * When they disconnect, we unsubscribe.
+ *
+ * This is for fire-and-forget events that only matter if the user
+ * is actively connected (e.g., chart updates, strategy logs).
+ */
+export class EventSubscriber {
+  private socket: Subscriber;
+  private sessions: SessionRegistry;
+  private logger: FastifyBaseLogger;
+
+  // Track which container endpoints we're connected to
+  // Map: userId -> containerEndpoint
+  private containerConnections = new Map<string, string>();
+
+  // Track active subscriptions
+  // Set of topic strings we're subscribed to
+  private activeSubscriptions = new Set<string>();
+
+  private running = false;
+  private messageLoopPromise: Promise<void> | null = null;
+
+  constructor(sessions: SessionRegistry, logger: FastifyBaseLogger) {
+    this.socket = new Subscriber();
+    this.sessions = sessions;
+    this.logger = logger.child({ component: 'EventSubscriber' });
+  }
+
+  /**
+   * Start the event subscriber.
+   */
+  async start(): Promise<void> {
+    this.running = true;
+    this.messageLoopPromise = this.messageLoop();
+    this.logger.info('Event subscriber started');
+  }
+
+  /**
+   * Stop the event subscriber.
+   */
+  async stop(): Promise<void> {
+    this.running = false;
+    this.socket.close();
+
+    if (this.messageLoopPromise) {
+      try {
+        await this.messageLoopPromise;
+      } catch {
+        // Ignore errors during shutdown
+      }
+    }
+
+    this.logger.info('Event subscriber stopped');
+  }
+
+  /**
+   * Called when a user session connects.
+   * Connects to the user's container XPUB and subscribes to their events.
+   */
+  async onSessionConnect(session: Session): Promise<void> {
+    const topic = `USER:${session.userId}`;
+    const endpoint = session.containerEndpoint;
+
+    // Connect to container if not already connected
+    if (!this.containerConnections.has(session.userId)) {
+      try {
+        this.socket.connect(endpoint);
+        this.containerConnections.set(session.userId, endpoint);
+        this.logger.info(
+          { userId: session.userId, endpoint },
+          'Connected to container XPUB'
+        );
+      } catch (error) {
+        this.logger.error(
+          { error, userId: session.userId, endpoint },
+          'Failed to connect to container XPUB'
+        );
+        return;
+      }
+    }
+
+    // Subscribe to user's topic
+    if (!this.activeSubscriptions.has(topic)) {
+      this.socket.subscribe(topic);
+      this.activeSubscriptions.add(topic);
+      this.logger.info({ userId: session.userId, topic }, 'Subscribed to user events');
+    }
+  }
+
+  /**
+   * Called when a user session disconnects.
+   * Unsubscribes from their events.
+   */
+  async onSessionDisconnect(session: Session): Promise<void> {
+    const topic = `USER:${session.userId}`;
+
+    // Unsubscribe from user's topic
+    if (this.activeSubscriptions.has(topic)) {
+      this.socket.unsubscribe(topic);
+      this.activeSubscriptions.delete(topic);
+      this.logger.info({ userId: session.userId, topic }, 'Unsubscribed from user events');
+    }
+
+    // Optionally disconnect from container after a delay
+    // (in case user reconnects quickly)
+    // For now, we keep the connection open
+  }
+
+  /**
+   * Main message processing loop.
+   */
+  private async messageLoop(): Promise<void> {
+    this.logger.debug('Starting message loop');
+
+    try {
+      for await (const [topicBuf, payloadBuf] of this.socket) {
+        if (!this.running) break;
+
+        try {
+          await this.processMessage(topicBuf, payloadBuf);
+        } catch (error) {
+          this.logger.error({ error }, 'Error processing informational event');
+        }
+      }
+    } catch (error) {
+      if (this.running) {
+        this.logger.error({ error }, 'Message loop error');
+      }
+    }
+
+    this.logger.debug('Message loop ended');
+  }
+
+  /**
+   * Process a single message from the SUB socket.
+   */
+  private async processMessage(topicBuf: Buffer, payloadBuf: Buffer): Promise<void> {
+    const topic = topicBuf.toString();
+    const userId = topic.replace('USER:', '');
+
+    // Deserialize the event
+    const event = deserializeUserEvent(payloadBuf);
+
+    this.logger.debug(
+      {
+        userId,
+        eventId: event.eventId,
+        eventType: getEventTypeName(event.eventType),
+      },
+      'Received informational event'
+    );
+
+    // Forward to active session
+    const session = this.sessions.get(userId);
+    if (!session) {
+      this.logger.debug({ userId }, 'No active session for event, dropping');
+      return;
+    }
+
+    // Format and send to WebSocket
+    const wsMessage = this.formatWebSocketMessage(event);
+    try {
+      session.socket.send(wsMessage);
+      this.logger.debug(
+        { userId, eventId: event.eventId },
+        'Forwarded event to WebSocket'
+      );
+    } catch (error) {
+      this.logger.warn(
+        { error, userId, eventId: event.eventId },
+        'Failed to send event to WebSocket'
+      );
+    }
+  }
+
+  /**
+   * Format a UserEvent as a WebSocket message.
+   */
+  private formatWebSocketMessage(event: UserEvent): string {
+    const payload = parseEventPayload(event);
+
+    return JSON.stringify({
+      type: 'event',
+      eventType: getEventTypeName(event.eventType),
+      eventId: event.eventId,
+      timestamp: event.timestamp,
+      payload: payload,
+    });
+  }
+
+  /**
+   * Get current subscription count.
+   */
+  getSubscriptionCount(): number {
+    return this.activeSubscriptions.size;
+  }
+
+  /**
+   * Get current connection count.
+   */
+  getConnectionCount(): number {
+    return this.containerConnections.size;
+  }
+}
--- a/gateway/src/events/index.ts
+++ b/gateway/src/events/index.ts
@@ -0,0 +1,15 @@
+/**
+ * User Container Event System
+ *
+ * Handles events from user containers via dual ZMQ patterns:
+ * - EventSubscriber (SUB): Informational events to active sessions
+ * - EventRouter (ROUTER): Critical events with guaranteed delivery
+ *
+ * See doc/protocol.md and doc/user_container_events.md for details.
+ */
+
+export * from './types.js';
+export * from './session-registry.js';
+export * from './event-subscriber.js';
+export * from './event-router.js';
+export * from './delivery-service.js';
--- a/gateway/src/events/session-registry.ts
+++ b/gateway/src/events/session-registry.ts
@@ -0,0 +1,134 @@
+import type { WebSocket } from '@fastify/websocket';
+
+/**
+ * Session information for an active user connection.
+ */
+export interface Session {
+  userId: string;
+  sessionId: string;
+  socket: WebSocket;
+  channelType: 'websocket' | 'telegram';
+  containerEndpoint: string; // Container's XPUB endpoint for informational events
+  connectedAt: Date;
+}
+
+/**
+ * Registry of active user sessions.
+ * Used by event system to route events to connected users.
+ */
+export class SessionRegistry {
+  // Map of userId -> Session (only most recent session per user)
+  private sessions = new Map<string, Session>();
+
+  // Map of sessionId -> userId (for reverse lookup)
+  private sessionToUser = new Map<string, string>();
+
+  /**
+   * Register a new session for a user.
+   * If user already has a session, the old one is replaced.
+   */
+  register(session: Session): void {
+    // Remove old session if exists
+    const oldSession = this.sessions.get(session.userId);
+    if (oldSession) {
+      this.sessionToUser.delete(oldSession.sessionId);
+    }
+
+    this.sessions.set(session.userId, session);
+    this.sessionToUser.set(session.sessionId, session.userId);
+  }
+
+  /**
+   * Unregister a session.
+   */
+  unregister(sessionId: string): Session | undefined {
+    const userId = this.sessionToUser.get(sessionId);
+    if (!userId) return undefined;
+
+    const session = this.sessions.get(userId);
+    if (session && session.sessionId === sessionId) {
+      this.sessions.delete(userId);
+      this.sessionToUser.delete(sessionId);
+      return session;
+    }
+
+    return undefined;
+  }
+
+  /**
+   * Get session by user ID.
+   */
+  get(userId: string): Session | undefined {
+    return this.sessions.get(userId);
+  }
+
+  /**
+   * Get session by session ID.
+   */
+  getBySessionId(sessionId: string): Session | undefined {
+    const userId = this.sessionToUser.get(sessionId);
+    if (!userId) return undefined;
+    return this.sessions.get(userId);
+  }
+
+  /**
+   * Check if user has an active session.
+   */
+  has(userId: string): boolean {
+    return this.sessions.has(userId);
+  }
+
+  /**
+   * Get all active sessions.
+   */
+  all(): Session[] {
+    return Array.from(this.sessions.values());
+  }
+
+  /**
+   * Get count of active sessions.
+   */
+  size(): number {
+    return this.sessions.size;
+  }
+
+  /**
+   * Get all user IDs with active sessions.
+   */
+  userIds(): string[] {
+    return Array.from(this.sessions.keys());
+  }
+
+  /**
+   * Send a message to a user's active session.
+   * Returns true if message was sent, false if no active session.
+   */
+  sendToUser(userId: string, message: string | Buffer): boolean {
+    const session = this.sessions.get(userId);
+    if (!session) return false;
+
+    try {
+      session.socket.send(message);
+      return true;
+    } catch {
+      return false;
+    }
+  }
+
+  /**
+   * Broadcast a message to all active sessions.
+   * Returns number of sessions that received the message.
+   */
+  broadcast(message: string | Buffer): number {
+    let sent = 0;
+    for (const session of this.sessions.values()) {
+      try {
+        session.socket.send(message);
+        sent++;
+      } catch {
+        // Session may have disconnected
+      }
+    }
+    return sent;
+  }
+}
--- a/gateway/src/events/types.ts
+++ b/gateway/src/events/types.ts
@@ -0,0 +1,289 @@
+/**
+ * User Event Types for Container → Gateway communication
+ *
+ * These types mirror the protobuf definitions in protobuf/user_events.proto
+ * Message Type IDs:
+ * - UserEvent: 0x20
+ * - EventAck: 0x21
+ */
+
+// Message type IDs (must match protocol.md)
+export const MSG_TYPE_USER_EVENT = 0x20;
+export const MSG_TYPE_EVENT_ACK = 0x21;
+
+// =============================================================================
+// Enums
+// =============================================================================
+
+export enum EventType {
+  // Trading events
+  ORDER_PLACED = 0,
+  ORDER_FILLED = 1,
+  ORDER_CANCELLED = 2,
+  ORDER_REJECTED = 3,
+  ORDER_EXPIRED = 4,
+
+  // Alert events
+  ALERT_TRIGGERED = 10,
+  ALERT_CREATED = 11,
+  ALERT_DELETED = 12,
+
+  // Position events
+  POSITION_OPENED = 20,
+  POSITION_CLOSED = 21,
+  POSITION_UPDATED = 22,
+  POSITION_LIQUIDATED = 23,
+
+  // Workspace/chart events
+  WORKSPACE_CHANGED = 30,
+  CHART_ANNOTATION_ADDED = 31,
+  CHART_ANNOTATION_REMOVED = 32,
+  INDICATOR_UPDATED = 33,
+
+  // Strategy events
+  STRATEGY_STARTED = 40,
+  STRATEGY_STOPPED = 41,
+  STRATEGY_LOG = 42,
+  STRATEGY_ERROR = 43,
+  BACKTEST_COMPLETED = 44,
+
+  // System events
+  CONTAINER_STARTING = 50,
+  CONTAINER_READY = 51,
+  CONTAINER_SHUTTING_DOWN = 52,
+  EVENT_ERROR = 53,
+}
+
+export enum Priority {
+  INFORMATIONAL = 0,
+  NORMAL = 1,
+  CRITICAL = 2,
+}
+
+export enum ChannelType {
+  ACTIVE_SESSION = 0,
+  WEB = 1,
+  TELEGRAM = 2,
+  EMAIL = 3,
+  PUSH = 4,
+  DISCORD = 5,
+  SLACK = 6,
+}
+
+export enum AckStatus {
+  DELIVERED = 0,
+  QUEUED = 1,
+  ACK_ERROR = 2,
+}
+
+// =============================================================================
+// Message Types
+// =============================================================================
+
+export interface ChannelPreference {
+  channel: ChannelType;
+  onlyIfActive: boolean;
+}
+
+export interface DeliverySpec {
+  priority: Priority;
+  channels: ChannelPreference[];
+}
+
+export interface UserEvent {
+  userId: string;
+  eventId: string;
+  timestamp: number; // Unix milliseconds
+  eventType: EventType;
+  payload: Buffer;
+  delivery: DeliverySpec;
+}
+
+export interface EventAck {
+  eventId: string;
+  status: AckStatus;
+  errorMessage: string;
+  deliveredVia?: ChannelType;
+}
+
+// =============================================================================
+// Serialization
+// =============================================================================
+
+/**
+ * Serialize UserEvent to wire format.
+ * Format: [1 byte msg type][JSON payload]
+ *
+ * Note: In production, replace with proper protobuf serialization.
+ */
+export function serializeUserEvent(event: UserEvent): Buffer {
+  const json = JSON.stringify({
+    user_id: event.userId,
+    event_id: event.eventId,
+    timestamp: event.timestamp,
+    event_type: event.eventType,
+    payload: event.payload.toString('base64'),
+    delivery: {
+      priority: event.delivery.priority,
+      channels: event.delivery.channels.map((c) => ({
+        channel: c.channel,
+        only_if_active: c.onlyIfActive,
+      })),
+    },
+  });
+  const msgType = Buffer.from([MSG_TYPE_USER_EVENT]);
+  return Buffer.concat([msgType, Buffer.from(json)]);
+}
+
+/**
+ * Deserialize UserEvent from wire format.
+ */
+export function deserializeUserEvent(data: Buffer): UserEvent {
+  const msgType = data[0];
+  if (msgType !== MSG_TYPE_USER_EVENT) {
+    throw new Error(`Invalid message type: expected ${MSG_TYPE_USER_EVENT}, got ${msgType}`);
+  }
+
+  const json = JSON.parse(data.subarray(1).toString());
+
+  return {
+    userId: json.user_id,
+    eventId: json.event_id,
+    timestamp: json.timestamp,
+    eventType: json.event_type as EventType,
+    payload: Buffer.from(json.payload, 'base64'),
+    delivery: {
+      priority: json.delivery.priority as Priority,
+      channels: json.delivery.channels.map(
+        (c: { channel: number; only_if_active: boolean }) => ({
+          channel: c.channel as ChannelType,
+          onlyIfActive: c.only_if_active,
+        })
+      ),
+    },
+  };
+}
+
+/**
+ * Serialize EventAck to wire format.
+ */
+export function serializeEventAck(ack: EventAck): Buffer {
+  const json = JSON.stringify({
+    event_id: ack.eventId,
+    status: ack.status,
+    error_message: ack.errorMessage,
+    delivered_via: ack.deliveredVia,
+  });
+  const msgType = Buffer.from([MSG_TYPE_EVENT_ACK]);
+  return Buffer.concat([msgType, Buffer.from(json)]);
+}
+
+/**
+ * Deserialize EventAck from wire format.
+ */
+export function deserializeEventAck(data: Buffer): EventAck {
+  const msgType = data[0];
+  if (msgType !== MSG_TYPE_EVENT_ACK) {
+    throw new Error(`Invalid message type: expected ${MSG_TYPE_EVENT_ACK}, got ${msgType}`);
+  }
+
+  const json = JSON.parse(data.subarray(1).toString());
+
+  return {
+    eventId: json.event_id,
+    status: json.status as AckStatus,
+    errorMessage: json.error_message || '',
+    deliveredVia: json.delivered_via as ChannelType | undefined,
+  };
+}
+
+// =============================================================================
+// Payload Parsing Helpers
+// =============================================================================
+
+export interface OrderEventPayload {
+  orderId: string;
+  symbol: string;
+  side: string;
+  orderType: string;
+  quantity: string;
+  price?: string;
+  fillPrice?: string;
+  fillQuantity?: string;
+  status: string;
+  exchange: string;
+  timestamp: number;
+  strategyId?: string;
+  errorMessage?: string;
+}
+
+export interface AlertEventPayload {
+  alertId: string;
+  symbol: string;
+  condition: string;
+  triggeredPrice: string;
+  timestamp: number;
+}
+
+export interface PositionEventPayload {
+  positionId: string;
+  symbol: string;
+  side: string;
+  size: string;
+  entryPrice: string;
+  currentPrice: string;
+  unrealizedPnl: string;
+  realizedPnl?: string;
+  leverage?: string;
+  liquidationPrice?: string;
+  exchange: string;
+  timestamp: number;
+}
+
+export interface WorkspaceEventPayload {
+  workspaceId: string;
+  changeType: string;
+  symbol?: string;
+  timeframe?: string;
+  annotationId?: string;
+  annotationType?: string;
+  annotationData?: string;
+  indicatorName?: string;
+  indicatorParams?: string;
+  timestamp: number;
+}
+
+export interface StrategyEventPayload {
+  strategyId: string;
+  strategyName: string;
+  logLevel: string;
+  message: string;
+  details?: string;
+  timestamp: number;
+}
+
+/**
+ * Parse event payload as JSON.
+ * Returns the parsed object or null if parsing fails.
+ */
+export function parseEventPayload<T>(event: UserEvent): T | null {
+  try {
+    return JSON.parse(event.payload.toString()) as T;
+  } catch {
+    return null;
+  }
+}
+
+/**
+ * Get human-readable event type name.
+ */
+export function getEventTypeName(eventType: EventType): string {
+  return EventType[eventType] || `UNKNOWN(${eventType})`;
+}
+
+/**
+ * Get human-readable channel type name.
+ */
+export function getChannelTypeName(channelType: ChannelType): string {
+  return ChannelType[channelType] || `UNKNOWN(${channelType})`;
+}
--- a/gateway/src/harness/README.md
+++ b/gateway/src/harness/README.md
@@ -0,0 +1,351 @@
+# Agent Harness
+
+Comprehensive agent orchestration system for Dexorder AI platform, built on LangChain.js and LangGraph.js.
+
+## Architecture Overview
+
+```
+gateway/src/harness/
+├── memory/              # Storage layer (Redis + Iceberg + Qdrant)
+├── skills/              # Individual capabilities (markdown + TypeScript)
+├── subagents/           # Specialized agents with multi-file memory
+├── workflows/           # LangGraph state machines
+├── tools/               # Platform tools (non-MCP)
+├── config/              # Configuration files
+└── index.ts             # Main exports
+```
+
+## Core Components
+
+### 1. Memory Layer (`memory/`)
+
+Tiered storage architecture as per [architecture discussion](/chat/harness-rag.txt):
+
+- **Redis**: Hot state (active sessions, checkpoints)
+- **Iceberg**: Cold storage (durable conversations, analytics)
+- **Qdrant**: Vector search (RAG, semantic memory)
+
+**Key Files:**
+- `checkpoint-saver.ts`: LangGraph checkpoint persistence
+- `conversation-store.ts`: Message history management
+- `rag-retriever.ts`: Vector similarity search
+- `embedding-service.ts`: Text→vector conversion
+- `session-context.ts`: User context with channel metadata
+
+### 2. Skills (`skills/`)
+
+Self-contained capabilities with markdown definitions:
+
+- `*.skill.md`: Human-readable documentation
+- `*.ts`: Implementation extending `BaseSkill`
+- Input validation and error handling
+- Can use LLM, MCP tools, or platform tools
+
+**Example:**
+```typescript
+import { MarketAnalysisSkill } from './skills';
+
+const skill = new MarketAnalysisSkill(logger, model);
+const result = await skill.execute({
+  context: userContext,
+  parameters: { ticker: 'BTC/USDT', period: '4h' }
+});
+```
+
+See [skills/README.md](skills/README.md) for authoring guide.
+
+### 3. Subagents (`subagents/`)
+
+Specialized agents with multi-file memory:
+
+```
+subagents/
+  code-reviewer/
+    config.yaml              # Model, memory files, capabilities
+    system-prompt.md         # System instructions
+    memory/                  # Multi-file knowledge base
+      review-guidelines.md
+      common-patterns.md
+      best-practices.md
+    index.ts                 # Implementation
+```
+
+**Features:**
+- Dedicated system prompts
+- Split memory into logical files (better organization)
+- Model overrides
+- Capability tagging
+
+**Example:**
+```typescript
+const codeReviewer = await createCodeReviewerSubagent(model, logger, basePath);
+const review = await codeReviewer.execute({ userContext }, strategyCode);
+```
+
+### 4. Workflows (`workflows/`)
+
+LangGraph state machines with:
+
+- Validation loops (retry with fixes)
+- Human-in-the-loop (approval gates)
+- Multi-step orchestration
+- Error recovery
+
+**Example Workflows:**
+- `strategy-validation/`: Code review → backtest → risk → approval
+- `trading-request/`: Analysis → risk → approval → execute
+
+See individual workflow READMEs for details.
+
+### 5. Configuration (`config/`)
+
+YAML-based configuration:
+
+- `models.yaml`: LLM providers, routing, rate limits
+- `subagent-routing.yaml`: When to use which subagent
+
+## User Context
+
+Enhanced session context with channel awareness for multi-channel support:
+
+```typescript
+interface UserContext {
+  userId: string;
+  sessionId: string;
+  license: UserLicense;
+
+  activeChannel: {
+    type: 'websocket' | 'telegram' | 'slack' | 'discord';
+    channelUserId: string;
+    capabilities: {
+      supportsMarkdown: boolean;
+      supportsImages: boolean;
+      supportsButtons: boolean;
+      maxMessageLength: number;
+    };
+  };
+
+  conversationHistory: BaseMessage[];
+  relevantMemories: MemoryChunk[];
+  workspaceState: WorkspaceContext;
+}
+```
+
+This allows workflows to:
+- Route responses to correct channel
+- Format output for channel capabilities
+- Handle channel-specific interactions (buttons, voice, etc.)
+
+## Storage Architecture
+
+Based on [harness-rag.txt discussion](../../chat/harness-rag.txt):
+
+### Hot Path (Redis)
+- Active checkpoints (TTL: 1 hour)
+- Recent messages (last 50)
+- Session metadata
+- Fast reads for active conversations
+
+### Cold Path (Iceberg)
+- Full conversation history (partitioned by user_id, session_id)
+- Checkpoint snapshots
+- Time-travel queries
+- GDPR-compliant deletion with compaction
+
+### Vector Search (Qdrant)
+- Conversation embeddings
+- Long-term memory
+- RAG retrieval
+- Payload-indexed by user_id for fast GDPR deletion
+- **Global knowledge base** (user_id="0") loaded from markdown files
+
+### GDPR Compliance
+
+```typescript
+// Delete user data across all stores
+await conversationStore.deleteUserData(userId);
+await ragRetriever.deleteUserData(userId);
+await checkpointSaver.delete(userId);
+await containerManager.deleteContainer(userId);
+
+// Iceberg physical delete
+await icebergTable.expire_snapshots();
+await icebergTable.rewrite_data_files();
+```
+
+## Standard Patterns
+
+### Validation Loop (Retry with Fixes)
+
+```typescript
+graph.addConditionalEdges('validate', (state) => {
+  if (state.errors.length > 0 && state.retryCount < 3) {
+    return 'fix_errors';  // Loop back
+  }
+  return state.errors.length === 0 ? 'approve' : 'reject';
+});
+```
+
+### Human-in-the-Loop (Approval Gates)
+
+```typescript
+const approvalNode = async (state) => {
+  // Send to user's channel
+  await sendToChannel(state.userContext.activeChannel, {
+    type: 'approval_request',
+    data: { /* details */ }
+  });
+
+  // LangGraph pauses here via Interrupt
+  // Resume with user input: graph.invoke(state, { ...resumeConfig })
+
+  return { approvalRequested: true };
+};
+```
+
+## Getting Started
+
+### 1. Install Dependencies
+
+Already in `gateway/package.json`:
+```json
+{
+  "@langchain/core": "^0.3.24",
+  "@langchain/langgraph": "^0.2.26",
+  "@langchain/anthropic": "^0.3.8",
+  "ioredis": "^5.4.2"
+}
+```
+
+### 2. Initialize Memory Layer
+
+```typescript
+import Redis from 'ioredis';
+import {
+  TieredCheckpointSaver,
+  ConversationStore,
+  EmbeddingService,
+  RAGRetriever
+} from './harness/memory';
+
+const redis = new Redis(process.env.REDIS_URL);
+
+const checkpointSaver = new TieredCheckpointSaver(redis, logger);
+const conversationStore = new ConversationStore(redis, logger);
+const embeddings = new EmbeddingService({ provider: 'openai', apiKey }, logger);
+const ragRetriever = new RAGRetriever({ url: QDRANT_URL }, logger);
+
+await ragRetriever.initialize();
+```
+
+### 3. Create Subagents
+
+```typescript
+import { createCodeReviewerSubagent } from './harness/subagents';
+import { ModelRouter } from './llm/router';
+
+const model = await modelRouter.route(query, license);
+const codeReviewer = await createCodeReviewerSubagent(
+  model,
+  logger,
+  'gateway/src/harness/subagents/code-reviewer'
+);
+```
+
+### 4. Build Workflows
+
+```typescript
+import { createStrategyValidationWorkflow } from './harness/workflows';
+
+const workflow = await createStrategyValidationWorkflow(
+  model,
+  codeReviewer,
+  mcpBacktestFn,
+  logger,
+  'gateway/src/harness/workflows/strategy-validation/config.yaml'
+);
+
+const result = await workflow.execute({
+  userContext,
+  strategyCode: '...',
+  ticker: 'BTC/USDT',
+  timeframe: '4h'
+});
+```
+
+### 5. Use Skills
+
+```typescript
+import { MarketAnalysisSkill } from './harness/skills';
+
+const skill = new MarketAnalysisSkill(logger, model);
+const analysis = await skill.execute({
+  context: userContext,
+  parameters: { ticker: 'BTC/USDT', period: '1h' }
+});
+```
+
+## Global Knowledge System
+
+The harness includes a document loader that automatically loads markdown files from `gateway/knowledge/` into Qdrant as global knowledge (user_id="0").
+
+### Directory Structure
+```
+gateway/knowledge/
+  ├── platform/          # Platform capabilities and architecture
+  ├── trading/           # Trading concepts and fundamentals
+  ├── indicators/        # Indicator development guides
+  └── strategies/        # Strategy patterns and examples
+```
+
+### How It Works
+
+1. **Startup**: Documents are loaded automatically when gateway starts
+2. **Chunking**: Intelligent splitting by markdown headers (~1000 tokens/chunk)
+3. **Embedding**: Chunks are embedded using configured embedding service
+4. **Storage**: Stored in Qdrant with user_id="0" (global namespace)
+5. **Updates**: Content hashing detects changes for incremental updates
+
+### RAG Query Flow
+
+When a user sends a message:
+1. Query is embedded using same embedding service
+2. Qdrant searches vectors with filter: `user_id = current_user OR user_id = "0"`
+3. Results include both user-specific and global knowledge
+4. Relevant chunks are added to LLM context
+5. LLM generates response with platform knowledge
+
+### Managing Knowledge
+
+**Add new documents**:
+```bash
+# Create markdown file in appropriate directory
+echo "# New Topic" > gateway/knowledge/platform/new-topic.md
+
+# Reload knowledge (development)
+curl -X POST http://localhost:3000/admin/reload-knowledge
+```
+
+**Check stats**:
+```bash
+curl http://localhost:3000/admin/knowledge-stats
+```
+
+**In production**: Just deploy updated markdown files - they'll be loaded on startup.
+
+See [gateway/knowledge/README.md](../../knowledge/README.md) for detailed documentation.
+
+## Next Steps
+
+1. **Implement Iceberg Integration**: Complete TODOs in checkpoint-saver.ts and conversation-store.ts
+2. **Add More Subagents**: Risk analyzer, market analyst, etc.
+3. **Implement Interrupts**: Full human-in-the-loop with LangGraph interrupts
+4. **Add Platform Tools**: Market data queries, chart rendering, etc.
+5. **Expand Knowledge Base**: Add more platform documentation to knowledge/
+
+## References
+
+- Architecture discussion: [chat/harness-rag.txt](../../chat/harness-rag.txt)
+- LangGraph docs: https://langchain-ai.github.io/langgraphjs/
+- Qdrant docs: https://qdrant.tech/documentation/
+- Apache Iceberg: https://iceberg.apache.org/docs/latest/
--- a/gateway/src/harness/agent-harness.ts
+++ b/gateway/src/harness/agent-harness.ts
@@ -1,4 +1,4 @@
-import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
+
 import type { BaseMessage } from '@langchain/core/messages';
 import { HumanMessage, AIMessage, SystemMessage } from '@langchain/core/messages';
 import type { FastifyBaseLogger } from 'fastify';
@@ -286,15 +286,7 @@ Available features: ${JSON.stringify(this.config.license.features, null, 2)}`;
    return prompt;
  }

-  /**
-   * Get platform tools (non-user-specific tools)
-   */
-  private getPlatformTools(): Array<{ name: string; description?: string }> {
-    // Platform tools that don't need user's MCP
-    return [
-      // TODO: Add platform tools like market data queries, chart rendering, etc.
-    ];
-  }
+

  /**
   * Cleanup resources
--- a/gateway/src/harness/config/models.yaml
+++ b/gateway/src/harness/config/models.yaml
@@ -0,0 +1,110 @@
+# Default LLM Model Configuration
+
+# Default model for general agent tasks
+default:
+  provider: anthropic
+  model: claude-3-5-sonnet-20241022
+  temperature: 0.7
+  maxTokens: 4096
+
+# Model overrides for specific use cases
+models:
+  # Fast model for simple tasks (routing, classification)
+  fast:
+    provider: anthropic
+    model: claude-3-haiku-20240307
+    temperature: 0.3
+    maxTokens: 1024
+
+  # Reasoning model for complex analysis
+  reasoning:
+    provider: anthropic
+    model: claude-3-5-sonnet-20241022
+    temperature: 0.5
+    maxTokens: 8192
+
+  # Precise model for code generation/review
+  code:
+    provider: anthropic
+    model: claude-3-5-sonnet-20241022
+    temperature: 0.2
+    maxTokens: 8192
+
+  # Creative model for strategy brainstorming
+  creative:
+    provider: anthropic
+    model: claude-3-5-sonnet-20241022
+    temperature: 0.9
+    maxTokens: 4096
+
+# Embedding model configuration
+embeddings:
+  provider: openai
+  model: text-embedding-3-small
+  dimensions: 1536
+
+# Model routing rules (complexity-based)
+routing:
+  # Simple queries → fast model
+  simple:
+    keywords:
+      - "what is"
+      - "define"
+      - "list"
+      - "show me"
+    maxTokens: 100
+    model: fast
+
+  # Code-related → code model
+  code:
+    keywords:
+      - "code"
+      - "function"
+      - "implement"
+      - "debug"
+      - "review"
+    model: code
+
+  # Analysis tasks → reasoning model
+  analysis:
+    keywords:
+      - "analyze"
+      - "compare"
+      - "evaluate"
+      - "assess"
+    model: reasoning
+
+  # Everything else → default
+  default:
+    model: default
+
+# Cost optimization settings
+costControl:
+  # Cache system prompts (Anthropic prompt caching)
+  cacheSystemPrompts: true
+
+  # Token limits per license type
+  tokenLimits:
+    free:
+      maxTokensPerMessage: 2048
+      maxTokensPerDay: 50000
+    pro:
+      maxTokensPerMessage: 8192
+      maxTokensPerDay: 500000
+    enterprise:
+      maxTokensPerMessage: 16384
+      maxTokensPerDay: -1  # unlimited
+
+# Rate limiting
+rateLimits:
+  # Requests per minute by license
+  requestsPerMinute:
+    free: 10
+    pro: 60
+    enterprise: 120
+
+  # Concurrent requests
+  concurrentRequests:
+    free: 1
+    pro: 3
+    enterprise: 10
--- a/gateway/src/harness/config/subagent-routing.yaml
+++ b/gateway/src/harness/config/subagent-routing.yaml
@@ -0,0 +1,98 @@
+# Subagent Routing Configuration
+
+# When to use which subagent based on task type
+
+subagents:
+  # Code Reviewer Subagent
+  code-reviewer:
+    enabled: true
+    path: src/harness/subagents/code-reviewer
+    triggers:
+      keywords:
+        - "review code"
+        - "check code"
+        - "code review"
+        - "analyze code"
+        - "audit code"
+      patterns:
+        - "review.*code"
+        - "check.*strategy"
+        - "analyze.*function"
+    priority: high
+    timeout: 60000  # 1 minute
+
+  # Risk Analyzer Subagent (TODO: implement)
+  risk-analyzer:
+    enabled: false
+    path: src/harness/subagents/risk-analyzer
+    triggers:
+      keywords:
+        - "risk"
+        - "exposure"
+        - "drawdown"
+        - "volatility"
+      patterns:
+        - "assess.*risk"
+        - "calculate.*risk"
+        - "risk.*analysis"
+    priority: high
+    timeout: 30000
+
+  # Market Analyst Subagent (TODO: implement)
+  market-analyst:
+    enabled: false
+    path: src/harness/subagents/market-analyst
+    triggers:
+      keywords:
+        - "market"
+        - "trend"
+        - "technical analysis"
+        - "price action"
+      patterns:
+        - "analyze.*market"
+        - "market.*conditions"
+    priority: medium
+    timeout: 45000
+
+# Routing strategy
+routing:
+  # Check triggers in priority order
+  strategy: priority
+
+  # Fallback to main agent if no subagent matches
+  fallback: main_agent
+
+  # Allow chaining (one subagent can invoke another)
+  allowChaining: true
+  maxChainDepth: 3
+
+# Subagent memory settings
+memory:
+  # Reload memory files on every request (dev mode)
+  hotReload: false
+
+  # Cache memory files in production
+  cacheMemory: true
+  cacheTTL: 3600000  # 1 hour
+
+# Parallel execution
+parallel:
+  # Allow multiple subagents to run in parallel
+  enabled: true
+
+  # Max concurrent subagents
+  maxConcurrent: 3
+
+  # Combine results strategy
+  combineStrategy: merge  # merge | first | best
+
+# Monitoring
+monitoring:
+  # Log subagent performance
+  logPerformance: true
+
+  # Track usage by subagent
+  trackUsage: true
+
+  # Alert on slow subagents
+  alertThreshold: 30000  # 30 seconds
--- a/gateway/src/harness/index.ts
+++ b/gateway/src/harness/index.ts
@@ -0,0 +1,17 @@
+// Main harness exports
+
+// Memory
+export * from './memory/index.js';
+
+// Skills
+export * from './skills/index.js';
+
+// Subagents
+export * from './subagents/index.js';
+
+// Workflows
+export * from './workflows/index.js';
+
+// Re-export agent harness (for backward compatibility)
+export { AgentHarness, type AgentHarnessConfig } from './agent-harness.js';
+export { MCPClientConnector } from './mcp-client.js';
--- a/gateway/src/harness/mcp-client.ts
+++ b/gateway/src/harness/mcp-client.ts
@@ -1,5 +1,5 @@
 import { Client } from '@modelcontextprotocol/sdk/client/index.js';
-import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js';
+
 import type { FastifyBaseLogger } from 'fastify';

 export interface MCPClientConfig {
@@ -44,10 +44,9 @@ export class MCPClientConnector {
        },
        {
          capabilities: {
-            tools: {},
-            resources: {},
+            sampling: {},
          },
-        }
+        } as any
      );

      // TODO: Replace with HTTP transport when user containers are ready
--- a/gateway/src/harness/memory/checkpoint-saver.ts
+++ b/gateway/src/harness/memory/checkpoint-saver.ts
@@ -0,0 +1,236 @@
+import { BaseCheckpointSaver } from '@langchain/langgraph';
+import type { Checkpoint, CheckpointMetadata, CheckpointTuple } from '@langchain/langgraph';
+import type { RunnableConfig } from '@langchain/core/runnables';
+import type Redis from 'ioredis';
+import type { FastifyBaseLogger } from 'fastify';
+
+/**
+ * Tiered checkpoint saver: Redis (hot) + Iceberg (cold)
+ *
+ * Hot path: Active checkpoints stored in Redis with TTL
+ * Cold path: Durable storage in Iceberg for long-term retention
+ *
+ * Based on architecture discussion: Redis for active sessions,
+ * Iceberg for durable storage with time-travel capabilities.
+ */
+export class TieredCheckpointSaver extends BaseCheckpointSaver<number> {
+  private readonly HOT_TTL_SECONDS = 3600; // 1 hour
+  private readonly KEY_PREFIX = 'ckpt:';
+
+  constructor(
+    private redis: Redis,
+    private logger: FastifyBaseLogger,
+    // Note: Iceberg writes are handled via Kafka + Flink for consistency
+    // Reads can be implemented when needed using IcebergClient
+    // private iceberg?: IcebergClient
+  ) {
+    super();
+  }
+
+  /**
+   * Get checkpoint from Redis (hot) or Iceberg (cold)
+   */
+  async getTuple(config: RunnableConfig): Promise<CheckpointTuple | undefined> {
+    const threadId = config.configurable?.thread_id as string;
+    if (!threadId) {
+      throw new Error('thread_id required in config.configurable');
+    }
+
+    const checkpointId = config.configurable?.checkpoint_id as string | undefined;
+
+    this.logger.debug({ threadId, checkpointId }, 'Getting checkpoint');
+
+    // Hot path: Try Redis first
+    const key = this.getRedisKey(threadId, checkpointId);
+    const cached = await this.redis.get(key);
+
+    if (cached) {
+      this.logger.debug({ threadId, checkpointId }, 'Checkpoint found in Redis (hot)');
+      return this.deserialize(cached);
+    }
+
+    // Cold path: Load from Iceberg (if needed)
+    // Note: Implement when Iceberg query is required
+    // Can use IcebergClient to query gateway.checkpoints table
+    // or set up a Kafka topic for checkpoint persistence
+
+    this.logger.debug({ threadId, checkpointId }, 'Checkpoint not in Redis, Iceberg cold storage not yet implemented');
+    return undefined;
+  }
+
+  /**
+   * Save checkpoint to Redis (hot) and async flush to Iceberg (cold)
+   */
+  async put(
+    config: RunnableConfig,
+    checkpoint: Checkpoint,
+    metadata: CheckpointMetadata
+  ): Promise<RunnableConfig> {
+    const threadId = config.configurable?.thread_id as string;
+    if (!threadId) {
+      throw new Error('thread_id required in config.configurable');
+    }
+
+    this.logger.debug({ threadId, checkpointId: checkpoint.id }, 'Saving checkpoint');
+
+    const serialized = this.serialize(checkpoint, metadata);
+
+    // Hot: Redis with TTL
+    const key = this.getRedisKey(threadId, checkpoint.id);
+    await this.redis.set(key, serialized, 'EX', this.HOT_TTL_SECONDS);
+
+    // Also store latest checkpoint pointer
+    const latestKey = this.getRedisKey(threadId);
+    await this.redis.set(latestKey, serialized, 'EX', this.HOT_TTL_SECONDS);
+
+    // Cold: Async flush to Iceberg (fire and forget)
+    this.flushToIceberg(threadId, checkpoint, metadata).catch((error) => {
+      this.logger.error({ error, threadId }, 'Failed to flush checkpoint to Iceberg');
+    });
+
+    return {
+      configurable: {
+        ...config.configurable,
+        thread_id: threadId,
+        checkpoint_id: checkpoint.id,
+      },
+    };
+  }
+
+  /**
+   * List all checkpoints for a thread
+   */
+  async *list(
+    config: RunnableConfig
+  ): AsyncGenerator<CheckpointTuple> {
+    const threadId = config.configurable?.thread_id as string;
+    if (!threadId) {
+      throw new Error('thread_id required in config.configurable');
+    }
+
+    // Try to get from Redis first
+    const pattern = `${this.KEY_PREFIX}${threadId}:*`;
+    const keys = await this.redis.keys(pattern);
+
+    for (const key of keys) {
+      const data = await this.redis.get(key);
+      if (data) {
+        const tuple = this.deserialize(data);
+        if (tuple) {
+          yield tuple;
+        }
+      }
+    }
+
+    // TODO: Also scan Iceberg for historical checkpoints
+  }
+
+  /**
+   * Delete thread (for GDPR compliance)
+   */
+  async deleteThread(threadId: string): Promise<void> {
+    this.logger.info({ threadId }, 'Deleting thread');
+
+    const pattern = `${this.KEY_PREFIX}${threadId}*`;
+    const keys = await this.redis.keys(pattern);
+    if (keys.length > 0) {
+      await this.redis.del(...keys);
+    }
+
+    // TODO: Also delete from Iceberg
+    // await this.deleteFromIceberg(threadId);
+  }
+
+  /**
+   * Put writes (required by BaseCheckpointSaver)
+   */
+  async putWrites(
+    config: RunnableConfig,
+    writes: [string, unknown][],
+    taskId: string
+  ): Promise<void> {
+    // For this simple implementation, we just log writes
+    // In a full implementation, you'd store pending writes separately
+    const threadId = config.configurable?.thread_id;
+    this.logger.debug({ threadId, taskId, writes }, 'Put writes called');
+  }
+
+
+
+  /**
+   * Generate Redis key for checkpoint
+   */
+  private getRedisKey(threadId: string, checkpointId?: string): string {
+    if (checkpointId) {
+      return `${this.KEY_PREFIX}${threadId}:${checkpointId}`;
+    }
+    return `${this.KEY_PREFIX}${threadId}:latest`;
+  }
+
+  /**
+   * Serialize checkpoint to JSON string
+   */
+  private serialize(checkpoint: Checkpoint, metadata: CheckpointMetadata): string {
+    return JSON.stringify({
+      checkpoint,
+      metadata,
+      savedAt: new Date().toISOString(),
+    });
+  }
+
+  /**
+   * Deserialize checkpoint from JSON string
+   */
+  private deserialize(data: string): CheckpointTuple | undefined {
+    try {
+      const parsed = JSON.parse(data);
+      return {
+        config: {
+          configurable: {
+            thread_id: parsed.checkpoint.id,
+            checkpoint_id: parsed.checkpoint.id,
+          },
+        },
+        checkpoint: parsed.checkpoint,
+        metadata: parsed.metadata,
+        parentConfig: undefined,
+      };
+    } catch (error) {
+      this.logger.error({ error }, 'Failed to deserialize checkpoint');
+      return undefined;
+    }
+  }
+
+  /**
+   * Async flush checkpoint to Iceberg for durable storage
+   *
+   * Note: For production, send to Kafka topic that Flink consumes:
+   * - Topic: gateway_checkpoints
+   * - Flink job writes to gateway.checkpoints Iceberg table
+   * - Ensures consistent write pattern with rest of system
+   */
+  private async flushToIceberg(
+    _threadId: string,
+    checkpoint: Checkpoint,
+    _metadata: CheckpointMetadata
+  ): Promise<void> {
+    // TODO: Send to Kafka topic for Flink processing
+    // const kafkaMessage = {
+    //   user_id: metadata.userId || '0',
+    //   session_id: threadId,
+    //   checkpoint_id: checkpoint.id,
+    //   checkpoint_data: JSON.stringify(checkpoint),
+    //   metadata: JSON.stringify(metadata),
+    //   timestamp: Date.now() * 1000, // microseconds
+    // };
+    // await this.kafkaProducer.send({
+    //   topic: 'gateway_checkpoints',
+    //   messages: [{ value: JSON.stringify(kafkaMessage) }]
+    // });
+
+    this.logger.debug({ threadId: _threadId, checkpointId: checkpoint.id },
+      'Checkpoint flush to Iceberg (via Kafka) not yet implemented');
+  }
+
+
+}
--- a/gateway/src/harness/memory/conversation-store.ts
+++ b/gateway/src/harness/memory/conversation-store.ts
@@ -0,0 +1,252 @@
+import type Redis from 'ioredis';
+import type { FastifyBaseLogger } from 'fastify';
+import type { BaseMessage } from '@langchain/core/messages';
+import { HumanMessage, AIMessage, SystemMessage } from '@langchain/core/messages';
+
+/**
+ * Message record for storage
+ */
+export interface StoredMessage {
+  id: string;
+  userId: string;
+  sessionId: string;
+  role: 'user' | 'assistant' | 'system';
+  content: string;
+  timestamp: number; // microseconds (Iceberg convention)
+  metadata?: Record<string, unknown>;
+}
+
+/**
+ * Conversation store: Redis (hot) + Iceberg (cold)
+ *
+ * Hot path: Recent messages in Redis for fast access
+ * Cold path: Full history in Iceberg for durability and analytics
+ *
+ * Architecture:
+ * - Redis stores last N messages per session with TTL
+ * - Iceberg stores all messages partitioned by user_id, session_id
+ * - Supports time-travel queries for debugging and analysis
+ */
+export class ConversationStore {
+  private readonly HOT_MESSAGE_LIMIT = 50; // Keep last 50 messages in Redis
+  private readonly HOT_TTL_SECONDS = 3600; // 1 hour
+
+  constructor(
+    private redis: Redis,
+    private logger: FastifyBaseLogger
+    // TODO: Add Iceberg catalog
+    // private iceberg: IcebergCatalog
+  ) {}
+
+  /**
+   * Save a message to both Redis and Iceberg
+   */
+  async saveMessage(
+    userId: string,
+    sessionId: string,
+    role: 'user' | 'assistant' | 'system',
+    content: string,
+    metadata?: Record<string, unknown>
+  ): Promise<void> {
+    const message: StoredMessage = {
+      id: `${userId}:${sessionId}:${Date.now()}`,
+      userId,
+      sessionId,
+      role,
+      content,
+      timestamp: Date.now() * 1000, // Convert to microseconds
+      metadata,
+    };
+
+    this.logger.debug({ userId, sessionId, role }, 'Saving message');
+
+    // Hot: Add to Redis list (LPUSH for newest first)
+    const key = this.getRedisKey(userId, sessionId);
+    await this.redis.lpush(key, JSON.stringify(message));
+
+    // Trim to keep only recent messages
+    await this.redis.ltrim(key, 0, this.HOT_MESSAGE_LIMIT - 1);
+
+    // Set TTL
+    await this.redis.expire(key, this.HOT_TTL_SECONDS);
+
+    // Cold: Async append to Iceberg
+    this.appendToIceberg(message).catch((error) => {
+      this.logger.error({ error, userId, sessionId }, 'Failed to append message to Iceberg');
+    });
+  }
+
+  /**
+   * Get recent messages from Redis (hot path)
+   */
+  async getRecentMessages(
+    userId: string,
+    sessionId: string,
+    limit: number = 20
+  ): Promise<StoredMessage[]> {
+    const key = this.getRedisKey(userId, sessionId);
+    const messages = await this.redis.lrange(key, 0, limit - 1);
+
+    return messages
+      .map((msg) => {
+        try {
+          return JSON.parse(msg) as StoredMessage;
+        } catch (error) {
+          this.logger.error({ error, message: msg }, 'Failed to parse message');
+          return null;
+        }
+      })
+      .filter((msg): msg is StoredMessage => msg !== null)
+      .reverse(); // Oldest first
+  }
+
+  /**
+   * Get full conversation history from Iceberg (cold path)
+   */
+  async getFullHistory(
+    userId: string,
+    sessionId: string,
+    timeRange?: { start: number; end: number }
+  ): Promise<StoredMessage[]> {
+    this.logger.debug({ userId, sessionId, timeRange }, 'Loading full history from Iceberg');
+
+    // TODO: Implement Iceberg query
+    // const table = this.iceberg.loadTable('gateway.conversations');
+    // const filters = [
+    //   EqualTo('user_id', userId),
+    //   EqualTo('session_id', sessionId),
+    // ];
+    //
+    // if (timeRange) {
+    //   filters.push(GreaterThanOrEqual('timestamp', timeRange.start));
+    //   filters.push(LessThanOrEqual('timestamp', timeRange.end));
+    // }
+    //
+    // const df = await table.scan({
+    //   row_filter: And(...filters)
+    // }).to_pandas();
+    //
+    // if (!df.empty) {
+    //   return df.sort_values('timestamp').to_dict('records');
+    // }
+
+    // Fallback to Redis if Iceberg not available
+    return await this.getRecentMessages(userId, sessionId, 1000);
+  }
+
+  /**
+   * Convert stored messages to LangChain message format
+   */
+  toLangChainMessages(messages: StoredMessage[]): BaseMessage[] {
+    return messages.map((msg) => {
+      switch (msg.role) {
+        case 'user':
+          return new HumanMessage(msg.content);
+        case 'assistant':
+          return new AIMessage(msg.content);
+        case 'system':
+          return new SystemMessage(msg.content);
+        default:
+          throw new Error(`Unknown role: ${msg.role}`);
+      }
+    });
+  }
+
+  /**
+   * Delete all messages for a session (Redis only, Iceberg handled separately)
+   */
+  async deleteSession(userId: string, sessionId: string): Promise<void> {
+    this.logger.info({ userId, sessionId }, 'Deleting session from Redis');
+    const key = this.getRedisKey(userId, sessionId);
+    await this.redis.del(key);
+  }
+
+  /**
+   * Delete all messages for a user (GDPR compliance)
+   */
+  async deleteUserData(userId: string): Promise<void> {
+    this.logger.info({ userId }, 'Deleting all user messages for GDPR compliance');
+
+    // Delete from Redis
+    const pattern = `conv:${userId}:*`;
+    const keys = await this.redis.keys(pattern);
+    if (keys.length > 0) {
+      await this.redis.del(...keys);
+    }
+
+    // Delete from Iceberg
+    // Note: For GDPR compliance, need to:
+    // 1. Send delete command via Kafka OR
+    // 2. Use Iceberg REST API to delete rows (if supported) OR
+    // 3. Coordinate with Flink job to handle deletes
+    //
+    // Iceberg delete flow:
+    // - Mark rows for deletion (equality delete files)
+    // - Run compaction to physically remove
+    // - Expire old snapshots
+
+    this.logger.info({ userId }, 'User messages deleted from Redis - Iceberg GDPR delete not yet implemented');
+  }
+
+  /**
+   * Get Redis key for conversation
+   */
+  private getRedisKey(userId: string, sessionId: string): string {
+    return `conv:${userId}:${sessionId}`;
+  }
+
+  /**
+   * Append message to Iceberg for durable storage
+   *
+   * Note: For production, send to Kafka topic that Flink consumes:
+   * - Topic: gateway_conversations
+   * - Flink job writes to gateway.conversations Iceberg table
+   * - Ensures consistent write pattern with rest of system
+   */
+  private async appendToIceberg(message: StoredMessage): Promise<void> {
+    // TODO: Send to Kafka topic for Flink processing
+    // const kafkaMessage = {
+    //   id: message.id,
+    //   user_id: message.userId,
+    //   session_id: message.sessionId,
+    //   role: message.role,
+    //   content: message.content,
+    //   metadata: JSON.stringify(message.metadata || {}),
+    //   timestamp: message.timestamp,
+    // };
+    // await this.kafkaProducer.send({
+    //   topic: 'gateway_conversations',
+    //   messages: [{ value: JSON.stringify(kafkaMessage) }]
+    // });
+
+    this.logger.debug(
+      { messageId: message.id, userId: message.userId, sessionId: message.sessionId },
+      'Message append to Iceberg (via Kafka) not yet implemented'
+    );
+  }
+
+  /**
+   * Get conversation statistics
+   */
+  async getStats(userId: string, sessionId: string): Promise<{
+    messageCount: number;
+    firstMessage?: Date;
+    lastMessage?: Date;
+  }> {
+    const key = this.getRedisKey(userId, sessionId);
+    const count = await this.redis.llen(key);
+
+    if (count === 0) {
+      return { messageCount: 0 };
+    }
+
+    const messages = await this.getRecentMessages(userId, sessionId, count);
+    const timestamps = messages.map((m) => m.timestamp / 1000); // Convert to milliseconds
+
+    return {
+      messageCount: count,
+      firstMessage: new Date(Math.min(...timestamps)),
+      lastMessage: new Date(Math.max(...timestamps)),
+    };
+  }
+}
--- a/gateway/src/harness/memory/document-loader.ts
+++ b/gateway/src/harness/memory/document-loader.ts
@@ -0,0 +1,356 @@
+import { readdir, readFile } from 'fs/promises';
+import { join, relative } from 'path';
+import { createHash } from 'crypto';
+import type { FastifyBaseLogger } from 'fastify';
+import { RAGRetriever } from './rag-retriever.js';
+import { EmbeddingService } from './embedding-service.js';
+
+/**
+ * Document metadata stored with each chunk
+ */
+export interface DocumentMetadata {
+  document_id: string;
+  chunk_index: number;
+  content_hash: string;
+  last_updated: number;
+  tags: string[];
+  heading?: string;
+  file_path: string;
+}
+
+/**
+ * Document chunk with content and metadata
+ */
+export interface DocumentChunk {
+  content: string;
+  metadata: DocumentMetadata;
+}
+
+/**
+ * Document loader configuration
+ */
+export interface DocumentLoaderConfig {
+  knowledgeDir: string;
+  maxChunkSize?: number; // in tokens (approximate by chars)
+  chunkOverlap?: number; // overlap between chunks
+}
+
+/**
+ * Global knowledge document loader
+ *
+ * Loads markdown documents from a directory structure and stores them
+ * as global knowledge (user_id="0") in Qdrant for RAG retrieval.
+ *
+ * Features:
+ * - Intelligent chunking by markdown headers
+ * - Content hashing for change detection
+ * - Metadata extraction (tags, headings)
+ * - Automatic embedding generation
+ * - Incremental updates (only changed docs)
+ *
+ * Directory structure:
+ * gateway/knowledge/
+ *   platform/
+ *   trading/
+ *   indicators/
+ *   strategies/
+ */
+export class DocumentLoader {
+  private config: DocumentLoaderConfig;
+  private logger: FastifyBaseLogger;
+  private embeddings: EmbeddingService;
+  private rag: RAGRetriever;
+  private loadedDocs: Map<string, string> = new Map(); // path -> hash
+
+  constructor(
+    config: DocumentLoaderConfig,
+    embeddings: EmbeddingService,
+    rag: RAGRetriever,
+    logger: FastifyBaseLogger
+  ) {
+    this.config = {
+      maxChunkSize: 4000, // ~1000 tokens
+      chunkOverlap: 200,
+      ...config,
+    };
+    this.embeddings = embeddings;
+    this.rag = rag;
+    this.logger = logger;
+  }
+
+  /**
+   * Load all documents from knowledge directory
+   */
+  async loadAll(): Promise<{ loaded: number; updated: number; skipped: number }> {
+    this.logger.info({ dir: this.config.knowledgeDir }, 'Loading knowledge documents');
+
+    const stats = { loaded: 0, updated: 0, skipped: 0 };
+
+    try {
+      const files = await this.findMarkdownFiles(this.config.knowledgeDir);
+
+      for (const filePath of files) {
+        const result = await this.loadDocument(filePath);
+
+        if (result === 'loaded') stats.loaded++;
+        else if (result === 'updated') stats.updated++;
+        else stats.skipped++;
+      }
+
+      this.logger.info(stats, 'Knowledge documents loaded');
+      return stats;
+    } catch (error) {
+      this.logger.error({ error }, 'Failed to load knowledge documents');
+      throw error;
+    }
+  }
+
+  /**
+   * Load a single document
+   */
+  async loadDocument(filePath: string): Promise<'loaded' | 'updated' | 'skipped'> {
+    try {
+      // Read file content
+      const content = await readFile(filePath, 'utf-8');
+      const contentHash = this.hashContent(content);
+
+      // Check if document has changed
+      const relativePath = relative(this.config.knowledgeDir, filePath);
+      const existingHash = this.loadedDocs.get(relativePath);
+
+      if (existingHash === contentHash) {
+        this.logger.debug({ file: relativePath }, 'Document unchanged, skipping');
+        return 'skipped';
+      }
+
+      const isUpdate = !!existingHash;
+
+      // Parse and chunk document
+      const chunks = this.chunkDocument(content, relativePath);
+
+      this.logger.info(
+        { file: relativePath, chunks: chunks.length, update: isUpdate },
+        'Processing document'
+      );
+
+      // Generate embeddings and store chunks
+      for (const chunk of chunks) {
+        const embedding = await this.embeddings.embed(chunk.content);
+
+        // Create unique ID for this chunk
+        const chunkId = `global:${chunk.metadata.document_id}:${chunk.metadata.chunk_index}`;
+
+        // Store in Qdrant as global knowledge
+        await this.rag.storeGlobalKnowledge(
+          chunkId,
+          chunk.content,
+          embedding,
+          {
+            ...chunk.metadata,
+            type: 'knowledge_doc',
+          }
+        );
+      }
+
+      // Update loaded docs tracking
+      this.loadedDocs.set(relativePath, contentHash);
+
+      return isUpdate ? 'updated' : 'loaded';
+    } catch (error) {
+      this.logger.error({ error, file: filePath }, 'Failed to load document');
+      throw error;
+    }
+  }
+
+  /**
+   * Reload a specific document (for updates)
+   */
+  async reloadDocument(filePath: string): Promise<void> {
+    this.logger.info({ file: filePath }, 'Reloading document');
+    await this.loadDocument(filePath);
+  }
+
+  /**
+   * Chunk document by markdown headers with smart splitting
+   */
+  private chunkDocument(content: string, documentId: string): DocumentChunk[] {
+    const chunks: DocumentChunk[] = [];
+    const tags = this.extractTags(content);
+    const lastModified = Date.now();
+
+    // Split by headers (## or ###)
+    const sections = this.splitByHeaders(content);
+
+    let chunkIndex = 0;
+
+    for (const section of sections) {
+      // If section is too large, split it further
+      const subChunks = this.splitLargeSection(section.content);
+
+      for (const subContent of subChunks) {
+        if (subContent.trim().length === 0) continue;
+
+        chunks.push({
+          content: subContent,
+          metadata: {
+            document_id: documentId,
+            chunk_index: chunkIndex++,
+            content_hash: this.hashContent(content),
+            last_updated: lastModified,
+            tags,
+            heading: section.heading,
+            file_path: documentId,
+          },
+        });
+      }
+    }
+
+    return chunks;
+  }
+
+  /**
+   * Split document by markdown headers
+   */
+  private splitByHeaders(content: string): Array<{ heading?: string; content: string }> {
+    const lines = content.split('\n');
+    const sections: Array<{ heading?: string; content: string }> = [];
+    let currentSection: string[] = [];
+    let currentHeading: string | undefined;
+
+    for (const line of lines) {
+      // Check for markdown header (##, ###, ####)
+      const headerMatch = line.match(/^(#{2,4})\s+(.+)$/);
+
+      if (headerMatch) {
+        // Save previous section
+        if (currentSection.length > 0) {
+          sections.push({
+            heading: currentHeading,
+            content: currentSection.join('\n'),
+          });
+        }
+
+        // Start new section
+        currentHeading = headerMatch[2].trim();
+        currentSection = [line];
+      } else {
+        currentSection.push(line);
+      }
+    }
+
+    // Add final section
+    if (currentSection.length > 0) {
+      sections.push({
+        heading: currentHeading,
+        content: currentSection.join('\n'),
+      });
+    }
+
+    return sections;
+  }
+
+  /**
+   * Split large sections into smaller chunks
+   */
+  private splitLargeSection(content: string): string[] {
+    const maxSize = this.config.maxChunkSize!;
+    const overlap = this.config.chunkOverlap!;
+
+    if (content.length <= maxSize) {
+      return [content];
+    }
+
+    const chunks: string[] = [];
+    let start = 0;
+
+    while (start < content.length) {
+      const end = Math.min(start + maxSize, content.length);
+      let chunkEnd = end;
+
+      // Try to break at sentence boundary
+      if (end < content.length) {
+        const sentenceEnd = content.lastIndexOf('.', end);
+        const paragraphEnd = content.lastIndexOf('\n\n', end);
+
+        if (paragraphEnd > start + maxSize / 2) {
+          chunkEnd = paragraphEnd;
+        } else if (sentenceEnd > start + maxSize / 2) {
+          chunkEnd = sentenceEnd + 1;
+        }
+      }
+
+      chunks.push(content.substring(start, chunkEnd));
+      start = chunkEnd - overlap;
+    }
+
+    return chunks;
+  }
+
+  /**
+   * Extract tags from document (frontmatter or first heading)
+   */
+  private extractTags(content: string): string[] {
+    const tags: string[] = [];
+
+    // Try to extract from YAML frontmatter
+    const frontmatterMatch = content.match(/^---\n([\s\S]*?)\n---/);
+    if (frontmatterMatch) {
+      const frontmatter = frontmatterMatch[1];
+      const tagsMatch = frontmatter.match(/tags:\s*\[([^\]]+)\]/);
+      if (tagsMatch) {
+        tags.push(...tagsMatch[1].split(',').map((t) => t.trim()));
+      }
+    }
+
+    // Extract from first heading
+    const headingMatch = content.match(/^#\s+(.+)$/m);
+    if (headingMatch) {
+      tags.push(headingMatch[1].toLowerCase().replace(/\s+/g, '-'));
+    }
+
+    return tags;
+  }
+
+  /**
+   * Hash content for change detection
+   */
+  private hashContent(content: string): string {
+    return createHash('md5').update(content).digest('hex');
+  }
+
+  /**
+   * Recursively find all markdown files
+   */
+  private async findMarkdownFiles(dir: string): Promise<string[]> {
+    const files: string[] = [];
+
+    try {
+      const entries = await readdir(dir, { withFileTypes: true });
+
+      for (const entry of entries) {
+        const fullPath = join(dir, entry.name);
+
+        if (entry.isDirectory()) {
+          const subFiles = await this.findMarkdownFiles(fullPath);
+          files.push(...subFiles);
+        } else if (entry.isFile() && entry.name.endsWith('.md')) {
+          files.push(fullPath);
+        }
+      }
+    } catch (error) {
+      this.logger.warn({ error, dir }, 'Failed to read directory');
+    }
+
+    return files;
+  }
+
+  /**
+   * Get loaded document stats
+   */
+  getStats(): { totalDocs: number; totalSize: number } {
+    return {
+      totalDocs: this.loadedDocs.size,
+      totalSize: Array.from(this.loadedDocs.values()).reduce((sum, hash) => sum + hash.length, 0),
+    };
+  }
+}
--- a/gateway/src/harness/memory/embedding-service.ts
+++ b/gateway/src/harness/memory/embedding-service.ts
@@ -0,0 +1,270 @@
+import type { FastifyBaseLogger } from 'fastify';
+import { Ollama } from 'ollama';
+
+/**
+ * Embedding provider configuration
+ */
+export interface EmbeddingConfig {
+  provider: 'ollama' | 'openai' | 'anthropic' | 'local' | 'voyage' | 'cohere' | 'none';
+  model?: string;
+  apiKey?: string;
+  dimensions?: number;
+  ollamaUrl?: string;
+}
+
+/**
+ * Embedding service for generating vectors from text
+ *
+ * Supports multiple providers:
+ * - Ollama (all-minilm, nomic-embed-text, mxbai-embed-large) - RECOMMENDED
+ * - OpenAI (text-embedding-3-small/large)
+ * - Voyage AI (voyage-2)
+ * - Cohere (embed-english-v3.0)
+ * - Local models (via transformers.js or Python sidecar)
+ * - None (for development without embeddings)
+ *
+ * Used by RAGRetriever to generate embeddings for storage and search.
+ *
+ * For production, use Ollama with all-minilm (90MB model, runs on CPU, ~100MB RAM).
+ * Ollama can run in-container or as a separate pod/sidecar.
+ */
+export class EmbeddingService {
+  private readonly model: string;
+  private readonly dimensions: number;
+  private ollama?: Ollama;
+
+  constructor(
+    private config: EmbeddingConfig,
+    private logger: FastifyBaseLogger
+  ) {
+    // Set defaults based on provider
+    switch (config.provider) {
+      case 'ollama':
+        this.model = config.model || 'all-minilm';
+        this.dimensions = config.dimensions || 384;
+        this.ollama = new Ollama({
+          host: config.ollamaUrl || 'http://localhost:11434',
+        });
+        break;
+      case 'openai':
+        this.model = config.model || 'text-embedding-3-small';
+        this.dimensions = config.dimensions || 1536;
+        break;
+      case 'anthropic':
+      case 'voyage':
+        this.model = config.model || 'voyage-2';
+        this.dimensions = config.dimensions || 1024;
+        break;
+      case 'cohere':
+        this.model = config.model || 'embed-english-v3.0';
+        this.dimensions = config.dimensions || 1024;
+        break;
+      case 'local':
+        this.model = config.model || 'all-MiniLM-L6-v2';
+        this.dimensions = config.dimensions || 384;
+        break;
+      case 'none':
+        // No embeddings configured - will return zero vectors
+        this.model = 'none';
+        this.dimensions = config.dimensions || 1536;
+        this.logger.warn('Embedding service initialized with provider=none - RAG will not function properly');
+        break;
+      default:
+        throw new Error(`Unknown embedding provider: ${config.provider}`);
+    }
+
+    if (config.provider !== 'none') {
+      this.logger.info(
+        { provider: config.provider, model: this.model, dimensions: this.dimensions },
+        'Initialized embedding service'
+      );
+    }
+  }
+
+  /**
+   * Generate embedding for a single text
+   */
+  async embed(text: string): Promise<number[]> {
+    if (this.config.provider === 'none') {
+      // Return zero vector when no embeddings configured
+      return new Array(this.dimensions).fill(0);
+    }
+
+    this.logger.debug({ textLength: text.length, provider: this.config.provider }, 'Generating embedding');
+
+    try {
+      switch (this.config.provider) {
+        case 'ollama':
+          return await this.embedOllama(text);
+        case 'openai':
+          return await this.embedOpenAI(text);
+        case 'anthropic':
+        case 'voyage':
+          return await this.embedVoyage(text);
+        case 'cohere':
+          return await this.embedCohere(text);
+        case 'local':
+          return await this.embedLocal(text);
+        default:
+          throw new Error(`Unknown provider: ${this.config.provider}`);
+      }
+    } catch (error) {
+      this.logger.error({ error, provider: this.config.provider }, 'Failed to generate embedding');
+      // Return zero vector as fallback to prevent crashes
+      return new Array(this.dimensions).fill(0);
+    }
+  }
+
+  /**
+   * Generate embeddings for multiple texts (batch)
+   */
+  async embedBatch(texts: string[]): Promise<number[][]> {
+    this.logger.debug({ count: texts.length, provider: this.config.provider }, 'Generating batch embeddings');
+
+    // Ollama supports native batch operations
+    if (this.config.provider === 'ollama' && this.ollama) {
+      try {
+        const response = await this.ollama.embed({
+          model: this.model,
+          input: texts,
+        });
+        return response.embeddings;
+      } catch (error) {
+        this.logger.error({ error }, 'Ollama batch embedding failed, falling back to sequential');
+        // Fall through to sequential processing
+      }
+    }
+
+    // Fallback: call embed() for each text sequentially
+    const embeddings = await Promise.all(texts.map((text) => this.embed(text)));
+
+    return embeddings;
+  }
+
+  /**
+   * Get embedding dimensions
+   */
+  getDimensions(): number {
+    return this.dimensions;
+  }
+
+  /**
+   * Get model name
+   */
+  getModel(): string {
+    return this.model;
+  }
+
+  /**
+   * Generate embedding using Ollama
+   */
+  private async embedOllama(text: string): Promise<number[]> {
+    if (!this.ollama) {
+      this.logger.error('Ollama client not initialized');
+      return new Array(this.dimensions).fill(0);
+    }
+
+    try {
+      const response = await this.ollama.embed({
+        model: this.model,
+        input: text,
+      });
+
+      // Ollama returns single embedding for single input
+      return response.embeddings[0];
+    } catch (error) {
+      this.logger.error({ error }, 'Ollama embedding failed, returning zero vector');
+      return new Array(this.dimensions).fill(0);
+    }
+  }
+
+  /**
+   * Generate embedding using OpenAI API
+   */
+  private async embedOpenAI(text: string): Promise<number[]> {
+    if (!this.config.apiKey) {
+      this.logger.warn('OpenAI API key not configured, returning zero vector');
+      return new Array(this.dimensions).fill(0);
+    }
+
+    try {
+      const response = await fetch('https://api.openai.com/v1/embeddings', {
+        method: 'POST',
+        headers: {
+          'Content-Type': 'application/json',
+          'Authorization': `Bearer ${this.config.apiKey}`,
+        },
+        body: JSON.stringify({
+          model: this.model,
+          input: text,
+        }),
+      });
+
+      if (!response.ok) {
+        const errorText = await response.text();
+        throw new Error(`OpenAI API error: ${response.status} ${errorText}`);
+      }
+
+      const data = await response.json() as { data: Array<{ embedding: number[] }> };
+      return data.data[0].embedding;
+    } catch (error) {
+      this.logger.error({ error }, 'OpenAI embedding failed, returning zero vector');
+      return new Array(this.dimensions).fill(0);
+    }
+  }
+
+  /**
+   * Generate embedding using Voyage AI API (Anthropic partnership)
+   */
+  private async embedVoyage(_text: string): Promise<number[]> {
+    // TODO: Implement Voyage AI embedding when API key available
+    // API endpoint: https://api.voyageai.com/v1/embeddings
+    this.logger.warn('Voyage AI embedding not yet implemented, returning zero vector');
+    return new Array(this.dimensions).fill(0);
+  }
+
+  /**
+   * Generate embedding using Cohere API
+   */
+  private async embedCohere(_text: string): Promise<number[]> {
+    // TODO: Implement Cohere embedding when API key available
+    // API endpoint: https://api.cohere.ai/v1/embed
+    this.logger.warn('Cohere embedding not yet implemented, returning zero vector');
+    return new Array(this.dimensions).fill(0);
+  }
+
+  /**
+   * Generate embedding using local model
+   */
+  private async embedLocal(_text: string): Promise<number[]> {
+    // TODO: Implement local embedding (via transformers.js or Python sidecar)
+    // Options:
+    // 1. transformers.js (pure JS/WebAssembly) - slower but self-contained
+    // 2. Python sidecar service running sentence-transformers - faster
+    // 3. ONNX runtime with pre-exported models - good balance
+
+    this.logger.warn('Local embedding not implemented, returning zero vector');
+    return new Array(this.dimensions).fill(0);
+  }
+
+  /**
+   * Calculate cosine similarity between two embeddings
+   */
+  static cosineSimilarity(a: number[], b: number[]): number {
+    if (a.length !== b.length) {
+      throw new Error('Embeddings must have same dimensions');
+    }
+
+    let dotProduct = 0;
+    let normA = 0;
+    let normB = 0;
+
+    for (let i = 0; i < a.length; i++) {
+      dotProduct += a[i] * b[i];
+      normA += a[i] * a[i];
+      normB += b[i] * b[i];
+    }
+
+    return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
+  }
+}
--- a/gateway/src/harness/memory/index.ts
+++ b/gateway/src/harness/memory/index.ts
@@ -0,0 +1,20 @@
+// Memory layer exports
+
+export { TieredCheckpointSaver } from './checkpoint-saver.js';
+export { ConversationStore } from './conversation-store.js';
+export { EmbeddingService } from './embedding-service.js';
+export { RAGRetriever } from './rag-retriever.js';
+export { DocumentLoader } from './document-loader.js';
+export {
+  createUserContext,
+  touchContext,
+  isContextExpired,
+  serializeContext,
+  deserializeContext,
+  getDefaultCapabilities,
+  type UserContext,
+  type ActiveChannel,
+  type ChannelCapabilities,
+  type WorkspaceContext,
+  type MemoryChunk,
+} from './session-context.js';
--- a/gateway/src/harness/memory/rag-retriever.ts
+++ b/gateway/src/harness/memory/rag-retriever.ts
@@ -0,0 +1,210 @@
+import type { FastifyBaseLogger } from 'fastify';
+import { QdrantClient } from '../../clients/qdrant-client.js';
+
+/**
+ * Vector point with metadata for Qdrant
+ */
+export interface VectorPoint {
+  id: string;
+  vector: number[];
+  payload: {
+    user_id: string;
+    session_id: string;
+    content: string;
+    role: 'user' | 'assistant' | 'system';
+    timestamp: number;
+    [key: string]: unknown;
+  };
+}
+
+/**
+ * Search result from Qdrant
+ */
+export interface SearchResult {
+  id: string;
+  score: number;
+  payload: VectorPoint['payload'];
+}
+
+/**
+ * Qdrant client configuration
+ */
+export interface QdrantConfig {
+  url: string;
+  apiKey?: string;
+  collectionName?: string;
+}
+
+/**
+ * RAG retriever using Qdrant for vector similarity search
+ *
+ * Features:
+ * - **Global namespace** (user_id="0") for platform knowledge
+ * - **User-specific namespaces** for personal memories
+ * - **Queries join both** global and user memories
+ * - Semantic search across conversation history
+ * - Context retrieval for agent prompts
+ * - User preference and pattern learning
+ *
+ * Architecture: Gateway-side vector store, user_id indexed for GDPR compliance
+ */
+export class RAGRetriever {
+  private qdrant: QdrantClient;
+
+  constructor(
+    config: QdrantConfig,
+    private logger: FastifyBaseLogger,
+    vectorDimension: number = 1536
+  ) {
+    this.qdrant = new QdrantClient(config, logger, vectorDimension);
+  }
+
+  /**
+   * Initialize Qdrant collection with proper schema
+   */
+  async initialize(): Promise<void> {
+    await this.qdrant.initialize();
+  }
+
+  /**
+   * Store conversation message as vector
+   */
+  async storeMessage(
+    userId: string,
+    sessionId: string,
+    role: 'user' | 'assistant' | 'system',
+    content: string,
+    embedding: number[],
+    metadata?: Record<string, unknown>
+  ): Promise<void> {
+    const id = `${userId}:${sessionId}:${Date.now()}`;
+
+    const payload = {
+      user_id: userId,
+      session_id: sessionId,
+      content,
+      role,
+      timestamp: Date.now(),
+      ...metadata,
+    };
+
+    this.logger.debug(
+      { userId, sessionId, role, contentLength: content.length },
+      'Storing message vector'
+    );
+
+    await this.qdrant.upsertPoint(id, embedding, payload);
+  }
+
+  /**
+   * Store global platform knowledge (user_id = "0")
+   */
+  async storeGlobalKnowledge(
+    id: string,
+    content: string,
+    embedding: number[],
+    metadata?: Record<string, unknown>
+  ): Promise<void> {
+    this.logger.debug({ id, contentLength: content.length }, 'Storing global knowledge');
+
+    await this.qdrant.storeGlobalKnowledge(id, embedding, {
+      session_id: 'global',
+      content,
+      role: 'system',
+      timestamp: Date.now(),
+      ...metadata,
+    });
+  }
+
+  /**
+   * Search for relevant memories using vector similarity
+   * Queries BOTH global (user_id="0") and user-specific memories
+   */
+  async search(
+    userId: string,
+    queryEmbedding: number[],
+    options?: {
+      limit?: number;
+      sessionId?: string;
+      minScore?: number;
+      timeRange?: { start: number; end: number };
+    }
+  ): Promise<SearchResult[]> {
+    const limit = options?.limit || 5;
+    const minScore = options?.minScore || 0.7;
+
+    this.logger.debug(
+      { userId, limit, sessionId: options?.sessionId },
+      'Searching for relevant memories (global + user)'
+    );
+
+    // Qdrant client handles the "should" logic: user_id = userId OR user_id = "0"
+    const results = await this.qdrant.search(userId, queryEmbedding, {
+      limit,
+      scoreThreshold: minScore,
+      sessionId: options?.sessionId,
+      timeRange: options?.timeRange,
+    });
+
+    return results.map(r => ({
+      id: r.id,
+      score: r.score,
+      payload: r.payload as VectorPoint['payload'],
+    }));
+  }
+
+  /**
+   * Get recent conversation history for context
+   */
+  async getRecentHistory(
+    userId: string,
+    sessionId: string,
+    limit: number = 10
+  ): Promise<SearchResult[]> {
+    this.logger.debug({ userId, sessionId, limit }, 'Getting recent conversation history');
+
+    const result = await this.qdrant.scroll(userId, {
+      sessionId,
+      limit,
+    });
+
+    return result.points.map(p => ({
+      id: p.id,
+      score: 1.0, // Not a search result, so score is 1.0
+      payload: p.payload as VectorPoint['payload'],
+    }));
+  }
+
+  /**
+   * Delete all vectors for a user (GDPR compliance)
+   */
+  async deleteUserData(userId: string): Promise<void> {
+    this.logger.info({ userId }, 'Deleting all user vectors for GDPR compliance');
+    await this.qdrant.deleteUserData(userId);
+  }
+
+  /**
+   * Delete all vectors for a session
+   */
+  async deleteSession(userId: string, sessionId: string): Promise<void> {
+    this.logger.info({ userId, sessionId }, 'Deleting session vectors');
+    await this.qdrant.deleteSession(userId, sessionId);
+  }
+
+  /**
+   * Get collection statistics
+   */
+  async getStats(): Promise<{
+    vectorCount: number;
+    indexedCount: number;
+    collectionSize: number;
+  }> {
+    const info = await this.qdrant.getCollectionInfo();
+
+    return {
+      vectorCount: info.vectorsCount,
+      indexedCount: info.indexedVectorsCount,
+      collectionSize: info.pointsCount,
+    };
+  }
+}
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`../../.agents/skills/better-auth-best-practices`
				`@@ -0,0 +1 @@`
				`../../.agents/skills/email-and-password-best-practices`
				`@@ -0,0 +1 @@`
				`../../.agents/skills/two-factor-authentication-best-practices`