Files
ai/gateway/src/clients/duckdb-client.ts

765 lines
22 KiB
TypeScript

/**
* DuckDB Client for querying Apache Iceberg tables
*
* Uses DuckDB's native Iceberg and Parquet support to query data
* directly from S3/MinIO without needing catalog-only libraries.
*/
import duckdb from 'duckdb';
import type { FastifyBaseLogger } from 'fastify';
import { promisify } from 'util';
type Database = duckdb.Database;
type Connection = duckdb.Connection;
const { Database, Connection } = duckdb;
export interface DuckDBConfig {
catalogUri: string;
namespace: string;
ohlcCatalogUri?: string;
ohlcNamespace?: string;
s3Endpoint?: string;
s3AccessKey?: string;
s3SecretKey?: string;
conversationsBucket?: string; // S3 bucket for conversation cold storage
}
/**
* DuckDB Client with Iceberg support
*
* Provides SQL-based queries against Iceberg tables stored in S3/MinIO.
*/
export class DuckDBClient {
private db: Database | null = null;
private conn: Connection | null = null;
private namespace: string;
private ohlcNamespace: string;
private catalogUri: string;
private ohlcCatalogUri: string;
private s3Config: {
endpoint?: string;
accessKey?: string;
secretKey?: string;
};
private conversationsBucket?: string;
private logger: FastifyBaseLogger;
private initialized = false;
constructor(config: DuckDBConfig, logger: FastifyBaseLogger) {
this.logger = logger;
this.namespace = config.namespace;
this.catalogUri = config.catalogUri;
this.ohlcCatalogUri = config.ohlcCatalogUri || config.catalogUri;
this.ohlcNamespace = config.ohlcNamespace || 'trading';
this.conversationsBucket = config.conversationsBucket;
this.s3Config = {
endpoint: config.s3Endpoint,
accessKey: config.s3AccessKey,
secretKey: config.s3SecretKey,
};
}
/**
* Initialize DuckDB connection and configure S3/Iceberg extensions
*/
async initialize(): Promise<void> {
if (this.initialized) {
return;
}
try {
this.db = new Database(':memory:');
this.conn = this.db.connect();
const all = promisify(this.conn.all.bind(this.conn));
// Install and load required extensions
await all('INSTALL httpfs;');
await all('LOAD httpfs;');
await all('INSTALL iceberg;');
await all('LOAD iceberg;');
// Configure S3 credentials if provided
if (this.s3Config.endpoint && this.s3Config.accessKey && this.s3Config.secretKey) {
const s3Url = new URL(this.s3Config.endpoint);
const useSSL = s3Url.protocol === 'https:';
await all(`SET s3_endpoint='${s3Url.hostname}:${s3Url.port || (useSSL ? 443 : 9000)}';`);
await all(`SET s3_access_key_id='${this.s3Config.accessKey}';`);
await all(`SET s3_secret_access_key='${this.s3Config.secretKey}';`);
await all(`SET s3_use_ssl=${useSSL};`);
await all(`SET s3_url_style='path';`);
await all(`SET s3_region='us-east-1';`);
await all(`SET s3_url_compatibility_mode=true;`);
this.logger.info({
endpoint: this.s3Config.endpoint,
useSSL,
}, 'Configured DuckDB S3 settings');
}
this.initialized = true;
this.logger.info({
catalogUri: this.catalogUri,
namespace: this.namespace,
ohlcCatalogUri: this.ohlcCatalogUri,
ohlcNamespace: this.ohlcNamespace,
}, 'DuckDB client initialized');
} catch (error) {
this.logger.error({ error }, 'Failed to initialize DuckDB');
throw error;
}
}
/**
* Execute a SQL query and return all rows
*/
private async query<T = any>(sql: string, params?: any[]): Promise<T[]> {
if (!this.conn) {
throw new Error('DuckDB connection not initialized');
}
try {
const all = promisify(this.conn.all.bind(this.conn)) as (sql: string, ...params: any[]) => Promise<any[]>;
const rows = params && params.length > 0 ? await all(sql, ...params) : await all(sql);
return rows as T[];
} catch (error) {
this.logger.error({ error, sql, params }, 'DuckDB query failed');
throw error;
}
}
/**
* Get the Iceberg table path from REST catalog
*/
private async getTablePath(namespace: string, tableName: string, catalogUri: string): Promise<string | null> {
try {
const tableUrl = `${catalogUri}/v1/namespaces/${namespace}/tables/${tableName}`;
this.logger.debug({ tableUrl }, 'Fetching Iceberg table metadata');
const response = await fetch(tableUrl, {
method: 'GET',
headers: {
'Content-Type': 'application/json',
},
});
if (!response.ok) {
if (response.status === 404) {
this.logger.debug({ namespace, tableName }, 'Table not found in catalog');
return null;
}
throw new Error(`Failed to fetch table metadata: ${response.status} ${response.statusText}`);
}
const metadata = await response.json() as any;
// Extract metadata location (S3 path to metadata.json)
const metadataLocation = metadata['metadata-location'] || metadata.location;
if (!metadataLocation) {
this.logger.warn({ metadata }, 'No metadata-location found in table response');
return null;
}
this.logger.debug({ metadataLocation }, 'Found Iceberg table location');
return metadataLocation;
} catch (error: any) {
this.logger.error({ error: error.message, namespace, tableName }, 'Failed to get table path');
return null;
}
}
/**
* Query messages from gateway.conversations table
*/
async queryMessages(
userId: string,
sessionId: string,
options?: {
startTime?: number;
endTime?: number;
limit?: number;
}
): Promise<any[]> {
await this.initialize();
try {
const tablePath = await this.getTablePath(
this.namespace,
'conversations',
this.catalogUri
);
if (!tablePath) {
// Fallback: scan Parquet files written directly to conversations bucket
if (this.conversationsBucket) {
this.logger.debug({ userId, sessionId }, 'REST catalog miss, scanning Parquet cold storage');
const parquetPath = `s3://${this.conversationsBucket}/gateway/conversations/**/user_id=${userId}/${sessionId}.parquet`;
const fallbackSql = `
SELECT id, user_id, session_id, role, content, metadata, timestamp
FROM read_parquet('${parquetPath}')
ORDER BY timestamp ASC
${options?.limit ? `LIMIT ${options.limit}` : ''}
`;
try {
return await this.query(fallbackSql);
} catch {
// File may not exist yet
}
}
this.logger.warn('Conversations table not found and no cold storage configured');
return [];
}
// Build SQL query with optional filters
let sql = `
SELECT
id,
user_id,
session_id,
role,
content,
metadata,
timestamp
FROM iceberg_scan('${tablePath}')
WHERE user_id = ?
AND session_id = ?
`;
const params: any[] = [userId, sessionId];
if (options?.startTime) {
sql += ' AND timestamp >= ?';
params.push(options.startTime.toString());
}
if (options?.endTime) {
sql += ' AND timestamp <= ?';
params.push(options.endTime.toString());
}
sql += ' ORDER BY timestamp ASC';
if (options?.limit) {
sql += ' LIMIT ?';
params.push(options.limit);
}
this.logger.debug({ userId, sessionId, options }, 'Querying conversation messages');
const rows = await this.query(sql, params);
this.logger.info({
userId,
sessionId,
count: rows.length
}, 'Loaded conversation messages from Iceberg');
// Convert timestamp strings back to numbers
return rows.map((row: any) => ({
...row,
timestamp: Number(row.timestamp)
}));
} catch (error: any) {
this.logger.error({
error: error.message,
userId,
sessionId
}, 'Failed to query conversation messages');
return [];
}
}
/**
* Query checkpoint from gateway.checkpoints table
*/
async queryCheckpoint(
userId: string,
sessionId: string,
checkpointId?: string
): Promise<any | null> {
await this.initialize();
try {
const tablePath = await this.getTablePath(
this.namespace,
'checkpoints',
this.catalogUri
);
if (!tablePath) {
this.logger.warn('Checkpoints table not found');
return null;
}
let sql = `
SELECT
user_id,
session_id,
checkpoint_id,
checkpoint_data,
metadata,
timestamp
FROM iceberg_scan('${tablePath}')
WHERE user_id = ?
AND session_id = ?
`;
const params: any[] = [userId, sessionId];
if (checkpointId) {
sql += ' AND checkpoint_id = ?';
params.push(checkpointId);
}
sql += ' ORDER BY timestamp DESC LIMIT 1';
this.logger.debug({ userId, sessionId, checkpointId }, 'Querying checkpoint');
const rows = await this.query(sql, params);
if (rows.length === 0) {
return null;
}
const row = rows[0];
this.logger.info({
userId,
sessionId,
checkpointId: row.checkpoint_id
}, 'Loaded checkpoint from Iceberg');
// Convert timestamp string back to number
return {
...row,
timestamp: Number(row.timestamp)
};
} catch (error: any) {
this.logger.error({
error: error.message,
userId,
sessionId,
checkpointId
}, 'Failed to query checkpoint');
return null;
}
}
/**
* Query symbol metadata from trading.symbol_metadata table
*/
async queryAllSymbols(): Promise<any[]> {
await this.initialize();
try {
const tablePath = await this.getTablePath(
this.ohlcNamespace,
'symbol_metadata',
this.ohlcCatalogUri
);
if (!tablePath) {
this.logger.warn('Symbol metadata table not found');
return [];
}
// Query the Iceberg table using DuckDB
const sql = `SELECT * FROM iceberg_scan('${tablePath}')`;
this.logger.debug({ sql }, 'Querying symbol metadata');
const rows = await this.query(sql);
this.logger.info({ count: rows.length }, 'Loaded symbol metadata from Iceberg');
return rows;
} catch (error: any) {
this.logger.error({ error: error.message }, 'Failed to query symbol metadata');
return [];
}
}
/**
* Query OHLC data from trading.ohlc table
*/
async queryOHLC(
ticker: string,
period_seconds: number,
start_time: bigint, // nanoseconds
end_time: bigint // nanoseconds
): Promise<any[]> {
await this.initialize();
try {
const tablePath = await this.getTablePath(
this.ohlcNamespace,
'ohlc',
this.ohlcCatalogUri
);
if (!tablePath) {
this.logger.warn('OHLC table not found');
return [];
}
// Query the Iceberg table with filters, deduplicating by ingested_at so that
// duplicate parquet files (e.g. from repeated Flink job runs on the same key
// range) never produce more than one row per (ticker, period_seconds, timestamp).
const sql = `
SELECT timestamp, ticker, period_seconds, open, high, low, close, volume
FROM (
SELECT
timestamp, ticker, period_seconds, open, high, low, close, volume, ingested_at,
ROW_NUMBER() OVER (
PARTITION BY timestamp
ORDER BY ingested_at DESC
) AS rn
FROM iceberg_scan('${tablePath}')
WHERE ticker = ?
AND period_seconds = ?
AND timestamp >= ?
AND timestamp < ?
)
WHERE rn = 1
ORDER BY timestamp ASC
`;
const params = [
ticker,
period_seconds,
start_time.toString(),
end_time.toString()
];
this.logger.debug({ ticker, period_seconds, start_time, end_time }, 'Querying OHLC data');
const rows = await this.query(sql, params);
this.logger.info({
ticker,
period_seconds,
count: rows.length
}, 'Loaded OHLC data from Iceberg');
// Keep timestamp as bigint to preserve full microsecond precision.
// Convert to seconds (divide first) only when producing TradingView bars.
return rows.map((row: any) => ({
...row,
timestamp: BigInt(row.timestamp)
}));
} catch (error: any) {
this.logger.error({
error: error.message,
ticker,
period_seconds
}, 'Failed to query OHLC data');
return [];
}
}
/**
* Check if OHLC data exists for the given parameters
*/
async hasOHLCData(
ticker: string,
period_seconds: number,
start_time: bigint,
end_time: bigint
): Promise<boolean> {
await this.initialize();
try {
const tablePath = await this.getTablePath(
this.ohlcNamespace,
'ohlc',
this.ohlcCatalogUri
);
if (!tablePath) {
return false;
}
const sql = `
SELECT COUNT(*) as count
FROM iceberg_scan('${tablePath}')
WHERE ticker = ?
AND period_seconds = ?
AND timestamp >= ?
AND timestamp < ?
`;
const params = [
ticker,
period_seconds,
start_time.toString(),
end_time.toString()
];
const rows = await this.query<{ count: number }>(sql, params);
return rows.length > 0 && rows[0].count > 0;
} catch (error: any) {
this.logger.error({ error: error.message }, 'Failed to check OHLC data existence');
return false;
}
}
/**
* Find missing OHLC data ranges by checking for absent timestamps.
*
* Any timestamp slot in [start_time, min(end_time, now)) that has no row in
* Iceberg is treated as missing and collected into contiguous ranges that the
* caller should request from the relay/ingestor.
*/
async findMissingOHLCRanges(
ticker: string,
period_seconds: number,
start_time: bigint,
end_time: bigint
): Promise<Array<[bigint, bigint]>> {
await this.initialize();
try {
const data = await this.queryOHLC(ticker, period_seconds, start_time, end_time);
const periodNanos = BigInt(period_seconds) * 1_000_000_000n;
// Cap at current time — future slots are not "missing", they don't exist yet.
const nowNanos = BigInt(Date.now()) * 1_000_000n;
const effectiveEnd = end_time < nowNanos ? end_time : nowNanos;
// Build a set of timestamps we already have (all rows are non-null now).
const present = new Set(data.map((row: any) => row.timestamp));
// Collect every expected slot that is absent.
const missing: bigint[] = [];
for (let t = start_time; t < effectiveEnd; t += periodNanos) {
if (!present.has(t)) {
missing.push(t);
}
}
if (missing.length === 0) {
return [];
}
// Coalesce adjacent missing slots into contiguous [rangeStart, rangeEnd) intervals.
const ranges: Array<[bigint, bigint]> = [];
let rangeStart = missing[0];
let prev = missing[0];
for (let i = 1; i < missing.length; i++) {
if (missing[i] !== prev + periodNanos) {
ranges.push([rangeStart, prev + periodNanos]);
rangeStart = missing[i];
}
prev = missing[i];
}
ranges.push([rangeStart, prev + periodNanos]);
this.logger.debug({
ticker,
period_seconds,
missingSlots: missing.length,
ranges: ranges.length,
}, 'OHLC gap detection complete');
return ranges;
} catch (error: any) {
this.logger.error({ error: error.message }, 'Failed to find missing OHLC ranges');
// Return full range on error (safe default — triggers a backfill)
return [[start_time, end_time]];
}
}
/**
* Append a batch of conversation messages as a Parquet file in S3.
* Called once per session at session end to avoid small-file fragmentation.
*/
async appendMessages(
userId: string,
sessionId: string,
messages: Array<{
id: string;
user_id: string;
session_id: string;
role: string;
content: string;
metadata: string;
timestamp: number;
}>
): Promise<void> {
await this.initialize();
if (!this.conversationsBucket || messages.length === 0) {
return;
}
const now = new Date();
const year = now.getUTCFullYear();
const month = String(now.getUTCMonth() + 1).padStart(2, '0');
const s3Path = `s3://${this.conversationsBucket}/gateway/conversations/year=${year}/month=${month}/user_id=${userId}/${sessionId}.parquet`;
// Use a timestamp-based name to avoid cross-session collisions
const tempTable = `msg_flush_${Date.now()}`;
try {
await this.query(`
CREATE TEMP TABLE ${tempTable} (
id VARCHAR,
user_id VARCHAR,
session_id VARCHAR,
role VARCHAR,
content VARCHAR,
metadata VARCHAR,
timestamp BIGINT
)
`);
for (const msg of messages) {
await this.query(
`INSERT INTO ${tempTable} VALUES (?, ?, ?, ?, ?, ?, ?)`,
[msg.id, msg.user_id, msg.session_id, msg.role, msg.content, msg.metadata, msg.timestamp]
);
}
await this.query(`COPY ${tempTable} TO '${s3Path}' (FORMAT PARQUET)`);
this.logger.info({ userId, sessionId, count: messages.length, s3Path }, 'Conversation flushed to Parquet');
} finally {
await this.query(`DROP TABLE IF EXISTS ${tempTable}`).catch(() => {});
}
}
/**
* Append a batch of image/audio blobs as a Parquet file in S3.
* Called once per assistant turn that produces binary output.
*/
async appendBlobs(
userId: string,
sessionId: string,
messageId: string,
blobs: Array<{
id: string;
user_id: string;
session_id: string;
message_id: string;
blob_type: string;
mime_type: string;
data: string;
caption: string | null;
timestamp: number;
}>
): Promise<void> {
await this.initialize();
if (!this.conversationsBucket || blobs.length === 0) {
return;
}
const now = new Date();
const year = now.getUTCFullYear();
const month = String(now.getUTCMonth() + 1).padStart(2, '0');
const s3Path = `s3://${this.conversationsBucket}/gateway/blobs/year=${year}/month=${month}/user_id=${userId}/${sessionId}_${messageId}.parquet`;
const tempTable = `blob_flush_${Date.now()}`;
try {
await this.query(`
CREATE TEMP TABLE ${tempTable} (
id VARCHAR,
user_id VARCHAR,
session_id VARCHAR,
message_id VARCHAR,
blob_type VARCHAR,
mime_type VARCHAR,
data VARCHAR,
caption VARCHAR,
timestamp BIGINT
)
`);
for (const blob of blobs) {
await this.query(
`INSERT INTO ${tempTable} VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
[blob.id, blob.user_id, blob.session_id, blob.message_id, blob.blob_type, blob.mime_type, blob.data, blob.caption, blob.timestamp]
);
}
await this.query(`COPY ${tempTable} TO '${s3Path}' (FORMAT PARQUET)`);
this.logger.info({ userId, sessionId, messageId, count: blobs.length, s3Path }, 'Blobs flushed to Parquet');
} finally {
await this.query(`DROP TABLE IF EXISTS ${tempTable}`).catch(() => {});
}
}
/**
* Query blobs from S3 by userId/sessionId, optionally filtered to specific blob IDs.
*/
async queryBlobs(
userId: string,
sessionId: string,
blobIds?: string[]
): Promise<any[]> {
await this.initialize();
try {
const tablePath = await this.getTablePath(this.namespace, 'blobs', this.catalogUri);
if (!tablePath) {
// Fallback: scan per-turn Parquet files written directly to S3
if (this.conversationsBucket) {
this.logger.debug({ userId, sessionId }, 'REST catalog miss, scanning blob Parquet files');
const parquetPath = `s3://${this.conversationsBucket}/gateway/blobs/**/user_id=${userId}/${sessionId}_*.parquet`;
const idClause = blobIds?.length
? `WHERE id IN (${blobIds.map(id => `'${id.replace(/'/g, "''")}'`).join(', ')})`
: '';
try {
return await this.query(`SELECT * FROM read_parquet('${parquetPath}') ${idClause} ORDER BY timestamp ASC`);
} catch {
// No blobs yet for this session
}
}
return [];
}
const idFilter = blobIds?.length
? `AND id IN (${blobIds.map(() => '?').join(', ')})`
: '';
const params: any[] = [userId, sessionId, ...(blobIds ?? [])];
const sql = `
SELECT id, user_id, session_id, message_id, blob_type, mime_type, data, caption, timestamp
FROM iceberg_scan('${tablePath}')
WHERE user_id = ? AND session_id = ? ${idFilter}
ORDER BY timestamp ASC
`;
const rows = await this.query(sql, params);
this.logger.info({ userId, sessionId, count: rows.length }, 'Loaded blobs from Iceberg');
return rows.map((row: any) => ({ ...row, timestamp: Number(row.timestamp) }));
} catch (error: any) {
this.logger.error({ error: error.message, userId, sessionId }, 'Failed to query blobs');
return [];
}
}
/**
* Close the DuckDB connection
*/
async close(): Promise<void> {
if (this.conn) {
const close = promisify(this.conn.close.bind(this.conn));
await close();
this.conn = null;
}
if (this.db) {
const close = promisify(this.db.close.bind(this.db));
await close();
this.db = null;
}
this.initialized = false;
this.logger.info('DuckDB client closed');
}
}