765 lines
22 KiB
TypeScript
765 lines
22 KiB
TypeScript
/**
|
|
* DuckDB Client for querying Apache Iceberg tables
|
|
*
|
|
* Uses DuckDB's native Iceberg and Parquet support to query data
|
|
* directly from S3/MinIO without needing catalog-only libraries.
|
|
*/
|
|
|
|
import duckdb from 'duckdb';
|
|
import type { FastifyBaseLogger } from 'fastify';
|
|
import { promisify } from 'util';
|
|
|
|
type Database = duckdb.Database;
|
|
type Connection = duckdb.Connection;
|
|
const { Database, Connection } = duckdb;
|
|
|
|
export interface DuckDBConfig {
|
|
catalogUri: string;
|
|
namespace: string;
|
|
ohlcCatalogUri?: string;
|
|
ohlcNamespace?: string;
|
|
s3Endpoint?: string;
|
|
s3AccessKey?: string;
|
|
s3SecretKey?: string;
|
|
conversationsBucket?: string; // S3 bucket for conversation cold storage
|
|
}
|
|
|
|
/**
|
|
* DuckDB Client with Iceberg support
|
|
*
|
|
* Provides SQL-based queries against Iceberg tables stored in S3/MinIO.
|
|
*/
|
|
export class DuckDBClient {
|
|
private db: Database | null = null;
|
|
private conn: Connection | null = null;
|
|
private namespace: string;
|
|
private ohlcNamespace: string;
|
|
private catalogUri: string;
|
|
private ohlcCatalogUri: string;
|
|
private s3Config: {
|
|
endpoint?: string;
|
|
accessKey?: string;
|
|
secretKey?: string;
|
|
};
|
|
private conversationsBucket?: string;
|
|
private logger: FastifyBaseLogger;
|
|
private initialized = false;
|
|
|
|
constructor(config: DuckDBConfig, logger: FastifyBaseLogger) {
|
|
this.logger = logger;
|
|
this.namespace = config.namespace;
|
|
this.catalogUri = config.catalogUri;
|
|
this.ohlcCatalogUri = config.ohlcCatalogUri || config.catalogUri;
|
|
this.ohlcNamespace = config.ohlcNamespace || 'trading';
|
|
this.conversationsBucket = config.conversationsBucket;
|
|
this.s3Config = {
|
|
endpoint: config.s3Endpoint,
|
|
accessKey: config.s3AccessKey,
|
|
secretKey: config.s3SecretKey,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Initialize DuckDB connection and configure S3/Iceberg extensions
|
|
*/
|
|
async initialize(): Promise<void> {
|
|
if (this.initialized) {
|
|
return;
|
|
}
|
|
|
|
try {
|
|
this.db = new Database(':memory:');
|
|
this.conn = this.db.connect();
|
|
|
|
const all = promisify(this.conn.all.bind(this.conn));
|
|
|
|
// Install and load required extensions
|
|
await all('INSTALL httpfs;');
|
|
await all('LOAD httpfs;');
|
|
await all('INSTALL iceberg;');
|
|
await all('LOAD iceberg;');
|
|
|
|
// Configure S3 credentials if provided
|
|
if (this.s3Config.endpoint && this.s3Config.accessKey && this.s3Config.secretKey) {
|
|
const s3Url = new URL(this.s3Config.endpoint);
|
|
const useSSL = s3Url.protocol === 'https:';
|
|
|
|
await all(`SET s3_endpoint='${s3Url.hostname}:${s3Url.port || (useSSL ? 443 : 9000)}';`);
|
|
await all(`SET s3_access_key_id='${this.s3Config.accessKey}';`);
|
|
await all(`SET s3_secret_access_key='${this.s3Config.secretKey}';`);
|
|
await all(`SET s3_use_ssl=${useSSL};`);
|
|
await all(`SET s3_url_style='path';`);
|
|
await all(`SET s3_region='us-east-1';`);
|
|
await all(`SET s3_url_compatibility_mode=true;`);
|
|
|
|
this.logger.info({
|
|
endpoint: this.s3Config.endpoint,
|
|
useSSL,
|
|
}, 'Configured DuckDB S3 settings');
|
|
}
|
|
|
|
this.initialized = true;
|
|
this.logger.info({
|
|
catalogUri: this.catalogUri,
|
|
namespace: this.namespace,
|
|
ohlcCatalogUri: this.ohlcCatalogUri,
|
|
ohlcNamespace: this.ohlcNamespace,
|
|
}, 'DuckDB client initialized');
|
|
} catch (error) {
|
|
this.logger.error({ error }, 'Failed to initialize DuckDB');
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Execute a SQL query and return all rows
|
|
*/
|
|
private async query<T = any>(sql: string, params?: any[]): Promise<T[]> {
|
|
if (!this.conn) {
|
|
throw new Error('DuckDB connection not initialized');
|
|
}
|
|
|
|
try {
|
|
const all = promisify(this.conn.all.bind(this.conn)) as (sql: string, ...params: any[]) => Promise<any[]>;
|
|
const rows = params && params.length > 0 ? await all(sql, ...params) : await all(sql);
|
|
return rows as T[];
|
|
} catch (error) {
|
|
this.logger.error({ error, sql, params }, 'DuckDB query failed');
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get the Iceberg table path from REST catalog
|
|
*/
|
|
private async getTablePath(namespace: string, tableName: string, catalogUri: string): Promise<string | null> {
|
|
try {
|
|
const tableUrl = `${catalogUri}/v1/namespaces/${namespace}/tables/${tableName}`;
|
|
|
|
this.logger.debug({ tableUrl }, 'Fetching Iceberg table metadata');
|
|
|
|
const response = await fetch(tableUrl, {
|
|
method: 'GET',
|
|
headers: {
|
|
'Content-Type': 'application/json',
|
|
},
|
|
});
|
|
|
|
if (!response.ok) {
|
|
if (response.status === 404) {
|
|
this.logger.debug({ namespace, tableName }, 'Table not found in catalog');
|
|
return null;
|
|
}
|
|
throw new Error(`Failed to fetch table metadata: ${response.status} ${response.statusText}`);
|
|
}
|
|
|
|
const metadata = await response.json() as any;
|
|
|
|
// Extract metadata location (S3 path to metadata.json)
|
|
const metadataLocation = metadata['metadata-location'] || metadata.location;
|
|
|
|
if (!metadataLocation) {
|
|
this.logger.warn({ metadata }, 'No metadata-location found in table response');
|
|
return null;
|
|
}
|
|
|
|
this.logger.debug({ metadataLocation }, 'Found Iceberg table location');
|
|
return metadataLocation;
|
|
} catch (error: any) {
|
|
this.logger.error({ error: error.message, namespace, tableName }, 'Failed to get table path');
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Query messages from gateway.conversations table
|
|
*/
|
|
async queryMessages(
|
|
userId: string,
|
|
sessionId: string,
|
|
options?: {
|
|
startTime?: number;
|
|
endTime?: number;
|
|
limit?: number;
|
|
}
|
|
): Promise<any[]> {
|
|
await this.initialize();
|
|
|
|
try {
|
|
const tablePath = await this.getTablePath(
|
|
this.namespace,
|
|
'conversations',
|
|
this.catalogUri
|
|
);
|
|
|
|
if (!tablePath) {
|
|
// Fallback: scan Parquet files written directly to conversations bucket
|
|
if (this.conversationsBucket) {
|
|
this.logger.debug({ userId, sessionId }, 'REST catalog miss, scanning Parquet cold storage');
|
|
const parquetPath = `s3://${this.conversationsBucket}/gateway/conversations/**/user_id=${userId}/${sessionId}.parquet`;
|
|
const fallbackSql = `
|
|
SELECT id, user_id, session_id, role, content, metadata, timestamp
|
|
FROM read_parquet('${parquetPath}')
|
|
ORDER BY timestamp ASC
|
|
${options?.limit ? `LIMIT ${options.limit}` : ''}
|
|
`;
|
|
try {
|
|
return await this.query(fallbackSql);
|
|
} catch {
|
|
// File may not exist yet
|
|
}
|
|
}
|
|
this.logger.warn('Conversations table not found and no cold storage configured');
|
|
return [];
|
|
}
|
|
|
|
// Build SQL query with optional filters
|
|
let sql = `
|
|
SELECT
|
|
id,
|
|
user_id,
|
|
session_id,
|
|
role,
|
|
content,
|
|
metadata,
|
|
timestamp
|
|
FROM iceberg_scan('${tablePath}')
|
|
WHERE user_id = ?
|
|
AND session_id = ?
|
|
`;
|
|
|
|
const params: any[] = [userId, sessionId];
|
|
|
|
if (options?.startTime) {
|
|
sql += ' AND timestamp >= ?';
|
|
params.push(options.startTime.toString());
|
|
}
|
|
|
|
if (options?.endTime) {
|
|
sql += ' AND timestamp <= ?';
|
|
params.push(options.endTime.toString());
|
|
}
|
|
|
|
sql += ' ORDER BY timestamp ASC';
|
|
|
|
if (options?.limit) {
|
|
sql += ' LIMIT ?';
|
|
params.push(options.limit);
|
|
}
|
|
|
|
this.logger.debug({ userId, sessionId, options }, 'Querying conversation messages');
|
|
|
|
const rows = await this.query(sql, params);
|
|
|
|
this.logger.info({
|
|
userId,
|
|
sessionId,
|
|
count: rows.length
|
|
}, 'Loaded conversation messages from Iceberg');
|
|
|
|
// Convert timestamp strings back to numbers
|
|
return rows.map((row: any) => ({
|
|
...row,
|
|
timestamp: Number(row.timestamp)
|
|
}));
|
|
} catch (error: any) {
|
|
this.logger.error({
|
|
error: error.message,
|
|
userId,
|
|
sessionId
|
|
}, 'Failed to query conversation messages');
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Query checkpoint from gateway.checkpoints table
|
|
*/
|
|
async queryCheckpoint(
|
|
userId: string,
|
|
sessionId: string,
|
|
checkpointId?: string
|
|
): Promise<any | null> {
|
|
await this.initialize();
|
|
|
|
try {
|
|
const tablePath = await this.getTablePath(
|
|
this.namespace,
|
|
'checkpoints',
|
|
this.catalogUri
|
|
);
|
|
|
|
if (!tablePath) {
|
|
this.logger.warn('Checkpoints table not found');
|
|
return null;
|
|
}
|
|
|
|
let sql = `
|
|
SELECT
|
|
user_id,
|
|
session_id,
|
|
checkpoint_id,
|
|
checkpoint_data,
|
|
metadata,
|
|
timestamp
|
|
FROM iceberg_scan('${tablePath}')
|
|
WHERE user_id = ?
|
|
AND session_id = ?
|
|
`;
|
|
|
|
const params: any[] = [userId, sessionId];
|
|
|
|
if (checkpointId) {
|
|
sql += ' AND checkpoint_id = ?';
|
|
params.push(checkpointId);
|
|
}
|
|
|
|
sql += ' ORDER BY timestamp DESC LIMIT 1';
|
|
|
|
this.logger.debug({ userId, sessionId, checkpointId }, 'Querying checkpoint');
|
|
|
|
const rows = await this.query(sql, params);
|
|
|
|
if (rows.length === 0) {
|
|
return null;
|
|
}
|
|
|
|
const row = rows[0];
|
|
|
|
this.logger.info({
|
|
userId,
|
|
sessionId,
|
|
checkpointId: row.checkpoint_id
|
|
}, 'Loaded checkpoint from Iceberg');
|
|
|
|
// Convert timestamp string back to number
|
|
return {
|
|
...row,
|
|
timestamp: Number(row.timestamp)
|
|
};
|
|
} catch (error: any) {
|
|
this.logger.error({
|
|
error: error.message,
|
|
userId,
|
|
sessionId,
|
|
checkpointId
|
|
}, 'Failed to query checkpoint');
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Query symbol metadata from trading.symbol_metadata table
|
|
*/
|
|
async queryAllSymbols(): Promise<any[]> {
|
|
await this.initialize();
|
|
|
|
try {
|
|
const tablePath = await this.getTablePath(
|
|
this.ohlcNamespace,
|
|
'symbol_metadata',
|
|
this.ohlcCatalogUri
|
|
);
|
|
|
|
if (!tablePath) {
|
|
this.logger.warn('Symbol metadata table not found');
|
|
return [];
|
|
}
|
|
|
|
// Query the Iceberg table using DuckDB
|
|
const sql = `SELECT * FROM iceberg_scan('${tablePath}')`;
|
|
|
|
this.logger.debug({ sql }, 'Querying symbol metadata');
|
|
|
|
const rows = await this.query(sql);
|
|
|
|
this.logger.info({ count: rows.length }, 'Loaded symbol metadata from Iceberg');
|
|
|
|
return rows;
|
|
} catch (error: any) {
|
|
this.logger.error({ error: error.message }, 'Failed to query symbol metadata');
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Query OHLC data from trading.ohlc table
|
|
*/
|
|
async queryOHLC(
|
|
ticker: string,
|
|
period_seconds: number,
|
|
start_time: bigint, // nanoseconds
|
|
end_time: bigint // nanoseconds
|
|
): Promise<any[]> {
|
|
await this.initialize();
|
|
|
|
try {
|
|
const tablePath = await this.getTablePath(
|
|
this.ohlcNamespace,
|
|
'ohlc',
|
|
this.ohlcCatalogUri
|
|
);
|
|
|
|
if (!tablePath) {
|
|
this.logger.warn('OHLC table not found');
|
|
return [];
|
|
}
|
|
|
|
// Query the Iceberg table with filters, deduplicating by ingested_at so that
|
|
// duplicate parquet files (e.g. from repeated Flink job runs on the same key
|
|
// range) never produce more than one row per (ticker, period_seconds, timestamp).
|
|
const sql = `
|
|
SELECT timestamp, ticker, period_seconds, open, high, low, close, volume
|
|
FROM (
|
|
SELECT
|
|
timestamp, ticker, period_seconds, open, high, low, close, volume, ingested_at,
|
|
ROW_NUMBER() OVER (
|
|
PARTITION BY timestamp
|
|
ORDER BY ingested_at DESC
|
|
) AS rn
|
|
FROM iceberg_scan('${tablePath}')
|
|
WHERE ticker = ?
|
|
AND period_seconds = ?
|
|
AND timestamp >= ?
|
|
AND timestamp < ?
|
|
)
|
|
WHERE rn = 1
|
|
ORDER BY timestamp ASC
|
|
`;
|
|
|
|
const params = [
|
|
ticker,
|
|
period_seconds,
|
|
start_time.toString(),
|
|
end_time.toString()
|
|
];
|
|
|
|
this.logger.debug({ ticker, period_seconds, start_time, end_time }, 'Querying OHLC data');
|
|
|
|
const rows = await this.query(sql, params);
|
|
|
|
this.logger.info({
|
|
ticker,
|
|
period_seconds,
|
|
count: rows.length
|
|
}, 'Loaded OHLC data from Iceberg');
|
|
|
|
// Keep timestamp as bigint to preserve full microsecond precision.
|
|
// Convert to seconds (divide first) only when producing TradingView bars.
|
|
return rows.map((row: any) => ({
|
|
...row,
|
|
timestamp: BigInt(row.timestamp)
|
|
}));
|
|
} catch (error: any) {
|
|
this.logger.error({
|
|
error: error.message,
|
|
ticker,
|
|
period_seconds
|
|
}, 'Failed to query OHLC data');
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check if OHLC data exists for the given parameters
|
|
*/
|
|
async hasOHLCData(
|
|
ticker: string,
|
|
period_seconds: number,
|
|
start_time: bigint,
|
|
end_time: bigint
|
|
): Promise<boolean> {
|
|
await this.initialize();
|
|
|
|
try {
|
|
const tablePath = await this.getTablePath(
|
|
this.ohlcNamespace,
|
|
'ohlc',
|
|
this.ohlcCatalogUri
|
|
);
|
|
|
|
if (!tablePath) {
|
|
return false;
|
|
}
|
|
|
|
const sql = `
|
|
SELECT COUNT(*) as count
|
|
FROM iceberg_scan('${tablePath}')
|
|
WHERE ticker = ?
|
|
AND period_seconds = ?
|
|
AND timestamp >= ?
|
|
AND timestamp < ?
|
|
`;
|
|
|
|
const params = [
|
|
ticker,
|
|
period_seconds,
|
|
start_time.toString(),
|
|
end_time.toString()
|
|
];
|
|
|
|
const rows = await this.query<{ count: number }>(sql, params);
|
|
return rows.length > 0 && rows[0].count > 0;
|
|
} catch (error: any) {
|
|
this.logger.error({ error: error.message }, 'Failed to check OHLC data existence');
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Find missing OHLC data ranges by checking for absent timestamps.
|
|
*
|
|
* Any timestamp slot in [start_time, min(end_time, now)) that has no row in
|
|
* Iceberg is treated as missing and collected into contiguous ranges that the
|
|
* caller should request from the relay/ingestor.
|
|
*/
|
|
async findMissingOHLCRanges(
|
|
ticker: string,
|
|
period_seconds: number,
|
|
start_time: bigint,
|
|
end_time: bigint
|
|
): Promise<Array<[bigint, bigint]>> {
|
|
await this.initialize();
|
|
|
|
try {
|
|
const data = await this.queryOHLC(ticker, period_seconds, start_time, end_time);
|
|
|
|
const periodNanos = BigInt(period_seconds) * 1_000_000_000n;
|
|
|
|
// Cap at current time — future slots are not "missing", they don't exist yet.
|
|
const nowNanos = BigInt(Date.now()) * 1_000_000n;
|
|
const effectiveEnd = end_time < nowNanos ? end_time : nowNanos;
|
|
|
|
// Build a set of timestamps we already have (all rows are non-null now).
|
|
const present = new Set(data.map((row: any) => row.timestamp));
|
|
|
|
// Collect every expected slot that is absent.
|
|
const missing: bigint[] = [];
|
|
for (let t = start_time; t < effectiveEnd; t += periodNanos) {
|
|
if (!present.has(t)) {
|
|
missing.push(t);
|
|
}
|
|
}
|
|
|
|
if (missing.length === 0) {
|
|
return [];
|
|
}
|
|
|
|
// Coalesce adjacent missing slots into contiguous [rangeStart, rangeEnd) intervals.
|
|
const ranges: Array<[bigint, bigint]> = [];
|
|
let rangeStart = missing[0];
|
|
let prev = missing[0];
|
|
for (let i = 1; i < missing.length; i++) {
|
|
if (missing[i] !== prev + periodNanos) {
|
|
ranges.push([rangeStart, prev + periodNanos]);
|
|
rangeStart = missing[i];
|
|
}
|
|
prev = missing[i];
|
|
}
|
|
ranges.push([rangeStart, prev + periodNanos]);
|
|
|
|
this.logger.debug({
|
|
ticker,
|
|
period_seconds,
|
|
missingSlots: missing.length,
|
|
ranges: ranges.length,
|
|
}, 'OHLC gap detection complete');
|
|
|
|
return ranges;
|
|
} catch (error: any) {
|
|
this.logger.error({ error: error.message }, 'Failed to find missing OHLC ranges');
|
|
// Return full range on error (safe default — triggers a backfill)
|
|
return [[start_time, end_time]];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Append a batch of conversation messages as a Parquet file in S3.
|
|
* Called once per session at session end to avoid small-file fragmentation.
|
|
*/
|
|
async appendMessages(
|
|
userId: string,
|
|
sessionId: string,
|
|
messages: Array<{
|
|
id: string;
|
|
user_id: string;
|
|
session_id: string;
|
|
role: string;
|
|
content: string;
|
|
metadata: string;
|
|
timestamp: number;
|
|
}>
|
|
): Promise<void> {
|
|
await this.initialize();
|
|
|
|
if (!this.conversationsBucket || messages.length === 0) {
|
|
return;
|
|
}
|
|
|
|
const now = new Date();
|
|
const year = now.getUTCFullYear();
|
|
const month = String(now.getUTCMonth() + 1).padStart(2, '0');
|
|
const s3Path = `s3://${this.conversationsBucket}/gateway/conversations/year=${year}/month=${month}/user_id=${userId}/${sessionId}.parquet`;
|
|
|
|
// Use a timestamp-based name to avoid cross-session collisions
|
|
const tempTable = `msg_flush_${Date.now()}`;
|
|
|
|
try {
|
|
await this.query(`
|
|
CREATE TEMP TABLE ${tempTable} (
|
|
id VARCHAR,
|
|
user_id VARCHAR,
|
|
session_id VARCHAR,
|
|
role VARCHAR,
|
|
content VARCHAR,
|
|
metadata VARCHAR,
|
|
timestamp BIGINT
|
|
)
|
|
`);
|
|
|
|
for (const msg of messages) {
|
|
await this.query(
|
|
`INSERT INTO ${tempTable} VALUES (?, ?, ?, ?, ?, ?, ?)`,
|
|
[msg.id, msg.user_id, msg.session_id, msg.role, msg.content, msg.metadata, msg.timestamp]
|
|
);
|
|
}
|
|
|
|
await this.query(`COPY ${tempTable} TO '${s3Path}' (FORMAT PARQUET)`);
|
|
|
|
this.logger.info({ userId, sessionId, count: messages.length, s3Path }, 'Conversation flushed to Parquet');
|
|
} finally {
|
|
await this.query(`DROP TABLE IF EXISTS ${tempTable}`).catch(() => {});
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Append a batch of image/audio blobs as a Parquet file in S3.
|
|
* Called once per assistant turn that produces binary output.
|
|
*/
|
|
async appendBlobs(
|
|
userId: string,
|
|
sessionId: string,
|
|
messageId: string,
|
|
blobs: Array<{
|
|
id: string;
|
|
user_id: string;
|
|
session_id: string;
|
|
message_id: string;
|
|
blob_type: string;
|
|
mime_type: string;
|
|
data: string;
|
|
caption: string | null;
|
|
timestamp: number;
|
|
}>
|
|
): Promise<void> {
|
|
await this.initialize();
|
|
|
|
if (!this.conversationsBucket || blobs.length === 0) {
|
|
return;
|
|
}
|
|
|
|
const now = new Date();
|
|
const year = now.getUTCFullYear();
|
|
const month = String(now.getUTCMonth() + 1).padStart(2, '0');
|
|
const s3Path = `s3://${this.conversationsBucket}/gateway/blobs/year=${year}/month=${month}/user_id=${userId}/${sessionId}_${messageId}.parquet`;
|
|
const tempTable = `blob_flush_${Date.now()}`;
|
|
|
|
try {
|
|
await this.query(`
|
|
CREATE TEMP TABLE ${tempTable} (
|
|
id VARCHAR,
|
|
user_id VARCHAR,
|
|
session_id VARCHAR,
|
|
message_id VARCHAR,
|
|
blob_type VARCHAR,
|
|
mime_type VARCHAR,
|
|
data VARCHAR,
|
|
caption VARCHAR,
|
|
timestamp BIGINT
|
|
)
|
|
`);
|
|
|
|
for (const blob of blobs) {
|
|
await this.query(
|
|
`INSERT INTO ${tempTable} VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
|
[blob.id, blob.user_id, blob.session_id, blob.message_id, blob.blob_type, blob.mime_type, blob.data, blob.caption, blob.timestamp]
|
|
);
|
|
}
|
|
|
|
await this.query(`COPY ${tempTable} TO '${s3Path}' (FORMAT PARQUET)`);
|
|
this.logger.info({ userId, sessionId, messageId, count: blobs.length, s3Path }, 'Blobs flushed to Parquet');
|
|
} finally {
|
|
await this.query(`DROP TABLE IF EXISTS ${tempTable}`).catch(() => {});
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Query blobs from S3 by userId/sessionId, optionally filtered to specific blob IDs.
|
|
*/
|
|
async queryBlobs(
|
|
userId: string,
|
|
sessionId: string,
|
|
blobIds?: string[]
|
|
): Promise<any[]> {
|
|
await this.initialize();
|
|
|
|
try {
|
|
const tablePath = await this.getTablePath(this.namespace, 'blobs', this.catalogUri);
|
|
|
|
if (!tablePath) {
|
|
// Fallback: scan per-turn Parquet files written directly to S3
|
|
if (this.conversationsBucket) {
|
|
this.logger.debug({ userId, sessionId }, 'REST catalog miss, scanning blob Parquet files');
|
|
const parquetPath = `s3://${this.conversationsBucket}/gateway/blobs/**/user_id=${userId}/${sessionId}_*.parquet`;
|
|
const idClause = blobIds?.length
|
|
? `WHERE id IN (${blobIds.map(id => `'${id.replace(/'/g, "''")}'`).join(', ')})`
|
|
: '';
|
|
try {
|
|
return await this.query(`SELECT * FROM read_parquet('${parquetPath}') ${idClause} ORDER BY timestamp ASC`);
|
|
} catch {
|
|
// No blobs yet for this session
|
|
}
|
|
}
|
|
return [];
|
|
}
|
|
|
|
const idFilter = blobIds?.length
|
|
? `AND id IN (${blobIds.map(() => '?').join(', ')})`
|
|
: '';
|
|
const params: any[] = [userId, sessionId, ...(blobIds ?? [])];
|
|
|
|
const sql = `
|
|
SELECT id, user_id, session_id, message_id, blob_type, mime_type, data, caption, timestamp
|
|
FROM iceberg_scan('${tablePath}')
|
|
WHERE user_id = ? AND session_id = ? ${idFilter}
|
|
ORDER BY timestamp ASC
|
|
`;
|
|
|
|
const rows = await this.query(sql, params);
|
|
this.logger.info({ userId, sessionId, count: rows.length }, 'Loaded blobs from Iceberg');
|
|
return rows.map((row: any) => ({ ...row, timestamp: Number(row.timestamp) }));
|
|
} catch (error: any) {
|
|
this.logger.error({ error: error.message, userId, sessionId }, 'Failed to query blobs');
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Close the DuckDB connection
|
|
*/
|
|
async close(): Promise<void> {
|
|
if (this.conn) {
|
|
const close = promisify(this.conn.close.bind(this.conn));
|
|
await close();
|
|
this.conn = null;
|
|
}
|
|
if (this.db) {
|
|
const close = promisify(this.db.close.bind(this.db));
|
|
await close();
|
|
this.db = null;
|
|
}
|
|
this.initialized = false;
|
|
this.logger.info('DuckDB client closed');
|
|
}
|
|
}
|