data pipeline refactor and fix

This commit is contained in:
2026-04-13 18:30:04 -04:00
parent 6418729b16
commit 326bf80846
96 changed files with 7107 additions and 1763 deletions

View File

@@ -632,6 +632,118 @@ export class DuckDBClient {
}
}
/**
* Append a batch of image/audio blobs as a Parquet file in S3.
* Called once per assistant turn that produces binary output.
*/
async appendBlobs(
userId: string,
sessionId: string,
messageId: string,
blobs: Array<{
id: string;
user_id: string;
session_id: string;
message_id: string;
blob_type: string;
mime_type: string;
data: string;
caption: string | null;
timestamp: number;
}>
): Promise<void> {
await this.initialize();
if (!this.conversationsBucket || blobs.length === 0) {
return;
}
const now = new Date();
const year = now.getUTCFullYear();
const month = String(now.getUTCMonth() + 1).padStart(2, '0');
const s3Path = `s3://${this.conversationsBucket}/gateway/blobs/year=${year}/month=${month}/user_id=${userId}/${sessionId}_${messageId}.parquet`;
const tempTable = `blob_flush_${Date.now()}`;
try {
await this.query(`
CREATE TEMP TABLE ${tempTable} (
id VARCHAR,
user_id VARCHAR,
session_id VARCHAR,
message_id VARCHAR,
blob_type VARCHAR,
mime_type VARCHAR,
data VARCHAR,
caption VARCHAR,
timestamp BIGINT
)
`);
for (const blob of blobs) {
await this.query(
`INSERT INTO ${tempTable} VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
[blob.id, blob.user_id, blob.session_id, blob.message_id, blob.blob_type, blob.mime_type, blob.data, blob.caption, blob.timestamp]
);
}
await this.query(`COPY ${tempTable} TO '${s3Path}' (FORMAT PARQUET)`);
this.logger.info({ userId, sessionId, messageId, count: blobs.length, s3Path }, 'Blobs flushed to Parquet');
} finally {
await this.query(`DROP TABLE IF EXISTS ${tempTable}`).catch(() => {});
}
}
/**
* Query blobs from S3 by userId/sessionId, optionally filtered to specific blob IDs.
*/
async queryBlobs(
userId: string,
sessionId: string,
blobIds?: string[]
): Promise<any[]> {
await this.initialize();
try {
const tablePath = await this.getTablePath(this.namespace, 'blobs', this.catalogUri);
if (!tablePath) {
// Fallback: scan per-turn Parquet files written directly to S3
if (this.conversationsBucket) {
this.logger.debug({ userId, sessionId }, 'REST catalog miss, scanning blob Parquet files');
const parquetPath = `s3://${this.conversationsBucket}/gateway/blobs/**/user_id=${userId}/${sessionId}_*.parquet`;
const idClause = blobIds?.length
? `WHERE id IN (${blobIds.map(id => `'${id.replace(/'/g, "''")}'`).join(', ')})`
: '';
try {
return await this.query(`SELECT * FROM read_parquet('${parquetPath}') ${idClause} ORDER BY timestamp ASC`);
} catch {
// No blobs yet for this session
}
}
return [];
}
const idFilter = blobIds?.length
? `AND id IN (${blobIds.map(() => '?').join(', ')})`
: '';
const params: any[] = [userId, sessionId, ...(blobIds ?? [])];
const sql = `
SELECT id, user_id, session_id, message_id, blob_type, mime_type, data, caption, timestamp
FROM iceberg_scan('${tablePath}')
WHERE user_id = ? AND session_id = ? ${idFilter}
ORDER BY timestamp ASC
`;
const rows = await this.query(sql, params);
this.logger.info({ userId, sessionId, count: rows.length }, 'Loaded blobs from Iceberg');
return rows.map((row: any) => ({ ...row, timestamp: Number(row.timestamp) }));
} catch (error: any) {
this.logger.error({ error: error.message, userId, sessionId }, 'Failed to query blobs');
return [];
}
}
/**
* Close the DuckDB connection
*/