data pipeline refactor and fix
This commit is contained in:
@@ -632,6 +632,118 @@ export class DuckDBClient {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Append a batch of image/audio blobs as a Parquet file in S3.
|
||||
* Called once per assistant turn that produces binary output.
|
||||
*/
|
||||
async appendBlobs(
|
||||
userId: string,
|
||||
sessionId: string,
|
||||
messageId: string,
|
||||
blobs: Array<{
|
||||
id: string;
|
||||
user_id: string;
|
||||
session_id: string;
|
||||
message_id: string;
|
||||
blob_type: string;
|
||||
mime_type: string;
|
||||
data: string;
|
||||
caption: string | null;
|
||||
timestamp: number;
|
||||
}>
|
||||
): Promise<void> {
|
||||
await this.initialize();
|
||||
|
||||
if (!this.conversationsBucket || blobs.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const now = new Date();
|
||||
const year = now.getUTCFullYear();
|
||||
const month = String(now.getUTCMonth() + 1).padStart(2, '0');
|
||||
const s3Path = `s3://${this.conversationsBucket}/gateway/blobs/year=${year}/month=${month}/user_id=${userId}/${sessionId}_${messageId}.parquet`;
|
||||
const tempTable = `blob_flush_${Date.now()}`;
|
||||
|
||||
try {
|
||||
await this.query(`
|
||||
CREATE TEMP TABLE ${tempTable} (
|
||||
id VARCHAR,
|
||||
user_id VARCHAR,
|
||||
session_id VARCHAR,
|
||||
message_id VARCHAR,
|
||||
blob_type VARCHAR,
|
||||
mime_type VARCHAR,
|
||||
data VARCHAR,
|
||||
caption VARCHAR,
|
||||
timestamp BIGINT
|
||||
)
|
||||
`);
|
||||
|
||||
for (const blob of blobs) {
|
||||
await this.query(
|
||||
`INSERT INTO ${tempTable} VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
[blob.id, blob.user_id, blob.session_id, blob.message_id, blob.blob_type, blob.mime_type, blob.data, blob.caption, blob.timestamp]
|
||||
);
|
||||
}
|
||||
|
||||
await this.query(`COPY ${tempTable} TO '${s3Path}' (FORMAT PARQUET)`);
|
||||
this.logger.info({ userId, sessionId, messageId, count: blobs.length, s3Path }, 'Blobs flushed to Parquet');
|
||||
} finally {
|
||||
await this.query(`DROP TABLE IF EXISTS ${tempTable}`).catch(() => {});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Query blobs from S3 by userId/sessionId, optionally filtered to specific blob IDs.
|
||||
*/
|
||||
async queryBlobs(
|
||||
userId: string,
|
||||
sessionId: string,
|
||||
blobIds?: string[]
|
||||
): Promise<any[]> {
|
||||
await this.initialize();
|
||||
|
||||
try {
|
||||
const tablePath = await this.getTablePath(this.namespace, 'blobs', this.catalogUri);
|
||||
|
||||
if (!tablePath) {
|
||||
// Fallback: scan per-turn Parquet files written directly to S3
|
||||
if (this.conversationsBucket) {
|
||||
this.logger.debug({ userId, sessionId }, 'REST catalog miss, scanning blob Parquet files');
|
||||
const parquetPath = `s3://${this.conversationsBucket}/gateway/blobs/**/user_id=${userId}/${sessionId}_*.parquet`;
|
||||
const idClause = blobIds?.length
|
||||
? `WHERE id IN (${blobIds.map(id => `'${id.replace(/'/g, "''")}'`).join(', ')})`
|
||||
: '';
|
||||
try {
|
||||
return await this.query(`SELECT * FROM read_parquet('${parquetPath}') ${idClause} ORDER BY timestamp ASC`);
|
||||
} catch {
|
||||
// No blobs yet for this session
|
||||
}
|
||||
}
|
||||
return [];
|
||||
}
|
||||
|
||||
const idFilter = blobIds?.length
|
||||
? `AND id IN (${blobIds.map(() => '?').join(', ')})`
|
||||
: '';
|
||||
const params: any[] = [userId, sessionId, ...(blobIds ?? [])];
|
||||
|
||||
const sql = `
|
||||
SELECT id, user_id, session_id, message_id, blob_type, mime_type, data, caption, timestamp
|
||||
FROM iceberg_scan('${tablePath}')
|
||||
WHERE user_id = ? AND session_id = ? ${idFilter}
|
||||
ORDER BY timestamp ASC
|
||||
`;
|
||||
|
||||
const rows = await this.query(sql, params);
|
||||
this.logger.info({ userId, sessionId, count: rows.length }, 'Loaded blobs from Iceberg');
|
||||
return rows.map((row: any) => ({ ...row, timestamp: Number(row.timestamp) }));
|
||||
} catch (error: any) {
|
||||
this.logger.error({ error: error.message, userId, sessionId }, 'Failed to query blobs');
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the DuckDB connection
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user