data pipeline refactor and fix

2026-04-13 18:30:04 -04:00
parent 6418729b16
commit 326bf80846
96 changed files with 7107 additions and 1763 deletions
--- a/gateway/src/clients/duckdb-client.ts
+++ b/gateway/src/clients/duckdb-client.ts
@@ -632,6 +632,118 @@ export class DuckDBClient {
    }
  }

+  /**
+   * Append a batch of image/audio blobs as a Parquet file in S3.
+   * Called once per assistant turn that produces binary output.
+   */
+  async appendBlobs(
+    userId: string,
+    sessionId: string,
+    messageId: string,
+    blobs: Array<{
+      id: string;
+      user_id: string;
+      session_id: string;
+      message_id: string;
+      blob_type: string;
+      mime_type: string;
+      data: string;
+      caption: string | null;
+      timestamp: number;
+    }>
+  ): Promise<void> {
+    await this.initialize();
+
+    if (!this.conversationsBucket || blobs.length === 0) {
+      return;
+    }
+
+    const now = new Date();
+    const year = now.getUTCFullYear();
+    const month = String(now.getUTCMonth() + 1).padStart(2, '0');
+    const s3Path = `s3://${this.conversationsBucket}/gateway/blobs/year=${year}/month=${month}/user_id=${userId}/${sessionId}_${messageId}.parquet`;
+    const tempTable = `blob_flush_${Date.now()}`;
+
+    try {
+      await this.query(`
+        CREATE TEMP TABLE ${tempTable} (
+          id VARCHAR,
+          user_id VARCHAR,
+          session_id VARCHAR,
+          message_id VARCHAR,
+          blob_type VARCHAR,
+          mime_type VARCHAR,
+          data VARCHAR,
+          caption VARCHAR,
+          timestamp BIGINT
+        )
+      `);
+
+      for (const blob of blobs) {
+        await this.query(
+          `INSERT INTO ${tempTable} VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+          [blob.id, blob.user_id, blob.session_id, blob.message_id, blob.blob_type, blob.mime_type, blob.data, blob.caption, blob.timestamp]
+        );
+      }
+
+      await this.query(`COPY ${tempTable} TO '${s3Path}' (FORMAT PARQUET)`);
+      this.logger.info({ userId, sessionId, messageId, count: blobs.length, s3Path }, 'Blobs flushed to Parquet');
+    } finally {
+      await this.query(`DROP TABLE IF EXISTS ${tempTable}`).catch(() => {});
+    }
+  }
+
+  /**
+   * Query blobs from S3 by userId/sessionId, optionally filtered to specific blob IDs.
+   */
+  async queryBlobs(
+    userId: string,
+    sessionId: string,
+    blobIds?: string[]
+  ): Promise<any[]> {
+    await this.initialize();
+
+    try {
+      const tablePath = await this.getTablePath(this.namespace, 'blobs', this.catalogUri);
+
+      if (!tablePath) {
+        // Fallback: scan per-turn Parquet files written directly to S3
+        if (this.conversationsBucket) {
+          this.logger.debug({ userId, sessionId }, 'REST catalog miss, scanning blob Parquet files');
+          const parquetPath = `s3://${this.conversationsBucket}/gateway/blobs/**/user_id=${userId}/${sessionId}_*.parquet`;
+          const idClause = blobIds?.length
+            ? `WHERE id IN (${blobIds.map(id => `'${id.replace(/'/g, "''")}'`).join(', ')})`
+            : '';
+          try {
+            return await this.query(`SELECT * FROM read_parquet('${parquetPath}') ${idClause} ORDER BY timestamp ASC`);
+          } catch {
+            // No blobs yet for this session
+          }
+        }
+        return [];
+      }
+
+      const idFilter = blobIds?.length
+        ? `AND id IN (${blobIds.map(() => '?').join(', ')})`
+        : '';
+      const params: any[] = [userId, sessionId, ...(blobIds ?? [])];
+
+      const sql = `
+        SELECT id, user_id, session_id, message_id, blob_type, mime_type, data, caption, timestamp
+        FROM iceberg_scan('${tablePath}')
+        WHERE user_id = ? AND session_id = ? ${idFilter}
+        ORDER BY timestamp ASC
+      `;
+
+      const rows = await this.query(sql, params);
+      this.logger.info({ userId, sessionId, count: rows.length }, 'Loaded blobs from Iceberg');
+      return rows.map((row: any) => ({ ...row, timestamp: Number(row.timestamp) }));
+    } catch (error: any) {
+      this.logger.error({ error: error.message, userId, sessionId }, 'Failed to query blobs');
+      return [];
+    }
+  }
+
  /**
   * Close the DuckDB connection
   */