redesign fully scaffolded and web login works
This commit is contained in:
356
gateway/src/harness/memory/document-loader.ts
Normal file
356
gateway/src/harness/memory/document-loader.ts
Normal file
@@ -0,0 +1,356 @@
|
||||
import { readdir, readFile } from 'fs/promises';
|
||||
import { join, relative } from 'path';
|
||||
import { createHash } from 'crypto';
|
||||
import type { FastifyBaseLogger } from 'fastify';
|
||||
import { RAGRetriever } from './rag-retriever.js';
|
||||
import { EmbeddingService } from './embedding-service.js';
|
||||
|
||||
/**
|
||||
* Document metadata stored with each chunk
|
||||
*/
|
||||
export interface DocumentMetadata {
|
||||
document_id: string;
|
||||
chunk_index: number;
|
||||
content_hash: string;
|
||||
last_updated: number;
|
||||
tags: string[];
|
||||
heading?: string;
|
||||
file_path: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Document chunk with content and metadata
|
||||
*/
|
||||
export interface DocumentChunk {
|
||||
content: string;
|
||||
metadata: DocumentMetadata;
|
||||
}
|
||||
|
||||
/**
|
||||
* Document loader configuration
|
||||
*/
|
||||
export interface DocumentLoaderConfig {
|
||||
knowledgeDir: string;
|
||||
maxChunkSize?: number; // in tokens (approximate by chars)
|
||||
chunkOverlap?: number; // overlap between chunks
|
||||
}
|
||||
|
||||
/**
|
||||
* Global knowledge document loader
|
||||
*
|
||||
* Loads markdown documents from a directory structure and stores them
|
||||
* as global knowledge (user_id="0") in Qdrant for RAG retrieval.
|
||||
*
|
||||
* Features:
|
||||
* - Intelligent chunking by markdown headers
|
||||
* - Content hashing for change detection
|
||||
* - Metadata extraction (tags, headings)
|
||||
* - Automatic embedding generation
|
||||
* - Incremental updates (only changed docs)
|
||||
*
|
||||
* Directory structure:
|
||||
* gateway/knowledge/
|
||||
* platform/
|
||||
* trading/
|
||||
* indicators/
|
||||
* strategies/
|
||||
*/
|
||||
export class DocumentLoader {
|
||||
private config: DocumentLoaderConfig;
|
||||
private logger: FastifyBaseLogger;
|
||||
private embeddings: EmbeddingService;
|
||||
private rag: RAGRetriever;
|
||||
private loadedDocs: Map<string, string> = new Map(); // path -> hash
|
||||
|
||||
constructor(
|
||||
config: DocumentLoaderConfig,
|
||||
embeddings: EmbeddingService,
|
||||
rag: RAGRetriever,
|
||||
logger: FastifyBaseLogger
|
||||
) {
|
||||
this.config = {
|
||||
maxChunkSize: 4000, // ~1000 tokens
|
||||
chunkOverlap: 200,
|
||||
...config,
|
||||
};
|
||||
this.embeddings = embeddings;
|
||||
this.rag = rag;
|
||||
this.logger = logger;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load all documents from knowledge directory
|
||||
*/
|
||||
async loadAll(): Promise<{ loaded: number; updated: number; skipped: number }> {
|
||||
this.logger.info({ dir: this.config.knowledgeDir }, 'Loading knowledge documents');
|
||||
|
||||
const stats = { loaded: 0, updated: 0, skipped: 0 };
|
||||
|
||||
try {
|
||||
const files = await this.findMarkdownFiles(this.config.knowledgeDir);
|
||||
|
||||
for (const filePath of files) {
|
||||
const result = await this.loadDocument(filePath);
|
||||
|
||||
if (result === 'loaded') stats.loaded++;
|
||||
else if (result === 'updated') stats.updated++;
|
||||
else stats.skipped++;
|
||||
}
|
||||
|
||||
this.logger.info(stats, 'Knowledge documents loaded');
|
||||
return stats;
|
||||
} catch (error) {
|
||||
this.logger.error({ error }, 'Failed to load knowledge documents');
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Load a single document
|
||||
*/
|
||||
async loadDocument(filePath: string): Promise<'loaded' | 'updated' | 'skipped'> {
|
||||
try {
|
||||
// Read file content
|
||||
const content = await readFile(filePath, 'utf-8');
|
||||
const contentHash = this.hashContent(content);
|
||||
|
||||
// Check if document has changed
|
||||
const relativePath = relative(this.config.knowledgeDir, filePath);
|
||||
const existingHash = this.loadedDocs.get(relativePath);
|
||||
|
||||
if (existingHash === contentHash) {
|
||||
this.logger.debug({ file: relativePath }, 'Document unchanged, skipping');
|
||||
return 'skipped';
|
||||
}
|
||||
|
||||
const isUpdate = !!existingHash;
|
||||
|
||||
// Parse and chunk document
|
||||
const chunks = this.chunkDocument(content, relativePath);
|
||||
|
||||
this.logger.info(
|
||||
{ file: relativePath, chunks: chunks.length, update: isUpdate },
|
||||
'Processing document'
|
||||
);
|
||||
|
||||
// Generate embeddings and store chunks
|
||||
for (const chunk of chunks) {
|
||||
const embedding = await this.embeddings.embed(chunk.content);
|
||||
|
||||
// Create unique ID for this chunk
|
||||
const chunkId = `global:${chunk.metadata.document_id}:${chunk.metadata.chunk_index}`;
|
||||
|
||||
// Store in Qdrant as global knowledge
|
||||
await this.rag.storeGlobalKnowledge(
|
||||
chunkId,
|
||||
chunk.content,
|
||||
embedding,
|
||||
{
|
||||
...chunk.metadata,
|
||||
type: 'knowledge_doc',
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// Update loaded docs tracking
|
||||
this.loadedDocs.set(relativePath, contentHash);
|
||||
|
||||
return isUpdate ? 'updated' : 'loaded';
|
||||
} catch (error) {
|
||||
this.logger.error({ error, file: filePath }, 'Failed to load document');
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reload a specific document (for updates)
|
||||
*/
|
||||
async reloadDocument(filePath: string): Promise<void> {
|
||||
this.logger.info({ file: filePath }, 'Reloading document');
|
||||
await this.loadDocument(filePath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk document by markdown headers with smart splitting
|
||||
*/
|
||||
private chunkDocument(content: string, documentId: string): DocumentChunk[] {
|
||||
const chunks: DocumentChunk[] = [];
|
||||
const tags = this.extractTags(content);
|
||||
const lastModified = Date.now();
|
||||
|
||||
// Split by headers (## or ###)
|
||||
const sections = this.splitByHeaders(content);
|
||||
|
||||
let chunkIndex = 0;
|
||||
|
||||
for (const section of sections) {
|
||||
// If section is too large, split it further
|
||||
const subChunks = this.splitLargeSection(section.content);
|
||||
|
||||
for (const subContent of subChunks) {
|
||||
if (subContent.trim().length === 0) continue;
|
||||
|
||||
chunks.push({
|
||||
content: subContent,
|
||||
metadata: {
|
||||
document_id: documentId,
|
||||
chunk_index: chunkIndex++,
|
||||
content_hash: this.hashContent(content),
|
||||
last_updated: lastModified,
|
||||
tags,
|
||||
heading: section.heading,
|
||||
file_path: documentId,
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
/**
|
||||
* Split document by markdown headers
|
||||
*/
|
||||
private splitByHeaders(content: string): Array<{ heading?: string; content: string }> {
|
||||
const lines = content.split('\n');
|
||||
const sections: Array<{ heading?: string; content: string }> = [];
|
||||
let currentSection: string[] = [];
|
||||
let currentHeading: string | undefined;
|
||||
|
||||
for (const line of lines) {
|
||||
// Check for markdown header (##, ###, ####)
|
||||
const headerMatch = line.match(/^(#{2,4})\s+(.+)$/);
|
||||
|
||||
if (headerMatch) {
|
||||
// Save previous section
|
||||
if (currentSection.length > 0) {
|
||||
sections.push({
|
||||
heading: currentHeading,
|
||||
content: currentSection.join('\n'),
|
||||
});
|
||||
}
|
||||
|
||||
// Start new section
|
||||
currentHeading = headerMatch[2].trim();
|
||||
currentSection = [line];
|
||||
} else {
|
||||
currentSection.push(line);
|
||||
}
|
||||
}
|
||||
|
||||
// Add final section
|
||||
if (currentSection.length > 0) {
|
||||
sections.push({
|
||||
heading: currentHeading,
|
||||
content: currentSection.join('\n'),
|
||||
});
|
||||
}
|
||||
|
||||
return sections;
|
||||
}
|
||||
|
||||
/**
|
||||
* Split large sections into smaller chunks
|
||||
*/
|
||||
private splitLargeSection(content: string): string[] {
|
||||
const maxSize = this.config.maxChunkSize!;
|
||||
const overlap = this.config.chunkOverlap!;
|
||||
|
||||
if (content.length <= maxSize) {
|
||||
return [content];
|
||||
}
|
||||
|
||||
const chunks: string[] = [];
|
||||
let start = 0;
|
||||
|
||||
while (start < content.length) {
|
||||
const end = Math.min(start + maxSize, content.length);
|
||||
let chunkEnd = end;
|
||||
|
||||
// Try to break at sentence boundary
|
||||
if (end < content.length) {
|
||||
const sentenceEnd = content.lastIndexOf('.', end);
|
||||
const paragraphEnd = content.lastIndexOf('\n\n', end);
|
||||
|
||||
if (paragraphEnd > start + maxSize / 2) {
|
||||
chunkEnd = paragraphEnd;
|
||||
} else if (sentenceEnd > start + maxSize / 2) {
|
||||
chunkEnd = sentenceEnd + 1;
|
||||
}
|
||||
}
|
||||
|
||||
chunks.push(content.substring(start, chunkEnd));
|
||||
start = chunkEnd - overlap;
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract tags from document (frontmatter or first heading)
|
||||
*/
|
||||
private extractTags(content: string): string[] {
|
||||
const tags: string[] = [];
|
||||
|
||||
// Try to extract from YAML frontmatter
|
||||
const frontmatterMatch = content.match(/^---\n([\s\S]*?)\n---/);
|
||||
if (frontmatterMatch) {
|
||||
const frontmatter = frontmatterMatch[1];
|
||||
const tagsMatch = frontmatter.match(/tags:\s*\[([^\]]+)\]/);
|
||||
if (tagsMatch) {
|
||||
tags.push(...tagsMatch[1].split(',').map((t) => t.trim()));
|
||||
}
|
||||
}
|
||||
|
||||
// Extract from first heading
|
||||
const headingMatch = content.match(/^#\s+(.+)$/m);
|
||||
if (headingMatch) {
|
||||
tags.push(headingMatch[1].toLowerCase().replace(/\s+/g, '-'));
|
||||
}
|
||||
|
||||
return tags;
|
||||
}
|
||||
|
||||
/**
|
||||
* Hash content for change detection
|
||||
*/
|
||||
private hashContent(content: string): string {
|
||||
return createHash('md5').update(content).digest('hex');
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively find all markdown files
|
||||
*/
|
||||
private async findMarkdownFiles(dir: string): Promise<string[]> {
|
||||
const files: string[] = [];
|
||||
|
||||
try {
|
||||
const entries = await readdir(dir, { withFileTypes: true });
|
||||
|
||||
for (const entry of entries) {
|
||||
const fullPath = join(dir, entry.name);
|
||||
|
||||
if (entry.isDirectory()) {
|
||||
const subFiles = await this.findMarkdownFiles(fullPath);
|
||||
files.push(...subFiles);
|
||||
} else if (entry.isFile() && entry.name.endsWith('.md')) {
|
||||
files.push(fullPath);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
this.logger.warn({ error, dir }, 'Failed to read directory');
|
||||
}
|
||||
|
||||
return files;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get loaded document stats
|
||||
*/
|
||||
getStats(): { totalDocs: number; totalSize: number } {
|
||||
return {
|
||||
totalDocs: this.loadedDocs.size,
|
||||
totalSize: Array.from(this.loadedDocs.values()).reduce((sum, hash) => sum + hash.length, 0),
|
||||
};
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user