package vector import ( "context" "crypto/md5" "fmt" "log" "regexp" "strings" "time" ) // DocumentProcessor 文档处理器接口 type DocumentProcessor interface { // 处理文档内容,返回分块后的文档 ProcessDocument(ctx context.Context, content, filename, docType string) ([]Document, error) // 处理知识库文档上传 ProcessKnowledgeDocument(ctx context.Context, content, title, category string) ([]Document, error) } // documentProcessor 文档处理器实现 type documentProcessor struct { vectorService VectorService chunkSize int chunkOverlap int } // NewDocumentProcessor 创建文档处理器 func NewDocumentProcessor(vectorService VectorService) DocumentProcessor { return &documentProcessor{ vectorService: vectorService, chunkSize: 1000, // 默认分块大小 chunkOverlap: 200, // 默认重叠大小 } } // ProcessDocument 处理文档内容 func (dp *documentProcessor) ProcessDocument(ctx context.Context, content, filename, docType string) ([]Document, error) { // 清理和预处理文本 cleanContent := dp.cleanText(content) // 分块处理 chunks := dp.splitText(cleanContent, dp.chunkSize, dp.chunkOverlap) // 创建文档对象 var documents []Document for i, chunk := range chunks { if strings.TrimSpace(chunk) == "" { continue } // 生成唯一ID docID := dp.generateDocumentID(filename, i) doc := Document{ ID: docID, Content: chunk, Metadata: map[string]interface{}{ "filename": filename, "doc_type": docType, "chunk_id": fmt.Sprintf("%d", i), "timestamp": time.Now().Format(time.RFC3339), "source": "document_upload", }, } documents = append(documents, doc) } // 转换为指针切片 docPtrs := make([]*Document, len(documents)) for i := range documents { docPtrs[i] = &documents[i] } // 存储到向量数据库 if err := dp.vectorService.AddDocuments(ctx, docPtrs); err != nil { return nil, fmt.Errorf("failed to store documents in vector database: %w", err) } log.Printf("Successfully processed document %s into %d chunks", filename, len(documents)) return documents, nil } // ProcessKnowledgeDocument 处理知识库文档 func (dp *documentProcessor) ProcessKnowledgeDocument(ctx context.Context, content, title, category string) ([]Document, error) { // 清理和预处理文本 cleanContent := dp.cleanText(content) // 分块处理 chunks := dp.splitText(cleanContent, dp.chunkSize, dp.chunkOverlap) // 创建文档对象 var documents []Document for i, chunk := range chunks { if strings.TrimSpace(chunk) == "" { continue } // 生成唯一ID docID := dp.generateKnowledgeID(title, category, i) doc := Document{ ID: docID, Content: chunk, Metadata: map[string]interface{}{ "title": title, "category": category, "chunk_id": fmt.Sprintf("%d", i), "timestamp": time.Now().Format(time.RFC3339), "source": "knowledge_base", "type": "knowledge", }, } documents = append(documents, doc) } // 转换为指针切片 docPtrs := make([]*Document, len(documents)) for i := range documents { docPtrs[i] = &documents[i] } // 存储到向量数据库 if err := dp.vectorService.AddDocuments(ctx, docPtrs); err != nil { return nil, fmt.Errorf("failed to store knowledge documents in vector database: %w", err) } log.Printf("Successfully processed knowledge document '%s' into %d chunks", title, len(documents)) return documents, nil } // cleanText 清理文本内容 func (dp *documentProcessor) cleanText(text string) string { // 移除多余的空白字符 text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ") // 移除特殊字符(保留基本标点) text = regexp.MustCompile(`[^\p{L}\p{N}\p{P}\p{Z}]`).ReplaceAllString(text, "") // 移除多余的换行符 text = regexp.MustCompile(`\n+`).ReplaceAllString(text, "\n") return strings.TrimSpace(text) } // splitText 分块文本 func (dp *documentProcessor) splitText(text string, chunkSize, overlap int) []string { if len(text) <= chunkSize { return []string{text} } var chunks []string start := 0 for start < len(text) { end := start + chunkSize if end > len(text) { end = len(text) } // 尝试在句子边界分割 chunk := text[start:end] if end < len(text) { // 寻找最后一个句号、问号或感叹号 lastSentence := strings.LastIndexAny(chunk, ".!?。!?") if lastSentence > chunkSize/2 { // 确保块不会太小 end = start + lastSentence + 1 chunk = text[start:end] } } chunks = append(chunks, strings.TrimSpace(chunk)) // 计算下一个块的起始位置(考虑重叠) if end >= len(text) { break } start = end - overlap if start < 0 { start = 0 } } return chunks } // generateDocumentID 生成文档ID func (dp *documentProcessor) generateDocumentID(filename string, chunkIndex int) string { data := fmt.Sprintf("%s_%d_%d", filename, chunkIndex, time.Now().Unix()) hash := md5.Sum([]byte(data)) return fmt.Sprintf("doc_%x", hash) } // generateKnowledgeID 生成知识库文档ID func (dp *documentProcessor) generateKnowledgeID(title, category string, chunkIndex int) string { data := fmt.Sprintf("%s_%s_%d_%d", title, category, chunkIndex, time.Now().Unix()) hash := md5.Sum([]byte(data)) return fmt.Sprintf("kb_%x", hash) } // KnowledgeSearcher 知识检索器接口 type KnowledgeSearcher interface { // 搜索相关知识 SearchKnowledge(ctx context.Context, query string, limit int) ([]SearchResult, error) // 搜索特定类别的知识 SearchKnowledgeByCategory(ctx context.Context, query, category string, limit int) ([]SearchResult, error) // 获取知识摘要 GetKnowledgeSummary(ctx context.Context, query string) (string, error) } // knowledgeSearcher 知识检索器实现 type knowledgeSearcher struct { vectorService VectorService } // NewKnowledgeSearcher 创建知识检索器 func NewKnowledgeSearcher(vectorService VectorService) KnowledgeSearcher { return &knowledgeSearcher{ vectorService: vectorService, } } // SearchKnowledge 搜索相关知识 func (ks *knowledgeSearcher) SearchKnowledge(ctx context.Context, query string, limit int) ([]SearchResult, error) { results, err := ks.vectorService.SearchSimilar(ctx, query, limit) if err != nil { return nil, fmt.Errorf("failed to search knowledge: %w", err) } // 过滤知识库文档 var knowledgeResults []SearchResult for _, result := range results { if result.Document.Metadata["source"] == "knowledge_base" { knowledgeResults = append(knowledgeResults, *result) } } log.Printf("Found %d knowledge documents for query: %s", len(knowledgeResults), query) return knowledgeResults, nil } // SearchKnowledgeByCategory 搜索特定类别的知识 func (ks *knowledgeSearcher) SearchKnowledgeByCategory(ctx context.Context, query, category string, limit int) ([]SearchResult, error) { results, err := ks.vectorService.SearchSimilar(ctx, query, limit*2) // 获取更多结果用于过滤 if err != nil { return nil, fmt.Errorf("failed to search knowledge by category: %w", err) } // 过滤特定类别的知识库文档 var categoryResults []SearchResult for _, result := range results { if result.Document.Metadata["source"] == "knowledge_base" && result.Document.Metadata["category"] == category { categoryResults = append(categoryResults, *result) if len(categoryResults) >= limit { break } } } log.Printf("Found %d knowledge documents in category '%s' for query: %s", len(categoryResults), category, query) return categoryResults, nil } // GetKnowledgeSummary 获取知识摘要 func (ks *knowledgeSearcher) GetKnowledgeSummary(ctx context.Context, query string) (string, error) { // 搜索相关知识 results, err := ks.SearchKnowledge(ctx, query, 5) if err != nil { return "", fmt.Errorf("failed to search knowledge for summary: %w", err) } if len(results) == 0 { return "未找到相关知识内容。", nil } // 组合相关内容 var contents []string for _, result := range results { if result.Score > 0.7 { // 只包含高相关性的内容 contents = append(contents, result.Document.Content) } } if len(contents) == 0 { return "未找到高相关性的知识内容。", nil } // 简单的摘要生成(实际项目中可以使用AI生成摘要) summary := strings.Join(contents, "\n\n") if len(summary) > 500 { summary = summary[:500] + "..." } return summary, nil }