ai-courseware/eino-project/internal/domain/vector/document.go

296 lines
8.4 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package vector
import (
"context"
"crypto/md5"
"fmt"
"log"
"regexp"
"strings"
"time"
)
// DocumentProcessor 文档处理器接口
type DocumentProcessor interface {
// 处理文档内容,返回分块后的文档
ProcessDocument(ctx context.Context, content, filename, docType string) ([]Document, error)
// 处理知识库文档上传
ProcessKnowledgeDocument(ctx context.Context, content, title, category string) ([]Document, error)
}
// documentProcessor 文档处理器实现
type documentProcessor struct {
vectorService VectorService
chunkSize int
chunkOverlap int
}
// NewDocumentProcessor 创建文档处理器
func NewDocumentProcessor(vectorService VectorService) DocumentProcessor {
return &documentProcessor{
vectorService: vectorService,
chunkSize: 1000, // 默认分块大小
chunkOverlap: 200, // 默认重叠大小
}
}
// ProcessDocument 处理文档内容
func (dp *documentProcessor) ProcessDocument(ctx context.Context, content, filename, docType string) ([]Document, error) {
// 清理和预处理文本
cleanContent := dp.cleanText(content)
// 分块处理
chunks := dp.splitText(cleanContent, dp.chunkSize, dp.chunkOverlap)
// 创建文档对象
var documents []Document
for i, chunk := range chunks {
if strings.TrimSpace(chunk) == "" {
continue
}
// 生成唯一ID
docID := dp.generateDocumentID(filename, i)
doc := Document{
ID: docID,
Content: chunk,
Metadata: map[string]interface{}{
"filename": filename,
"doc_type": docType,
"chunk_id": fmt.Sprintf("%d", i),
"timestamp": time.Now().Format(time.RFC3339),
"source": "document_upload",
},
}
documents = append(documents, doc)
}
// 转换为指针切片
docPtrs := make([]*Document, len(documents))
for i := range documents {
docPtrs[i] = &documents[i]
}
// 存储到向量数据库
if err := dp.vectorService.AddDocuments(ctx, docPtrs); err != nil {
return nil, fmt.Errorf("failed to store documents in vector database: %w", err)
}
log.Printf("Successfully processed document %s into %d chunks", filename, len(documents))
return documents, nil
}
// ProcessKnowledgeDocument 处理知识库文档
func (dp *documentProcessor) ProcessKnowledgeDocument(ctx context.Context, content, title, category string) ([]Document, error) {
// 清理和预处理文本
cleanContent := dp.cleanText(content)
// 分块处理
chunks := dp.splitText(cleanContent, dp.chunkSize, dp.chunkOverlap)
// 创建文档对象
var documents []Document
for i, chunk := range chunks {
if strings.TrimSpace(chunk) == "" {
continue
}
// 生成唯一ID
docID := dp.generateKnowledgeID(title, category, i)
doc := Document{
ID: docID,
Content: chunk,
Metadata: map[string]interface{}{
"title": title,
"category": category,
"chunk_id": fmt.Sprintf("%d", i),
"timestamp": time.Now().Format(time.RFC3339),
"source": "knowledge_base",
"type": "knowledge",
},
}
documents = append(documents, doc)
}
// 转换为指针切片
docPtrs := make([]*Document, len(documents))
for i := range documents {
docPtrs[i] = &documents[i]
}
// 存储到向量数据库
if err := dp.vectorService.AddDocuments(ctx, docPtrs); err != nil {
return nil, fmt.Errorf("failed to store knowledge documents in vector database: %w", err)
}
log.Printf("Successfully processed knowledge document '%s' into %d chunks", title, len(documents))
return documents, nil
}
// cleanText 清理文本内容
func (dp *documentProcessor) cleanText(text string) string {
// 移除多余的空白字符
text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ")
// 移除特殊字符(保留基本标点)
text = regexp.MustCompile(`[^\p{L}\p{N}\p{P}\p{Z}]`).ReplaceAllString(text, "")
// 移除多余的换行符
text = regexp.MustCompile(`\n+`).ReplaceAllString(text, "\n")
return strings.TrimSpace(text)
}
// splitText 分块文本
func (dp *documentProcessor) splitText(text string, chunkSize, overlap int) []string {
if len(text) <= chunkSize {
return []string{text}
}
var chunks []string
start := 0
for start < len(text) {
end := start + chunkSize
if end > len(text) {
end = len(text)
}
// 尝试在句子边界分割
chunk := text[start:end]
if end < len(text) {
// 寻找最后一个句号、问号或感叹号
lastSentence := strings.LastIndexAny(chunk, ".!?。!?")
if lastSentence > chunkSize/2 { // 确保块不会太小
end = start + lastSentence + 1
chunk = text[start:end]
}
}
chunks = append(chunks, strings.TrimSpace(chunk))
// 计算下一个块的起始位置(考虑重叠)
if end >= len(text) {
break
}
start = end - overlap
if start < 0 {
start = 0
}
}
return chunks
}
// generateDocumentID 生成文档ID
func (dp *documentProcessor) generateDocumentID(filename string, chunkIndex int) string {
data := fmt.Sprintf("%s_%d_%d", filename, chunkIndex, time.Now().Unix())
hash := md5.Sum([]byte(data))
return fmt.Sprintf("doc_%x", hash)
}
// generateKnowledgeID 生成知识库文档ID
func (dp *documentProcessor) generateKnowledgeID(title, category string, chunkIndex int) string {
data := fmt.Sprintf("%s_%s_%d_%d", title, category, chunkIndex, time.Now().Unix())
hash := md5.Sum([]byte(data))
return fmt.Sprintf("kb_%x", hash)
}
// KnowledgeSearcher 知识检索器接口
type KnowledgeSearcher interface {
// 搜索相关知识
SearchKnowledge(ctx context.Context, query string, limit int) ([]SearchResult, error)
// 搜索特定类别的知识
SearchKnowledgeByCategory(ctx context.Context, query, category string, limit int) ([]SearchResult, error)
// 获取知识摘要
GetKnowledgeSummary(ctx context.Context, query string) (string, error)
}
// knowledgeSearcher 知识检索器实现
type knowledgeSearcher struct {
vectorService VectorService
}
// NewKnowledgeSearcher 创建知识检索器
func NewKnowledgeSearcher(vectorService VectorService) KnowledgeSearcher {
return &knowledgeSearcher{
vectorService: vectorService,
}
}
// SearchKnowledge 搜索相关知识
func (ks *knowledgeSearcher) SearchKnowledge(ctx context.Context, query string, limit int) ([]SearchResult, error) {
results, err := ks.vectorService.SearchSimilar(ctx, query, limit)
if err != nil {
return nil, fmt.Errorf("failed to search knowledge: %w", err)
}
// 过滤知识库文档
var knowledgeResults []SearchResult
for _, result := range results {
if result.Document.Metadata["source"] == "knowledge_base" {
knowledgeResults = append(knowledgeResults, *result)
}
}
log.Printf("Found %d knowledge documents for query: %s", len(knowledgeResults), query)
return knowledgeResults, nil
}
// SearchKnowledgeByCategory 搜索特定类别的知识
func (ks *knowledgeSearcher) SearchKnowledgeByCategory(ctx context.Context, query, category string, limit int) ([]SearchResult, error) {
results, err := ks.vectorService.SearchSimilar(ctx, query, limit*2) // 获取更多结果用于过滤
if err != nil {
return nil, fmt.Errorf("failed to search knowledge by category: %w", err)
}
// 过滤特定类别的知识库文档
var categoryResults []SearchResult
for _, result := range results {
if result.Document.Metadata["source"] == "knowledge_base" &&
result.Document.Metadata["category"] == category {
categoryResults = append(categoryResults, *result)
if len(categoryResults) >= limit {
break
}
}
}
log.Printf("Found %d knowledge documents in category '%s' for query: %s", len(categoryResults), category, query)
return categoryResults, nil
}
// GetKnowledgeSummary 获取知识摘要
func (ks *knowledgeSearcher) GetKnowledgeSummary(ctx context.Context, query string) (string, error) {
// 搜索相关知识
results, err := ks.SearchKnowledge(ctx, query, 5)
if err != nil {
return "", fmt.Errorf("failed to search knowledge for summary: %w", err)
}
if len(results) == 0 {
return "未找到相关知识内容。", nil
}
// 组合相关内容
var contents []string
for _, result := range results {
if result.Score > 0.7 { // 只包含高相关性的内容
contents = append(contents, result.Document.Content)
}
}
if len(contents) == 0 {
return "未找到高相关性的知识内容。", nil
}
// 简单的摘要生成实际项目中可以使用AI生成摘要
summary := strings.Join(contents, "\n\n")
if len(summary) > 500 {
summary = summary[:500] + "..."
}
return summary, nil
}