296 lines
8.4 KiB
Go
296 lines
8.4 KiB
Go
package vector
|
||
|
||
import (
|
||
"context"
|
||
"crypto/md5"
|
||
"fmt"
|
||
"log"
|
||
"regexp"
|
||
"strings"
|
||
"time"
|
||
)
|
||
|
||
// DocumentProcessor 文档处理器接口
|
||
type DocumentProcessor interface {
|
||
// 处理文档内容,返回分块后的文档
|
||
ProcessDocument(ctx context.Context, content, filename, docType string) ([]Document, error)
|
||
// 处理知识库文档上传
|
||
ProcessKnowledgeDocument(ctx context.Context, content, title, category string) ([]Document, error)
|
||
}
|
||
|
||
// documentProcessor 文档处理器实现
|
||
type documentProcessor struct {
|
||
vectorService VectorService
|
||
chunkSize int
|
||
chunkOverlap int
|
||
}
|
||
|
||
// NewDocumentProcessor 创建文档处理器
|
||
func NewDocumentProcessor(vectorService VectorService) DocumentProcessor {
|
||
return &documentProcessor{
|
||
vectorService: vectorService,
|
||
chunkSize: 1000, // 默认分块大小
|
||
chunkOverlap: 200, // 默认重叠大小
|
||
}
|
||
}
|
||
|
||
// ProcessDocument 处理文档内容
|
||
func (dp *documentProcessor) ProcessDocument(ctx context.Context, content, filename, docType string) ([]Document, error) {
|
||
// 清理和预处理文本
|
||
cleanContent := dp.cleanText(content)
|
||
|
||
// 分块处理
|
||
chunks := dp.splitText(cleanContent, dp.chunkSize, dp.chunkOverlap)
|
||
|
||
// 创建文档对象
|
||
var documents []Document
|
||
for i, chunk := range chunks {
|
||
if strings.TrimSpace(chunk) == "" {
|
||
continue
|
||
}
|
||
|
||
// 生成唯一ID
|
||
docID := dp.generateDocumentID(filename, i)
|
||
|
||
doc := Document{
|
||
ID: docID,
|
||
Content: chunk,
|
||
Metadata: map[string]interface{}{
|
||
"filename": filename,
|
||
"doc_type": docType,
|
||
"chunk_id": fmt.Sprintf("%d", i),
|
||
"timestamp": time.Now().Format(time.RFC3339),
|
||
"source": "document_upload",
|
||
},
|
||
}
|
||
documents = append(documents, doc)
|
||
}
|
||
|
||
// 转换为指针切片
|
||
docPtrs := make([]*Document, len(documents))
|
||
for i := range documents {
|
||
docPtrs[i] = &documents[i]
|
||
}
|
||
|
||
// 存储到向量数据库
|
||
if err := dp.vectorService.AddDocuments(ctx, docPtrs); err != nil {
|
||
return nil, fmt.Errorf("failed to store documents in vector database: %w", err)
|
||
}
|
||
|
||
log.Printf("Successfully processed document %s into %d chunks", filename, len(documents))
|
||
return documents, nil
|
||
}
|
||
|
||
// ProcessKnowledgeDocument 处理知识库文档
|
||
func (dp *documentProcessor) ProcessKnowledgeDocument(ctx context.Context, content, title, category string) ([]Document, error) {
|
||
// 清理和预处理文本
|
||
cleanContent := dp.cleanText(content)
|
||
|
||
// 分块处理
|
||
chunks := dp.splitText(cleanContent, dp.chunkSize, dp.chunkOverlap)
|
||
|
||
// 创建文档对象
|
||
var documents []Document
|
||
for i, chunk := range chunks {
|
||
if strings.TrimSpace(chunk) == "" {
|
||
continue
|
||
}
|
||
|
||
// 生成唯一ID
|
||
docID := dp.generateKnowledgeID(title, category, i)
|
||
|
||
doc := Document{
|
||
ID: docID,
|
||
Content: chunk,
|
||
Metadata: map[string]interface{}{
|
||
"title": title,
|
||
"category": category,
|
||
"chunk_id": fmt.Sprintf("%d", i),
|
||
"timestamp": time.Now().Format(time.RFC3339),
|
||
"source": "knowledge_base",
|
||
"type": "knowledge",
|
||
},
|
||
}
|
||
documents = append(documents, doc)
|
||
}
|
||
|
||
// 转换为指针切片
|
||
docPtrs := make([]*Document, len(documents))
|
||
for i := range documents {
|
||
docPtrs[i] = &documents[i]
|
||
}
|
||
|
||
// 存储到向量数据库
|
||
if err := dp.vectorService.AddDocuments(ctx, docPtrs); err != nil {
|
||
return nil, fmt.Errorf("failed to store knowledge documents in vector database: %w", err)
|
||
}
|
||
|
||
log.Printf("Successfully processed knowledge document '%s' into %d chunks", title, len(documents))
|
||
return documents, nil
|
||
}
|
||
|
||
// cleanText 清理文本内容
|
||
func (dp *documentProcessor) cleanText(text string) string {
|
||
// 移除多余的空白字符
|
||
text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ")
|
||
|
||
// 移除特殊字符(保留基本标点)
|
||
text = regexp.MustCompile(`[^\p{L}\p{N}\p{P}\p{Z}]`).ReplaceAllString(text, "")
|
||
|
||
// 移除多余的换行符
|
||
text = regexp.MustCompile(`\n+`).ReplaceAllString(text, "\n")
|
||
|
||
return strings.TrimSpace(text)
|
||
}
|
||
|
||
// splitText 分块文本
|
||
func (dp *documentProcessor) splitText(text string, chunkSize, overlap int) []string {
|
||
if len(text) <= chunkSize {
|
||
return []string{text}
|
||
}
|
||
|
||
var chunks []string
|
||
start := 0
|
||
|
||
for start < len(text) {
|
||
end := start + chunkSize
|
||
if end > len(text) {
|
||
end = len(text)
|
||
}
|
||
|
||
// 尝试在句子边界分割
|
||
chunk := text[start:end]
|
||
if end < len(text) {
|
||
// 寻找最后一个句号、问号或感叹号
|
||
lastSentence := strings.LastIndexAny(chunk, ".!?。!?")
|
||
if lastSentence > chunkSize/2 { // 确保块不会太小
|
||
end = start + lastSentence + 1
|
||
chunk = text[start:end]
|
||
}
|
||
}
|
||
|
||
chunks = append(chunks, strings.TrimSpace(chunk))
|
||
|
||
// 计算下一个块的起始位置(考虑重叠)
|
||
if end >= len(text) {
|
||
break
|
||
}
|
||
start = end - overlap
|
||
if start < 0 {
|
||
start = 0
|
||
}
|
||
}
|
||
|
||
return chunks
|
||
}
|
||
|
||
// generateDocumentID 生成文档ID
|
||
func (dp *documentProcessor) generateDocumentID(filename string, chunkIndex int) string {
|
||
data := fmt.Sprintf("%s_%d_%d", filename, chunkIndex, time.Now().Unix())
|
||
hash := md5.Sum([]byte(data))
|
||
return fmt.Sprintf("doc_%x", hash)
|
||
}
|
||
|
||
// generateKnowledgeID 生成知识库文档ID
|
||
func (dp *documentProcessor) generateKnowledgeID(title, category string, chunkIndex int) string {
|
||
data := fmt.Sprintf("%s_%s_%d_%d", title, category, chunkIndex, time.Now().Unix())
|
||
hash := md5.Sum([]byte(data))
|
||
return fmt.Sprintf("kb_%x", hash)
|
||
}
|
||
|
||
// KnowledgeSearcher 知识检索器接口
|
||
type KnowledgeSearcher interface {
|
||
// 搜索相关知识
|
||
SearchKnowledge(ctx context.Context, query string, limit int) ([]SearchResult, error)
|
||
// 搜索特定类别的知识
|
||
SearchKnowledgeByCategory(ctx context.Context, query, category string, limit int) ([]SearchResult, error)
|
||
// 获取知识摘要
|
||
GetKnowledgeSummary(ctx context.Context, query string) (string, error)
|
||
}
|
||
|
||
// knowledgeSearcher 知识检索器实现
|
||
type knowledgeSearcher struct {
|
||
vectorService VectorService
|
||
}
|
||
|
||
// NewKnowledgeSearcher 创建知识检索器
|
||
func NewKnowledgeSearcher(vectorService VectorService) KnowledgeSearcher {
|
||
return &knowledgeSearcher{
|
||
vectorService: vectorService,
|
||
}
|
||
}
|
||
|
||
// SearchKnowledge 搜索相关知识
|
||
func (ks *knowledgeSearcher) SearchKnowledge(ctx context.Context, query string, limit int) ([]SearchResult, error) {
|
||
results, err := ks.vectorService.SearchSimilar(ctx, query, limit)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("failed to search knowledge: %w", err)
|
||
}
|
||
|
||
// 过滤知识库文档
|
||
var knowledgeResults []SearchResult
|
||
for _, result := range results {
|
||
if result.Document.Metadata["source"] == "knowledge_base" {
|
||
knowledgeResults = append(knowledgeResults, *result)
|
||
}
|
||
}
|
||
|
||
log.Printf("Found %d knowledge documents for query: %s", len(knowledgeResults), query)
|
||
return knowledgeResults, nil
|
||
}
|
||
|
||
// SearchKnowledgeByCategory 搜索特定类别的知识
|
||
func (ks *knowledgeSearcher) SearchKnowledgeByCategory(ctx context.Context, query, category string, limit int) ([]SearchResult, error) {
|
||
results, err := ks.vectorService.SearchSimilar(ctx, query, limit*2) // 获取更多结果用于过滤
|
||
if err != nil {
|
||
return nil, fmt.Errorf("failed to search knowledge by category: %w", err)
|
||
}
|
||
|
||
// 过滤特定类别的知识库文档
|
||
var categoryResults []SearchResult
|
||
for _, result := range results {
|
||
if result.Document.Metadata["source"] == "knowledge_base" &&
|
||
result.Document.Metadata["category"] == category {
|
||
categoryResults = append(categoryResults, *result)
|
||
if len(categoryResults) >= limit {
|
||
break
|
||
}
|
||
}
|
||
}
|
||
|
||
log.Printf("Found %d knowledge documents in category '%s' for query: %s", len(categoryResults), category, query)
|
||
return categoryResults, nil
|
||
}
|
||
|
||
// GetKnowledgeSummary 获取知识摘要
|
||
func (ks *knowledgeSearcher) GetKnowledgeSummary(ctx context.Context, query string) (string, error) {
|
||
// 搜索相关知识
|
||
results, err := ks.SearchKnowledge(ctx, query, 5)
|
||
if err != nil {
|
||
return "", fmt.Errorf("failed to search knowledge for summary: %w", err)
|
||
}
|
||
|
||
if len(results) == 0 {
|
||
return "未找到相关知识内容。", nil
|
||
}
|
||
|
||
// 组合相关内容
|
||
var contents []string
|
||
for _, result := range results {
|
||
if result.Score > 0.7 { // 只包含高相关性的内容
|
||
contents = append(contents, result.Document.Content)
|
||
}
|
||
}
|
||
|
||
if len(contents) == 0 {
|
||
return "未找到高相关性的知识内容。", nil
|
||
}
|
||
|
||
// 简单的摘要生成(实际项目中可以使用AI生成摘要)
|
||
summary := strings.Join(contents, "\n\n")
|
||
if len(summary) > 500 {
|
||
summary = summary[:500] + "..."
|
||
}
|
||
|
||
return summary, nil
|
||
} |