package service import ( "context" "crypto/md5" "encoding/hex" "encoding/json" "errors" "fmt" "io" "mime/multipart" "os" "regexp" "slices" "sort" "strings" "sync" "time" "github.com/google/uuid" "go.opentelemetry.io/otel/attribute" "golang.org/x/sync/errgroup" "knowlege-lsxd/internal/application/service/retriever" "knowlege-lsxd/internal/config" werrors "knowlege-lsxd/internal/errors" "knowlege-lsxd/internal/logger" "knowlege-lsxd/internal/models/chat" "knowlege-lsxd/internal/models/utils" "knowlege-lsxd/internal/tracing" "knowlege-lsxd/internal/types" "knowlege-lsxd/internal/types/interfaces" "knowlege-lsxd/services/docreader/src/client" "knowlege-lsxd/services/docreader/src/proto" ) // Error definitions for knowledge service operations var ( // ErrInvalidFileType is returned when an unsupported file type is provided ErrInvalidFileType = errors.New("unsupported file type") // ErrInvalidURL is returned when an invalid URL is provided ErrInvalidURL = errors.New("invalid URL") // ErrChunkNotFound is returned when a requested chunk cannot be found ErrChunkNotFound = errors.New("chunk not found") // ErrDuplicateFile is returned when trying to add a file that already exists ErrDuplicateFile = errors.New("file already exists") // ErrDuplicateURL is returned when trying to add a URL that already exists ErrDuplicateURL = errors.New("URL already exists") // ErrImageNotParse is returned when trying to update image information without enabling multimodel ErrImageNotParse = errors.New("image not parse without enable multimodel") ) // knowledgeService implements the knowledge service interface // service 实现知识服务接口 type knowledgeService struct { config *config.Config repo interfaces.KnowledgeRepository kbService interfaces.KnowledgeBaseService tenantRepo interfaces.TenantRepository docReaderClient *client.Client chunkService interfaces.ChunkService chunkRepo interfaces.ChunkRepository fileSvc interfaces.FileService modelService interfaces.ModelService } // NewKnowledgeService creates a new knowledge service instance func NewKnowledgeService( config *config.Config, repo interfaces.KnowledgeRepository, docReaderClient *client.Client, kbService interfaces.KnowledgeBaseService, tenantRepo interfaces.TenantRepository, chunkService interfaces.ChunkService, chunkRepo interfaces.ChunkRepository, fileSvc interfaces.FileService, modelService interfaces.ModelService, ) (interfaces.KnowledgeService, error) { return &knowledgeService{ config: config, repo: repo, kbService: kbService, tenantRepo: tenantRepo, docReaderClient: docReaderClient, chunkService: chunkService, chunkRepo: chunkRepo, fileSvc: fileSvc, modelService: modelService, }, nil } // CreateKnowledgeFromFile creates a knowledge entry from an uploaded file func (s *knowledgeService) CreateKnowledgeFromFile(ctx context.Context, kbID string, file *multipart.FileHeader, metadata map[string]string, enableMultimodel *bool, ) (*types.Knowledge, error) { logger.Info(ctx, "Start creating knowledge from file") logger.Infof(ctx, "Knowledge base ID: %s, file: %s", kbID, file.Filename) if metadata != nil { logger.Infof(ctx, "Received metadata: %v", metadata) } // Get knowledge base configuration logger.Info(ctx, "Getting knowledge base configuration") kb, err := s.kbService.GetKnowledgeBaseByID(ctx, kbID) if err != nil { logger.Errorf(ctx, "Failed to get knowledge base: %v", err) return nil, err } // 检查多模态配置完整性 - 只在图片文件时校验 // 检查是否为图片文件 if !IsImageType(getFileType(file.Filename)) { logger.Info(ctx, "Non-image file with multimodal enabled, skipping COS/VLM validation") } else { // 检查COS配置 switch kb.StorageConfig.Provider { case "cos": if kb.StorageConfig.SecretID == "" || kb.StorageConfig.SecretKey == "" || kb.StorageConfig.Region == "" || kb.StorageConfig.BucketName == "" || kb.StorageConfig.AppID == "" { logger.Error(ctx, "COS configuration incomplete for image multimodal processing") return nil, werrors.NewBadRequestError("上传图片文件需要完整的对象存储配置信息, 请前往系统设置页面进行补全") } case "minio": if kb.StorageConfig.BucketName == "" { logger.Error(ctx, "MinIO configuration incomplete for image multimodal processing") return nil, werrors.NewBadRequestError("上传图片文件需要完整的对象存储配置信息, 请前往系统设置页面进行补全") } } // 检查VLM配置 if kb.VLMConfig.ModelName == "" || kb.VLMConfig.BaseURL == "" { logger.Error(ctx, "VLM configuration incomplete for image multimodal processing") return nil, werrors.NewBadRequestError("上传图片文件需要完整的VLM配置信息, 请前往系统设置页面进行补全") } logger.Info(ctx, "Image multimodal configuration validation passed") } // Validate file type logger.Infof(ctx, "Checking file type: %s", file.Filename) if !isValidFileType(file.Filename) { logger.Error(ctx, "Invalid file type") return nil, ErrInvalidFileType } // Calculate file hash for deduplication logger.Info(ctx, "Calculating file hash") hash, err := calculateFileHash(file) if err != nil { logger.Errorf(ctx, "Failed to calculate file hash: %v", err) return nil, err } // Check if file already exists tenantID := ctx.Value(types.TenantIDContextKey).(uint) logger.Infof(ctx, "Checking if file exists, tenant ID: %d", tenantID) exists, existingKnowledge, err := s.repo.CheckKnowledgeExists(ctx, tenantID, kbID, &types.KnowledgeCheckParams{ Type: "file", FileName: file.Filename, FileSize: file.Size, FileHash: hash, }) if err != nil { logger.Errorf(ctx, "Failed to check knowledge existence: %v", err) return nil, err } if exists { logger.Infof(ctx, "File already exists: %s", file.Filename) // Update creation time for existing knowledge existingKnowledge.CreatedAt = time.Now() existingKnowledge.UpdatedAt = time.Now() if err := s.repo.UpdateKnowledge(ctx, existingKnowledge); err != nil { logger.Errorf(ctx, "Failed to update existing knowledge: %v", err) return nil, err } return existingKnowledge, types.NewDuplicateFileError(existingKnowledge) } // Check storage quota tenantInfo := ctx.Value(types.TenantInfoContextKey).(*types.Tenant) if tenantInfo.StorageQuota > 0 && tenantInfo.StorageUsed >= tenantInfo.StorageQuota { logger.Error(ctx, "Storage quota exceeded") return nil, types.NewStorageQuotaExceededError() } // Convert metadata to JSON format if provided var metadataJSON types.JSON if metadata != nil { metadataBytes, err := json.Marshal(metadata) if err != nil { logger.Errorf(ctx, "Failed to marshal metadata: %v", err) return nil, err } metadataJSON = types.JSON(metadataBytes) } // Create knowledge record logger.Info(ctx, "Creating knowledge record") knowledge := &types.Knowledge{ TenantID: tenantID, KnowledgeBaseID: kbID, Type: "file", Title: file.Filename, FileName: file.Filename, FileType: getFileType(file.Filename), FileSize: file.Size, FileHash: hash, ParseStatus: "pending", EnableStatus: "disabled", CreatedAt: time.Now(), UpdatedAt: time.Now(), EmbeddingModelID: kb.EmbeddingModelID, Metadata: metadataJSON, } // Save knowledge record to database logger.Info(ctx, "Saving knowledge record to database") if err := s.repo.CreateKnowledge(ctx, knowledge); err != nil { logger.Errorf(ctx, "Failed to create knowledge record, ID: %s, error: %v", knowledge.ID, err) return nil, err } // Save the file to storage logger.Infof(ctx, "Saving file, knowledge ID: %s", knowledge.ID) filePath, err := s.fileSvc.SaveFile(ctx, file, knowledge.TenantID, knowledge.ID) if err != nil { logger.Errorf(ctx, "Failed to save file, knowledge ID: %s, error: %v", knowledge.ID, err) return nil, err } knowledge.FilePath = filePath // Update knowledge record with file path logger.Info(ctx, "Updating knowledge record with file path") if err := s.repo.UpdateKnowledge(ctx, knowledge); err != nil { logger.Errorf(ctx, "Failed to update knowledge with file path, ID: %s, error: %v", knowledge.ID, err) return nil, err } // Process document asynchronously logger.Info(ctx, "Starting asynchronous document processing") newCtx := logger.CloneContext(ctx) if enableMultimodel == nil { enableMultimodel = &kb.ChunkingConfig.EnableMultimodal } go s.processDocument(newCtx, kb, knowledge, file, *enableMultimodel) logger.Infof(ctx, "Knowledge from file created successfully, ID: %s", knowledge.ID) return knowledge, nil } // CreateKnowledgeFromURL creates a knowledge entry from a URL source func (s *knowledgeService) CreateKnowledgeFromURL(ctx context.Context, kbID string, url string, enableMultimodel *bool, ) (*types.Knowledge, error) { logger.Info(ctx, "Start creating knowledge from URL") logger.Infof(ctx, "Knowledge base ID: %s, URL: %s", kbID, url) // Get knowledge base configuration logger.Info(ctx, "Getting knowledge base configuration") kb, err := s.kbService.GetKnowledgeBaseByID(ctx, kbID) if err != nil { logger.Errorf(ctx, "Failed to get knowledge base: %v", err) return nil, err } // Validate URL format logger.Info(ctx, "Validating URL") if !isValidURL(url) { logger.Error(ctx, "Invalid URL format") return nil, ErrInvalidURL } // Check if URL already exists in the knowledge base tenantID := ctx.Value(types.TenantIDContextKey).(uint) logger.Infof(ctx, "Checking if URL exists, tenant ID: %d", tenantID) exists, existingKnowledge, err := s.repo.CheckKnowledgeExists(ctx, tenantID, kbID, &types.KnowledgeCheckParams{ Type: "url", URL: url, }) if err != nil { logger.Errorf(ctx, "Failed to check knowledge existence: %v", err) return nil, err } if exists { logger.Infof(ctx, "URL already exists: %s", url) // Update creation time for existing knowledge existingKnowledge.CreatedAt = time.Now() existingKnowledge.UpdatedAt = time.Now() if err := s.repo.UpdateKnowledge(ctx, existingKnowledge); err != nil { logger.Errorf(ctx, "Failed to update existing knowledge: %v", err) return nil, err } return existingKnowledge, types.NewDuplicateURLError(existingKnowledge) } // Check storage quota tenantInfo := ctx.Value(types.TenantInfoContextKey).(*types.Tenant) if tenantInfo.StorageQuota > 0 && tenantInfo.StorageUsed >= tenantInfo.StorageQuota { logger.Error(ctx, "Storage quota exceeded") return nil, types.NewStorageQuotaExceededError() } // Create knowledge record logger.Info(ctx, "Creating knowledge record") knowledge := &types.Knowledge{ ID: uuid.New().String(), TenantID: tenantID, KnowledgeBaseID: kbID, Type: "url", Source: url, ParseStatus: "pending", EnableStatus: "disabled", CreatedAt: time.Now(), UpdatedAt: time.Now(), EmbeddingModelID: kb.EmbeddingModelID, } // Save knowledge record logger.Infof(ctx, "Saving knowledge record to database, ID: %s", knowledge.ID) if err := s.repo.CreateKnowledge(ctx, knowledge); err != nil { logger.Errorf(ctx, "Failed to create knowledge record: %v", err) return nil, err } // Process URL asynchronously logger.Info(ctx, "Starting asynchronous URL processing") if enableMultimodel == nil { enableMultimodel = &kb.ChunkingConfig.EnableMultimodal } newCtx := logger.CloneContext(ctx) go s.processDocumentFromURL(newCtx, kb, knowledge, url, *enableMultimodel) logger.Infof(ctx, "Knowledge from URL created successfully, ID: %s", knowledge.ID) return knowledge, nil } // CreateKnowledgeFromPassage creates a knowledge entry from text passages func (s *knowledgeService) CreateKnowledgeFromPassage(ctx context.Context, kbID string, passage []string, ) (*types.Knowledge, error) { logger.Info(ctx, "Start creating knowledge from passage") logger.Infof(ctx, "Knowledge base ID: %s, passage count: %d", kbID, len(passage)) // Get knowledge base configuration logger.Info(ctx, "Getting knowledge base configuration") kb, err := s.kbService.GetKnowledgeBaseByID(ctx, kbID) if err != nil { logger.Errorf(ctx, "Failed to get knowledge base: %v", err) return nil, err } // Create knowledge record logger.Info(ctx, "Creating knowledge record") knowledge := &types.Knowledge{ ID: uuid.New().String(), TenantID: ctx.Value(types.TenantIDContextKey).(uint), KnowledgeBaseID: kbID, Type: "passage", ParseStatus: "pending", EnableStatus: "disabled", CreatedAt: time.Now(), UpdatedAt: time.Now(), EmbeddingModelID: kb.EmbeddingModelID, } // Save knowledge record logger.Infof(ctx, "Saving knowledge record to database, ID: %s", knowledge.ID) if err := s.repo.CreateKnowledge(ctx, knowledge); err != nil { logger.Errorf(ctx, "Failed to create knowledge record: %v", err) return nil, err } // Process passages asynchronously logger.Info(ctx, "Starting asynchronous passage processing") go s.processDocumentFromPassage(ctx, kb, knowledge, passage) logger.Infof(ctx, "Knowledge from passage created successfully, ID: %s", knowledge.ID) return knowledge, nil } // GetKnowledgeByID retrieves a knowledge entry by its ID func (s *knowledgeService) GetKnowledgeByID(ctx context.Context, id string) (*types.Knowledge, error) { logger.Info(ctx, "Start getting knowledge by ID") logger.Infof(ctx, "Knowledge ID: %s", id) tenantID := ctx.Value(types.TenantIDContextKey).(uint) logger.Infof(ctx, "Tenant ID: %d", tenantID) knowledge, err := s.repo.GetKnowledgeByID(ctx, tenantID, id) if err != nil { logger.ErrorWithFields(ctx, err, map[string]interface{}{ "knowledge_id": id, "tenant_id": tenantID, }) return nil, err } logger.Infof(ctx, "Knowledge retrieved successfully, ID: %s, type: %s", knowledge.ID, knowledge.Type) return knowledge, nil } // ListKnowledgeByKnowledgeBaseID returns all knowledge entries in a knowledge base func (s *knowledgeService) ListKnowledgeByKnowledgeBaseID(ctx context.Context, kbID string, ) ([]*types.Knowledge, error) { return s.repo.ListKnowledgeByKnowledgeBaseID(ctx, ctx.Value(types.TenantIDContextKey).(uint), kbID) } // ListPagedKnowledgeByKnowledgeBaseID returns paginated knowledge entries in a knowledge base func (s *knowledgeService) ListPagedKnowledgeByKnowledgeBaseID(ctx context.Context, kbID string, page *types.Pagination, ) (*types.PageResult, error) { knowledges, total, err := s.repo.ListPagedKnowledgeByKnowledgeBaseID(ctx, ctx.Value(types.TenantIDContextKey).(uint), kbID, page) if err != nil { return nil, err } return types.NewPageResult(total, page, knowledges), nil } // DeleteKnowledge deletes a knowledge entry and all related resources func (s *knowledgeService) DeleteKnowledge(ctx context.Context, id string) error { // Get the knowledge entry knowledge, err := s.repo.GetKnowledgeByID(ctx, ctx.Value(types.TenantIDContextKey).(uint), id) if err != nil { return err } wg := errgroup.Group{} // Delete knowledge embeddings from vector store wg.Go(func() error { tenantInfo := ctx.Value(types.TenantInfoContextKey).(*types.Tenant) retrieveEngine, err := retriever.NewCompositeRetrieveEngine(tenantInfo.RetrieverEngines.Engines) if err != nil { logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete knowledge embedding failed") return err } embeddingModel, err := s.modelService.GetEmbeddingModel(ctx, knowledge.EmbeddingModelID) if err != nil { logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete knowledge embedding failed") return err } if err := retrieveEngine.DeleteByKnowledgeIDList(ctx, []string{knowledge.ID}, embeddingModel.GetDimensions()); err != nil { logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete knowledge embedding failed") return err } return nil }) // Delete all chunks associated with this knowledge wg.Go(func() error { if err := s.chunkService.DeleteChunksByKnowledgeID(ctx, knowledge.ID); err != nil { logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete chunks failed") return err } return nil }) // Delete the physical file if it exists wg.Go(func() error { if knowledge.FilePath != "" { if err := s.fileSvc.DeleteFile(ctx, knowledge.FilePath); err != nil { logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete file failed") } } tenantInfo := ctx.Value(types.TenantInfoContextKey).(*types.Tenant) tenantInfo.StorageUsed -= knowledge.StorageSize if err := s.tenantRepo.AdjustStorageUsed(ctx, tenantInfo.ID, -knowledge.StorageSize); err != nil { logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge update tenant storage used failed") } return nil }) if err = wg.Wait(); err != nil { return err } // Delete the knowledge entry itself from the database return s.repo.DeleteKnowledge(ctx, ctx.Value(types.TenantIDContextKey).(uint), id) } // DeleteKnowledge deletes a knowledge entry and all related resources func (s *knowledgeService) DeleteKnowledgeList(ctx context.Context, ids []string) error { if len(ids) == 0 { return nil } // 1. Get the knowledge entry tenantInfo := ctx.Value(types.TenantInfoContextKey).(*types.Tenant) knowledgeList, err := s.repo.GetKnowledgeBatch(ctx, tenantInfo.ID, ids) if err != nil { return err } wg := errgroup.Group{} // 2. Delete knowledge embeddings from vector store wg.Go(func() error { tenantInfo := ctx.Value(types.TenantInfoContextKey).(*types.Tenant) retrieveEngine, err := retriever.NewCompositeRetrieveEngine(tenantInfo.RetrieverEngines.Engines) if err != nil { logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete knowledge embedding failed") return err } group := map[string][]string{} for _, knowledge := range knowledgeList { group[knowledge.EmbeddingModelID] = append(group[knowledge.EmbeddingModelID], knowledge.ID) } for embeddingModelID, knowledgeList := range group { embeddingModel, err := s.modelService.GetEmbeddingModel(ctx, embeddingModelID) if err != nil { logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge get embedding model failed") return err } if err := retrieveEngine.DeleteByKnowledgeIDList(ctx, knowledgeList, embeddingModel.GetDimensions()); err != nil { logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete knowledge embedding failed") return err } } return nil }) // 3. Delete all chunks associated with this knowledge wg.Go(func() error { if err := s.chunkService.DeleteByKnowledgeList(ctx, ids); err != nil { logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete chunks failed") return err } return nil }) // 4. Delete the physical file if it exists wg.Go(func() error { storageAdjust := int64(0) for _, knowledge := range knowledgeList { if knowledge.FilePath != "" { if err := s.fileSvc.DeleteFile(ctx, knowledge.FilePath); err != nil { logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge delete file failed") } } storageAdjust -= knowledge.StorageSize } tenantInfo.StorageUsed += storageAdjust if err := s.tenantRepo.AdjustStorageUsed(ctx, tenantInfo.ID, storageAdjust); err != nil { logger.GetLogger(ctx).WithField("error", err).Errorf("DeleteKnowledge update tenant storage used failed") } return nil }) if err = wg.Wait(); err != nil { return err } // 5. Delete the knowledge entry itself from the database return s.repo.DeleteKnowledgeList(ctx, tenantInfo.ID, ids) } func (s *knowledgeService) cloneKnowledge(ctx context.Context, src *types.Knowledge, targetKB *types.KnowledgeBase) (err error) { if src.ParseStatus != "completed" { logger.GetLogger(ctx).WithField("knowledge_id", src.ID).Errorf("MoveKnowledge parse status is not completed") return nil } tenantInfo := ctx.Value(types.TenantInfoContextKey).(*types.Tenant) dst := &types.Knowledge{ ID: uuid.New().String(), TenantID: targetKB.TenantID, KnowledgeBaseID: targetKB.ID, Type: src.Type, Title: src.Title, Description: src.Description, Source: src.Source, ParseStatus: "processing", EnableStatus: "disabled", EmbeddingModelID: targetKB.EmbeddingModelID, FileName: src.FileName, FileType: src.FileType, FileSize: src.FileSize, FileHash: src.FileHash, FilePath: src.FilePath, StorageSize: src.StorageSize, Metadata: src.Metadata, } defer func() { if err != nil { dst.ParseStatus = "failed" dst.ErrorMessage = err.Error() _ = s.repo.UpdateKnowledge(ctx, dst) logger.GetLogger(ctx).WithField("error", err).Errorf("MoveKnowledge failed to move knowledge") } else { dst.ParseStatus = "completed" dst.EnableStatus = "enabled" _ = s.repo.UpdateKnowledge(ctx, dst) logger.GetLogger(ctx).WithField("knowledge_id", dst.ID).Infof("MoveKnowledge move knowledge successfully") } }() if err = s.repo.CreateKnowledge(ctx, dst); err != nil { logger.GetLogger(ctx).WithField("error", err).Errorf("MoveKnowledge create knowledge failed") return } tenantInfo.StorageUsed += dst.StorageSize if err = s.tenantRepo.AdjustStorageUsed(ctx, tenantInfo.ID, dst.StorageSize); err != nil { logger.GetLogger(ctx).WithField("error", err).Errorf("MoveKnowledge update tenant storage used failed") return } if err = s.CloneChunk(ctx, src, dst); err != nil { logger.GetLogger(ctx).WithField("knowledge_id", dst.ID). WithField("error", err).Errorf("MoveKnowledge move chunks failed") return } return } // processDocument handles asynchronous processing of document files func (s *knowledgeService) processDocument(ctx context.Context, kb *types.KnowledgeBase, knowledge *types.Knowledge, file *multipart.FileHeader, enableMultimodel bool, ) { logger.GetLogger(ctx).Infof("processDocument enableMultimodel: %v", enableMultimodel) ctx, span := tracing.ContextWithSpan(ctx, "knowledgeService.processDocument") defer span.End() span.SetAttributes( attribute.String("request_id", ctx.Value(types.RequestIDContextKey).(string)), attribute.String("knowledge_base_id", kb.ID), attribute.Int("tenant_id", int(kb.TenantID)), attribute.String("knowledge_id", knowledge.ID), attribute.String("file_name", knowledge.FileName), attribute.String("file_type", knowledge.FileType), attribute.String("file_path", knowledge.FilePath), attribute.Int64("file_size", knowledge.FileSize), attribute.String("embedding_model", knowledge.EmbeddingModelID), attribute.Bool("enable_multimodal", enableMultimodel), ) if !enableMultimodel && IsImageType(knowledge.FileType) { logger.GetLogger(ctx).WithField("knowledge_id", knowledge.ID). WithField("error", ErrImageNotParse).Errorf("processDocument image without enable multimodel") knowledge.ParseStatus = "failed" knowledge.ErrorMessage = ErrImageNotParse.Error() s.repo.UpdateKnowledge(ctx, knowledge) span.RecordError(ErrImageNotParse) return } // Update status to processing knowledge.ParseStatus = "processing" knowledge.UpdatedAt = time.Now() if err := s.repo.UpdateKnowledge(ctx, knowledge); err != nil { span.RecordError(err) return } // Read and chunk the document f, err := file.Open() if err != nil { logger.GetLogger(ctx).WithField("knowledge_id", knowledge.ID). WithField("error", err).Errorf("processDocument open file failed") knowledge.ParseStatus = "failed" knowledge.ErrorMessage = err.Error() knowledge.UpdatedAt = time.Now() s.repo.UpdateKnowledge(ctx, knowledge) span.RecordError(err) return } defer f.Close() span.AddEvent("start read file") contentBytes, err := io.ReadAll(f) if err != nil { knowledge.ParseStatus = "failed" knowledge.ErrorMessage = err.Error() knowledge.UpdatedAt = time.Now() s.repo.UpdateKnowledge(ctx, knowledge) span.RecordError(err) return } // Split file into chunks using document reader service span.AddEvent("start split file") resp, err := s.docReaderClient.ReadFromFile(ctx, &proto.ReadFromFileRequest{ FileContent: contentBytes, FileName: knowledge.FileName, FileType: knowledge.FileType, ReadConfig: &proto.ReadConfig{ ChunkSize: int32(kb.ChunkingConfig.ChunkSize), ChunkOverlap: int32(kb.ChunkingConfig.ChunkOverlap), Separators: kb.ChunkingConfig.Separators, EnableMultimodal: enableMultimodel, StorageConfig: &proto.StorageConfig{ Provider: proto.StorageProvider(proto.StorageProvider_value[strings.ToUpper(kb.StorageConfig.Provider)]), Region: kb.StorageConfig.Region, BucketName: kb.StorageConfig.BucketName, AccessKeyId: kb.StorageConfig.SecretID, SecretAccessKey: kb.StorageConfig.SecretKey, AppId: kb.StorageConfig.AppID, PathPrefix: kb.StorageConfig.PathPrefix, }, VlmConfig: &proto.VLMConfig{ ModelName: kb.VLMConfig.ModelName, BaseUrl: kb.VLMConfig.BaseURL, ApiKey: kb.VLMConfig.APIKey, InterfaceType: kb.VLMConfig.InterfaceType, }, }, RequestId: ctx.Value(types.RequestIDContextKey).(string), }) if err != nil { logger.GetLogger(ctx).WithField("knowledge_id", knowledge.ID). WithField("error", err).Errorf("processDocument read file failed") knowledge.ParseStatus = "failed" knowledge.ErrorMessage = err.Error() knowledge.UpdatedAt = time.Now() s.repo.UpdateKnowledge(ctx, knowledge) span.RecordError(err) return } // Process and store chunks span.AddEvent("start process chunks") s.processChunks(ctx, kb, knowledge, resp.Chunks) } // processDocumentFromURL handles asynchronous processing of URL content func (s *knowledgeService) processDocumentFromURL(ctx context.Context, kb *types.KnowledgeBase, knowledge *types.Knowledge, url string, enableMultimodel bool, ) { // Update status to processing knowledge.ParseStatus = "processing" knowledge.UpdatedAt = time.Now() if err := s.repo.UpdateKnowledge(ctx, knowledge); err != nil { return } logger.GetLogger(ctx).Infof("processDocumentFromURL enableMultimodel: %v", enableMultimodel) // Fetch and chunk content from URL resp, err := s.docReaderClient.ReadFromURL(ctx, &proto.ReadFromURLRequest{ Url: url, Title: knowledge.Title, ReadConfig: &proto.ReadConfig{ ChunkSize: int32(kb.ChunkingConfig.ChunkSize), ChunkOverlap: int32(kb.ChunkingConfig.ChunkOverlap), Separators: kb.ChunkingConfig.Separators, EnableMultimodal: enableMultimodel, StorageConfig: &proto.StorageConfig{ Provider: proto.StorageProvider(proto.StorageProvider_value[strings.ToUpper(kb.StorageConfig.Provider)]), Region: kb.StorageConfig.Region, BucketName: kb.StorageConfig.BucketName, AccessKeyId: kb.StorageConfig.SecretID, SecretAccessKey: kb.StorageConfig.SecretKey, AppId: kb.StorageConfig.AppID, PathPrefix: kb.StorageConfig.PathPrefix, }, VlmConfig: &proto.VLMConfig{ ModelName: kb.VLMConfig.ModelName, BaseUrl: kb.VLMConfig.BaseURL, ApiKey: kb.VLMConfig.APIKey, InterfaceType: kb.VLMConfig.InterfaceType, }, }, RequestId: ctx.Value(types.RequestIDContextKey).(string), }) if err != nil { knowledge.ParseStatus = "failed" knowledge.ErrorMessage = err.Error() knowledge.UpdatedAt = time.Now() s.repo.UpdateKnowledge(ctx, knowledge) return } // Process and store chunks s.processChunks(ctx, kb, knowledge, resp.Chunks) } // processDocumentFromPassage handles asynchronous processing of text passages func (s *knowledgeService) processDocumentFromPassage(ctx context.Context, kb *types.KnowledgeBase, knowledge *types.Knowledge, passage []string, ) { // Update status to processing knowledge.ParseStatus = "processing" knowledge.UpdatedAt = time.Now() if err := s.repo.UpdateKnowledge(ctx, knowledge); err != nil { return } // Convert passages to chunks chunks := make([]*proto.Chunk, 0, len(passage)) start, end := 0, 0 for i, p := range passage { if p == "" { continue } end += len([]rune(p)) chunk := &proto.Chunk{ Content: p, Seq: int32(i), Start: int32(start), End: int32(end), } start = end chunks = append(chunks, chunk) } // Process and store chunks s.processChunks(ctx, kb, knowledge, chunks) } // processChunks processes chunks and creates embeddings for knowledge content func (s *knowledgeService) processChunks(ctx context.Context, kb *types.KnowledgeBase, knowledge *types.Knowledge, chunks []*proto.Chunk, ) { ctx, span := tracing.ContextWithSpan(ctx, "knowledgeService.processChunks") defer span.End() span.SetAttributes( attribute.Int("tenant_id", int(knowledge.TenantID)), attribute.String("knowledge_base_id", knowledge.KnowledgeBaseID), attribute.String("knowledge_id", knowledge.ID), attribute.String("embedding_model_id", kb.EmbeddingModelID), attribute.Int("chunk_count", len(chunks)), ) // Get embedding model for vectorization embeddingModel, err := s.modelService.GetEmbeddingModel(ctx, kb.EmbeddingModelID) if err != nil { logger.GetLogger(ctx).WithField("error", err).Errorf("processChunks get embedding model failed") span.RecordError(err) return } // Generate document summary - 只使用文本类型的 Chunk chatModel, err := s.modelService.GetChatModel(ctx, kb.SummaryModelID) if err != nil { logger.GetLogger(ctx).WithField("error", err).Errorf("processChunks get summary model failed") span.RecordError(err) return } enableGraphRAG := os.Getenv("ENABLE_GRAPH_RAG") == "true" // Create chunk objects from proto chunks maxSeq := 0 // 统计图片相关的子Chunk数量,用于扩展insertChunks的容量 imageChunkCount := 0 for _, chunkData := range chunks { if len(chunkData.Images) > 0 { // 为每个图片的OCR和Caption分别创建一个Chunk imageChunkCount += len(chunkData.Images) * 2 } if int(chunkData.Seq) > maxSeq { maxSeq = int(chunkData.Seq) } } // 重新分配容量,考虑图片相关的Chunk insertChunks := make([]*types.Chunk, 0, len(chunks)+imageChunkCount) for _, chunkData := range chunks { if strings.TrimSpace(chunkData.Content) == "" { continue } // 创建主文本Chunk textChunk := &types.Chunk{ ID: uuid.New().String(), TenantID: knowledge.TenantID, KnowledgeID: knowledge.ID, KnowledgeBaseID: knowledge.KnowledgeBaseID, Content: chunkData.Content, ChunkIndex: int(chunkData.Seq), IsEnabled: true, CreatedAt: time.Now(), UpdatedAt: time.Now(), StartAt: int(chunkData.Start), EndAt: int(chunkData.End), ChunkType: types.ChunkTypeText, } var chunkImages []types.ImageInfo insertChunks = append(insertChunks, textChunk) // 处理图片信息 if len(chunkData.Images) > 0 { logger.GetLogger(ctx).Infof("Processing %d images in chunk #%d", len(chunkData.Images), chunkData.Seq) for i, img := range chunkData.Images { // 保存图片信息到文本Chunk imageInfo := types.ImageInfo{ URL: img.Url, OriginalURL: img.OriginalUrl, StartPos: int(img.Start), EndPos: int(img.End), OCRText: img.OcrText, Caption: img.Caption, } chunkImages = append(chunkImages, imageInfo) // 将ImageInfo序列化为JSON imageInfoJSON, err := json.Marshal([]types.ImageInfo{imageInfo}) if err != nil { logger.GetLogger(ctx).WithField("error", err).Errorf("Failed to marshal image info to JSON") continue } // 如果有OCR文本,创建OCR Chunk if img.OcrText != "" { ocrChunk := &types.Chunk{ ID: uuid.New().String(), TenantID: knowledge.TenantID, KnowledgeID: knowledge.ID, KnowledgeBaseID: knowledge.KnowledgeBaseID, Content: img.OcrText, ChunkIndex: maxSeq + i*100 + 1, // 使用不冲突的索引方式 IsEnabled: true, CreatedAt: time.Now(), UpdatedAt: time.Now(), StartAt: int(img.Start), EndAt: int(img.End), ChunkType: types.ChunkTypeImageOCR, ParentChunkID: textChunk.ID, ImageInfo: string(imageInfoJSON), } insertChunks = append(insertChunks, ocrChunk) logger.GetLogger(ctx).Infof("Created OCR chunk for image %d in chunk #%d", i, chunkData.Seq) } // 如果有图片描述,创建Caption Chunk if img.Caption != "" { captionChunk := &types.Chunk{ ID: uuid.New().String(), TenantID: knowledge.TenantID, KnowledgeID: knowledge.ID, KnowledgeBaseID: knowledge.KnowledgeBaseID, Content: img.Caption, ChunkIndex: maxSeq + i*100 + 2, // 使用不冲突的索引方式 IsEnabled: true, CreatedAt: time.Now(), UpdatedAt: time.Now(), StartAt: int(img.Start), EndAt: int(img.End), ChunkType: types.ChunkTypeImageCaption, ParentChunkID: textChunk.ID, ImageInfo: string(imageInfoJSON), } insertChunks = append(insertChunks, captionChunk) logger.GetLogger(ctx).Infof("Created caption chunk for image %d in chunk #%d", i, chunkData.Seq) } } imageInfoJSON, err := json.Marshal(chunkImages) if err != nil { logger.GetLogger(ctx).WithField("error", err).Errorf("Failed to marshal image info to JSON") continue } textChunk.ImageInfo = string(imageInfoJSON) } } // Sort chunks by index for proper ordering sort.Slice(insertChunks, func(i, j int) bool { return insertChunks[i].ChunkIndex < insertChunks[j].ChunkIndex }) // 仅为文本类型的Chunk设置前后关系 textChunks := make([]*types.Chunk, 0, len(chunks)) for _, chunk := range insertChunks { if chunk.ChunkType == types.ChunkTypeText { textChunks = append(textChunks, chunk) } } // 设置文本Chunk之间的前后关系 for i, chunk := range textChunks { if i > 0 { textChunks[i-1].NextChunkID = chunk.ID } if i < len(textChunks)-1 { textChunks[i+1].PreChunkID = chunk.ID } } if enableGraphRAG { relationChunkSize := 5 indirectRelationChunkSize := 5 graphBuilder := NewGraphBuilder(s.config, chatModel) err = graphBuilder.BuildGraph(ctx, textChunks) if err != nil { logger.GetLogger(ctx).WithField("error", err).Errorf("processChunks build graph failed") span.RecordError(err) } else { for _, chunk := range textChunks { chunk.RelationChunks, _ = json.Marshal(graphBuilder.GetRelationChunks(chunk.ID, relationChunkSize)) chunk.IndirectRelationChunks, _ = json.Marshal(graphBuilder.GetIndirectRelationChunks(chunk.ID, indirectRelationChunkSize)) } for i, entity := range graphBuilder.GetAllEntities() { relationChunks, _ := json.Marshal(entity.ChunkIDs) entityChunk := &types.Chunk{ ID: entity.ID, TenantID: knowledge.TenantID, KnowledgeID: knowledge.ID, KnowledgeBaseID: knowledge.KnowledgeBaseID, Content: entity.Description, ChunkIndex: maxSeq + i*100 + 3, IsEnabled: true, CreatedAt: time.Now(), UpdatedAt: time.Now(), ChunkType: types.ChunkTypeEntity, RelationChunks: types.JSON(relationChunks), } insertChunks = append(insertChunks, entityChunk) } for i, relationship := range graphBuilder.GetAllRelationships() { relationChunks, _ := json.Marshal(relationship.ChunkIDs) relationshipChunk := &types.Chunk{ ID: relationship.ID, TenantID: knowledge.TenantID, KnowledgeID: knowledge.ID, KnowledgeBaseID: knowledge.KnowledgeBaseID, Content: relationship.Description, ChunkIndex: maxSeq + i*100 + 4, IsEnabled: true, CreatedAt: time.Now(), UpdatedAt: time.Now(), ChunkType: types.ChunkTypeRelationship, RelationChunks: types.JSON(relationChunks), } insertChunks = append(insertChunks, relationshipChunk) } } } span.AddEvent("extract summary") summary, err := s.getSummary(ctx, chatModel, knowledge, textChunks) if err != nil { logger.GetLogger(ctx).WithField("knowledge_id", knowledge.ID). WithField("error", err).Errorf("processChunks get summary failed, use first chunk as description") if len(textChunks) > 0 { knowledge.Description = textChunks[0].Content } } else { knowledge.Description = summary } span.SetAttributes(attribute.String("summary", knowledge.Description)) // 批量索引 if strings.TrimSpace(knowledge.Description) != "" && len(textChunks) > 0 { sChunk := &types.Chunk{ ID: uuid.New().String(), TenantID: knowledge.TenantID, KnowledgeID: knowledge.ID, KnowledgeBaseID: knowledge.KnowledgeBaseID, Content: fmt.Sprintf("# 文档名称\n%s\n\n# 摘要\n%s", knowledge.FileName, knowledge.Description), ChunkIndex: maxSeq + 3, // 使用不冲突的索引方式 IsEnabled: true, CreatedAt: time.Now(), UpdatedAt: time.Now(), StartAt: 0, EndAt: 0, ChunkType: types.ChunkTypeSummary, ParentChunkID: textChunks[0].ID, } logger.GetLogger(ctx).Infof("Created summary chunk for %s with index %d", sChunk.ParentChunkID, sChunk.ChunkIndex) insertChunks = append(insertChunks, sChunk) } // Create index information for each chunk indexInfoList := utils.MapSlice(insertChunks, func(chunk *types.Chunk) *types.IndexInfo { return &types.IndexInfo{ Content: chunk.Content, SourceID: chunk.ID, SourceType: types.ChunkSourceType, ChunkID: chunk.ID, KnowledgeID: knowledge.ID, KnowledgeBaseID: knowledge.KnowledgeBaseID, } }) // Initialize retrieval engine tenantInfo := ctx.Value(types.TenantInfoContextKey).(*types.Tenant) retrieveEngine, err := retriever.NewCompositeRetrieveEngine(tenantInfo.RetrieverEngines.Engines) if err != nil { knowledge.ParseStatus = "failed" knowledge.ErrorMessage = err.Error() knowledge.UpdatedAt = time.Now() s.repo.UpdateKnowledge(ctx, knowledge) span.RecordError(err) return } // Calculate storage size required for embeddings span.AddEvent("estimate storage size") totalStorageSize := retrieveEngine.EstimateStorageSize(ctx, embeddingModel, indexInfoList) if tenantInfo.StorageQuota > 0 { // Re-fetch tenant storage information tenantInfo, err = s.tenantRepo.GetTenantByID(ctx, tenantInfo.ID) if err != nil { knowledge.ParseStatus = "failed" knowledge.ErrorMessage = err.Error() knowledge.UpdatedAt = time.Now() s.repo.UpdateKnowledge(ctx, knowledge) span.RecordError(err) return } // Check if there's enough storage quota available if tenantInfo.StorageUsed+totalStorageSize > tenantInfo.StorageQuota { knowledge.ParseStatus = "failed" knowledge.ErrorMessage = "存储空间不足" knowledge.UpdatedAt = time.Now() s.repo.UpdateKnowledge(ctx, knowledge) span.RecordError(errors.New("storage quota exceeded")) return } } // Save chunks to database span.AddEvent("create chunks") if err := s.chunkService.CreateChunks(ctx, insertChunks); err != nil { knowledge.ParseStatus = "failed" knowledge.ErrorMessage = err.Error() knowledge.UpdatedAt = time.Now() s.repo.UpdateKnowledge(ctx, knowledge) span.RecordError(err) return } span.AddEvent("batch index") err = retrieveEngine.BatchIndex(ctx, embeddingModel, indexInfoList) if err != nil { knowledge.ParseStatus = "failed" knowledge.ErrorMessage = err.Error() knowledge.UpdatedAt = time.Now() s.repo.UpdateKnowledge(ctx, knowledge) // delete failed chunks if err := s.chunkService.DeleteChunksByKnowledgeID(ctx, knowledge.ID); err != nil { logger.Errorf(ctx, "Delete chunks failed: %v", err) } // delete index if err := retrieveEngine.DeleteByKnowledgeIDList( ctx, []string{knowledge.ID}, embeddingModel.GetDimensions(), ); err != nil { logger.Errorf(ctx, "Delete index failed: %v", err) } span.RecordError(err) return } // Update knowledge status to completed knowledge.ParseStatus = "completed" knowledge.EnableStatus = "enabled" knowledge.StorageSize = totalStorageSize now := time.Now() knowledge.ProcessedAt = &now knowledge.UpdatedAt = now if err := s.repo.UpdateKnowledge(ctx, knowledge); err != nil { logger.GetLogger(ctx).WithField("error", err).Errorf("processChunks update knowledge failed") } // Update tenant's storage usage tenantInfo.StorageUsed += totalStorageSize if err := s.tenantRepo.AdjustStorageUsed(ctx, tenantInfo.ID, totalStorageSize); err != nil { logger.GetLogger(ctx).WithField("error", err).Errorf("processChunks update tenant storage used failed") } } // GetSummary generates a summary for knowledge content using an AI model func (s *knowledgeService) getSummary(ctx context.Context, summaryModel chat.Chat, knowledge *types.Knowledge, chunks []*types.Chunk, ) (string, error) { // Get knowledge info from the first chunk if len(chunks) == 0 { return "", fmt.Errorf("no chunks provided for summary generation") } // concat chunk contents chunkContents := "" allImageInfos := make([]*types.ImageInfo, 0) // then, sort chunks by StartAt sortedChunks := make([]*types.Chunk, len(chunks)) copy(sortedChunks, chunks) sort.Slice(sortedChunks, func(i, j int) bool { return sortedChunks[i].StartAt < sortedChunks[j].StartAt }) // concat chunk contents and collect image infos for _, chunk := range sortedChunks { if chunk.EndAt > 4096 { break } chunkContents = string([]rune(chunkContents)[:chunk.StartAt]) + chunk.Content if chunk.ImageInfo != "" { var images []*types.ImageInfo if err := json.Unmarshal([]byte(chunk.ImageInfo), &images); err == nil { allImageInfos = append(allImageInfos, images...) } } } // remove markdown image syntax re := regexp.MustCompile(`!\[[^\]]*\]\([^)]+\)`) chunkContents = re.ReplaceAllString(chunkContents, "") // collect all image infos if len(allImageInfos) > 0 { // add image infos to chunk contents var imageAnnotations string for _, img := range allImageInfos { if img.Caption != "" { imageAnnotations += fmt.Sprintf("\n[图片描述: %s]", img.Caption) } if img.OCRText != "" { imageAnnotations += fmt.Sprintf("\n[图片文字: %s]", img.OCRText) } } // concat chunk contents and image annotations chunkContents = chunkContents + imageAnnotations } if len(chunkContents) < 300 { return chunkContents, nil } // Prepare content with metadata for summary generation contentWithMetadata := chunkContents // Add knowledge metadata if available if knowledge != nil { metadataIntro := fmt.Sprintf("文档类型: %s\n文件名称: %s\n", knowledge.FileType, knowledge.FileName) // Add additional metadata if available if knowledge.Type != "" { metadataIntro += fmt.Sprintf("知识类型: %s\n", knowledge.Type) } // Prepend metadata to content contentWithMetadata = metadataIntro + "\n内容:\n" + contentWithMetadata } // Generate summary using AI model thinking := false summary, err := summaryModel.Chat(ctx, []chat.Message{ { Role: "system", Content: s.config.Conversation.GenerateSummaryPrompt, }, { Role: "user", Content: contentWithMetadata, }, }, &chat.ChatOptions{ Temperature: 0.3, MaxTokens: 1024, Thinking: &thinking, }) if err != nil { logger.GetLogger(ctx).WithField("error", err).Errorf("GetSummary failed") return "", err } logger.GetLogger(ctx).WithField("summary", summary.Content).Infof("GetSummary success") return summary.Content, nil } // GetKnowledgeFile retrieves the physical file associated with a knowledge entry func (s *knowledgeService) GetKnowledgeFile(ctx context.Context, id string) (io.ReadCloser, string, error) { // Get knowledge record knowledge, err := s.repo.GetKnowledgeByID(ctx, ctx.Value(types.TenantIDContextKey).(uint), id) if err != nil { return nil, "", err } // Get the file from storage file, err := s.fileSvc.GetFile(ctx, knowledge.FilePath) if err != nil { return nil, "", err } return file, knowledge.FileName, nil } func (s *knowledgeService) UpdateKnowledge(ctx context.Context, knowledge *types.Knowledge) error { record, err := s.repo.GetKnowledgeByID(ctx, ctx.Value(types.TenantIDContextKey).(uint), knowledge.ID) if err != nil { logger.Errorf(ctx, "Failed to get knowledge record: %v", err) return err } // if need other fields update, please add here if knowledge.Title != "" { record.Title = knowledge.Title } // Update knowledge record in the repository if err := s.repo.UpdateKnowledge(ctx, record); err != nil { logger.Errorf(ctx, "Failed to update knowledge: %v", err) return err } logger.Infof(ctx, "Knowledge updated successfully, ID: %s", knowledge.ID) return nil } // isValidFileType checks if a file type is supported func isValidFileType(filename string) bool { switch strings.ToLower(getFileType(filename)) { case "pdf", "txt", "docx", "doc", "md", "markdown", "png", "jpg", "jpeg", "gif": return true default: return false } } // getFileType extracts the file extension from a filename func getFileType(filename string) string { ext := strings.Split(filename, ".") if len(ext) < 2 { return "unknown" } return ext[len(ext)-1] } // isValidURL verifies if a URL is valid // isValidURL 检查URL是否有效 func isValidURL(url string) bool { if strings.HasPrefix(url, "http://") || strings.HasPrefix(url, "https://") { return true } return false } // GetKnowledgeBatch retrieves multiple knowledge entries by their IDs func (s *knowledgeService) GetKnowledgeBatch(ctx context.Context, tenantID uint, ids []string, ) ([]*types.Knowledge, error) { if len(ids) == 0 { return nil, nil } return s.repo.GetKnowledgeBatch(ctx, tenantID, ids) } // calculateFileHash calculates MD5 hash of a file func calculateFileHash(file *multipart.FileHeader) (string, error) { f, err := file.Open() if err != nil { return "", err } defer f.Close() h := md5.New() if _, err := io.Copy(h, f); err != nil { return "", err } // Reset file pointer for subsequent operations if _, err := f.Seek(0, 0); err != nil { return "", err } return hex.EncodeToString(h.Sum(nil)), nil } func calculateStr(strList ...string) string { h := md5.New() input := strings.Join(strList, "") h.Write([]byte(input)) return hex.EncodeToString(h.Sum(nil)) } func (s *knowledgeService) CloneKnowledgeBase(ctx context.Context, srcID, dstID string) error { srcKB, dstKB, err := s.kbService.CopyKnowledgeBase(ctx, srcID, dstID) if err != nil { logger.Errorf(ctx, "Failed to copy knowledge base: %v", err) return err } addKnowledge, err := s.repo.AminusB(ctx, srcKB.TenantID, srcKB.ID, dstKB.TenantID, dstKB.ID) if err != nil { logger.Errorf(ctx, "Failed to get knowledge: %v", err) return err } delKnowledge, err := s.repo.AminusB(ctx, dstKB.TenantID, dstKB.ID, srcKB.TenantID, srcKB.ID) if err != nil { logger.Errorf(ctx, "Failed to get knowledge: %v", err) return err } logger.Infof(ctx, "Knowledge after update to add: %d, delete: %d", len(addKnowledge), len(delKnowledge)) batch := 10 wg := sync.WaitGroup{} errCh := make(chan error, len(delKnowledge)) for ids := range slices.Chunk(delKnowledge, batch) { wg.Add(1) go func(ids []string) { defer wg.Done() if err := s.DeleteKnowledgeList(ctx, ids); err != nil { errCh <- fmt.Errorf("delete knowledge %v: %w", ids, err) } }(ids) } wg.Wait() close(errCh) for err := range errCh { if err != nil { return err } } wg = sync.WaitGroup{} errCh = make(chan error, len(addKnowledge)+len(delKnowledge)) for ids := range slices.Chunk(addKnowledge, batch) { wg.Add(1) go func(ids []string) { defer wg.Done() for _, kID := range ids { srcKn, err := s.repo.GetKnowledgeByID(ctx, srcKB.TenantID, kID) if err != nil { errCh <- fmt.Errorf("get knowledge %s: %w", kID, err) continue } if err := s.cloneKnowledge(ctx, srcKn, dstKB); err != nil { errCh <- fmt.Errorf("move knowledge %s: %w", kID, err) } } }(ids) } wg.Wait() close(errCh) for err := range errCh { if err != nil { return err } } return nil } func (s *knowledgeService) updateChunkVector(ctx context.Context, kbID string, chunks []*types.Chunk) error { // Get embedding model from knowledge base sourceKB, err := s.kbService.GetKnowledgeBaseByID(ctx, kbID) if err != nil { return err } embeddingModel, err := s.modelService.GetEmbeddingModel(ctx, sourceKB.EmbeddingModelID) if err != nil { return err } // Initialize composite retrieve engine from tenant configuration indexInfo := make([]*types.IndexInfo, 0, len(chunks)) ids := make([]string, 0, len(chunks)) for _, chunk := range chunks { if chunk.KnowledgeBaseID != kbID { logger.Warnf(ctx, "Knowledge base ID mismatch: %s != %s", chunk.KnowledgeBaseID, kbID) continue } indexInfo = append(indexInfo, &types.IndexInfo{ Content: chunk.Content, SourceID: chunk.ID, SourceType: types.ChunkSourceType, ChunkID: chunk.ID, KnowledgeID: chunk.KnowledgeID, KnowledgeBaseID: chunk.KnowledgeBaseID, }) ids = append(ids, chunk.ID) } tenantInfo := ctx.Value(types.TenantInfoContextKey).(*types.Tenant) retrieveEngine, err := retriever.NewCompositeRetrieveEngine(tenantInfo.RetrieverEngines.Engines) if err != nil { return err } // Delete old vector representation of the chunk err = retrieveEngine.DeleteByChunkIDList(ctx, ids, embeddingModel.GetDimensions()) if err != nil { return err } // Index updated chunk content with new vector representation err = retrieveEngine.BatchIndex(ctx, embeddingModel, indexInfo) if err != nil { return err } return nil } func (s *knowledgeService) UpdateImageInfo(ctx context.Context, knowledgeID string, chunkID string, imageInfo string) error { var images []*types.ImageInfo if err := json.Unmarshal([]byte(imageInfo), &images); err != nil { logger.Errorf(ctx, "Failed to unmarshal image info: %v", err) return err } if len(images) != 1 { logger.Warnf(ctx, "Expected exactly one image info, got %d", len(images)) return nil } image := images[0] // Retrieve all chunks with the given parent chunk ID chunk, err := s.chunkService.GetChunkByID(ctx, knowledgeID, chunkID) if err != nil { logger.Errorf(ctx, "Failed to get chunk: %v", err) return err } chunk.ImageInfo = imageInfo tenantID := ctx.Value(types.TenantIDContextKey).(uint) chunkChildren, err := s.chunkService.ListChunkByParentID(ctx, tenantID, chunkID) if err != nil { logger.ErrorWithFields(ctx, err, map[string]interface{}{ "parent_chunk_id": chunkID, "tenant_id": tenantID, }) return err } logger.Infof(ctx, "Found %d chunks with parent chunk ID: %s", len(chunkChildren), chunkID) // Iterate through each chunk and update its content based on the image information updateChunk := []*types.Chunk{chunk} var addChunk []*types.Chunk // Track whether we've found OCR and caption child chunks for this image hasOCRChunk := false hasCaptionChunk := false for i, child := range chunkChildren { // Skip chunks that are not image types var cImageInfo []*types.ImageInfo err = json.Unmarshal([]byte(child.ImageInfo), &cImageInfo) if err != nil { logger.Warnf(ctx, "Failed to unmarshal image %s info: %v", child.ID, err) continue } if len(cImageInfo) == 0 { continue } if cImageInfo[0].OriginalURL != image.OriginalURL { logger.Warnf(ctx, "Skipping chunk ID: %s, image URL mismatch: %s != %s", child.ID, cImageInfo[0].OriginalURL, image.OriginalURL) continue } // Mark that we've found chunks for this image if child.ChunkType == types.ChunkTypeImageCaption { hasCaptionChunk = true // Update caption if it has changed if image.Caption != cImageInfo[0].Caption { child.Content = image.Caption child.ImageInfo = imageInfo updateChunk = append(updateChunk, chunkChildren[i]) } } else if child.ChunkType == types.ChunkTypeImageOCR { hasOCRChunk = true // Update OCR if it has changed if image.OCRText != cImageInfo[0].OCRText { child.Content = image.OCRText child.ImageInfo = imageInfo updateChunk = append(updateChunk, chunkChildren[i]) } } } // Create a new caption chunk if it doesn't exist and we have caption data if !hasCaptionChunk && image.Caption != "" { captionChunk := &types.Chunk{ ID: uuid.New().String(), TenantID: tenantID, KnowledgeID: chunk.KnowledgeID, KnowledgeBaseID: chunk.KnowledgeBaseID, Content: image.Caption, ChunkType: types.ChunkTypeImageCaption, ParentChunkID: chunk.ID, ImageInfo: imageInfo, } addChunk = append(addChunk, captionChunk) logger.Infof(ctx, "Created new caption chunk ID: %s for image URL: %s", captionChunk.ID, image.OriginalURL) } // Create a new OCR chunk if it doesn't exist and we have OCR data if !hasOCRChunk && image.OCRText != "" { ocrChunk := &types.Chunk{ ID: uuid.New().String(), TenantID: tenantID, KnowledgeID: chunk.KnowledgeID, KnowledgeBaseID: chunk.KnowledgeBaseID, Content: image.OCRText, ChunkType: types.ChunkTypeImageOCR, ParentChunkID: chunk.ID, ImageInfo: imageInfo, } addChunk = append(addChunk, ocrChunk) logger.Infof(ctx, "Created new OCR chunk ID: %s for image URL: %s", ocrChunk.ID, image.OriginalURL) } logger.Infof(ctx, "Updated %d chunks out of %d total chunks", len(updateChunk), len(chunkChildren)+1) if len(addChunk) > 0 { err := s.chunkService.CreateChunks(ctx, addChunk) if err != nil { logger.ErrorWithFields(ctx, err, map[string]interface{}{ "add_chunk_size": len(addChunk), }) return err } } // Update the chunks for _, c := range updateChunk { err := s.chunkService.UpdateChunk(ctx, c) if err != nil { logger.ErrorWithFields(ctx, err, map[string]interface{}{ "chunk_id": c.ID, "knowledge_id": c.KnowledgeID, }) return err } } // Update the chunk vector err = s.updateChunkVector(ctx, chunk.KnowledgeBaseID, append(updateChunk, addChunk...)) if err != nil { logger.ErrorWithFields(ctx, err, map[string]interface{}{ "chunk_id": chunk.ID, "knowledge_id": chunk.KnowledgeID, }) return err } // Update the knowledge file hash knowledge, err := s.repo.GetKnowledgeByID(ctx, tenantID, knowledgeID) if err != nil { logger.Errorf(ctx, "Failed to get knowledge: %v", err) return err } fileHash := calculateStr(knowledgeID, knowledge.FileHash, imageInfo) knowledge.FileHash = fileHash err = s.repo.UpdateKnowledge(ctx, knowledge) if err != nil { logger.Warnf(ctx, "Failed to update knowledge file hash: %v", err) } logger.Infof(ctx, "Updated chunk successfully, chunk ID: %s, knowledge ID: %s", chunk.ID, chunk.KnowledgeID) return nil } // CloneChunk clone chunks from one knowledge to another // This method transfers a chunk from a source knowledge document to a target knowledge document // It handles the creation of new chunks in the target knowledge and updates the vector database accordingly // Parameters: // - ctx: Context with authentication and request information // - src: Source knowledge document containing the chunk to move // - dst: Target knowledge document where the chunk will be moved // // Returns: // - error: Any error encountered during the move operation // // This method handles the chunk transfer logic, including creating new chunks in the target knowledge // and updating the vector database representation of the moved chunks. // It also ensures that the chunk's relationships (like pre and next chunk IDs) are maintained // by mapping the source chunk IDs to the new target chunk IDs. func (s *knowledgeService) CloneChunk(ctx context.Context, src, dst *types.Knowledge) error { chunkPage := 1 chunkPageSize := 100 srcTodst := map[string]string{} targetChunks := make([]*types.Chunk, 0, 10) chunkType := []types.ChunkType{ types.ChunkTypeText, types.ChunkTypeSummary, types.ChunkTypeImageCaption, types.ChunkTypeImageOCR, } for { sourceChunks, _, err := s.chunkRepo.ListPagedChunksByKnowledgeID(ctx, src.TenantID, src.ID, &types.Pagination{ Page: chunkPage, PageSize: chunkPageSize, }, chunkType, ) chunkPage++ if err != nil { return err } if len(sourceChunks) == 0 { break } for _, sourceChunk := range sourceChunks { targetChunk := &types.Chunk{ ID: uuid.New().String(), TenantID: dst.TenantID, KnowledgeID: dst.ID, KnowledgeBaseID: dst.KnowledgeBaseID, Content: sourceChunk.Content, ChunkIndex: sourceChunk.ChunkIndex, IsEnabled: sourceChunk.IsEnabled, StartAt: sourceChunk.StartAt, EndAt: sourceChunk.EndAt, PreChunkID: sourceChunk.PreChunkID, NextChunkID: sourceChunk.NextChunkID, ChunkType: sourceChunk.ChunkType, ParentChunkID: sourceChunk.ParentChunkID, ImageInfo: sourceChunk.ImageInfo, } targetChunks = append(targetChunks, targetChunk) srcTodst[sourceChunk.ID] = targetChunk.ID } } for _, targetChunk := range targetChunks { if val, ok := srcTodst[targetChunk.PreChunkID]; ok { targetChunk.PreChunkID = val } else { targetChunk.PreChunkID = "" } if val, ok := srcTodst[targetChunk.NextChunkID]; ok { targetChunk.NextChunkID = val } else { targetChunk.NextChunkID = "" } if val, ok := srcTodst[targetChunk.ParentChunkID]; ok { targetChunk.ParentChunkID = val } else { targetChunk.ParentChunkID = "" } } for chunks := range slices.Chunk(targetChunks, chunkPageSize) { err := s.chunkRepo.CreateChunks(ctx, chunks) if err != nil { return err } } tenantInfo := ctx.Value(types.TenantInfoContextKey).(*types.Tenant) retrieveEngine, err := retriever.NewCompositeRetrieveEngine(tenantInfo.RetrieverEngines.Engines) if err != nil { return err } embeddingModel, err := s.modelService.GetEmbeddingModel(ctx, dst.EmbeddingModelID) if err != nil { return err } if err := retrieveEngine.CopyIndices(ctx, src.KnowledgeBaseID, dst.KnowledgeBaseID, map[string]string{src.ID: dst.ID}, srcTodst, embeddingModel.GetDimensions(), ); err != nil { return err } return nil } func IsImageType(fileType string) bool { switch fileType { case "jpg", "jpeg", "png", "gif", "webp", "bmp", "svg", "tiff": return true default: return false } }