l_ai_knowledge/internal/application/service/chat_pipline/preprocess.go

package chatpipline

import (
	"context"
	"github.com/yanyiwu/gojieba"
	"regexp"
	"strings"
	"unicode"

	"knowlege-lsxd/internal/config"
	"knowlege-lsxd/internal/logger"
	"knowlege-lsxd/internal/types"
	"knowlege-lsxd/internal/types/interfaces"
)

// PluginPreprocess Query preprocessing plugin
type PluginPreprocess struct {
	config    *config.Config
	jieba     *gojieba.Jieba
	stopwords map[string]struct{}
}

// Regular expressions for text cleaning
var (
	multiSpaceRegex = regexp.MustCompile(`\s+`)                                 // Multiple spaces
	urlRegex        = regexp.MustCompile(`https?://\S+`)                        // URLs
	emailRegex      = regexp.MustCompile(`\b[\w.%+-]+@[\w.-]+\.[a-zA-Z]{2,}\b`) // Email addresses
	punctRegex      = regexp.MustCompile(`[^\p{L}\p{N}\s]`)                     // Punctuation marks
)

// NewPluginPreprocess Creates a new query preprocessing plugin
func NewPluginPreprocess(
	eventManager *EventManager,
	config *config.Config,
	cleaner interfaces.ResourceCleaner,
) *PluginPreprocess {
	// Use default dictionary for Jieba tokenizer
	jieba := gojieba.NewJieba()

	// Load stopwords from built-in stopword library
	stopwords := loadStopwords()

	res := &PluginPreprocess{
		config:    config,
		jieba:     jieba,
		stopwords: stopwords,
	}

	// Register resource cleanup function
	if cleaner != nil {
		cleaner.RegisterWithName("JiebaPreprocessor", func() error {
			res.Close()
			return nil
		})
	}

	eventManager.Register(res)
	return res
}

// Load stopwords
func loadStopwords() map[string]struct{} {
	// Directly use some common stopwords built into Jieba
	commonStopwords := []string{
		"的", "了", "和", "是", "在", "我", "你", "他", "她", "它",
		"这", "那", "什么", "怎么", "如何", "为什么", "哪里", "什么时候",
		"the", "is", "are", "am", "I", "you", "he", "she", "it", "this",
		"that", "what", "how", "a", "an", "and", "or", "but", "if", "of",
		"to", "in", "on", "at", "by", "for", "with", "about", "from",
		"有", "无", "好", "来", "去", "说", "看", "想", "会", "可以",
		"吗", "呢", "啊", "吧", "的话", "就是", "只是", "因为", "所以",
	}

	result := make(map[string]struct{}, len(commonStopwords))
	for _, word := range commonStopwords {
		result[word] = struct{}{}
	}
	return result
}

// ActivationEvents Register activation events
func (p *PluginPreprocess) ActivationEvents() []types.EventType {
	return []types.EventType{types.PREPROCESS_QUERY}
}

// OnEvent Process events
func (p *PluginPreprocess) OnEvent(ctx context.Context, eventType types.EventType, chatManage *types.ChatManage, next func() *PluginError) *PluginError {
	if chatManage.RewriteQuery == "" {
		return next()
	}

	logger.GetLogger(ctx).Infof("Starting query preprocessing, original query: %s", chatManage.RewriteQuery)

	// 1. Basic text cleaning
	processed := p.cleanText(chatManage.RewriteQuery)

	// 2. Tokenization
	segments := p.segmentText(processed)

	// 3. Stopword filtering and reconstruction
	filteredSegments := p.filterStopwords(segments)

	// Update preprocessed query
	chatManage.ProcessedQuery = strings.Join(filteredSegments, " ")

	logger.GetLogger(ctx).Infof("Query preprocessing complete, processed query: %s", chatManage.ProcessedQuery)

	return next()
}

// cleanText Basic text cleaning
func (p *PluginPreprocess) cleanText(text string) string {
	// Remove URLs
	text = urlRegex.ReplaceAllString(text, " ")

	// Remove email addresses
	text = emailRegex.ReplaceAllString(text, " ")

	// Remove excessive spaces
	text = multiSpaceRegex.ReplaceAllString(text, " ")

	// Remove punctuation marks
	text = punctRegex.ReplaceAllString(text, " ")

	// Trim leading and trailing spaces
	text = strings.TrimSpace(text)

	return text
}

// segmentText Text tokenization
func (p *PluginPreprocess) segmentText(text string) []string {
	// Use Jieba tokenizer for tokenization, using search engine mode
	segments := p.jieba.CutForSearch(text, true)
	return segments
}

// filterStopwords Filter stopwords
func (p *PluginPreprocess) filterStopwords(segments []string) []string {
	var filtered []string

	for _, word := range segments {
		// If not a stopword and not blank, keep it
		if _, isStopword := p.stopwords[word]; !isStopword && !isBlank(word) {
			filtered = append(filtered, word)
		}
	}

	// If filtering results in empty list, return original tokenization results
	if len(filtered) == 0 {
		return segments
	}

	return filtered
}

// isBlank Check if a string is blank
func isBlank(str string) bool {
	for _, r := range str {
		if !unicode.IsSpace(r) {
			return false
		}
	}
	return true
}

// Ensure resources are properly released
func (p *PluginPreprocess) Close() {
	if p.jieba != nil {
		p.jieba.Free()
		p.jieba = nil
	}
}

// ShutdownHandler Returns shutdown function
func (p *PluginPreprocess) ShutdownHandler() func() {
	return func() {
		p.Close()
	}
}