l_ai_knowledge/internal/application/service/chat_pipline/preprocess.go

181 lines
4.8 KiB
Go

package chatpipline
import (
"context"
"github.com/yanyiwu/gojieba"
"regexp"
"strings"
"unicode"
"knowlege-lsxd/internal/config"
"knowlege-lsxd/internal/logger"
"knowlege-lsxd/internal/types"
"knowlege-lsxd/internal/types/interfaces"
)
// PluginPreprocess Query preprocessing plugin
type PluginPreprocess struct {
config *config.Config
jieba *gojieba.Jieba
stopwords map[string]struct{}
}
// Regular expressions for text cleaning
var (
multiSpaceRegex = regexp.MustCompile(`\s+`) // Multiple spaces
urlRegex = regexp.MustCompile(`https?://\S+`) // URLs
emailRegex = regexp.MustCompile(`\b[\w.%+-]+@[\w.-]+\.[a-zA-Z]{2,}\b`) // Email addresses
punctRegex = regexp.MustCompile(`[^\p{L}\p{N}\s]`) // Punctuation marks
)
// NewPluginPreprocess Creates a new query preprocessing plugin
func NewPluginPreprocess(
eventManager *EventManager,
config *config.Config,
cleaner interfaces.ResourceCleaner,
) *PluginPreprocess {
// Use default dictionary for Jieba tokenizer
jieba := gojieba.NewJieba()
// Load stopwords from built-in stopword library
stopwords := loadStopwords()
res := &PluginPreprocess{
config: config,
jieba: jieba,
stopwords: stopwords,
}
// Register resource cleanup function
if cleaner != nil {
cleaner.RegisterWithName("JiebaPreprocessor", func() error {
res.Close()
return nil
})
}
eventManager.Register(res)
return res
}
// Load stopwords
func loadStopwords() map[string]struct{} {
// Directly use some common stopwords built into Jieba
commonStopwords := []string{
"的", "了", "和", "是", "在", "我", "你", "他", "她", "它",
"这", "那", "什么", "怎么", "如何", "为什么", "哪里", "什么时候",
"the", "is", "are", "am", "I", "you", "he", "she", "it", "this",
"that", "what", "how", "a", "an", "and", "or", "but", "if", "of",
"to", "in", "on", "at", "by", "for", "with", "about", "from",
"有", "无", "好", "来", "去", "说", "看", "想", "会", "可以",
"吗", "呢", "啊", "吧", "的话", "就是", "只是", "因为", "所以",
}
result := make(map[string]struct{}, len(commonStopwords))
for _, word := range commonStopwords {
result[word] = struct{}{}
}
return result
}
// ActivationEvents Register activation events
func (p *PluginPreprocess) ActivationEvents() []types.EventType {
return []types.EventType{types.PREPROCESS_QUERY}
}
// OnEvent Process events
func (p *PluginPreprocess) OnEvent(ctx context.Context, eventType types.EventType, chatManage *types.ChatManage, next func() *PluginError) *PluginError {
if chatManage.RewriteQuery == "" {
return next()
}
logger.GetLogger(ctx).Infof("Starting query preprocessing, original query: %s", chatManage.RewriteQuery)
// 1. Basic text cleaning
processed := p.cleanText(chatManage.RewriteQuery)
// 2. Tokenization
segments := p.segmentText(processed)
// 3. Stopword filtering and reconstruction
filteredSegments := p.filterStopwords(segments)
// Update preprocessed query
chatManage.ProcessedQuery = strings.Join(filteredSegments, " ")
logger.GetLogger(ctx).Infof("Query preprocessing complete, processed query: %s", chatManage.ProcessedQuery)
return next()
}
// cleanText Basic text cleaning
func (p *PluginPreprocess) cleanText(text string) string {
// Remove URLs
text = urlRegex.ReplaceAllString(text, " ")
// Remove email addresses
text = emailRegex.ReplaceAllString(text, " ")
// Remove excessive spaces
text = multiSpaceRegex.ReplaceAllString(text, " ")
// Remove punctuation marks
text = punctRegex.ReplaceAllString(text, " ")
// Trim leading and trailing spaces
text = strings.TrimSpace(text)
return text
}
// segmentText Text tokenization
func (p *PluginPreprocess) segmentText(text string) []string {
// Use Jieba tokenizer for tokenization, using search engine mode
segments := p.jieba.CutForSearch(text, true)
return segments
}
// filterStopwords Filter stopwords
func (p *PluginPreprocess) filterStopwords(segments []string) []string {
var filtered []string
for _, word := range segments {
// If not a stopword and not blank, keep it
if _, isStopword := p.stopwords[word]; !isStopword && !isBlank(word) {
filtered = append(filtered, word)
}
}
// If filtering results in empty list, return original tokenization results
if len(filtered) == 0 {
return segments
}
return filtered
}
// isBlank Check if a string is blank
func isBlank(str string) bool {
for _, r := range str {
if !unicode.IsSpace(r) {
return false
}
}
return true
}
// Ensure resources are properly released
func (p *PluginPreprocess) Close() {
if p.jieba != nil {
p.jieba.Free()
p.jieba = nil
}
}
// ShutdownHandler Returns shutdown function
func (p *PluginPreprocess) ShutdownHandler() func() {
return func() {
p.Close()
}
}