181 lines
4.8 KiB
Go
181 lines
4.8 KiB
Go
package chatpipline
|
|
|
|
import (
|
|
"context"
|
|
"github.com/yanyiwu/gojieba"
|
|
"regexp"
|
|
"strings"
|
|
"unicode"
|
|
|
|
"knowlege-lsxd/internal/config"
|
|
"knowlege-lsxd/internal/logger"
|
|
"knowlege-lsxd/internal/types"
|
|
"knowlege-lsxd/internal/types/interfaces"
|
|
)
|
|
|
|
// PluginPreprocess Query preprocessing plugin
|
|
type PluginPreprocess struct {
|
|
config *config.Config
|
|
jieba *gojieba.Jieba
|
|
stopwords map[string]struct{}
|
|
}
|
|
|
|
// Regular expressions for text cleaning
|
|
var (
|
|
multiSpaceRegex = regexp.MustCompile(`\s+`) // Multiple spaces
|
|
urlRegex = regexp.MustCompile(`https?://\S+`) // URLs
|
|
emailRegex = regexp.MustCompile(`\b[\w.%+-]+@[\w.-]+\.[a-zA-Z]{2,}\b`) // Email addresses
|
|
punctRegex = regexp.MustCompile(`[^\p{L}\p{N}\s]`) // Punctuation marks
|
|
)
|
|
|
|
// NewPluginPreprocess Creates a new query preprocessing plugin
|
|
func NewPluginPreprocess(
|
|
eventManager *EventManager,
|
|
config *config.Config,
|
|
cleaner interfaces.ResourceCleaner,
|
|
) *PluginPreprocess {
|
|
// Use default dictionary for Jieba tokenizer
|
|
jieba := gojieba.NewJieba()
|
|
|
|
// Load stopwords from built-in stopword library
|
|
stopwords := loadStopwords()
|
|
|
|
res := &PluginPreprocess{
|
|
config: config,
|
|
jieba: jieba,
|
|
stopwords: stopwords,
|
|
}
|
|
|
|
// Register resource cleanup function
|
|
if cleaner != nil {
|
|
cleaner.RegisterWithName("JiebaPreprocessor", func() error {
|
|
res.Close()
|
|
return nil
|
|
})
|
|
}
|
|
|
|
eventManager.Register(res)
|
|
return res
|
|
}
|
|
|
|
// Load stopwords
|
|
func loadStopwords() map[string]struct{} {
|
|
// Directly use some common stopwords built into Jieba
|
|
commonStopwords := []string{
|
|
"的", "了", "和", "是", "在", "我", "你", "他", "她", "它",
|
|
"这", "那", "什么", "怎么", "如何", "为什么", "哪里", "什么时候",
|
|
"the", "is", "are", "am", "I", "you", "he", "she", "it", "this",
|
|
"that", "what", "how", "a", "an", "and", "or", "but", "if", "of",
|
|
"to", "in", "on", "at", "by", "for", "with", "about", "from",
|
|
"有", "无", "好", "来", "去", "说", "看", "想", "会", "可以",
|
|
"吗", "呢", "啊", "吧", "的话", "就是", "只是", "因为", "所以",
|
|
}
|
|
|
|
result := make(map[string]struct{}, len(commonStopwords))
|
|
for _, word := range commonStopwords {
|
|
result[word] = struct{}{}
|
|
}
|
|
return result
|
|
}
|
|
|
|
// ActivationEvents Register activation events
|
|
func (p *PluginPreprocess) ActivationEvents() []types.EventType {
|
|
return []types.EventType{types.PREPROCESS_QUERY}
|
|
}
|
|
|
|
// OnEvent Process events
|
|
func (p *PluginPreprocess) OnEvent(ctx context.Context, eventType types.EventType, chatManage *types.ChatManage, next func() *PluginError) *PluginError {
|
|
if chatManage.RewriteQuery == "" {
|
|
return next()
|
|
}
|
|
|
|
logger.GetLogger(ctx).Infof("Starting query preprocessing, original query: %s", chatManage.RewriteQuery)
|
|
|
|
// 1. Basic text cleaning
|
|
processed := p.cleanText(chatManage.RewriteQuery)
|
|
|
|
// 2. Tokenization
|
|
segments := p.segmentText(processed)
|
|
|
|
// 3. Stopword filtering and reconstruction
|
|
filteredSegments := p.filterStopwords(segments)
|
|
|
|
// Update preprocessed query
|
|
chatManage.ProcessedQuery = strings.Join(filteredSegments, " ")
|
|
|
|
logger.GetLogger(ctx).Infof("Query preprocessing complete, processed query: %s", chatManage.ProcessedQuery)
|
|
|
|
return next()
|
|
}
|
|
|
|
// cleanText Basic text cleaning
|
|
func (p *PluginPreprocess) cleanText(text string) string {
|
|
// Remove URLs
|
|
text = urlRegex.ReplaceAllString(text, " ")
|
|
|
|
// Remove email addresses
|
|
text = emailRegex.ReplaceAllString(text, " ")
|
|
|
|
// Remove excessive spaces
|
|
text = multiSpaceRegex.ReplaceAllString(text, " ")
|
|
|
|
// Remove punctuation marks
|
|
text = punctRegex.ReplaceAllString(text, " ")
|
|
|
|
// Trim leading and trailing spaces
|
|
text = strings.TrimSpace(text)
|
|
|
|
return text
|
|
}
|
|
|
|
// segmentText Text tokenization
|
|
func (p *PluginPreprocess) segmentText(text string) []string {
|
|
// Use Jieba tokenizer for tokenization, using search engine mode
|
|
segments := p.jieba.CutForSearch(text, true)
|
|
return segments
|
|
}
|
|
|
|
// filterStopwords Filter stopwords
|
|
func (p *PluginPreprocess) filterStopwords(segments []string) []string {
|
|
var filtered []string
|
|
|
|
for _, word := range segments {
|
|
// If not a stopword and not blank, keep it
|
|
if _, isStopword := p.stopwords[word]; !isStopword && !isBlank(word) {
|
|
filtered = append(filtered, word)
|
|
}
|
|
}
|
|
|
|
// If filtering results in empty list, return original tokenization results
|
|
if len(filtered) == 0 {
|
|
return segments
|
|
}
|
|
|
|
return filtered
|
|
}
|
|
|
|
// isBlank Check if a string is blank
|
|
func isBlank(str string) bool {
|
|
for _, r := range str {
|
|
if !unicode.IsSpace(r) {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// Ensure resources are properly released
|
|
func (p *PluginPreprocess) Close() {
|
|
if p.jieba != nil {
|
|
p.jieba.Free()
|
|
p.jieba = nil
|
|
}
|
|
}
|
|
|
|
// ShutdownHandler Returns shutdown function
|
|
func (p *PluginPreprocess) ShutdownHandler() func() {
|
|
return func() {
|
|
p.Close()
|
|
}
|
|
}
|