geoGo/internal/collect/utils.go

145 lines
4.2 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package collect
import (
"fmt"
"regexp"
"strings"
)
// CleanHTMLTags 清理HTML标签只保留纯文本内容
// 这个函数是包内公共方法供所有AI平台使用
func CleanHTMLTags(html string) string {
if html == "" {
return ""
}
// 移除所有HTML标签
re := regexp.MustCompile(`<[^>]*>`)
cleaned := re.ReplaceAllString(html, "")
// 解码常见的HTML实体
cleaned = strings.ReplaceAll(cleaned, "&nbsp;", " ")
cleaned = strings.ReplaceAll(cleaned, "&lt;", "<")
cleaned = strings.ReplaceAll(cleaned, "&gt;", ">")
cleaned = strings.ReplaceAll(cleaned, "&amp;", "&")
cleaned = strings.ReplaceAll(cleaned, "&quot;", "\"")
cleaned = strings.ReplaceAll(cleaned, "&#39;", "'")
// 去除多余的空格和换行
cleaned = strings.TrimSpace(cleaned)
// 将多个连续空格替换为单个空格
multipleSpaces := regexp.MustCompile(`\s+`)
cleaned = multipleSpaces.ReplaceAllString(cleaned, " ")
return cleaned
}
// CleanDivTags 只清理div标签保留其他HTML标签和纯文本内容
// 这个函数会移除所有<div>和</div>标签,但保留标签内的内容
func CleanDivTags(html string) string {
if html == "" {
return ""
}
// 移除所有<div>和</div>标签(不区分大小写)
re := regexp.MustCompile(`(?i)</?div[^>]*>`)
cleaned := re.ReplaceAllString(html, "")
// 解码常见的HTML实体
cleaned = strings.ReplaceAll(cleaned, "&nbsp;", " ")
cleaned = strings.ReplaceAll(cleaned, "&lt;", "<")
cleaned = strings.ReplaceAll(cleaned, "&gt;", ">")
cleaned = strings.ReplaceAll(cleaned, "&amp;", "&")
cleaned = strings.ReplaceAll(cleaned, "&quot;", "\"")
cleaned = strings.ReplaceAll(cleaned, "&#39;", "'")
// 去除多余的空格和换行
cleaned = strings.TrimSpace(cleaned)
// 将多个连续空格替换为单个空格
multipleSpaces := regexp.MustCompile(`\s+`)
cleaned = multipleSpaces.ReplaceAllString(cleaned, " ")
return cleaned
}
// HighlightKeywordsInHTML 在HTML内容中高亮显示指定的关键词
// htmlContent: 原始HTML内容
// pointKeys: 需要高亮的关键词列表
// 返回处理后的HTML内容每个关键词会被不同颜色的span标签包裹
func HighlightKeywordsInHTML(htmlContent string, pointKeys []string) (string, bool) {
var isExposure bool
if htmlContent == "" || len(pointKeys) == 0 {
return htmlContent, false
}
// 预定义的颜色列表使用CSS颜色值
colors := []string{
"#FF6B6B", // 红色
"#4ECDC4", // 青色
"#45B7D1", // 蓝色
"#FFA07A", // 浅橙色
"#98D8C8", // 薄荷绿
"#F7DC6F", // 黄色
"#BB8FCE", // 紫色
"#85C1E2", // 天蓝色
"#F8B739", // 橙色
"#52B788", // 绿色
"#E63946", // 深红色
"#457B9D", // 深蓝色
"#2A9D8F", // 蓝绿色
"#E9C46A", // 金黄色
"#F4A261", // 橘色
}
result := htmlContent
// 遍历每个关键词,为其分配颜色并添加高亮标记
for index, keyword := range pointKeys {
if keyword == "" {
continue
}
// 选择颜色(循环使用颜色列表)
color := colors[index%len(colors)]
// 转义关键词中的特殊正则字符
escapedKeyword := regexp.QuoteMeta(keyword)
// 创建匹配模式,确保只匹配完整的词(避免部分匹配)
// 使用大小写不敏感匹配
pattern := fmt.Sprintf(`(?i)(%s)`, escapedKeyword)
re := regexp.MustCompile(pattern)
// 检查是否匹配到关键词
if re.MatchString(result) {
isExposure = true
}
// 替换匹配的关键词为带颜色的span标签
replacement := fmt.Sprintf(`<span style="color:%s;font-weight:bold;">$1</span>`, color)
result = re.ReplaceAllString(result, replacement)
}
return result, isExposure
}
// HighlightKeywordsInText 在纯文本中高亮显示指定的关键词先转换为HTML
// textContent: 纯文本内容
// pointKeys: 需要高亮的关键词列表
// 返回带有高亮标记的HTML内容
func HighlightKeywordsInText(textContent string, pointKeys []string) (string, bool) {
if textContent == "" {
return textContent, false
}
htmlContent := CleanDivTags(textContent)
// 将纯文本转换为HTML段落格式
htmlContent = fmt.Sprintf("<p>%s</p>", strings.ReplaceAll(htmlContent, "\n", "</p><p>"))
// 使用HTML高亮方法
return HighlightKeywordsInHTML(htmlContent, pointKeys)
}