143 lines
4.2 KiB
Go
143 lines
4.2 KiB
Go
package collect
|
||
|
||
import (
|
||
"fmt"
|
||
"regexp"
|
||
"strings"
|
||
)
|
||
|
||
// CleanHTMLTags 清理HTML标签,只保留纯文本内容
|
||
// 这个函数是包内公共方法,供所有AI平台使用
|
||
func CleanHTMLTags(html string) string {
|
||
if html == "" {
|
||
return ""
|
||
}
|
||
|
||
// 移除所有HTML标签
|
||
re := regexp.MustCompile(`<[^>]*>`)
|
||
cleaned := re.ReplaceAllString(html, "")
|
||
|
||
// 解码常见的HTML实体
|
||
cleaned = strings.ReplaceAll(cleaned, " ", " ")
|
||
cleaned = strings.ReplaceAll(cleaned, "<", "<")
|
||
cleaned = strings.ReplaceAll(cleaned, ">", ">")
|
||
cleaned = strings.ReplaceAll(cleaned, "&", "&")
|
||
cleaned = strings.ReplaceAll(cleaned, """, "\"")
|
||
cleaned = strings.ReplaceAll(cleaned, "'", "'")
|
||
|
||
// 去除多余的空格和换行
|
||
cleaned = strings.TrimSpace(cleaned)
|
||
|
||
// 将多个连续空格替换为单个空格
|
||
multipleSpaces := regexp.MustCompile(`\s+`)
|
||
cleaned = multipleSpaces.ReplaceAllString(cleaned, " ")
|
||
|
||
return cleaned
|
||
}
|
||
|
||
// CleanDivTags 只清理div标签,保留其他HTML标签和纯文本内容
|
||
// 这个函数会移除所有<div>和</div>标签,但保留标签内的内容
|
||
func CleanDivTags(html string) string {
|
||
if html == "" {
|
||
return ""
|
||
}
|
||
|
||
// 移除所有<div>和</div>标签(不区分大小写)
|
||
re := regexp.MustCompile(`(?i)</?div[^>]*>`)
|
||
cleaned := re.ReplaceAllString(html, "")
|
||
|
||
// 解码常见的HTML实体
|
||
cleaned = strings.ReplaceAll(cleaned, " ", " ")
|
||
cleaned = strings.ReplaceAll(cleaned, "<", "<")
|
||
cleaned = strings.ReplaceAll(cleaned, ">", ">")
|
||
cleaned = strings.ReplaceAll(cleaned, "&", "&")
|
||
cleaned = strings.ReplaceAll(cleaned, """, "\"")
|
||
cleaned = strings.ReplaceAll(cleaned, "'", "'")
|
||
|
||
// 去除多余的空格和换行
|
||
cleaned = strings.TrimSpace(cleaned)
|
||
|
||
// 将多个连续空格替换为单个空格
|
||
multipleSpaces := regexp.MustCompile(`\s+`)
|
||
cleaned = multipleSpaces.ReplaceAllString(cleaned, " ")
|
||
|
||
return cleaned
|
||
}
|
||
|
||
// HighlightKeywordsInHTML 在HTML内容中高亮显示指定的关键词
|
||
// htmlContent: 原始HTML内容
|
||
// pointKeys: 需要高亮的关键词列表
|
||
// 返回处理后的HTML内容,每个关键词会被不同颜色的span标签包裹
|
||
func HighlightKeywordsInHTML(htmlContent string, pointKeys []string) (string, bool) {
|
||
var isExposure bool
|
||
if htmlContent == "" || len(pointKeys) == 0 {
|
||
return htmlContent, isExposure
|
||
}
|
||
|
||
// 预定义的颜色列表(使用CSS颜色值)
|
||
colors := []string{
|
||
"#FF6B6B", // 红色
|
||
"#4ECDC4", // 青色
|
||
"#45B7D1", // 蓝色
|
||
"#FFA07A", // 浅橙色
|
||
"#98D8C8", // 薄荷绿
|
||
"#F7DC6F", // 黄色
|
||
"#BB8FCE", // 紫色
|
||
"#85C1E2", // 天蓝色
|
||
"#F8B739", // 橙色
|
||
"#52B788", // 绿色
|
||
"#E63946", // 深红色
|
||
"#457B9D", // 深蓝色
|
||
"#2A9D8F", // 蓝绿色
|
||
"#E9C46A", // 金黄色
|
||
"#F4A261", // 橘色
|
||
}
|
||
|
||
result := htmlContent
|
||
|
||
// 遍历每个关键词,为其分配颜色并添加高亮标记
|
||
for index, keyword := range pointKeys {
|
||
if keyword == "" {
|
||
continue
|
||
}
|
||
|
||
// 选择颜色(循环使用颜色列表)
|
||
color := colors[index%len(colors)]
|
||
|
||
// 转义关键词中的特殊正则字符
|
||
escapedKeyword := regexp.QuoteMeta(keyword)
|
||
|
||
// 创建匹配模式,确保只匹配完整的词(避免部分匹配)
|
||
// 使用大小写不敏感匹配
|
||
pattern := fmt.Sprintf(`(?i)(%s)`, escapedKeyword)
|
||
re := regexp.MustCompile(pattern)
|
||
|
||
// 检查是否匹配到关键词
|
||
if re.MatchString(result) {
|
||
isExposure = true
|
||
}
|
||
|
||
// 替换匹配的关键词为带颜色的span标签
|
||
replacement := fmt.Sprintf(`<span style="color:%s;font-weight:bold;">$1</span>`, color)
|
||
result = re.ReplaceAllString(result, replacement)
|
||
}
|
||
|
||
return result, isExposure
|
||
}
|
||
|
||
// HighlightKeywordsInText 在纯文本中高亮显示指定的关键词(先转换为HTML)
|
||
// textContent: 纯文本内容
|
||
// pointKeys: 需要高亮的关键词列表
|
||
// 返回带有高亮标记的HTML内容
|
||
func HighlightKeywordsInText(textContent string, pointKeys []string) (string, bool) {
|
||
if textContent == "" || len(pointKeys) == 0 {
|
||
return textContent, false
|
||
}
|
||
|
||
// 将纯文本转换为HTML段落格式
|
||
htmlContent := fmt.Sprintf("<p>%s</p>", strings.ReplaceAll(textContent, "\n", "</p><p>"))
|
||
|
||
// 使用HTML高亮方法
|
||
return HighlightKeywordsInHTML(htmlContent, pointKeys)
|
||
}
|