geoGo/internal/collect/utils.go

package collect

import (
	"fmt"
	"regexp"
	"strings"
)

// CleanHTMLTags 清理HTML标签，只保留纯文本内容
// 这个函数是包内公共方法，供所有AI平台使用
func CleanHTMLTags(html string) string {
	if html == "" {
		return ""
	}

	// 移除所有HTML标签
	re := regexp.MustCompile(`<[^>]*>`)
	cleaned := re.ReplaceAllString(html, "")

	// 解码常见的HTML实体
	cleaned = strings.ReplaceAll(cleaned, "&nbsp;", " ")
	cleaned = strings.ReplaceAll(cleaned, "&lt;", "<")
	cleaned = strings.ReplaceAll(cleaned, "&gt;", ">")
	cleaned = strings.ReplaceAll(cleaned, "&amp;", "&")
	cleaned = strings.ReplaceAll(cleaned, "&quot;", "\"")
	cleaned = strings.ReplaceAll(cleaned, "&#39;", "'")

	// 去除多余的空格和换行
	cleaned = strings.TrimSpace(cleaned)

	// 将多个连续空格替换为单个空格
	multipleSpaces := regexp.MustCompile(`\s+`)
	cleaned = multipleSpaces.ReplaceAllString(cleaned, " ")

	return cleaned
}

// CleanDivTags 只清理div标签，保留其他HTML标签和纯文本内容
// 这个函数会移除所有<div>和</div>标签，但保留标签内的内容
func CleanDivTags(html string) string {
	if html == "" {
		return ""
	}

	// 移除所有<div>和</div>标签（不区分大小写）
	re := regexp.MustCompile(`(?i)</?div[^>]*>`)
	cleaned := re.ReplaceAllString(html, "")

	// 解码常见的HTML实体
	cleaned = strings.ReplaceAll(cleaned, "&nbsp;", " ")
	cleaned = strings.ReplaceAll(cleaned, "&lt;", "<")
	cleaned = strings.ReplaceAll(cleaned, "&gt;", ">")
	cleaned = strings.ReplaceAll(cleaned, "&amp;", "&")
	cleaned = strings.ReplaceAll(cleaned, "&quot;", "\"")
	cleaned = strings.ReplaceAll(cleaned, "&#39;", "'")

	// 去除多余的空格和换行
	cleaned = strings.TrimSpace(cleaned)

	// 将多个连续空格替换为单个空格
	multipleSpaces := regexp.MustCompile(`\s+`)
	cleaned = multipleSpaces.ReplaceAllString(cleaned, " ")

	return cleaned
}

// HighlightKeywordsInHTML 在HTML内容中高亮显示指定的关键词
// htmlContent: 原始HTML内容
// pointKeys: 需要高亮的关键词列表
// 返回处理后的HTML内容，每个关键词会被不同颜色的span标签包裹
func HighlightKeywordsInHTML(htmlContent string, pointKeys []string) (string, bool) {

	var isExposure bool
	if htmlContent == "" || len(pointKeys) == 0 {
		return htmlContent, false
	}

	// 预定义的颜色列表（使用CSS颜色值）
	colors := []string{
		"#FF6B6B", // 红色
		"#4ECDC4", // 青色
		"#45B7D1", // 蓝色
		"#FFA07A", // 浅橙色
		"#98D8C8", // 薄荷绿
		"#F7DC6F", // 黄色
		"#BB8FCE", // 紫色
		"#85C1E2", // 天蓝色
		"#F8B739", // 橙色
		"#52B788", // 绿色
		"#E63946", // 深红色
		"#457B9D", // 深蓝色
		"#2A9D8F", // 蓝绿色
		"#E9C46A", // 金黄色
		"#F4A261", // 橘色
	}

	result := htmlContent

	// 遍历每个关键词，为其分配颜色并添加高亮标记
	for index, keyword := range pointKeys {
		if keyword == "" {
			continue
		}

		// 选择颜色（循环使用颜色列表）
		color := colors[index%len(colors)]

		// 转义关键词中的特殊正则字符
		escapedKeyword := regexp.QuoteMeta(keyword)

		// 创建匹配模式，确保只匹配完整的词（避免部分匹配）
		// 使用大小写不敏感匹配
		pattern := fmt.Sprintf(`(?i)(%s)`, escapedKeyword)
		re := regexp.MustCompile(pattern)

		// 检查是否匹配到关键词
		if re.MatchString(result) {
			isExposure = true
		}

		// 替换匹配的关键词为带颜色的span标签
		replacement := fmt.Sprintf(`<span style="color:%s;font-weight:bold;">$1</span>`, color)
		result = re.ReplaceAllString(result, replacement)
	}

	return result, isExposure
}

// HighlightKeywordsInText 在纯文本中高亮显示指定的关键词（先转换为HTML）
// textContent: 纯文本内容
// pointKeys: 需要高亮的关键词列表
// 返回带有高亮标记的HTML内容
func HighlightKeywordsInText(textContent string, pointKeys []string) (string, bool) {
	if textContent == "" {
		return textContent, false
	}
	htmlContent := CleanDivTags(textContent)
	// 将纯文本转换为HTML段落格式

	htmlContent = fmt.Sprintf("<p>%s</p>", strings.ReplaceAll(htmlContent, "\n", "</p><p>"))

	// 使用HTML高亮方法
	return HighlightKeywordsInHTML(htmlContent, pointKeys)
}