geoGo/internal/collect/utils.go

36 lines
948 B
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package collect
import (
"regexp"
"strings"
)
// CleanHTMLTags 清理HTML标签只保留纯文本内容
// 这个函数是包内公共方法供所有AI平台使用
func CleanHTMLTags(html string) string {
if html == "" {
return ""
}
// 移除所有HTML标签
re := regexp.MustCompile(`<[^>]*>`)
cleaned := re.ReplaceAllString(html, "")
// 解码常见的HTML实体
cleaned = strings.ReplaceAll(cleaned, "&nbsp;", " ")
cleaned = strings.ReplaceAll(cleaned, "&lt;", "<")
cleaned = strings.ReplaceAll(cleaned, "&gt;", ">")
cleaned = strings.ReplaceAll(cleaned, "&amp;", "&")
cleaned = strings.ReplaceAll(cleaned, "&quot;", "\"")
cleaned = strings.ReplaceAll(cleaned, "&#39;", "'")
// 去除多余的空格和换行
cleaned = strings.TrimSpace(cleaned)
// 将多个连续空格替换为单个空格
multipleSpaces := regexp.MustCompile(`\s+`)
cleaned = multipleSpaces.ReplaceAllString(cleaned, " ")
return cleaned
}