36 lines
948 B
Go
36 lines
948 B
Go
package collect
|
||
|
||
import (
|
||
"regexp"
|
||
"strings"
|
||
)
|
||
|
||
// CleanHTMLTags 清理HTML标签,只保留纯文本内容
|
||
// 这个函数是包内公共方法,供所有AI平台使用
|
||
func CleanHTMLTags(html string) string {
|
||
if html == "" {
|
||
return ""
|
||
}
|
||
|
||
// 移除所有HTML标签
|
||
re := regexp.MustCompile(`<[^>]*>`)
|
||
cleaned := re.ReplaceAllString(html, "")
|
||
|
||
// 解码常见的HTML实体
|
||
cleaned = strings.ReplaceAll(cleaned, " ", " ")
|
||
cleaned = strings.ReplaceAll(cleaned, "<", "<")
|
||
cleaned = strings.ReplaceAll(cleaned, ">", ">")
|
||
cleaned = strings.ReplaceAll(cleaned, "&", "&")
|
||
cleaned = strings.ReplaceAll(cleaned, """, "\"")
|
||
cleaned = strings.ReplaceAll(cleaned, "'", "'")
|
||
|
||
// 去除多余的空格和换行
|
||
cleaned = strings.TrimSpace(cleaned)
|
||
|
||
// 将多个连续空格替换为单个空格
|
||
multipleSpaces := regexp.MustCompile(`\s+`)
|
||
cleaned = multipleSpaces.ReplaceAllString(cleaned, " ")
|
||
|
||
return cleaned
|
||
}
|