package collect import ( "regexp" "strings" ) // CleanHTMLTags 清理HTML标签,只保留纯文本内容 // 这个函数是包内公共方法,供所有AI平台使用 func CleanHTMLTags(html string) string { if html == "" { return "" } // 移除所有HTML标签 re := regexp.MustCompile(`<[^>]*>`) cleaned := re.ReplaceAllString(html, "") // 解码常见的HTML实体 cleaned = strings.ReplaceAll(cleaned, " ", " ") cleaned = strings.ReplaceAll(cleaned, "<", "<") cleaned = strings.ReplaceAll(cleaned, ">", ">") cleaned = strings.ReplaceAll(cleaned, "&", "&") cleaned = strings.ReplaceAll(cleaned, """, "\"") cleaned = strings.ReplaceAll(cleaned, "'", "'") // 去除多余的空格和换行 cleaned = strings.TrimSpace(cleaned) // 将多个连续空格替换为单个空格 multipleSpaces := regexp.MustCompile(`\s+`) cleaned = multipleSpaces.ReplaceAllString(cleaned, " ") return cleaned }