package collect

import (
	"regexp"
	"strings"
)

// CleanHTMLTags 清理HTML标签，只保留纯文本内容
// 这个函数是包内公共方法，供所有AI平台使用
func CleanHTMLTags(html string) string {
	if html == "" {
		return ""
	}

	// 移除所有HTML标签
	re := regexp.MustCompile(`<[^>]*>`)
	cleaned := re.ReplaceAllString(html, "")

	// 解码常见的HTML实体
	cleaned = strings.ReplaceAll(cleaned, "&nbsp;", " ")
	cleaned = strings.ReplaceAll(cleaned, "&lt;", "<")
	cleaned = strings.ReplaceAll(cleaned, "&gt;", ">")
	cleaned = strings.ReplaceAll(cleaned, "&amp;", "&")
	cleaned = strings.ReplaceAll(cleaned, "&quot;", "\"")
	cleaned = strings.ReplaceAll(cleaned, "&#39;", "'")

	// 去除多余的空格和换行
	cleaned = strings.TrimSpace(cleaned)
	
	// 将多个连续空格替换为单个空格
	multipleSpaces := regexp.MustCompile(`\s+`)
	cleaned = multipleSpaces.ReplaceAllString(cleaned, " ")

	return cleaned
}