geoGo/internal/collect/deepseek.go

470 lines
12 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package collect
import (
"context"
"encoding/json"
"fmt"
"geo/internal/config"
"os"
"strings"
"time"
"github.com/go-rod/rod"
"github.com/go-rod/rod/lib/proto"
"github.com/gofiber/fiber/v2/log"
)
// DeepseekCollector DeepSeek收集器
type DeepseekCollector struct {
*BaseCollector
}
// NewDeepseekCollector 创建DeepSeek收集器
func NewDeepseekCollector(ctx context.Context, params *CollectParams, cfg *config.Config, logger log.AllLogger) CollectorInterface {
collector := &DeepseekCollector{
BaseCollector: NewBaseCollector(ctx, params, cfg, logger),
}
// 设置DeepSeek的URL
collector.LoginURL = "https://chat.deepseek.com/"
collector.ChatURL = "https://chat.deepseek.com/"
return collector
}
// saveLocalStorage 保存LocalStorage数据
func (c *DeepseekCollector) saveLocalStorage() error {
// 使用JavaScript获取所有LocalStorage数据
result, err := c.Page.Eval(`() => {
const data = {};
for (let i = 0; i < localStorage.length; i++) {
const key = localStorage.key(i);
data[key] = localStorage.getItem(key);
}
return JSON.stringify(data);
}`)
if err != nil {
return fmt.Errorf("获取LocalStorage失败: %v", err)
}
// 保存到文件
localStorageFile := c.CookiesFile[:len(c.CookiesFile)-5] + "_localstorage.json"
return os.WriteFile(localStorageFile, []byte(result.Value.Str()), 0644)
}
// loadLocalStorage 加载LocalStorage数据
func (c *DeepseekCollector) loadLocalStorage() error {
localStorageFile := c.CookiesFile[:len(c.CookiesFile)-5] + "_localstorage.json"
data, err := os.ReadFile(localStorageFile)
if err != nil {
return err
}
var storageData map[string]string
if err := json.Unmarshal(data, &storageData); err != nil {
return err
}
// 使用JavaScript设置LocalStorage
for key, value := range storageData {
_, err := c.Page.Eval(`(key, val) => localStorage.setItem(key, val)`, key, value)
if err != nil {
c.Logger.Warnf("设置LocalStorage键 %s 失败: %v", key, err)
}
}
return nil
}
// saveSessionStorage 保存SessionStorage数据
func (c *DeepseekCollector) saveSessionStorage() error {
result, err := c.Page.Eval(`() => {
const data = {};
for (let i = 0; i < sessionStorage.length; i++) {
const key = sessionStorage.key(i);
data[key] = sessionStorage.getItem(key);
}
return JSON.stringify(data);
}`)
if err != nil {
return fmt.Errorf("获取SessionStorage失败: %v", err)
}
sessionStorageFile := c.CookiesFile[:len(c.CookiesFile)-5] + "_sessionstorage.json"
return os.WriteFile(sessionStorageFile, []byte(result.Value.Str()), 0644)
}
// loadSessionStorage 加载SessionStorage数据
func (c *DeepseekCollector) loadSessionStorage() error {
sessionStorageFile := c.CookiesFile[:len(c.CookiesFile)-5] + "_sessionstorage.json"
data, err := os.ReadFile(sessionStorageFile)
if err != nil {
return err
}
var storageData map[string]string
if err := json.Unmarshal(data, &storageData); err != nil {
return err
}
for key, value := range storageData {
_, err := c.Page.Eval(`(key, val) => sessionStorage.setItem(key, val)`, key, value)
if err != nil {
c.Logger.Warnf("设置SessionStorage键 %s 失败: %v", key, err)
}
}
return nil
}
// SaveBrowserStorage 保存所有浏览器存储Cookies + LocalStorage + SessionStorage
func (c *DeepseekCollector) SaveBrowserStorage() error {
// 保存Cookies
if err := c.SaveCookies(); err != nil {
c.Logger.Warnf("保存Cookies失败: %v", err)
}
// 保存LocalStorage
if err := c.saveLocalStorage(); err != nil {
c.Logger.Warnf("保存LocalStorage失败: %v", err)
}
// 保存SessionStorage
if err := c.saveSessionStorage(); err != nil {
c.Logger.Warnf("保存SessionStorage失败: %v", err)
}
return nil
}
// LoadBrowserStorage 加载所有浏览器存储
func (c *DeepseekCollector) LoadBrowserStorage() error {
// 加载Cookies
if err := c.LoadCookies(); err != nil {
c.Logger.Warnf("加载Cookies失败: %v", err)
return err
}
// 加载LocalStorage
if err := c.loadLocalStorage(); err != nil {
c.Logger.Warnf("加载LocalStorage失败: %v", err)
}
// 加载SessionStorage
if err := c.loadSessionStorage(); err != nil {
c.Logger.Warnf("加载SessionStorage失败: %v", err)
}
return nil
}
// CheckLoginStatus 检查登录状态
func (c *DeepseekCollector) CheckLoginStatus() bool {
currentURL := c.GetCurrentURL()
// 如果在首页或登录页面,可能未登录
if strings.Contains(currentURL, "chat.deepseek.com") {
// 检查是否有用户头像或登录标识
userAvatar, err := c.SafeElement(".user-avatar, [class*='avatar'], [class*='profile']")
if err == nil && userAvatar != nil {
return true
}
// 检查是否有聊天输入框(登录后才有)
inputBox, err := c.SafeElement("textarea, [contenteditable='true']")
if err == nil && inputBox != nil {
return true
}
}
return false
}
// WaitLogin 等待登录
func (c *DeepseekCollector) WaitLogin() (bool, string) {
if err := c.SetupDriver(); err != nil {
return false, fmt.Sprintf("浏览器启动失败: %v", err)
}
defer c.Close()
c.Page.MustNavigate(c.LoginURL)
c.Sleep(3)
if c.CheckLoginStatus() {
c.SaveBrowserStorage()
return true, "already_logged_in"
}
c.LogInfo("等待用户登录...")
// 最多等待300秒
for i := 0; i < 300; i++ {
if c.CheckLoginStatus() {
c.Sleep(2)
c.SaveBrowserStorage()
return true, "login_success"
}
time.Sleep(1 * time.Second)
// 每30秒提醒一次
if (i+1)%30 == 0 {
c.LogInfo(fmt.Sprintf("仍在等待登录... 已等待 %d 秒", i+1))
}
}
return false, "登录超时"
}
// InitPage 初始化页面重写基类方法以支持LocalStorage
func (c *DeepseekCollector) InitPage() error {
// 先导航到页面
c.Page.MustNavigate(c.ChatURL)
c.WaitForPageReady(5)
// 然后尝试加载浏览器存储Cookies + LocalStorage + SessionStorage
if err := c.LoadBrowserStorage(); err == nil {
c.LogInfo("已加载浏览器存储")
// 重新加载页面以应用存储的数据
c.Page.MustReload()
c.WaitForPageReady(5)
} else {
c.LogInfo("未找到保存的浏览器存储")
}
return nil
}
// AskQuestion 提问并获取答案
func (c *DeepseekCollector) AskQuestion(question string) (*CollectResult, error) {
c.LogInfo("开始提问流程...")
if err := c.SetupDriver(); err != nil {
return nil, fmt.Errorf("浏览器启动失败: %v", err)
}
defer c.Close()
if err := c.InitPage(); err != nil {
return nil, fmt.Errorf("页面初始化失败: %v", err)
}
if err := c.inputQuestion(question); err != nil {
return nil, fmt.Errorf("输入问题失败: %v", err)
}
if err := c.clickSendButton(); err != nil {
return nil, fmt.Errorf("点击发送按钮失败: %v", err)
}
answer, err := c.waitForAnswer()
if err != nil {
return nil, fmt.Errorf("获取答案失败: %v", err)
}
answerStr, isExposure := HighlightKeywordsInText(answer, c.KeyWords)
return &CollectResult{
Answer: answerStr,
ShareLink: "",
IsExposure: isExposure,
}, nil
}
// inputQuestion 输入问题
func (c *DeepseekCollector) inputQuestion(question string) error {
c.LogInfo("输入问题...")
// DeepSeek的输入框选择器
inputSelectors := []string{
"textarea[placeholder*='Message DeepSeek']",
}
var inputBox *rod.Element
var err error
for _, selector := range inputSelectors {
inputBox, err = c.WaitForElementVisible(selector, 10)
if err == nil && inputBox != nil {
c.LogInfo(fmt.Sprintf("找到输入框: %s", selector))
break
}
}
if inputBox == nil {
return fmt.Errorf("未找到输入框")
}
// 点击获取焦点
if err := inputBox.Click(proto.InputMouseButtonLeft, 1); err != nil {
return fmt.Errorf("点击输入框失败: %v", err)
}
c.SleepMs(500)
// 清空输入框
if err := c.ClearInput(inputBox); err != nil {
// Ignore clear error
}
c.SleepMs(300)
// 输入问题
if err := c.SetInputValue(inputBox, question); err != nil {
inputBox.Input(question)
}
c.LogInfo(fmt.Sprintf("问题已输入: %s", question))
c.SleepMs(1000)
return nil
}
func (c *DeepseekCollector) clickSendButton() error {
// 使用JavaScript直接找到input的父级下的第三个div并点击
clickJS := `
() => {
// 找到页面上第一个input元素
const input = document.querySelector('input');
if (!input) {
return { success: false, error: '未找到input元素', divCount: 0 };
}
// 获取input的父级元素
const parent = input.parentElement;
if (!parent) {
return { success: false, error: '未找到input的父级元素', divCount: 0 };
}
// 找到父级下的直接子级div元素只找一级
const divs = parent.querySelectorAll(':scope > div');
const divCount = divs.length;
if (divs.length < 2) {
return { success: false, error: '父级下没有足够的直接子级div元素', divCount: divCount };
}
// 获取第2个div作为发送按钮
const sendBtn = divs[1];
const s = sendBtn.querySelectorAll(':scope > div');
console.log(s.length);
console.log('开始点击');
// 点击发送按钮
s[0].click();
console.log('开始完成');
return { success: true, divCount: divCount };
}
`
result, err := c.Page.Eval(clickJS)
if err != nil {
return fmt.Errorf("执行点击JavaScript失败: %v", err)
}
// 检查执行结果
success := result.Value.Get("success").Bool()
divCount := result.Value.Get("divCount").Int()
c.LogInfof("父级下共有 %d 个直接子级div元素", divCount)
if !success {
errorMsg := result.Value.Get("error").String()
return fmt.Errorf("点击发送按钮失败: %s", errorMsg)
}
c.SleepMs(2000)
return nil
}
// waitForAnswer 等待并获取答案
func (c *DeepseekCollector) waitForAnswer() (string, error) {
c.LogInfo("等待AI回答...")
timeout := 180 // 最大等待时间(秒)
startTime := time.Now()
var lastAnswer string
var stableCount int // 稳定计数器
const requiredStableCount = 3 // 需要连续3次内容不变才认为完成
isAnswering := false // 标记是否正在回答中
for time.Since(startTime).Seconds() < float64(timeout) {
// 查找答案区域 - DeepSeek 使用 ds-markdown 类
answerSelectors := []string{
"div[class='ds-markdown']",
".message-content",
".response-text",
"[class*='assistant'] [class*='content']",
"[class*='ai'] [class*='message']",
".chat-message.ai",
".answer-content",
"div[data-message-id]", // 通用的消息ID选择器
}
var answerHTML string
for _, selector := range answerSelectors {
answerElements, err := c.Page.Elements(selector)
if err == nil && len(answerElements) > 0 {
// 获取最后一个答案元素
lastAnswerElem := answerElements[len(answerElements)-1]
visible, _ := lastAnswerElem.Visible()
if visible {
// 直接获取原始HTML内容不做任何处理
htmlContent, err := lastAnswerElem.HTML()
if err == nil && htmlContent != "" {
answerHTML = strings.TrimSpace(htmlContent)
c.LogInfo(fmt.Sprintf("找到答案容器: %s, HTML长度: %d", selector, len(answerHTML)))
break
}
}
}
}
// 检查是否获取到答案
if answerHTML != "" {
if !isAnswering {
c.LogInfo("检测到AI开始回答...")
isAnswering = true
}
// 检查内容是否稳定(流式输出完成)
if answerHTML == lastAnswer {
stableCount++
c.LogInfo(fmt.Sprintf("答案稳定中... (%d/%d), 长度: %d", stableCount, requiredStableCount, len(answerHTML)))
// 如果内容稳定足够次数,说明回答完成
if stableCount >= requiredStableCount {
c.LogInfo(fmt.Sprintf("✓ AI回答完成最终HTML长度: %d 字符", len(answerHTML)))
return answerHTML, nil
}
} else {
// 内容还在变化,重置计数器
stableCount = 0
lastAnswer = answerHTML
c.LogInfo(fmt.Sprintf("检测到流式输出当前HTML长度: %d 字符", len(answerHTML)))
}
}
c.SleepMs(1000) // 每1秒检查一次
// 每10秒输出一次等待状态
elapsed := int(time.Since(startTime).Seconds())
if elapsed > 0 && elapsed%10 == 0 {
c.LogInfo(fmt.Sprintf("等待AI回答中... 已等待 %d 秒", elapsed))
}
}
return "", fmt.Errorf("等待答案超时(%d秒", timeout)
}
// SafeElement 安全地获取元素
func (c *DeepseekCollector) SafeElement(selector string) (*rod.Element, error) {
exists, _, err := c.Page.Has(selector)
if err != nil {
return nil, err
}
if !exists {
return nil, nil
}
return c.Page.Element(selector)
}