diff --git a/config/config.yaml b/config/config.yaml index 92eee31..c66ba00 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -53,3 +53,21 @@ tools: enabled: true DingTalkBot: enabled: true + + +default_prompt: + img_recognize: + system_prompt: + '你是一个具备图像理解与用户意图分析能力的智能助手。当用户提供一张图片时,请完成以下任务: + 1.图像内容识别: + 识别并描述图片中的主要对象、场景、文字(如存在)、颜色、布局等基本信息。 + 如果图片中包含表格、图表、二维码、标志(Logo)、菜单、票据等内容,请特别指出。 + 2. 关键信息提取: + 提取出图片中对用户可能有用的关键信息(例如金额、日期、标题、编号、联系信息、商品名称等)。 + 若图片为文档类(如合同、发票、收据),请结构化输出关键字段(如客户名称、金额、开票日期等)。 + 3.用户需求预测: + 根据图片内容和常见使用场景,推测用户可能想要执行的操作或提出的问题。 + 例如:是否需要翻译图片中的文字?是否需要提取表格数据?是否需要分析图表趋势?是否需要识别某个标志的含义? + 输出你预测的 2~3 个用户可能的需求,并简要说明理由。 + ' + user_prompt: '识别图片内容, 以markdown格式输出' \ No newline at end of file diff --git a/internal/biz/do/ctx.go b/internal/biz/do/ctx.go index 3915b83..0164da4 100644 --- a/internal/biz/do/ctx.go +++ b/internal/biz/do/ctx.go @@ -110,15 +110,21 @@ func (d *Do) getImgData() (err error) { if len(imgs) == 0 { return } - if err = pkg.ValidateImageURL(d.Ctx.Req.Img); err != nil { - return err - } + for k, img := range imgs { baseErr := "获取第" + strconv.Itoa(k+1) + "张图片失败:" - entitys.ResLog(d.Ctx.Ch, "", "获取第"+strconv.Itoa(k+1)+"张图片") + entitys.ResLog(d.Ctx.Ch, "img_get_start", "正在获取第"+strconv.Itoa(k+1)+"张图片") + if err = pkg.ValidateImageURL(img); err != nil { + entitys.ResLog(d.Ctx.Ch, "", baseErr+":expected image content") + continue + } req := l_request.Request{ Method: "GET", Url: img, + Headers: map[string]string{ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "Accept": "image/webp,image/apng,image/*,*/*;q=0.8", + }, } res, _err := req.Send() if _err != nil { @@ -135,6 +141,7 @@ func (d *Do) getImgData() (err error) { } d.Ctx.ImgByte = append(d.Ctx.ImgByte, res.Content) d.Ctx.ImgUrls = append(d.Ctx.ImgUrls, img) + entitys.ResLog(d.Ctx.Ch, "img_get_end", "第"+strconv.Itoa(k+1)+"张图片获取成功") } return diff --git a/internal/biz/do/handle.go b/internal/biz/do/handle.go index a137f05..73e304e 100644 --- a/internal/biz/do/handle.go +++ b/internal/biz/do/handle.go @@ -44,15 +44,15 @@ func NewHandle( } func (r *Handle) Recognize(ctx context.Context, requireData *entitys.RequireData) (err error) { - entitys.ResLog(requireData.Ch, "", "准备意图识别") + entitys.ResLog(requireData.Ch, "recognize_start", "准备意图识别") //意图识别 recognizeMsg, err := r.Ollama.IntentRecognize(ctx, requireData) if err != nil { return } - entitys.ResLog(requireData.Ch, "", recognizeMsg) - entitys.ResLog(requireData.Ch, "", "意图识别结束") + entitys.ResLog(requireData.Ch, "recognize", recognizeMsg) + entitys.ResLog(requireData.Ch, "recognize_end", "意图识别结束") var match entitys.Match if err = json.Unmarshal([]byte(recognizeMsg), &match); err != nil { diff --git a/internal/biz/llm_service/ollama.go b/internal/biz/llm_service/ollama.go index 070d884..417baa7 100644 --- a/internal/biz/llm_service/ollama.go +++ b/internal/biz/llm_service/ollama.go @@ -98,20 +98,20 @@ func (r *OllamaService) RecognizeWithImg(ctx context.Context, requireData *entit if requireData.ImgByte == nil { return } - entitys.ResLog(requireData.Ch, "", "图片识别中。。。") + entitys.ResLog(requireData.Ch, "recognize_img_start", "图片识别中...") desc, err = r.client.Generation(ctx, &api.GenerateRequest{ Model: r.config.Ollama.VlModel, Stream: new(bool), - System: "完整提取出图片中的文字以及重要信息,并对用户的需求进行预测", - Prompt: "完整提取出图片中的文字以及重要信息,并对用户的需求进行预测", //requireData.Req.Text, + System: r.config.DefaultPrompt.ImgRecognize.SystemPrompt, + Prompt: r.config.DefaultPrompt.ImgRecognize.UserPrompt, Images: requireData.ImgByte, KeepAlive: &api.Duration{Duration: 3600 * time.Second}, }) if err != nil { return } - entitys.ResLog(requireData.Ch, "", "图片识别完成,识别内容:"+desc.Response) + entitys.ResLog(requireData.Ch, "recognize_img_end", "图片识别完成,识别内容:"+desc.Response) return } diff --git a/internal/biz/router.go b/internal/biz/router.go index 1478ff5..287d87b 100644 --- a/internal/biz/router.go +++ b/internal/biz/router.go @@ -40,7 +40,7 @@ func (r *AiRouterBiz) RouteWithSocket(c *websocket.Conn, req *entitys.ChatSockRe return } - //初始化通道/上下文 + //意图识别 if err = r.handle.Recognize(ctx, dos.Ctx); err != nil { log.Errorf("意图识别失败: %s", err.Error()) return diff --git a/internal/config/config.go b/internal/config/config.go index 8cc2dae..f0c80da 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -9,16 +9,26 @@ import ( // Config 应用配置 type Config struct { - Server ServerConfig `mapstructure:"server"` - Ollama OllamaConfig `mapstructure:"ollama"` - Sys SysConfig `mapstructure:"sys"` - Tools ToolsConfig `mapstructure:"tools"` - Logging LoggingConfig `mapstructure:"logging"` - Redis Redis `mapstructure:"redis"` - DB DB `mapstructure:"db"` + Server ServerConfig `mapstructure:"server"` + Ollama OllamaConfig `mapstructure:"ollama"` + Sys SysConfig `mapstructure:"sys"` + Tools ToolsConfig `mapstructure:"tools"` + Logging LoggingConfig `mapstructure:"logging"` + Redis Redis `mapstructure:"redis"` + DB DB `mapstructure:"db"` + DefaultPrompt SysPrompt `mapstructure:"default_prompt"` // LLM *LLM `mapstructure:"llm"` } +type SysPrompt struct { + ImgRecognize DefaultPrompt `mapstructure:"img_recognize"` +} + +type DefaultPrompt struct { + SystemPrompt string `mapstructure:"system_prompt"` + UserPrompt string `mapstructure:"user_prompt"` +} + type LLM struct { Model string `mapstructure:"model"` } diff --git a/internal/pkg/func.go b/internal/pkg/func.go index 27ce2a3..4e6481a 100644 --- a/internal/pkg/func.go +++ b/internal/pkg/func.go @@ -35,22 +35,22 @@ func ValidateImageURL(rawURL string) error { // 1. 基础格式验证 parsed, err := url.Parse(rawURL) if err != nil { - return fmt.Errorf("invalid URL format: %v", err) + return fmt.Errorf("未知的图片格式: %v", err) } // 2. 检查协议是否为 http/https if parsed.Scheme != "http" && parsed.Scheme != "https" { - return errors.New("URL must use http or https protocol") + return errors.New("必须是http/https结构") } // 3. 检查是否有空的主机名 if parsed.Host == "" { - return errors.New("URL missing host") + return errors.New("未知的url地址") } - // 4. 检查路径是否为空(可选) + // 4. 检查路径是否为空 if strings.TrimSpace(parsed.Path) == "" { - return errors.New("URL path is empty") + return errors.New("url为空") } return nil