结构优化与图片识别增强

2025-11-11 18:30:43 +08:00 · 2025-11-11 18:30:43 +08:00 · 92218ceb4d
parent 7076d6a918
commit 92218ceb4d
7 changed files with 59 additions and 24 deletions
--- a/config/config.yaml
+++ b/config/config.yaml
@ -53,3 +53,21 @@ tools:
    enabled: true
  DingTalkBot:
    enabled: true
+
+
+default_prompt:
+  img_recognize:
+    system_prompt:
+      '你是一个具备图像理解与用户意图分析能力的智能助手。当用户提供一张图片时，请完成以下任务：
+      1.图像内容识别：
+       识别并描述图片中的主要对象、场景、文字（如存在）、颜色、布局等基本信息。
+       如果图片中包含表格、图表、二维码、标志（Logo）、菜单、票据等内容，请特别指出。
+      2. 关键信息提取：
+       提取出图片中对用户可能有用的关键信息（例如金额、日期、标题、编号、联系信息、商品名称等）。
+       若图片为文档类（如合同、发票、收据），请结构化输出关键字段（如客户名称、金额、开票日期等）。
+      3.用户需求预测：
+       根据图片内容和常见使用场景，推测用户可能想要执行的操作或提出的问题。
+       例如：是否需要翻译图片中的文字？是否需要提取表格数据？是否需要分析图表趋势？是否需要识别某个标志的含义？
+       输出你预测的 2~3 个用户可能的需求，并简要说明理由。
+      '
+    user_prompt: '识别图片内容, 以markdown格式输出'
--- a/internal/biz/do/ctx.go
+++ b/internal/biz/do/ctx.go
@ -110,15 +110,21 @@ func (d *Do) getImgData() (err error) {
 	if len(imgs) == 0 {
 		return
 	}
-	if err = pkg.ValidateImageURL(d.Ctx.Req.Img); err != nil {
-		return err
-	}
+
 	for k, img := range imgs {
 		baseErr := "获取第" + strconv.Itoa(k+1) + "张图片失败："
-		entitys.ResLog(d.Ctx.Ch, "", "获取第"+strconv.Itoa(k+1)+"张图片")
+		entitys.ResLog(d.Ctx.Ch, "img_get_start", "正在获取第"+strconv.Itoa(k+1)+"张图片")
+		if err = pkg.ValidateImageURL(img); err != nil {
+			entitys.ResLog(d.Ctx.Ch, "", baseErr+"：expected image content")
+			continue
+		}
 		req := l_request.Request{
 			Method: "GET",
 			Url:    img,
+			Headers: map[string]string{
+				"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+				"Accept":     "image/webp,image/apng,image/*,*/*;q=0.8",
+			},
 		}
 		res, _err := req.Send()
 		if _err != nil {
@ -135,6 +141,7 @@ func (d *Do) getImgData() (err error) {
 		}
 		d.Ctx.ImgByte = append(d.Ctx.ImgByte, res.Content)
 		d.Ctx.ImgUrls = append(d.Ctx.ImgUrls, img)
+		entitys.ResLog(d.Ctx.Ch, "img_get_end", "第"+strconv.Itoa(k+1)+"张图片获取成功")
 	}

 	return
--- a/internal/biz/do/handle.go
+++ b/internal/biz/do/handle.go
@ -44,15 +44,15 @@ func NewHandle(
 }

 func (r *Handle) Recognize(ctx context.Context, requireData *entitys.RequireData) (err error) {
-	entitys.ResLog(requireData.Ch, "", "准备意图识别")
+	entitys.ResLog(requireData.Ch, "recognize_start", "准备意图识别")

 	//意图识别
 	recognizeMsg, err := r.Ollama.IntentRecognize(ctx, requireData)
 	if err != nil {
 		return
 	}
-	entitys.ResLog(requireData.Ch, "", recognizeMsg)
-	entitys.ResLog(requireData.Ch, "", "意图识别结束")
+	entitys.ResLog(requireData.Ch, "recognize", recognizeMsg)
+	entitys.ResLog(requireData.Ch, "recognize_end", "意图识别结束")

 	var match entitys.Match
 	if err = json.Unmarshal([]byte(recognizeMsg), &match); err != nil {
--- a/internal/biz/llm_service/ollama.go
+++ b/internal/biz/llm_service/ollama.go
@ -98,20 +98,20 @@ func (r *OllamaService) RecognizeWithImg(ctx context.Context, requireData *entit
 	if requireData.ImgByte == nil {
 		return
 	}
-	entitys.ResLog(requireData.Ch, "", "图片识别中。。。")
+	entitys.ResLog(requireData.Ch, "recognize_img_start", "图片识别中...")

 	desc, err = r.client.Generation(ctx, &api.GenerateRequest{
 		Model:     r.config.Ollama.VlModel,
 		Stream:    new(bool),
-		System:    "完整提取出图片中的文字以及重要信息,并对用户的需求进行预测",
-		Prompt:    "完整提取出图片中的文字以及重要信息,并对用户的需求进行预测", //requireData.Req.Text,
+		System:    r.config.DefaultPrompt.ImgRecognize.SystemPrompt,
+		Prompt:    r.config.DefaultPrompt.ImgRecognize.UserPrompt,
 		Images:    requireData.ImgByte,
 		KeepAlive: &api.Duration{Duration: 3600 * time.Second},
 	})
 	if err != nil {
 		return
 	}
-	entitys.ResLog(requireData.Ch, "", "图片识别完成，识别内容："+desc.Response)
+	entitys.ResLog(requireData.Ch, "recognize_img_end", "图片识别完成，识别内容："+desc.Response)
 	return
 }

--- a/internal/biz/router.go
+++ b/internal/biz/router.go
@ -40,7 +40,7 @@ func (r *AiRouterBiz) RouteWithSocket(c *websocket.Conn, req *entitys.ChatSockRe
 		return
 	}

-	//初始化通道/上下文
+	//意图识别
 	if err = r.handle.Recognize(ctx, dos.Ctx); err != nil {
 		log.Errorf("意图识别失败: %s", err.Error())
 		return
--- a/internal/config/config.go
+++ b/internal/config/config.go
@ -9,16 +9,26 @@ import (

 // Config 应用配置
 type Config struct {
-	Server  ServerConfig  `mapstructure:"server"`
-	Ollama  OllamaConfig  `mapstructure:"ollama"`
-	Sys     SysConfig     `mapstructure:"sys"`
-	Tools   ToolsConfig   `mapstructure:"tools"`
-	Logging LoggingConfig `mapstructure:"logging"`
-	Redis   Redis         `mapstructure:"redis"`
-	DB      DB            `mapstructure:"db"`
+	Server        ServerConfig  `mapstructure:"server"`
+	Ollama        OllamaConfig  `mapstructure:"ollama"`
+	Sys           SysConfig     `mapstructure:"sys"`
+	Tools         ToolsConfig   `mapstructure:"tools"`
+	Logging       LoggingConfig `mapstructure:"logging"`
+	Redis         Redis         `mapstructure:"redis"`
+	DB            DB            `mapstructure:"db"`
+	DefaultPrompt SysPrompt     `mapstructure:"default_prompt"`
 	//	LLM     *LLM          `mapstructure:"llm"`
 }

+type SysPrompt struct {
+	ImgRecognize DefaultPrompt `mapstructure:"img_recognize"`
+}
+
+type DefaultPrompt struct {
+	SystemPrompt string `mapstructure:"system_prompt"`
+	UserPrompt   string `mapstructure:"user_prompt"`
+}
+
 type LLM struct {
 	Model string `mapstructure:"model"`
 }
--- a/internal/pkg/func.go
+++ b/internal/pkg/func.go
@ -35,22 +35,22 @@ func ValidateImageURL(rawURL string) error {
 	// 1. 基础格式验证
 	parsed, err := url.Parse(rawURL)
 	if err != nil {
-		return fmt.Errorf("invalid URL format: %v", err)
+		return fmt.Errorf("未知的图片格式: %v", err)
 	}

 	// 2. 检查协议是否为 http/https
 	if parsed.Scheme != "http" && parsed.Scheme != "https" {
-		return errors.New("URL must use http or https protocol")
+		return errors.New("必须是http/https结构")
 	}

 	// 3. 检查是否有空的主机名
 	if parsed.Host == "" {
-		return errors.New("URL missing host")
+		return errors.New("未知的url地址")
 	}

-	// 4. 检查路径是否为空（可选）
+	// 4. 检查路径是否为空
 	if strings.TrimSpace(parsed.Path) == "" {
-		return errors.New("URL path is empty")
+		return errors.New("url为空")
 	}

 	return nil