ai-lightrag/app/core/ingest.py

import base64
import logging
import httpx
from io import BytesIO
from app.config import settings

async def vl_image_caption_func(image_data: bytes, prompt: str = "请详细描述这张图片") -> str:
    """
    使用 VL 模型 (vLLM OpenAI API) 生成图片描述
    """
    if not settings.VL_BINDING_HOST:
        return "[Image Processing Skipped: No VL Model Configured]"

    try:
        # 1. 编码图片为 Base64
        base64_image = base64.b64encode(image_data).decode('utf-8')

        # 2. 构造 OpenAI 格式请求
        # vLLM 支持 OpenAI Vision API
        url = f"{settings.VL_BINDING_HOST}/chat/completions"
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {settings.VL_KEY}"
        }

        payload = {
            "model": settings.VL_MODEL,
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            }
                        }
                    ]
                }
            ],
            "max_tokens": 300
        }

        async with httpx.AsyncClient(timeout=30.0) as client:
            response = await client.post(url, headers=headers, json=payload)
            response.raise_for_status()
            result = response.json()
            description = result['choices'][0]['message']['content']
            return f"[Image Description: {description}]"

    except Exception as e:
        logging.error(f"VL Caption failed: {str(e)}")
        return f"[Image Processing Failed: {str(e)}]"

async def process_pdf_with_images(file_bytes: bytes) -> str:
    """
    解析 PDF，提取文本并对图片进行 Caption
    """
    import pypdf
    from PIL import Image

    text_content = ""
    pdf_file = BytesIO(file_bytes)
    reader = pypdf.PdfReader(pdf_file)

    for page_num, page in enumerate(reader.pages):
        # 1. 提取文本
        page_text = page.extract_text()
        text_content += f"--- Page {page_num + 1} Text ---\n{page_text}\n\n"

        # 2. 提取图片
        if False and settings.VL_BINDING_HOST:
            for count, image_file_object in enumerate(page.images):
                try:
                    # 获取图片数据
                    image_data = image_file_object.data

                    # 简单验证图片有效性
                    # Image.open(BytesIO(image_data)).verify()

                    # 调用 VL 模型
                    caption = await vl_image_caption_func(image_data)
                    text_content += f"--- Page {page_num + 1} Image {count + 1} ---\n{caption}\n\n"
                except Exception as e:
                    logging.warning(f"Failed to process image {count} on page {page_num}: {e}")

    return text_content