ai-lightrag/app/core/ingest.py

105 lines
3.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import base64
import logging
import httpx
from io import BytesIO
from app.config import settings
async def vl_image_caption_func(image_data: bytes, prompt: str = "请详细描述这张图片") -> str:
"""
使用 VL 模型生成图片描述
支持 ollama 和 openai/vllm 协议
"""
if not settings.VL_BINDING_HOST:
return "[Image Processing Skipped: No VL Model Configured]"
try:
# 1. 编码图片为 Base64
base64_image = base64.b64encode(image_data).decode('utf-8')
async with httpx.AsyncClient(timeout=30.0) as client:
if settings.VL_BINDING == "ollama":
# Ollama 协议
url = f"{settings.VL_BINDING_HOST}/api/generate"
payload = {
"model": settings.VL_MODEL,
"prompt": prompt,
"images": [base64_image],
"stream": False
}
response = await client.post(url, json=payload)
response.raise_for_status()
result = response.json()
description = result.get('response', '')
else:
# OpenAI / vLLM 协议
url = f"{settings.VL_BINDING_HOST}/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {settings.VL_KEY}"
}
payload = {
"model": settings.VL_MODEL,
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
"max_tokens": 300
}
response = await client.post(url, headers=headers, json=payload)
response.raise_for_status()
result = response.json()
description = result['choices'][0]['message']['content']
return f"[Image Description: {description}]"
except Exception as e:
logging.error(f"VL Caption failed: {str(e)}")
return f"[Image Processing Failed: {str(e)}]"
async def process_pdf_with_images(file_bytes: bytes) -> str:
"""
解析 PDF提取文本并对图片进行 Caption
"""
import pypdf
from PIL import Image
text_content = ""
pdf_file = BytesIO(file_bytes)
reader = pypdf.PdfReader(pdf_file)
for page_num, page in enumerate(reader.pages):
# 1. 提取文本
page_text = page.extract_text()
text_content += f"--- Page {page_num + 1} Text ---\n{page_text}\n\n"
# 2. 提取图片
if settings.VL_BINDING_HOST:
for count, image_file_object in enumerate(page.images):
try:
# 获取图片数据
image_data = image_file_object.data
# 简单验证图片有效性
Image.open(BytesIO(image_data)).verify()
# 调用 VL 模型
caption = await vl_image_caption_func(image_data)
text_content += f"--- Page {page_num + 1} Image {count + 1} ---\n{caption}\n\n"
except Exception as e:
logging.warning(f"Failed to process image {count} on page {page_num}: {e}")
return text_content