89 lines
3.1 KiB
Python
89 lines
3.1 KiB
Python
import base64
|
||
import logging
|
||
import httpx
|
||
from io import BytesIO
|
||
from app.config import settings
|
||
|
||
async def vl_image_caption_func(image_data: bytes, prompt: str = "请详细描述这张图片") -> str:
|
||
"""
|
||
使用 VL 模型 (vLLM OpenAI API) 生成图片描述
|
||
"""
|
||
if not settings.VL_BINDING_HOST:
|
||
return "[Image Processing Skipped: No VL Model Configured]"
|
||
|
||
try:
|
||
# 1. 编码图片为 Base64
|
||
base64_image = base64.b64encode(image_data).decode('utf-8')
|
||
|
||
# 2. 构造 OpenAI 格式请求
|
||
# vLLM 支持 OpenAI Vision API
|
||
url = f"{settings.VL_BINDING_HOST}/chat/completions"
|
||
headers = {
|
||
"Content-Type": "application/json",
|
||
"Authorization": f"Bearer {settings.VL_KEY}"
|
||
}
|
||
|
||
payload = {
|
||
"model": settings.VL_MODEL,
|
||
"messages": [
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
{"type": "text", "text": prompt},
|
||
{
|
||
"type": "image_url",
|
||
"image_url": {
|
||
"url": f"data:image/jpeg;base64,{base64_image}"
|
||
}
|
||
}
|
||
]
|
||
}
|
||
],
|
||
"max_tokens": 300
|
||
}
|
||
|
||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||
response = await client.post(url, headers=headers, json=payload)
|
||
response.raise_for_status()
|
||
result = response.json()
|
||
description = result['choices'][0]['message']['content']
|
||
return f"[Image Description: {description}]"
|
||
|
||
except Exception as e:
|
||
logging.error(f"VL Caption failed: {str(e)}")
|
||
return f"[Image Processing Failed: {str(e)}]"
|
||
|
||
async def process_pdf_with_images(file_bytes: bytes) -> str:
|
||
"""
|
||
解析 PDF,提取文本并对图片进行 Caption
|
||
"""
|
||
import pypdf
|
||
from PIL import Image
|
||
|
||
text_content = ""
|
||
pdf_file = BytesIO(file_bytes)
|
||
reader = pypdf.PdfReader(pdf_file)
|
||
|
||
for page_num, page in enumerate(reader.pages):
|
||
# 1. 提取文本
|
||
page_text = page.extract_text()
|
||
text_content += f"--- Page {page_num + 1} Text ---\n{page_text}\n\n"
|
||
|
||
# 2. 提取图片
|
||
if False and settings.VL_BINDING_HOST:
|
||
for count, image_file_object in enumerate(page.images):
|
||
try:
|
||
# 获取图片数据
|
||
image_data = image_file_object.data
|
||
|
||
# 简单验证图片有效性
|
||
# Image.open(BytesIO(image_data)).verify()
|
||
|
||
# 调用 VL 模型
|
||
caption = await vl_image_caption_func(image_data)
|
||
text_content += f"--- Page {page_num + 1} Image {count + 1} ---\n{caption}\n\n"
|
||
except Exception as e:
|
||
logging.warning(f"Failed to process image {count} on page {page_num}: {e}")
|
||
|
||
return text_content
|