import base64 import logging import httpx from io import BytesIO from app.config import settings async def vl_image_caption_func(image_data: bytes, prompt: str = "请详细描述这张图片") -> str: """ 使用 VL 模型 (vLLM OpenAI API) 生成图片描述 """ if not settings.VL_BINDING_HOST: return "[Image Processing Skipped: No VL Model Configured]" try: # 1. 编码图片为 Base64 base64_image = base64.b64encode(image_data).decode('utf-8') # 2. 构造 OpenAI 格式请求 # vLLM 支持 OpenAI Vision API url = f"{settings.VL_BINDING_HOST}/chat/completions" headers = { "Content-Type": "application/json", "Authorization": f"Bearer {settings.VL_KEY}" } payload = { "model": settings.VL_MODEL, "messages": [ { "role": "user", "content": [ {"type": "text", "text": prompt}, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base64_image}" } } ] } ], "max_tokens": 300 } async with httpx.AsyncClient(timeout=30.0) as client: response = await client.post(url, headers=headers, json=payload) response.raise_for_status() result = response.json() description = result['choices'][0]['message']['content'] return f"[Image Description: {description}]" except Exception as e: logging.error(f"VL Caption failed: {str(e)}") return f"[Image Processing Failed: {str(e)}]" async def process_pdf_with_images(file_bytes: bytes) -> str: """ 解析 PDF,提取文本并对图片进行 Caption """ import pypdf from PIL import Image text_content = "" pdf_file = BytesIO(file_bytes) reader = pypdf.PdfReader(pdf_file) for page_num, page in enumerate(reader.pages): # 1. 提取文本 page_text = page.extract_text() text_content += f"--- Page {page_num + 1} Text ---\n{page_text}\n\n" # 2. 提取图片 if settings.VL_BINDING_HOST: for count, image_file_object in enumerate(page.images): try: # 获取图片数据 image_data = image_file_object.data # 简单验证图片有效性 # Image.open(BytesIO(image_data)).verify() # 调用 VL 模型 caption = await vl_image_caption_func(image_data) text_content += f"--- Page {page_num + 1} Image {count + 1} ---\n{caption}\n\n" except Exception as e: logging.warning(f"Failed to process image {count} on page {page_num}: {e}") return text_content