l_ai_knowledge/docker/Dockerfile.docreader

135 lines
4.9 KiB
Docker
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# =========================
# 构建阶段
# =========================
FROM python:3.10.18-bookworm AS builder
# 切换 apt 源到清华
RUN sed -i 's@http://deb.debian.org@https://mirrors.tuna.tsinghua.edu.cn@g' /etc/apt/sources.list.d/debian.sources && \
sed -i 's@http://security.debian.org@https://mirrors.tuna.tsinghua.edu.cn@g' /etc/apt/sources.list.d/debian.sources
WORKDIR /app
# 安装构建依赖
RUN apt-get update && apt-get install -y \
gcc \
python3-dev \
libjpeg-dev \
zlib1g-dev \
libpq-dev \
libffi-dev \
libgl1 \
libglib2.0-0 \
wget \
antiword \
curl \
unzip \
&& rm -rf /var/lib/apt/lists/*
# 安装 protoc
RUN curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.19.4/protoc-3.19.4-linux-x86_64.zip && \
unzip protoc-3.19.4-linux-x86_64.zip -d /usr/local && \
chmod +x /usr/local/bin/protoc && \
rm protoc-3.19.4-linux-x86_64.zip
# 复制依赖文件
COPY services/docreader/requirements.txt .
# 安装依赖
RUN pip cache purge && pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
# 预下载 PP-OCRv5 模型
RUN mkdir -p /root/.paddlex/official_models && \
wget https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/PP-OCRv5_server_det_infer.tar \
-O /root/.paddlex/official_models/PP-OCRv5_server_det_infer.tar && \
wget https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/PP-OCRv5_server_rec_infer.tar \
-O /root/.paddlex/official_models/PP-OCRv5_server_rec_infer.tar && \
tar -xf /root/.paddlex/official_models/PP-OCRv5_server_det_infer.tar -C /root/.paddlex/official_models/ && \
tar -xf /root/.paddlex/official_models/PP-OCRv5_server_rec_infer.tar -C /root/.paddlex/official_models/ && \
rm -rf /root/.paddlex/official_models/PP-OCRv5_server_det_infer.tar /root/.paddlex/official_models/PP-OCRv5_server_rec_infer.tar
# 复制源代码和生成脚本
COPY services/docreader/src/ /app/src/
COPY services/docreader/scripts/ /app/scripts/
# 确保模型目录存在
RUN ls -la /root/.paddlex/official_models
# 生成 protobuf 代码
RUN chmod +x /app/scripts/generate_proto.sh && bash /app/scripts/generate_proto.sh
# =========================
# 运行阶段
# =========================
FROM python:3.10.18-bookworm AS runner
# 切换 apt 源到清华
RUN sed -i 's@http://deb.debian.org@https://mirrors.tuna.tsinghua.edu.cn@g' /etc/apt/sources.list.d/debian.sources && \
sed -i 's@http://security.debian.org@https://mirrors.tuna.tsinghua.edu.cn@g' /etc/apt/sources.list.d/debian.sources
WORKDIR /app
# 安装运行时依赖
RUN apt-get update && apt-get install -y \
libjpeg62-turbo \
libpq5 \
wget \
gnupg \
libgl1 \
libglib2.0-0 \
antiword \
supervisor \
vim \
tar \
dpkg \
libxinerama1 \
libfontconfig1 \
libdbus-glib-1-2 \
libcairo2 \
libcups2 \
libglu1-mesa \
libsm6 \
libreoffice \
&& rm -rf /var/lib/apt/lists/*
# # 下载并安装 LibreOffice区分架构
# RUN mkdir -p /tmp/libreoffice && cd /tmp/libreoffice && \
# if [ "$(uname -m)" = "x86_64" ]; then \
# wget https://mirrors.tuna.tsinghua.edu.cn/libreoffice/libreoffice/stable/25.2.5/deb/x86_64/LibreOffice_25.2.5_Linux_x86-64_deb.tar.gz && \
# tar -xzf LibreOffice_25.2.5_Linux_x86-64_deb.tar.gz && \
# cd LibreOffice_25.2.5.2_Linux_x86-64_deb/DEBS && dpkg -i *.deb; \
# elif [ "$(uname -m)" = "aarch64" ] || [ "$(uname -m)" = "arm64" ]; then \
# wget https://mirrors.aliyun.com/libreoffice/testing/25.8.0/deb/aarch64/LibreOffice_25.8.0.3_Linux_aarch64_deb.tar.gz && \
# tar -xzf LibreOffice_25.8.0.3_Linux_aarch64_deb.tar.gz && \
# cd LibreOffice_25.8.0.3_Linux_aarch64_deb/DEBS && dpkg -i *.deb; \
# else \
# echo "Unsupported architecture: $(uname -m)" && exit 1; \
# fi && \
# cd / && rm -rf /tmp/libreoffice
# 设置 LibreOffice 环境变量
# RUN echo 'export LIBREOFFICE_PATH=/opt/libreoffice25.2/program/soffice' >> /etc/environment;
# 从构建阶段复制已安装的依赖和生成的代码
COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
COPY --from=builder /usr/local/bin /usr/local/bin
COPY --from=builder /root/.paddlex/official_models /root/.paddlex/official_models
COPY --from=builder /app/src /app/src
# 安装 Playwright 浏览器
RUN python -m playwright install webkit
RUN python -m playwright install-deps webkit
# 设置 Python 路径
ENV PYTHONPATH=/app/src
RUN cd /app/src && python -m download_deps
# 创建supervisor配置
RUN mkdir -p /etc/supervisor/conf.d
COPY services/docreader/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
# 暴露 gRPC 端口
EXPOSE 50051
# 使用supervisor启动服务
CMD ["supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]