scbank-sync/scbank_collector.py

344 lines
15 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import time
import random
import os
import sys
from datetime import datetime
from playwright.sync_api import sync_playwright
from chaojiying import ChaojiyingClient
class SCBankCollector:
def __init__(self):
self.target_url = "https://jf.scbank.cn:8085/#/orderManagement/deliveryOrders"
# 动态生成文件名: data/raw_YYYYMMDD_HHMMSS.jsonl
self.ts_str = datetime.now().strftime("%Y%m%d_%H%M%S")
# 读取外置配置文件
self.config = self._load_config()
self.username = self.config.get("scbank_username", "")
self.password = self.config.get("scbank_password", "")
# 初始化超级鹰客户端
cjy_user = self.config.get("chaojiying_username", "")
cjy_pass = self.config.get("chaojiying_password", "")
cjy_softid = self.config.get("chaojiying_softid", "96001")
self.cjy_client = ChaojiyingClient(cjy_user, cjy_pass, cjy_softid) if cjy_user and cjy_pass else None
# 确保数据目录存在
if not os.path.exists("data"):
os.makedirs("data")
self.browser = None
self.page = None
def _load_config(self):
config_file = "config.txt"
config_data = {
"商城账号": "",
"商城密码": "",
"超级鹰账号": "",
"超级鹰密码": "",
"超级鹰软件ID": "96001"
}
if not os.path.exists(config_file):
print(f"[ERROR] 未找到配置文件 {config_file},请确保该文件与程序在同一目录下。")
sys.exit(1)
try:
with open(config_file, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
# 跳过注释和空行
if not line or line.startswith("#"):
continue
if "=" in line:
key, val = line.split("=", 1)
config_data[key.strip()] = val.strip()
# 将中文 key 映射回内部使用的 key
return {
"scbank_username": config_data.get("商城账号", ""),
"scbank_password": config_data.get("商城密码", ""),
"chaojiying_username": config_data.get("超级鹰账号", ""),
"chaojiying_password": config_data.get("超级鹰密码", ""),
"chaojiying_softid": config_data.get("超级鹰软件ID", "96001")
}
except Exception as e:
print(f"[ERROR] 读取配置文件失败: {e}")
sys.exit(1)
def log(self, msg):
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")
def start_browser(self):
self.log("启动浏览器...")
p = sync_playwright().start()
# 尝试使用本地浏览器 (Chrome 或 Edge)
browser = None
for channel in ["chrome", "msedge"]:
try:
self.log(f"尝试启动本地 {channel}...")
browser = p.chromium.launch(
channel=channel,
headless=False,
args=["--disable-blink-features=AutomationControlled"]
)
self.log(f"成功启动 {channel}")
break
except Exception as e:
self.log(f"启动 {channel} 失败,尝试下一个...")
# 如果本地浏览器都失败,尝试使用内置 Chromium (如果已安装)
if not browser:
self.log("未找到本地 Chrome 或 Edge尝试使用内置 Chromium...")
try:
browser = p.chromium.launch(
headless=False,
args=["--disable-blink-features=AutomationControlled"]
)
except Exception as e:
self.log(f"[FATAL] 无法启动任何浏览器: {e}")
self.log("请确保已安装 Google Chrome 或 Microsoft Edge 浏览器。")
raise e
self.browser = browser
context = self.browser.new_context(
viewport={'width': 1920, 'height': 1080},
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
accept_downloads=True # 明确允许下载
)
self.page = context.new_page()
def _auto_solve_captcha(self):
"""自动识别并填写验证码"""
if not self.cjy_client:
self.log("[WARN] 未配置超级鹰账号,跳过自动识别验证码。请手动输入验证码。")
return False, None
try:
self.log("尝试获取验证码图片...")
# 四川银行登录页的验证码图片选择器
captcha_img_selector = 'img.code-image'
# 等待图片加载完成
self.page.wait_for_selector(captcha_img_selector, timeout=5000)
# 获取图片的 base64 数据或截图
# 因为是图片验证码,我们可以直接用 playwright 的 screenshot 功能获取二进制
image_bytes = self.page.locator(captcha_img_selector).screenshot()
self.log("正在调用超级鹰进行识别...")
# 1902: 4-6位英文数字混合
result = self.cjy_client.solve_captcha(image_bytes, codetype=1902)
if result and result.get('err_no') == 0:
code = result.get('pic_str')
pic_id = result.get('pic_id')
self.log(f"✅ 验证码识别成功: {code}")
# 填入验证码
self.page.fill('input[name="code"]', code)
# 点击登录
self.log("尝试自动点击登录...")
self.page.locator('button.login-btn').click()
return True, pic_id
else:
print(result)
err_str = result.get('err_str', '未知错误') if result else '返回为空'
self.log(f"❌ 验证码识别失败: {err_str}")
return False, None
except Exception as e:
self.log(f"自动处理验证码发生异常: {e}")
return False, None
def run(self):
try:
self.start_browser()
# 1. 登录
self.log(f"正在打开页面: {self.target_url}")
try:
self.page.goto(self.target_url)
# 等待跳转到登录页并加载输入框
try:
self.log("等待登录页面加载...")
self.page.wait_for_selector('input[name="username"]', timeout=10000)
# 自动填入账号密码
if self.username and self.password:
self.log(f"正在自动填入账号: {self.username}")
self.page.fill('input[name="username"]', self.username)
self.page.fill('input[name="password"]', self.password)
self.log("账号密码已填入")
else:
self.log("[WARN] 配置文件中未提供商城账号密码,请手动输入")
except Exception as e:
self.log(f"自动填入账号密码失败 (可能已登录或页面结构变化): {e}")
except: pass
# 尝试自动识别验证码并重试
max_retries = 3
retry_count = 0
login_success = False
while retry_count < max_retries and not login_success:
if self.cjy_client:
self.log(f"--- 第 {retry_count + 1} 次尝试登录 ---")
success, pic_id = self._auto_solve_captcha()
if not success:
self.log("验证码识别请求失败,等待后重试...")
time.sleep(1)
retry_count += 1
continue
else:
self.log(">>> 未配置超级鹰,请在浏览器中手动完成登录操作 (输入验证码并点击登录) <<<")
# 等待 URL 包含 homePage (用户指定) 或者 错误提示出现
try:
# 使用 wait_for_url 或者检查是否有错误弹窗
# 如果使用了超级鹰,缩短等待时间,因为错误提示出得很快
self.page.wait_for_url("**/homePage**", timeout=3000 if self.cjy_client else 0)
self.log("✅ 检测到登录成功!")
login_success = True
time.sleep(1)
except Exception as e:
if self.cjy_client:
self.log(f"登录等待超时或失败,可能验证码错误。")
# 验证码识别错误,按要求以 50% 的概率调用报错返分接口
if pic_id and random.random() < 0.8:
self.log(f"触发报错返分机制 (pic_id: {pic_id})...")
try:
self.cjy_client.report_error(pic_id)
self.log("报错返分请求已发送")
except Exception as re:
self.log(f"报错返分请求异常: {re}")
# 检查是否有错误提示 (比如 el-message)
try:
error_msg = self.page.locator('.el-message__content').inner_text(timeout=1000)
self.log(f"页面提示: {error_msg}")
except: pass
# 点击登录失败后,验证码会自动刷新,不需要手动点击刷新,稍微等待一下让新图片加载即可
time.sleep(1)
retry_count += 1
else:
self.log(f"登录等待超时或失败: {e}")
return
if not login_success:
self.log(f"[WARN] 达到最大自动登录重试次数 ({max_retries}次)。请在浏览器中手动输入验证码并完成登录!")
try:
# 退回到手动等待模式,无限制等待直到登录成功
self.page.wait_for_url("**/homePage**", timeout=0)
self.log("✅ 检测到手动登录成功!")
except Exception as e:
self.log(f"手动登录等待失败: {e}")
return
# 2. 强制跳转目标页
if "deliveryOrders" not in self.page.url:
self.log(f"跳转至订单管理页面: {self.target_url}")
self.page.goto(self.target_url)
self.page.wait_for_load_state("domcontentloaded")
time.sleep(0.5)
# 3. 筛选状态
self._filter_status()
# 4. 执行批量导出下载
self._download_excel()
self.log("采集任务完成。")
except Exception as e:
self.log(f"[FATAL] 脚本异常: {e}")
finally:
# 数据抓取完成后,浏览器不退出
# if self.browser:
# self.browser.close()
self.log("浏览器保持开启状态,请手动关闭。")
def _filter_status(self):
self.log("正在点击“待发货”标签页")
try:
# 定位 Tab
tab_selector = ".el-tabs__item:has-text('待发货')"
tab = self.page.locator(tab_selector)
if tab.count() > 0:
if "is-active" not in tab.get_attribute("class"):
tab.click()
self.log("已点击“待发货”标签页")
time.sleep(1)
else:
self.log("“待发货”标签页已经是选中状态")
else:
self.log(" [WARN] 未找到“待发货”Tab")
except Exception as e:
self.log(f"筛选操作失败: {e}")
def _download_excel(self):
"""执行批量导出操作"""
try:
self.log("准备触发批量发货...")
# 1. 点击批量发货按钮
# 通过包含的文本或者 class 寻找按钮
batch_ship_btn = self.page.locator("button:has-text('批量发货')")
if batch_ship_btn.count() > 0:
batch_ship_btn.first.click()
self.log("已点击 '批量发货' 按钮,等待弹窗加载...")
time.sleep(2) # 等待弹窗和里面的按钮渲染
else:
self.log("[WARN] 未找到 '批量发货' 按钮")
return
# 2. 点击导出待发货订单按钮,并拦截下载
self.log("尝试寻找并点击 '导出待发货订单' 按钮...")
# 使用文本包含来定位按钮,即使它在复杂的结构中
export_btn = self.page.locator("button:has-text('导出待发货订单')")
if export_btn.count() == 0:
self.log("[ERROR] 弹窗中未找到 '导出待发货订单' 按钮,可能是因为无待发货订单或者页面结构变更")
# 按 ESC 关闭弹窗,防止阻塞
self.page.keyboard.press("Escape")
return
# 开始监听下载事件
self.log("开始监听文件下载...")
with self.page.expect_download(timeout=60000) as download_info:
export_btn.first.click()
self.log("已点击 '导出待发货订单'")
download = download_info.value
# 保存文件到 data 目录
file_name = f"shipping_order_{self.ts_str}.xls"
save_path = os.path.join("data", file_name)
self.log(f"正在保存文件...")
download.save_as(save_path)
self.log(f"✅ 文件下载成功: {save_path}")
# 按 ESC 关闭弹窗
time.sleep(1)
self.page.keyboard.press("Escape")
except Exception as e:
self.log(f"执行批量导出失败: {e}")
if __name__ == "__main__":
collector = SCBankCollector()
collector.run()