scbank-sync/scbank_collector.py

344 lines
15 KiB
Python
Raw Normal View History

2026-03-11 17:32:13 +08:00
import json
import time
import random
import os
2026-03-19 17:09:37 +08:00
import sys
2026-03-11 17:32:13 +08:00
from datetime import datetime
from playwright.sync_api import sync_playwright
2026-03-19 17:09:37 +08:00
from chaojiying import ChaojiyingClient
2026-03-11 17:32:13 +08:00
class SCBankCollector:
def __init__(self):
self.target_url = "https://jf.scbank.cn:8085/#/orderManagement/deliveryOrders"
# 动态生成文件名: data/raw_YYYYMMDD_HHMMSS.jsonl
self.ts_str = datetime.now().strftime("%Y%m%d_%H%M%S")
2026-03-12 09:50:57 +08:00
2026-03-19 17:09:37 +08:00
# 读取外置配置文件
self.config = self._load_config()
self.username = self.config.get("scbank_username", "")
self.password = self.config.get("scbank_password", "")
# 初始化超级鹰客户端
cjy_user = self.config.get("chaojiying_username", "")
cjy_pass = self.config.get("chaojiying_password", "")
cjy_softid = self.config.get("chaojiying_softid", "96001")
self.cjy_client = ChaojiyingClient(cjy_user, cjy_pass, cjy_softid) if cjy_user and cjy_pass else None
2026-03-12 09:50:57 +08:00
# 确保数据目录存在
if not os.path.exists("data"):
os.makedirs("data")
2026-03-11 17:32:13 +08:00
self.browser = None
self.page = None
2026-03-19 17:09:37 +08:00
def _load_config(self):
config_file = "config.txt"
config_data = {
"商城账号": "",
"商城密码": "",
"超级鹰账号": "",
"超级鹰密码": "",
"超级鹰软件ID": "96001"
}
if not os.path.exists(config_file):
2026-03-20 13:40:49 +08:00
print(f"[ERROR] 未找到配置文件 {config_file},请确保该文件与程序在同一目录下。")
2026-03-19 17:09:37 +08:00
sys.exit(1)
try:
with open(config_file, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
# 跳过注释和空行
if not line or line.startswith("#"):
continue
if "=" in line:
key, val = line.split("=", 1)
config_data[key.strip()] = val.strip()
# 将中文 key 映射回内部使用的 key
return {
"scbank_username": config_data.get("商城账号", ""),
"scbank_password": config_data.get("商城密码", ""),
"chaojiying_username": config_data.get("超级鹰账号", ""),
"chaojiying_password": config_data.get("超级鹰密码", ""),
"chaojiying_softid": config_data.get("超级鹰软件ID", "96001")
}
except Exception as e:
print(f"[ERROR] 读取配置文件失败: {e}")
sys.exit(1)
2026-03-11 17:32:13 +08:00
def log(self, msg):
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")
def start_browser(self):
self.log("启动浏览器...")
p = sync_playwright().start()
2026-03-12 09:50:57 +08:00
# 尝试使用本地浏览器 (Chrome 或 Edge)
browser = None
for channel in ["chrome", "msedge"]:
try:
self.log(f"尝试启动本地 {channel}...")
browser = p.chromium.launch(
channel=channel,
headless=False,
args=["--disable-blink-features=AutomationControlled"]
)
self.log(f"成功启动 {channel}")
break
except Exception as e:
self.log(f"启动 {channel} 失败,尝试下一个...")
# 如果本地浏览器都失败,尝试使用内置 Chromium (如果已安装)
if not browser:
self.log("未找到本地 Chrome 或 Edge尝试使用内置 Chromium...")
try:
browser = p.chromium.launch(
headless=False,
args=["--disable-blink-features=AutomationControlled"]
)
except Exception as e:
self.log(f"[FATAL] 无法启动任何浏览器: {e}")
self.log("请确保已安装 Google Chrome 或 Microsoft Edge 浏览器。")
raise e
self.browser = browser
2026-03-11 17:32:13 +08:00
context = self.browser.new_context(
viewport={'width': 1920, 'height': 1080},
2026-03-19 16:29:01 +08:00
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
accept_downloads=True # 明确允许下载
2026-03-11 17:32:13 +08:00
)
self.page = context.new_page()
2026-03-19 17:09:37 +08:00
def _auto_solve_captcha(self):
"""自动识别并填写验证码"""
if not self.cjy_client:
self.log("[WARN] 未配置超级鹰账号,跳过自动识别验证码。请手动输入验证码。")
2026-03-20 13:40:49 +08:00
return False, None
2026-03-19 17:09:37 +08:00
try:
self.log("尝试获取验证码图片...")
# 四川银行登录页的验证码图片选择器
captcha_img_selector = 'img.code-image'
# 等待图片加载完成
self.page.wait_for_selector(captcha_img_selector, timeout=5000)
# 获取图片的 base64 数据或截图
# 因为是图片验证码,我们可以直接用 playwright 的 screenshot 功能获取二进制
image_bytes = self.page.locator(captcha_img_selector).screenshot()
self.log("正在调用超级鹰进行识别...")
# 1902: 4-6位英文数字混合
result = self.cjy_client.solve_captcha(image_bytes, codetype=1902)
if result and result.get('err_no') == 0:
code = result.get('pic_str')
2026-03-20 13:40:49 +08:00
pic_id = result.get('pic_id')
2026-03-19 17:09:37 +08:00
self.log(f"✅ 验证码识别成功: {code}")
# 填入验证码
self.page.fill('input[name="code"]', code)
# 点击登录
self.log("尝试自动点击登录...")
self.page.locator('button.login-btn').click()
2026-03-20 13:40:49 +08:00
return True, pic_id
2026-03-19 17:09:37 +08:00
else:
print(result)
err_str = result.get('err_str', '未知错误') if result else '返回为空'
self.log(f"❌ 验证码识别失败: {err_str}")
2026-03-20 13:40:49 +08:00
return False, None
2026-03-19 17:09:37 +08:00
except Exception as e:
self.log(f"自动处理验证码发生异常: {e}")
2026-03-20 13:40:49 +08:00
return False, None
2026-03-19 17:09:37 +08:00
2026-03-11 17:32:13 +08:00
def run(self):
try:
self.start_browser()
# 1. 登录
self.log(f"正在打开页面: {self.target_url}")
try:
self.page.goto(self.target_url)
# 等待跳转到登录页并加载输入框
try:
self.log("等待登录页面加载...")
self.page.wait_for_selector('input[name="username"]', timeout=10000)
# 自动填入账号密码
2026-03-19 17:09:37 +08:00
if self.username and self.password:
self.log(f"正在自动填入账号: {self.username}")
self.page.fill('input[name="username"]', self.username)
self.page.fill('input[name="password"]', self.password)
self.log("账号密码已填入")
else:
self.log("[WARN] 配置文件中未提供商城账号密码,请手动输入")
except Exception as e:
self.log(f"自动填入账号密码失败 (可能已登录或页面结构变化): {e}")
2026-03-11 17:32:13 +08:00
except: pass
2026-03-19 17:09:37 +08:00
# 尝试自动识别验证码并重试
max_retries = 3
retry_count = 0
login_success = False
while retry_count < max_retries and not login_success:
if self.cjy_client:
self.log(f"--- 第 {retry_count + 1} 次尝试登录 ---")
2026-03-20 13:40:49 +08:00
success, pic_id = self._auto_solve_captcha()
2026-03-19 17:09:37 +08:00
if not success:
2026-03-20 13:40:49 +08:00
self.log("验证码识别请求失败,等待后重试...")
time.sleep(1)
2026-03-19 17:09:37 +08:00
retry_count += 1
continue
else:
self.log(">>> 未配置超级鹰,请在浏览器中手动完成登录操作 (输入验证码并点击登录) <<<")
# 等待 URL 包含 homePage (用户指定) 或者 错误提示出现
try:
# 使用 wait_for_url 或者检查是否有错误弹窗
2026-03-20 13:40:49 +08:00
# 如果使用了超级鹰,缩短等待时间,因为错误提示出得很快
self.page.wait_for_url("**/homePage**", timeout=3000 if self.cjy_client else 0)
2026-03-19 17:09:37 +08:00
self.log("✅ 检测到登录成功!")
login_success = True
time.sleep(1)
except Exception as e:
if self.cjy_client:
self.log(f"登录等待超时或失败,可能验证码错误。")
2026-03-20 13:40:49 +08:00
# 验证码识别错误,按要求以 50% 的概率调用报错返分接口
if pic_id and random.random() < 0.8:
self.log(f"触发报错返分机制 (pic_id: {pic_id})...")
try:
self.cjy_client.report_error(pic_id)
self.log("报错返分请求已发送")
except Exception as re:
self.log(f"报错返分请求异常: {re}")
2026-03-19 17:09:37 +08:00
# 检查是否有错误提示 (比如 el-message)
try:
error_msg = self.page.locator('.el-message__content').inner_text(timeout=1000)
self.log(f"页面提示: {error_msg}")
except: pass
2026-03-20 13:40:49 +08:00
# 点击登录失败后,验证码会自动刷新,不需要手动点击刷新,稍微等待一下让新图片加载即可
time.sleep(1)
2026-03-19 17:09:37 +08:00
retry_count += 1
else:
self.log(f"登录等待超时或失败: {e}")
return
if not login_success:
2026-03-20 13:40:49 +08:00
self.log(f"[WARN] 达到最大自动登录重试次数 ({max_retries}次)。请在浏览器中手动输入验证码并完成登录!")
try:
# 退回到手动等待模式,无限制等待直到登录成功
self.page.wait_for_url("**/homePage**", timeout=0)
self.log("✅ 检测到手动登录成功!")
except Exception as e:
self.log(f"手动登录等待失败: {e}")
return
2026-03-11 17:32:13 +08:00
# 2. 强制跳转目标页
if "deliveryOrders" not in self.page.url:
self.log(f"跳转至订单管理页面: {self.target_url}")
self.page.goto(self.target_url)
self.page.wait_for_load_state("domcontentloaded")
2026-03-11 18:23:55 +08:00
time.sleep(0.5)
2026-03-11 17:32:13 +08:00
# 3. 筛选状态
self._filter_status()
2026-03-19 16:29:01 +08:00
# 4. 执行批量导出下载
self._download_excel()
2026-03-11 17:32:13 +08:00
2026-03-19 16:29:01 +08:00
self.log("采集任务完成。")
2026-03-11 17:32:13 +08:00
except Exception as e:
self.log(f"[FATAL] 脚本异常: {e}")
finally:
# 数据抓取完成后,浏览器不退出
# if self.browser:
# self.browser.close()
self.log("浏览器保持开启状态,请手动关闭。")
2026-03-11 17:32:13 +08:00
def _filter_status(self):
2026-03-11 18:23:55 +08:00
self.log("正在点击“待发货”标签页")
2026-03-11 17:32:13 +08:00
try:
# 定位 Tab
tab_selector = ".el-tabs__item:has-text('待发货')"
tab = self.page.locator(tab_selector)
if tab.count() > 0:
if "is-active" not in tab.get_attribute("class"):
tab.click()
2026-03-11 18:23:55 +08:00
self.log("已点击“待发货”标签页")
2026-03-11 17:32:13 +08:00
time.sleep(1)
else:
2026-03-11 18:23:55 +08:00
self.log("“待发货”标签页已经是选中状态")
2026-03-11 17:32:13 +08:00
else:
self.log(" [WARN] 未找到“待发货”Tab")
except Exception as e:
self.log(f"筛选操作失败: {e}")
2026-03-19 16:29:01 +08:00
def _download_excel(self):
"""执行批量导出操作"""
2026-03-11 17:32:13 +08:00
try:
2026-03-19 16:29:01 +08:00
self.log("准备触发批量发货...")
# 1. 点击批量发货按钮
# 通过包含的文本或者 class 寻找按钮
batch_ship_btn = self.page.locator("button:has-text('批量发货')")
if batch_ship_btn.count() > 0:
batch_ship_btn.first.click()
self.log("已点击 '批量发货' 按钮,等待弹窗加载...")
time.sleep(2) # 等待弹窗和里面的按钮渲染
else:
self.log("[WARN] 未找到 '批量发货' 按钮")
return
2026-03-11 17:32:13 +08:00
2026-03-19 16:29:01 +08:00
# 2. 点击导出待发货订单按钮,并拦截下载
self.log("尝试寻找并点击 '导出待发货订单' 按钮...")
# 使用文本包含来定位按钮,即使它在复杂的结构中
export_btn = self.page.locator("button:has-text('导出待发货订单')")
if export_btn.count() == 0:
self.log("[ERROR] 弹窗中未找到 '导出待发货订单' 按钮,可能是因为无待发货订单或者页面结构变更")
# 按 ESC 关闭弹窗,防止阻塞
2026-03-11 17:32:13 +08:00
self.page.keyboard.press("Escape")
2026-03-19 16:29:01 +08:00
return
2026-03-11 17:32:13 +08:00
2026-03-19 16:29:01 +08:00
# 开始监听下载事件
self.log("开始监听文件下载...")
with self.page.expect_download(timeout=60000) as download_info:
export_btn.first.click()
self.log("已点击 '导出待发货订单'")
2026-03-11 17:32:13 +08:00
2026-03-19 16:29:01 +08:00
download = download_info.value
# 保存文件到 data 目录
file_name = f"shipping_order_{self.ts_str}.xls"
save_path = os.path.join("data", file_name)
self.log(f"正在保存文件...")
download.save_as(save_path)
self.log(f"✅ 文件下载成功: {save_path}")
# 按 ESC 关闭弹窗
time.sleep(1)
self.page.keyboard.press("Escape")
2026-03-11 17:32:13 +08:00
except Exception as e:
2026-03-19 16:29:01 +08:00
self.log(f"执行批量导出失败: {e}")
2026-03-11 17:32:13 +08:00
if __name__ == "__main__":
collector = SCBankCollector()
collector.run()