import json import time import random from playwright.sync_api import sync_playwright # 目标网址 TARGET_URL = "https://jf.scbank.cn:8085/#/orderManagement/deliveryOrders" OUTPUT_FILE = "scbank_data.jsonl" def process_details(page): """ 点击当前页面的所有详情按钮 基于 HTML 分析:详情按钮是 """ # 使用精准选择器:可见的、文本包含"详情"的 ElementUI 文字按钮 detail_selector = "button.el-button--text:has-text('详情')" try: # 等待按钮出现 - 这里也是人机交互,timeout=0 死等 page.wait_for_selector(detail_selector, timeout=0) # 获取所有匹配的按钮 # 注意:ElementUI 表格若有固定列,会渲染双份表格(一份header一份body,或者一份固定列一份滚动列) # 所以必须过滤 visible=true buttons = page.locator(detail_selector).all() visible_buttons = [btn for btn in buttons if btn.is_visible()] print(f"[当前页] 发现 {len(visible_buttons)} 个可见详情按钮") for i, btn in enumerate(visible_buttons): try: btn.click() # print(f" -> 点击第 {i+1} 个详情") # 等待数据加载 (Hook 会自动捕获) # 随机等待 time.sleep(random.uniform(1.5, 2.5)) # 关闭详情页 # 策略:ElementUI 弹窗通常可以通过按 ESC 关闭 page.keyboard.press("Escape") time.sleep(0.5) except Exception as e: print(f" [警告] 详情操作失败: {e}") except Exception as e: print(f"[提示] 当前页无详情按钮或加载超时: {e}") def run(): print(f"[系统] 启动精准采集脚本...") print(f"[系统] 数据将保存至: {OUTPUT_FILE}") with sync_playwright() as p: # 1. 启动浏览器 browser = p.chromium.launch( headless=False, args=["--disable-blink-features=AutomationControlled"] ) context = browser.new_context( viewport={'width': 1920, 'height': 1080}, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ) page = context.new_page() # ===================================================== # 2. Hook 注入 (去噪优化版) # ===================================================== page.add_init_script(""" const originalParse = JSON.parse; JSON.parse = function(text, reviver) { const data = originalParse(text, reviver); try { if (data && typeof data === 'object') { const str = JSON.stringify(data); // 【过滤噪音】排除包含 IsQX 的干扰接口 (通常用于权限检查) if (str.includes('IsQX') || (data.body && data.body.IsQX)) { return data; } // 1. 列表接口 /selectOrderlist -> 通常返回 { rows: [...], total: ... } const isList = (data.rows && Array.isArray(data.rows) && data.rows.length > 0) || (data.list && Array.isArray(data.list) && data.list.length > 0); // 2. 详情接口 /getorderdetail -> 通常返回 { orderNo: '...', ... } // 为了防止漏掉,只要包含 orderNo 且长度足够大(排除简单的状态返回),都抓取 // 一个完整的订单详情通常包含收货人、地址、商品列表等,长度肯定超过 300 字符 const isDetail = str.includes('orderNo') && str.length > 300; if (isList || isDetail) { console.log('__INTERCEPTED__:' + str); } } } catch (e) {} return data; } """) # ===================================================== # 3. 数据监听 # ===================================================== def handle_console(msg): if msg.text.startswith("__INTERCEPTED__:"): json_str = msg.text.replace("__INTERCEPTED__:", "") try: data = json.loads(json_str) # 简单去重或标记 desc = "数据" if 'rows' in data or 'list' in data: desc = "【列表】" elif 'orderNo' in str(data): desc = "【详情】" # 写入文件 with open(OUTPUT_FILE, "a", encoding="utf-8") as f: f.write(json.dumps(data, ensure_ascii=False) + "\n") print(f"[捕获]{desc} 长度: {len(str(data))}") except: pass page.on("console", handle_console) # ===================================================== # 4. 业务流程 # ===================================================== print(f"[操作] 正在打开页面,请手动登录...") try: page.goto(TARGET_URL) except: pass print(">>> 请在浏览器中完成登录操作 <<<") # 等待登录成功 try: # 进入首页视为登录成功 page.wait_for_url("**/homePage", timeout=0) print("[状态] 检测到登录成功!") # 1. 强制跳转到目标页面 (如果当前不在目标页) # 有时候登录后会跳转到首页或 dashboard if "deliveryOrders" not in page.url: print(f"[跳转] 正在前往目标页面: {TARGET_URL}") page.goto(TARGET_URL) page.wait_for_load_state("domcontentloaded") time.sleep(0.5) # 等待页面组件渲染 else: print("[状态] 已在目标页面") # 2. 选中“待发货” print("[操作] 正在点击“待发货”标签页") try: # 定位 Tab tab_selector = ".el-tabs__item:has-text('待发货')" tab = page.locator(tab_selector) if tab.count() > 0: # 检查是否已经选中 if "is-active" in tab.get_attribute("class"): print(" [状态] “待发货”标签页已经是选中状态") else: tab.click() print("[状态] 已点击“待发货”标签页") time.sleep(1) # 等待 Tab 切换动画 else: print(" [警告] 未找到“待发货”标签页,跳过筛选") except Exception as e: print(f" [错误] 筛选操作失败 (非致命,继续尝试抓取): {e}") except Exception as e: print(f"[错误] 等待登录或初始化失败: {e}") # 循环翻页 page_num = 1 while True: print(f"\n========== 正在处理第 {page_num} 页 ==========") # 1. 采集当前页详情 process_details(page) # 2. 翻页逻辑 print("[翻页] 检查下一页...") # 【精准选择器】基于 HTML 分析: ElementUI 的下一页按钮 class 为 btn-next next_btn = page.locator(".btn-next") # 检查按钮是否存在 if next_btn.count() == 0: print("[结束] 未找到下一页按钮 (.btn-next)") break # 检查是否禁用 (disabled 属性) if next_btn.is_disabled(): print("[结束] 下一页按钮已禁用,采集完成。") break try: next_btn.click() page_num += 1 print(f"[操作] 翻页 -> 第 {page_num} 页") time.sleep(3) # 等待新页面列表加载 except Exception as e: print(f"[错误] 翻页失败: {e}") break print(f"\n[系统] 任务完成。数据已保存至 {OUTPUT_FILE}") time.sleep(5) browser.close() if __name__ == "__main__": run()