移除无需文件
This commit is contained in:
parent
6d0cd9834c
commit
219cb8ae37
|
|
@ -3,4 +3,5 @@ data/
|
||||||
output/
|
output/
|
||||||
.trae/
|
.trae/
|
||||||
实物系统API文档.pdf
|
实物系统API文档.pdf
|
||||||
*.pyc
|
*.pyc
|
||||||
|
__pycache__/
|
||||||
1872
order_data.json
1872
order_data.json
File diff suppressed because it is too large
Load Diff
|
|
@ -1,47 +0,0 @@
|
||||||
import json
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
def process_jsonl_to_json(input_path: str, output_path: str) -> None:
|
|
||||||
"""
|
|
||||||
处理jsonl文件,提取body内容并以orderNo为维度聚合为json
|
|
||||||
"""
|
|
||||||
order_dict = {}
|
|
||||||
|
|
||||||
with open(input_path, 'r', encoding='utf-8') as f:
|
|
||||||
for line_num, line in enumerate(f, 1):
|
|
||||||
line = line.strip()
|
|
||||||
if not line:
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
data = json.loads(line)
|
|
||||||
body = data.get('body', {})
|
|
||||||
|
|
||||||
# 处理两种body结构:包含pageDataList列表或直接是订单对象
|
|
||||||
if 'pageDataList' in body:
|
|
||||||
# 列表形式,多个订单
|
|
||||||
for order in body['pageDataList']:
|
|
||||||
order_no = order.get('orderNo')
|
|
||||||
if order_no:
|
|
||||||
order_dict[order_no] = order
|
|
||||||
else:
|
|
||||||
# 单个订单对象
|
|
||||||
order_no = body.get('orderNo')
|
|
||||||
if order_no:
|
|
||||||
order_dict[order_no] = body
|
|
||||||
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
print(f"第{line_num}行解析失败: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 保存为json文件
|
|
||||||
with open(output_path, 'w', encoding='utf-8') as f:
|
|
||||||
json.dump(order_dict, f, ensure_ascii=False, indent=2)
|
|
||||||
|
|
||||||
print(f"处理完成!共提取{len(order_dict)}个订单,已保存到{output_path}")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
input_file = "/mnt/d/code/project/python/scbank/scbank_data.jsonl"
|
|
||||||
output_file = "/mnt/d/code/project/python/scbank/order_data.json"
|
|
||||||
|
|
||||||
process_jsonl_to_json(input_file, output_file)
|
|
||||||
|
|
@ -1,60 +0,0 @@
|
||||||
import time
|
|
||||||
from playwright.sync_api import sync_playwright
|
|
||||||
|
|
||||||
TARGET_URL = "https://jf.scbank.cn:8085/#/orderManagement/deliveryOrders"
|
|
||||||
|
|
||||||
def run():
|
|
||||||
print("[系统] 启动页面结构分析工具...")
|
|
||||||
with sync_playwright() as p:
|
|
||||||
# 1. 启动浏览器
|
|
||||||
browser = p.chromium.launch(
|
|
||||||
headless=False,
|
|
||||||
args=["--disable-blink-features=AutomationControlled"]
|
|
||||||
)
|
|
||||||
|
|
||||||
context = browser.new_context(
|
|
||||||
viewport={'width': 1920, 'height': 1080},
|
|
||||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
||||||
)
|
|
||||||
|
|
||||||
page = context.new_page()
|
|
||||||
|
|
||||||
# 2. 打开页面
|
|
||||||
print("[操作] 正在打开页面,请手动登录...")
|
|
||||||
try:
|
|
||||||
page.goto(TARGET_URL)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
print(">>> 请在浏览器中完成登录操作 <<<")
|
|
||||||
print(">>> 登录成功并看到订单列表后,脚本将自动保存页面结构供分析 <<<")
|
|
||||||
|
|
||||||
# 等待登录成功
|
|
||||||
try:
|
|
||||||
# 进入首页视为登录成功
|
|
||||||
page.wait_for_url("**/homePage", timeout=0)
|
|
||||||
print("[状态] 检测到登录成功!")
|
|
||||||
|
|
||||||
# 等待表格数据加载 (关键步骤)
|
|
||||||
print("[状态] 等待表格渲染 (10秒)...")
|
|
||||||
time.sleep(10)
|
|
||||||
|
|
||||||
# 3. 保存页面结构
|
|
||||||
print("[分析] 正在保存页面 HTML...")
|
|
||||||
html = page.content()
|
|
||||||
with open("scbank_page.html", "w", encoding="utf-8") as f:
|
|
||||||
f.write(html)
|
|
||||||
print(f"[成功] HTML 已保存至 scbank_page.html")
|
|
||||||
|
|
||||||
# 4. 保存页面截图 (辅助定位)
|
|
||||||
print("[分析] 正在保存页面截图...")
|
|
||||||
page.screenshot(path="scbank_page.png", full_page=True)
|
|
||||||
print(f"[成功] 截图已保存至 scbank_page.png")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[错误] 分析过程出错: {e}")
|
|
||||||
|
|
||||||
browser.close()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
run()
|
|
||||||
214
scbank_hook.py
214
scbank_hook.py
|
|
@ -1,214 +0,0 @@
|
||||||
import json
|
|
||||||
import time
|
|
||||||
import random
|
|
||||||
from playwright.sync_api import sync_playwright
|
|
||||||
|
|
||||||
# 目标网址
|
|
||||||
TARGET_URL = "https://jf.scbank.cn:8085/#/orderManagement/deliveryOrders"
|
|
||||||
OUTPUT_FILE = "scbank_data.jsonl"
|
|
||||||
|
|
||||||
def process_details(page):
|
|
||||||
"""
|
|
||||||
点击当前页面的所有详情按钮
|
|
||||||
基于 HTML 分析:详情按钮是 <button class="el-button--text"><span>详情</span></button>
|
|
||||||
"""
|
|
||||||
# 使用精准选择器:可见的、文本包含"详情"的 ElementUI 文字按钮
|
|
||||||
detail_selector = "button.el-button--text:has-text('详情')"
|
|
||||||
|
|
||||||
try:
|
|
||||||
# 等待按钮出现 - 这里也是人机交互,timeout=0 死等
|
|
||||||
page.wait_for_selector(detail_selector, timeout=0)
|
|
||||||
|
|
||||||
# 获取所有匹配的按钮
|
|
||||||
# 注意:ElementUI 表格若有固定列,会渲染双份表格(一份header一份body,或者一份固定列一份滚动列)
|
|
||||||
# 所以必须过滤 visible=true
|
|
||||||
buttons = page.locator(detail_selector).all()
|
|
||||||
visible_buttons = [btn for btn in buttons if btn.is_visible()]
|
|
||||||
|
|
||||||
print(f"[当前页] 发现 {len(visible_buttons)} 个可见详情按钮")
|
|
||||||
|
|
||||||
for i, btn in enumerate(visible_buttons):
|
|
||||||
try:
|
|
||||||
btn.click()
|
|
||||||
# print(f" -> 点击第 {i+1} 个详情")
|
|
||||||
|
|
||||||
# 等待数据加载 (Hook 会自动捕获)
|
|
||||||
# 随机等待
|
|
||||||
time.sleep(random.uniform(1.5, 2.5))
|
|
||||||
|
|
||||||
# 关闭详情页
|
|
||||||
# 策略:ElementUI 弹窗通常可以通过按 ESC 关闭
|
|
||||||
page.keyboard.press("Escape")
|
|
||||||
time.sleep(0.5)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f" [警告] 详情操作失败: {e}")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[提示] 当前页无详情按钮或加载超时: {e}")
|
|
||||||
|
|
||||||
def run():
|
|
||||||
print(f"[系统] 启动精准采集脚本...")
|
|
||||||
print(f"[系统] 数据将保存至: {OUTPUT_FILE}")
|
|
||||||
|
|
||||||
with sync_playwright() as p:
|
|
||||||
# 1. 启动浏览器
|
|
||||||
browser = p.chromium.launch(
|
|
||||||
headless=False,
|
|
||||||
args=["--disable-blink-features=AutomationControlled"]
|
|
||||||
)
|
|
||||||
context = browser.new_context(
|
|
||||||
viewport={'width': 1920, 'height': 1080},
|
|
||||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
||||||
)
|
|
||||||
page = context.new_page()
|
|
||||||
|
|
||||||
# =====================================================
|
|
||||||
# 2. Hook 注入 (去噪优化版)
|
|
||||||
# =====================================================
|
|
||||||
page.add_init_script("""
|
|
||||||
const originalParse = JSON.parse;
|
|
||||||
JSON.parse = function(text, reviver) {
|
|
||||||
const data = originalParse(text, reviver);
|
|
||||||
try {
|
|
||||||
if (data && typeof data === 'object') {
|
|
||||||
const str = JSON.stringify(data);
|
|
||||||
|
|
||||||
// 【过滤噪音】排除包含 IsQX 的干扰接口 (通常用于权限检查)
|
|
||||||
if (str.includes('IsQX') || (data.body && data.body.IsQX)) {
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 1. 列表接口 /selectOrderlist -> 通常返回 { rows: [...], total: ... }
|
|
||||||
const isList = (data.rows && Array.isArray(data.rows) && data.rows.length > 0) ||
|
|
||||||
(data.list && Array.isArray(data.list) && data.list.length > 0);
|
|
||||||
|
|
||||||
// 2. 详情接口 /getorderdetail -> 通常返回 { orderNo: '...', ... }
|
|
||||||
// 为了防止漏掉,只要包含 orderNo 且长度足够大(排除简单的状态返回),都抓取
|
|
||||||
// 一个完整的订单详情通常包含收货人、地址、商品列表等,长度肯定超过 300 字符
|
|
||||||
const isDetail = str.includes('orderNo') && str.length > 300;
|
|
||||||
|
|
||||||
if (isList || isDetail) {
|
|
||||||
console.log('__INTERCEPTED__:' + str);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (e) {}
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
""")
|
|
||||||
|
|
||||||
# =====================================================
|
|
||||||
# 3. 数据监听
|
|
||||||
# =====================================================
|
|
||||||
def handle_console(msg):
|
|
||||||
if msg.text.startswith("__INTERCEPTED__:"):
|
|
||||||
json_str = msg.text.replace("__INTERCEPTED__:", "")
|
|
||||||
try:
|
|
||||||
data = json.loads(json_str)
|
|
||||||
|
|
||||||
# 简单去重或标记
|
|
||||||
desc = "数据"
|
|
||||||
if 'rows' in data or 'list' in data:
|
|
||||||
desc = "【列表】"
|
|
||||||
elif 'orderNo' in str(data):
|
|
||||||
desc = "【详情】"
|
|
||||||
|
|
||||||
# 写入文件
|
|
||||||
with open(OUTPUT_FILE, "a", encoding="utf-8") as f:
|
|
||||||
f.write(json.dumps(data, ensure_ascii=False) + "\n")
|
|
||||||
|
|
||||||
print(f"[捕获]{desc} 长度: {len(str(data))}")
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
page.on("console", handle_console)
|
|
||||||
|
|
||||||
# =====================================================
|
|
||||||
# 4. 业务流程
|
|
||||||
# =====================================================
|
|
||||||
print(f"[操作] 正在打开页面,请手动登录...")
|
|
||||||
try:
|
|
||||||
page.goto(TARGET_URL)
|
|
||||||
except: pass
|
|
||||||
|
|
||||||
print(">>> 请在浏览器中完成登录操作 <<<")
|
|
||||||
|
|
||||||
# 等待登录成功
|
|
||||||
try:
|
|
||||||
# 进入首页视为登录成功
|
|
||||||
page.wait_for_url("**/homePage", timeout=0)
|
|
||||||
print("[状态] 检测到登录成功!")
|
|
||||||
|
|
||||||
# 1. 强制跳转到目标页面 (如果当前不在目标页)
|
|
||||||
# 有时候登录后会跳转到首页或 dashboard
|
|
||||||
if "deliveryOrders" not in page.url:
|
|
||||||
print(f"[跳转] 正在前往目标页面: {TARGET_URL}")
|
|
||||||
page.goto(TARGET_URL)
|
|
||||||
page.wait_for_load_state("domcontentloaded")
|
|
||||||
time.sleep(0.5) # 等待页面组件渲染
|
|
||||||
else:
|
|
||||||
print("[状态] 已在目标页面")
|
|
||||||
|
|
||||||
# 2. 选中“待发货”
|
|
||||||
print("[操作] 正在点击“待发货”标签页")
|
|
||||||
try:
|
|
||||||
# 定位 Tab
|
|
||||||
tab_selector = ".el-tabs__item:has-text('待发货')"
|
|
||||||
tab = page.locator(tab_selector)
|
|
||||||
|
|
||||||
if tab.count() > 0:
|
|
||||||
# 检查是否已经选中
|
|
||||||
if "is-active" in tab.get_attribute("class"):
|
|
||||||
print(" [状态] “待发货”标签页已经是选中状态")
|
|
||||||
else:
|
|
||||||
tab.click()
|
|
||||||
print("[状态] 已点击“待发货”标签页")
|
|
||||||
time.sleep(1) # 等待 Tab 切换动画
|
|
||||||
else:
|
|
||||||
print(" [警告] 未找到“待发货”标签页,跳过筛选")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f" [错误] 筛选操作失败 (非致命,继续尝试抓取): {e}")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[错误] 等待登录或初始化失败: {e}")
|
|
||||||
|
|
||||||
# 循环翻页
|
|
||||||
page_num = 1
|
|
||||||
while True:
|
|
||||||
print(f"\n========== 正在处理第 {page_num} 页 ==========")
|
|
||||||
|
|
||||||
# 1. 采集当前页详情
|
|
||||||
process_details(page)
|
|
||||||
|
|
||||||
# 2. 翻页逻辑
|
|
||||||
print("[翻页] 检查下一页...")
|
|
||||||
|
|
||||||
# 【精准选择器】基于 HTML 分析: ElementUI 的下一页按钮 class 为 btn-next
|
|
||||||
next_btn = page.locator(".btn-next")
|
|
||||||
|
|
||||||
# 检查按钮是否存在
|
|
||||||
if next_btn.count() == 0:
|
|
||||||
print("[结束] 未找到下一页按钮 (.btn-next)")
|
|
||||||
break
|
|
||||||
|
|
||||||
# 检查是否禁用 (disabled 属性)
|
|
||||||
if next_btn.is_disabled():
|
|
||||||
print("[结束] 下一页按钮已禁用,采集完成。")
|
|
||||||
break
|
|
||||||
|
|
||||||
try:
|
|
||||||
next_btn.click()
|
|
||||||
page_num += 1
|
|
||||||
print(f"[操作] 翻页 -> 第 {page_num} 页")
|
|
||||||
time.sleep(3) # 等待新页面列表加载
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[错误] 翻页失败: {e}")
|
|
||||||
break
|
|
||||||
|
|
||||||
print(f"\n[系统] 任务完成。数据已保存至 {OUTPUT_FILE}")
|
|
||||||
time.sleep(5)
|
|
||||||
browser.close()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
run()
|
|
||||||
179
scbank_page.html
179
scbank_page.html
File diff suppressed because one or more lines are too long
BIN
scbank_page.png
BIN
scbank_page.png
Binary file not shown.
|
Before Width: | Height: | Size: 162 KiB |
Loading…
Reference in New Issue