彻底放弃huggingface,改用modelscope

This commit is contained in:
renzhiyuan 2025-10-13 16:25:30 +08:00
parent 3abfe72e19
commit e9b12df3a7
1 changed files with 27 additions and 35 deletions

View File

@ -1,8 +1,7 @@
import pandas as pd import pandas as pd
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from modelscope import AutoTokenizer, AutoModelForSequenceClassification
from transformers import ( from transformers import (
BertTokenizer,
BertForSequenceClassification,
Trainer, Trainer,
TrainingArguments, TrainingArguments,
EarlyStoppingCallback EarlyStoppingCallback
@ -15,10 +14,9 @@ import re # 用于正则表达式清洗
from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import LabelEncoder
import joblib import joblib
# 1. 参数配置(集中管理) # 1. 参数配置
class Config: class Config:
MODEL_NAME = "bert-base-chinese" MODEL_NAME = "bert-base-chinese"
Train_CSV = "order_address.csv"
MAX_LENGTH = 64 MAX_LENGTH = 64
BATCH_SIZE = 32 BATCH_SIZE = 32
NUM_EPOCHS = 5 NUM_EPOCHS = 5
@ -32,7 +30,7 @@ class Config:
DEVICE = "cuda" if FP16 else "cpu" DEVICE = "cuda" if FP16 else "cpu"
# 2. 数据加载与预处理(添加异常处理和日志) # 2. 数据加载与预处理
def load_data(file_path): def load_data(file_path):
try: try:
df = pd.read_csv(file_path) df = pd.read_csv(file_path)
@ -47,17 +45,15 @@ def load_data(file_path):
# 新增:数据清洗函数 - 只保留中文字符 # 新增:数据清洗函数 - 只保留中文字符
def clean_chinese_text(text): def clean_chinese_text(text):
""" """
清洗文本保留中文字符 清洗文本保留中文字符英文字母和数字去除空格和特殊符号
""" """
if not isinstance(text, str): if not isinstance(text, str):
return "" return ""
# 使用正则表达式匹配所有中文字符(包括中文标点符号)[^\u4e00-\u9fa5\u3000-\u303f\uff00-\uffef] # 保留中文(\u4e00-\u9fa5、英文a-zA-Z和数字0-9去除其他所有字符包括空格、标点等
# 如果需要更严格的只保留汉字,可以使用:[\u4e00-\u9fa5] cleaned_text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', '', text)
cleaned_text = re.sub(r'[^\u4e00-\u9fa5]', '', text)
return cleaned_text.strip() return cleaned_text.strip()
# 3. Dataset
# 3. 优化Dataset添加内存缓存和批处理支持
class TextDataset(Dataset): class TextDataset(Dataset):
def __init__(self, dataframe, tokenizer, text_col="sentence", label_col="label"): def __init__(self, dataframe, tokenizer, text_col="sentence", label_col="label"):
self.data = dataframe self.data = dataframe
@ -86,24 +82,20 @@ class TextDataset(Dataset):
} }
# 4. 模型初始化(添加设备移动) # 4. 模型初始化
def init_model(num_labels): def init_model(num_labels):
tokenizer = BertTokenizer.from_pretrained(Config.MODEL_NAME) tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese")
model = BertForSequenceClassification.from_pretrained( model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-chinese",num_labels=num_labels)
Config.MODEL_NAME,
num_labels=num_labels,
ignore_mismatched_sizes=True # 可选
).to(Config.DEVICE)
return tokenizer, model return tokenizer, model
# 5. 训练配置(添加早停和梯度累积) # 5. 训练配置
def get_training_args(): def get_training_args():
return TrainingArguments( return TrainingArguments(
output_dir=Config.OUTPUT_DIR, output_dir=Config.OUTPUT_DIR, #输出目录
num_train_epochs=Config.NUM_EPOCHS, num_train_epochs=Config.NUM_EPOCHS, #训练轮数,适度训练会增加精度,训练过多可能会因为训练数据中的噪声(错误数据)导致精度下降,解决方案:正则,早停,数据增强;梯度爆炸
per_device_train_batch_size=Config.BATCH_SIZE, per_device_train_batch_size=Config.BATCH_SIZE, #前向传播forward pass处理的样本数比如若 per_device_train_batch_size=32且使用 2 块 GPU则每块 GPU 会独立处理 32 个样本。总批量大小total_batch_size由以下公式决定total_batch_size=per_device_train_batch_size×GPU 数量×gradient_accumulation_steps
per_device_eval_batch_size=Config.BATCH_SIZE * 2, # 评估时可用更大batch per_device_eval_batch_size=Config.BATCH_SIZE * 2,
learning_rate=Config.LEARNING_RATE, learning_rate=Config.LEARNING_RATE,
warmup_steps=Config.WARMUP_STEPS, warmup_steps=Config.WARMUP_STEPS,
weight_decay=Config.WEIGHT_DECAY, weight_decay=Config.WEIGHT_DECAY,
@ -117,13 +109,13 @@ def get_training_args():
metric_for_best_model="eval_loss", metric_for_best_model="eval_loss",
greater_is_better=False, greater_is_better=False,
fp16=Config.FP16, fp16=Config.FP16,
gradient_accumulation_steps=2, # 模拟更大batch gradient_accumulation_steps=2,
report_to="none", # 禁用wandb等报告 report_to="none", # 禁用wandb等报告
seed=42 seed=42
) )
# 6. 优化推理函数(添加批处理支持) # 6.推理完测试函数
@torch.no_grad() @torch.no_grad()
def batch_predict(texts, model, tokenizer, label_map, top_k=1, batch_size=16): def batch_predict(texts, model, tokenizer, label_map, top_k=1, batch_size=16):
model.eval() model.eval()
@ -158,7 +150,7 @@ def batch_predict(texts, model, tokenizer, label_map, top_k=1, batch_size=16):
# 主流程 # 主流程
if __name__ == "__main__": if __name__ == "__main__":
# 1. 加载数据 # 1. 加载数据
df = load_data(Config.Train_CSV) df = load_data("order_address.csv")
# 2. 数据清洗 - 只保留中文 # 2. 数据清洗 - 只保留中文
print("🧼 开始清洗文本数据...") print("🧼 开始清洗文本数据...")
@ -174,26 +166,26 @@ if __name__ == "__main__":
print(f"标签映射示例: {label_map}") print(f"标签映射示例: {label_map}")
# 保存标签映射器(供推理时使用) # 保存标签映射器(供推理时使用)
joblib.dump(label_encoder, "cate/label_encoder.pkl") joblib.dump(label_encoder, "label_encoder.pkl")
print(f"✅ 标签映射完成 | 类别数: {len(label_map)}") print(f"✅ 标签映射完成 | 类别数: {len(label_map)}")
# 4. 划分数据集(使用 label_id 列) # 4. 划分数据集
train_df, test_df = train_test_split( train_df, test_df = train_test_split(
df, test_size=0.2, random_state=42, stratify=df["label_id"] # 注意这里用 label_id df, test_size=0.2, random_state=42, stratify=df["label_id"]
) )
# 5. 初始化模型(使用数值标签的数量) # 5. 初始化模型
num_labels = len(label_map) num_labels = len(label_map)
tokenizer, model = init_model(num_labels) tokenizer, model = init_model(num_labels)
# 6. 准备数据集(使用 label_id 列) # 6. 准备数据集(使用 label_id 列)
train_dataset = TextDataset(train_df, tokenizer, label_col="label_id") # 指定 label_col train_dataset = TextDataset(train_df, tokenizer, label_col="label_id")
test_dataset = TextDataset(test_df, tokenizer, label_col="label_id") test_dataset = TextDataset(test_df, tokenizer, label_col="label_id")
# 7. 训练配置(保持不变) # 7. 训练配置
training_args = get_training_args() training_args = get_training_args()
# 8. 训练器(保持不变) # 8. 训练器
trainer = Trainer( trainer = Trainer(
model=model, model=model,
args=training_args, args=training_args,
@ -202,12 +194,12 @@ if __name__ == "__main__":
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
) )
# 9. 训练和保存(保持不变) # 9. 训练和保存
trainer.train() trainer.train()
model.save_pretrained(Config.SAVE_DIR) model.save_pretrained(Config.SAVE_DIR)
tokenizer.save_pretrained(Config.SAVE_DIR) tokenizer.save_pretrained(Config.SAVE_DIR)
# 12. 测试推理 # 12. 测试推理
test_samples = ["不二家棒棒糖", "iPhone 15", "无线鼠标"] test_samples = ["山东省济南市莱芜区碧桂园天樾422502", "广东省广州市花都区狮岭镇山前旅游大道18号机车检修段", "江苏省苏州市吴中区吴中区木渎镇枫瑞路85号诺德·长枫雅苑北区10栋-303"]
# 先清洗测试样本 # 先清洗测试样本
cleaned_samples = [clean_chinese_text(s) for s in test_samples] cleaned_samples = [clean_chinese_text(s) for s in test_samples]
predictions = batch_predict(cleaned_samples, model, tokenizer, label_map) predictions = batch_predict(cleaned_samples, model, tokenizer, label_map)