diff --git a/bert_train.py b/bert_train.py index 8f0c653..fc52751 100644 --- a/bert_train.py +++ b/bert_train.py @@ -1,8 +1,7 @@ import pandas as pd from sklearn.model_selection import train_test_split +from modelscope import AutoTokenizer, AutoModelForSequenceClassification from transformers import ( - BertTokenizer, - BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback @@ -15,10 +14,9 @@ import re # 用于正则表达式清洗 from sklearn.preprocessing import LabelEncoder import joblib -# 1. 参数配置(集中管理) +# 1. 参数配置 class Config: MODEL_NAME = "bert-base-chinese" - Train_CSV = "order_address.csv" MAX_LENGTH = 64 BATCH_SIZE = 32 NUM_EPOCHS = 5 @@ -32,7 +30,7 @@ class Config: DEVICE = "cuda" if FP16 else "cpu" -# 2. 数据加载与预处理(添加异常处理和日志) +# 2. 数据加载与预处理 def load_data(file_path): try: df = pd.read_csv(file_path) @@ -47,17 +45,15 @@ def load_data(file_path): # 新增:数据清洗函数 - 只保留中文字符 def clean_chinese_text(text): """ - 清洗文本,只保留中文字符 + 清洗文本,保留中文字符、英文字母和数字,去除空格和特殊符号 """ if not isinstance(text, str): return "" - # 使用正则表达式匹配所有中文字符(包括中文标点符号)[^\u4e00-\u9fa5\u3000-\u303f\uff00-\uffef] - # 如果需要更严格的只保留汉字,可以使用:[\u4e00-\u9fa5] - cleaned_text = re.sub(r'[^\u4e00-\u9fa5]', '', text) + # 保留中文(\u4e00-\u9fa5)、英文(a-zA-Z)和数字(0-9),去除其他所有字符(包括空格、标点等) + cleaned_text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', '', text) return cleaned_text.strip() - -# 3. 优化Dataset(添加内存缓存和批处理支持) +# 3. Dataset class TextDataset(Dataset): def __init__(self, dataframe, tokenizer, text_col="sentence", label_col="label"): self.data = dataframe @@ -86,24 +82,20 @@ class TextDataset(Dataset): } -# 4. 模型初始化(添加设备移动) +# 4. 模型初始化 def init_model(num_labels): - tokenizer = BertTokenizer.from_pretrained(Config.MODEL_NAME) - model = BertForSequenceClassification.from_pretrained( - Config.MODEL_NAME, - num_labels=num_labels, - ignore_mismatched_sizes=True # 可选 - ).to(Config.DEVICE) + tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese") + model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-chinese",num_labels=num_labels) return tokenizer, model -# 5. 训练配置(添加早停和梯度累积) +# 5. 训练配置 def get_training_args(): return TrainingArguments( - output_dir=Config.OUTPUT_DIR, - num_train_epochs=Config.NUM_EPOCHS, - per_device_train_batch_size=Config.BATCH_SIZE, - per_device_eval_batch_size=Config.BATCH_SIZE * 2, # 评估时可用更大batch + output_dir=Config.OUTPUT_DIR, #输出目录 + num_train_epochs=Config.NUM_EPOCHS, #训练轮数,适度训练会增加精度,训练过多可能会因为训练数据中的噪声(错误数据)导致精度下降,解决方案:正则,早停,数据增强;梯度爆炸 + per_device_train_batch_size=Config.BATCH_SIZE, #前向传播(forward pass)处理的样本数,比如:若 per_device_train_batch_size=32,且使用 2 块 GPU,则每块 GPU 会独立处理 32 个样本。总批量大小(total_batch_size)由以下公式决定:total_batch_size=per_device_train_batch_size×GPU 数量×gradient_accumulation_steps + per_device_eval_batch_size=Config.BATCH_SIZE * 2, learning_rate=Config.LEARNING_RATE, warmup_steps=Config.WARMUP_STEPS, weight_decay=Config.WEIGHT_DECAY, @@ -117,13 +109,13 @@ def get_training_args(): metric_for_best_model="eval_loss", greater_is_better=False, fp16=Config.FP16, - gradient_accumulation_steps=2, # 模拟更大batch + gradient_accumulation_steps=2, report_to="none", # 禁用wandb等报告 seed=42 ) -# 6. 优化推理函数(添加批处理支持) +# 6.推理完测试函数 @torch.no_grad() def batch_predict(texts, model, tokenizer, label_map, top_k=1, batch_size=16): model.eval() @@ -158,7 +150,7 @@ def batch_predict(texts, model, tokenizer, label_map, top_k=1, batch_size=16): # 主流程 if __name__ == "__main__": # 1. 加载数据 - df = load_data(Config.Train_CSV) + df = load_data("order_address.csv") # 2. 数据清洗 - 只保留中文 print("🧼 开始清洗文本数据...") @@ -174,26 +166,26 @@ if __name__ == "__main__": print(f"标签映射示例: {label_map}") # 保存标签映射器(供推理时使用) - joblib.dump(label_encoder, "cate/label_encoder.pkl") + joblib.dump(label_encoder, "label_encoder.pkl") print(f"✅ 标签映射完成 | 类别数: {len(label_map)}") - # 4. 划分数据集(使用 label_id 列) + # 4. 划分数据集 train_df, test_df = train_test_split( - df, test_size=0.2, random_state=42, stratify=df["label_id"] # 注意这里用 label_id + df, test_size=0.2, random_state=42, stratify=df["label_id"] ) - # 5. 初始化模型(使用数值标签的数量) + # 5. 初始化模型 num_labels = len(label_map) tokenizer, model = init_model(num_labels) # 6. 准备数据集(使用 label_id 列) - train_dataset = TextDataset(train_df, tokenizer, label_col="label_id") # 指定 label_col + train_dataset = TextDataset(train_df, tokenizer, label_col="label_id") test_dataset = TextDataset(test_df, tokenizer, label_col="label_id") - # 7. 训练配置(保持不变) + # 7. 训练配置 training_args = get_training_args() - # 8. 训练器(保持不变) + # 8. 训练器 trainer = Trainer( model=model, args=training_args, @@ -202,12 +194,12 @@ if __name__ == "__main__": callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] ) - # 9. 训练和保存(保持不变) + # 9. 训练和保存 trainer.train() model.save_pretrained(Config.SAVE_DIR) tokenizer.save_pretrained(Config.SAVE_DIR) # 12. 测试推理 - test_samples = ["不二家棒棒糖", "iPhone 15", "无线鼠标"] + test_samples = ["山东省济南市莱芜区碧桂园天樾422502", "广东省广州市花都区狮岭镇山前旅游大道18号机车检修段", "江苏省苏州市吴中区吴中区木渎镇枫瑞路85号诺德·长枫雅苑北区10栋-303"] # 先清洗测试样本 cleaned_samples = [clean_chinese_text(s) for s in test_samples] predictions = batch_predict(cleaned_samples, model, tokenizer, label_map)