彻底放弃huggingface,改用modelscope
This commit is contained in:
		
							parent
							
								
									3abfe72e19
								
							
						
					
					
						commit
						e9b12df3a7
					
				| 
						 | 
					@ -1,8 +1,7 @@
 | 
				
			||||||
import pandas as pd
 | 
					import pandas as pd
 | 
				
			||||||
from sklearn.model_selection import train_test_split
 | 
					from sklearn.model_selection import train_test_split
 | 
				
			||||||
 | 
					from modelscope import AutoTokenizer, AutoModelForSequenceClassification
 | 
				
			||||||
from transformers import (
 | 
					from transformers import (
 | 
				
			||||||
    BertTokenizer,
 | 
					 | 
				
			||||||
    BertForSequenceClassification,
 | 
					 | 
				
			||||||
    Trainer,
 | 
					    Trainer,
 | 
				
			||||||
    TrainingArguments,
 | 
					    TrainingArguments,
 | 
				
			||||||
    EarlyStoppingCallback
 | 
					    EarlyStoppingCallback
 | 
				
			||||||
| 
						 | 
					@ -15,10 +14,9 @@ import re  # 用于正则表达式清洗
 | 
				
			||||||
from sklearn.preprocessing import LabelEncoder
 | 
					from sklearn.preprocessing import LabelEncoder
 | 
				
			||||||
import joblib
 | 
					import joblib
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# 1. 参数配置(集中管理)
 | 
					# 1. 参数配置
 | 
				
			||||||
class Config:
 | 
					class Config:
 | 
				
			||||||
    MODEL_NAME = "bert-base-chinese"
 | 
					    MODEL_NAME = "bert-base-chinese"
 | 
				
			||||||
    Train_CSV = "order_address.csv"
 | 
					 | 
				
			||||||
    MAX_LENGTH = 64
 | 
					    MAX_LENGTH = 64
 | 
				
			||||||
    BATCH_SIZE = 32
 | 
					    BATCH_SIZE = 32
 | 
				
			||||||
    NUM_EPOCHS = 5
 | 
					    NUM_EPOCHS = 5
 | 
				
			||||||
| 
						 | 
					@ -32,7 +30,7 @@ class Config:
 | 
				
			||||||
    DEVICE = "cuda" if FP16 else "cpu"
 | 
					    DEVICE = "cuda" if FP16 else "cpu"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# 2. 数据加载与预处理(添加异常处理和日志)
 | 
					# 2. 数据加载与预处理
 | 
				
			||||||
def load_data(file_path):
 | 
					def load_data(file_path):
 | 
				
			||||||
    try:
 | 
					    try:
 | 
				
			||||||
        df = pd.read_csv(file_path)
 | 
					        df = pd.read_csv(file_path)
 | 
				
			||||||
| 
						 | 
					@ -47,17 +45,15 @@ def load_data(file_path):
 | 
				
			||||||
# 新增:数据清洗函数 - 只保留中文字符
 | 
					# 新增:数据清洗函数 - 只保留中文字符
 | 
				
			||||||
def clean_chinese_text(text):
 | 
					def clean_chinese_text(text):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    清洗文本,只保留中文字符
 | 
					    清洗文本,保留中文字符、英文字母和数字,去除空格和特殊符号
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    if not isinstance(text, str):
 | 
					    if not isinstance(text, str):
 | 
				
			||||||
        return ""
 | 
					        return ""
 | 
				
			||||||
    # 使用正则表达式匹配所有中文字符(包括中文标点符号)[^\u4e00-\u9fa5\u3000-\u303f\uff00-\uffef]
 | 
					    # 保留中文(\u4e00-\u9fa5)、英文(a-zA-Z)和数字(0-9),去除其他所有字符(包括空格、标点等)
 | 
				
			||||||
    # 如果需要更严格的只保留汉字,可以使用:[\u4e00-\u9fa5]
 | 
					    cleaned_text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', '', text)
 | 
				
			||||||
    cleaned_text = re.sub(r'[^\u4e00-\u9fa5]', '', text)
 | 
					 | 
				
			||||||
    return cleaned_text.strip()
 | 
					    return cleaned_text.strip()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# 3. Dataset
 | 
				
			||||||
# 3. 优化Dataset(添加内存缓存和批处理支持)
 | 
					 | 
				
			||||||
class TextDataset(Dataset):
 | 
					class TextDataset(Dataset):
 | 
				
			||||||
    def __init__(self, dataframe, tokenizer, text_col="sentence", label_col="label"):
 | 
					    def __init__(self, dataframe, tokenizer, text_col="sentence", label_col="label"):
 | 
				
			||||||
        self.data = dataframe
 | 
					        self.data = dataframe
 | 
				
			||||||
| 
						 | 
					@ -86,24 +82,20 @@ class TextDataset(Dataset):
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# 4. 模型初始化(添加设备移动)
 | 
					# 4. 模型初始化
 | 
				
			||||||
def init_model(num_labels):
 | 
					def init_model(num_labels):
 | 
				
			||||||
    tokenizer = BertTokenizer.from_pretrained(Config.MODEL_NAME)
 | 
					    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese")
 | 
				
			||||||
    model = BertForSequenceClassification.from_pretrained(
 | 
					    model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-chinese",num_labels=num_labels)
 | 
				
			||||||
        Config.MODEL_NAME,
 | 
					 | 
				
			||||||
        num_labels=num_labels,
 | 
					 | 
				
			||||||
        ignore_mismatched_sizes=True  # 可选
 | 
					 | 
				
			||||||
    ).to(Config.DEVICE)
 | 
					 | 
				
			||||||
    return tokenizer, model
 | 
					    return tokenizer, model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# 5. 训练配置(添加早停和梯度累积)
 | 
					# 5. 训练配置
 | 
				
			||||||
def get_training_args():
 | 
					def get_training_args():
 | 
				
			||||||
    return TrainingArguments(
 | 
					    return TrainingArguments(
 | 
				
			||||||
        output_dir=Config.OUTPUT_DIR,
 | 
					        output_dir=Config.OUTPUT_DIR, #输出目录
 | 
				
			||||||
        num_train_epochs=Config.NUM_EPOCHS,
 | 
					        num_train_epochs=Config.NUM_EPOCHS,  #训练轮数,适度训练会增加精度,训练过多可能会因为训练数据中的噪声(错误数据)导致精度下降,解决方案:正则,早停,数据增强;梯度爆炸
 | 
				
			||||||
        per_device_train_batch_size=Config.BATCH_SIZE,
 | 
					        per_device_train_batch_size=Config.BATCH_SIZE, #前向传播(forward pass)处理的样本数,比如:若 per_device_train_batch_size=32,且使用 2 块 GPU,则每块 GPU 会独立处理 32 个样本。总批量大小(total_batch_size)由以下公式决定:total_batch_size=per_device_train_batch_size×GPU 数量×gradient_accumulation_steps
 | 
				
			||||||
        per_device_eval_batch_size=Config.BATCH_SIZE * 2,  # 评估时可用更大batch
 | 
					        per_device_eval_batch_size=Config.BATCH_SIZE * 2,
 | 
				
			||||||
        learning_rate=Config.LEARNING_RATE,
 | 
					        learning_rate=Config.LEARNING_RATE,
 | 
				
			||||||
        warmup_steps=Config.WARMUP_STEPS,
 | 
					        warmup_steps=Config.WARMUP_STEPS,
 | 
				
			||||||
        weight_decay=Config.WEIGHT_DECAY,
 | 
					        weight_decay=Config.WEIGHT_DECAY,
 | 
				
			||||||
| 
						 | 
					@ -117,13 +109,13 @@ def get_training_args():
 | 
				
			||||||
        metric_for_best_model="eval_loss",
 | 
					        metric_for_best_model="eval_loss",
 | 
				
			||||||
        greater_is_better=False,
 | 
					        greater_is_better=False,
 | 
				
			||||||
        fp16=Config.FP16,
 | 
					        fp16=Config.FP16,
 | 
				
			||||||
        gradient_accumulation_steps=2,  # 模拟更大batch
 | 
					        gradient_accumulation_steps=2,
 | 
				
			||||||
        report_to="none",  # 禁用wandb等报告
 | 
					        report_to="none",  # 禁用wandb等报告
 | 
				
			||||||
        seed=42
 | 
					        seed=42
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# 6. 优化推理函数(添加批处理支持)
 | 
					# 6.推理完测试函数
 | 
				
			||||||
@torch.no_grad()
 | 
					@torch.no_grad()
 | 
				
			||||||
def batch_predict(texts, model, tokenizer, label_map, top_k=1, batch_size=16):
 | 
					def batch_predict(texts, model, tokenizer, label_map, top_k=1, batch_size=16):
 | 
				
			||||||
    model.eval()
 | 
					    model.eval()
 | 
				
			||||||
| 
						 | 
					@ -158,7 +150,7 @@ def batch_predict(texts, model, tokenizer, label_map, top_k=1, batch_size=16):
 | 
				
			||||||
# 主流程
 | 
					# 主流程
 | 
				
			||||||
if __name__ == "__main__":
 | 
					if __name__ == "__main__":
 | 
				
			||||||
    # 1. 加载数据
 | 
					    # 1. 加载数据
 | 
				
			||||||
    df = load_data(Config.Train_CSV)
 | 
					    df = load_data("order_address.csv")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # 2. 数据清洗 - 只保留中文
 | 
					    # 2. 数据清洗 - 只保留中文
 | 
				
			||||||
    print("🧼 开始清洗文本数据...")
 | 
					    print("🧼 开始清洗文本数据...")
 | 
				
			||||||
| 
						 | 
					@ -174,26 +166,26 @@ if __name__ == "__main__":
 | 
				
			||||||
    print(f"标签映射示例: {label_map}")
 | 
					    print(f"标签映射示例: {label_map}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # 保存标签映射器(供推理时使用)
 | 
					    # 保存标签映射器(供推理时使用)
 | 
				
			||||||
    joblib.dump(label_encoder, "cate/label_encoder.pkl")
 | 
					    joblib.dump(label_encoder, "label_encoder.pkl")
 | 
				
			||||||
    print(f"✅ 标签映射完成 | 类别数: {len(label_map)}")
 | 
					    print(f"✅ 标签映射完成 | 类别数: {len(label_map)}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # 4. 划分数据集(使用 label_id 列)
 | 
					    # 4. 划分数据集
 | 
				
			||||||
    train_df, test_df = train_test_split(
 | 
					    train_df, test_df = train_test_split(
 | 
				
			||||||
        df, test_size=0.2, random_state=42, stratify=df["label_id"]  # 注意这里用 label_id
 | 
					        df, test_size=0.2, random_state=42, stratify=df["label_id"]
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # 5. 初始化模型(使用数值标签的数量)
 | 
					    # 5. 初始化模型
 | 
				
			||||||
    num_labels = len(label_map)
 | 
					    num_labels = len(label_map)
 | 
				
			||||||
    tokenizer, model = init_model(num_labels)
 | 
					    tokenizer, model = init_model(num_labels)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # 6. 准备数据集(使用 label_id 列)
 | 
					    # 6. 准备数据集(使用 label_id 列)
 | 
				
			||||||
    train_dataset = TextDataset(train_df, tokenizer, label_col="label_id")  # 指定 label_col
 | 
					    train_dataset = TextDataset(train_df, tokenizer, label_col="label_id")
 | 
				
			||||||
    test_dataset = TextDataset(test_df, tokenizer, label_col="label_id")
 | 
					    test_dataset = TextDataset(test_df, tokenizer, label_col="label_id")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # 7. 训练配置(保持不变)
 | 
					    # 7. 训练配置
 | 
				
			||||||
    training_args = get_training_args()
 | 
					    training_args = get_training_args()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # 8. 训练器(保持不变)
 | 
					    # 8. 训练器
 | 
				
			||||||
    trainer = Trainer(
 | 
					    trainer = Trainer(
 | 
				
			||||||
        model=model,
 | 
					        model=model,
 | 
				
			||||||
        args=training_args,
 | 
					        args=training_args,
 | 
				
			||||||
| 
						 | 
					@ -202,12 +194,12 @@ if __name__ == "__main__":
 | 
				
			||||||
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
 | 
					        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # 9. 训练和保存(保持不变)
 | 
					    # 9. 训练和保存
 | 
				
			||||||
    trainer.train()
 | 
					    trainer.train()
 | 
				
			||||||
    model.save_pretrained(Config.SAVE_DIR)
 | 
					    model.save_pretrained(Config.SAVE_DIR)
 | 
				
			||||||
    tokenizer.save_pretrained(Config.SAVE_DIR)
 | 
					    tokenizer.save_pretrained(Config.SAVE_DIR)
 | 
				
			||||||
    # 12. 测试推理
 | 
					    # 12. 测试推理
 | 
				
			||||||
    test_samples = ["不二家棒棒糖", "iPhone 15", "无线鼠标"]
 | 
					    test_samples = ["山东省济南市莱芜区碧桂园天樾422502", "广东省广州市花都区狮岭镇山前旅游大道18号机车检修段", "江苏省苏州市吴中区吴中区木渎镇枫瑞路85号诺德·长枫雅苑北区10栋-303"]
 | 
				
			||||||
    # 先清洗测试样本
 | 
					    # 先清洗测试样本
 | 
				
			||||||
    cleaned_samples = [clean_chinese_text(s) for s in test_samples]
 | 
					    cleaned_samples = [clean_chinese_text(s) for s in test_samples]
 | 
				
			||||||
    predictions = batch_predict(cleaned_samples, model, tokenizer, label_map)
 | 
					    predictions = batch_predict(cleaned_samples, model, tokenizer, label_map)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue