摘要:from transformers import BertTokenizer, BertForSequenceClassification, AdamW
以下是使用BERT进行中文情感分析的步骤详解和示例代码:
1. 环境准备
安装所需库:
bash
pip install transformers torch pandas sklearn
2. 数据准备
准备CSV数据文件(示例格式):
csv
text,label
"这部电影太精彩了!",1
"糟糕的观影体验,不推荐。",0
...
3. 实现代码
python
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import pandas as pd
# 参数设置
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5
MODEL_NAME = 'bert-base-chinese' # 使用中文预训练模型
# 加载数据
df = pd.read_csv('sentiment_data.csv')
texts = df.text.values
labels = df.label.values
# 划分数据集
train_texts, val_texts, train_labels, val_labels = train_test_split(
texts, labels, test_size=0.2, random_state=42)
# 初始化tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
# 数据集类
class SentimentDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_len):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = str(self.texts[idx])
label = self.labels[idx]
encoding = tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
padding='max_length',
truncation=True,
return_attention_mask=True,
return_tensors='pt',
)
return {
'input_ids': encoding['input_ids'].flatten,
'attention_mask': encoding['attention_mask'].flatten,
'labels': torch.tensor(label, dtype=torch.long)
}
# 创建数据加载器
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
# 加载模型
model = BertForSequenceClassification.from_pretrained(
MODEL_NAME,
num_labels=2 # 根据你的情感类别数量修改
)
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
model = model.to(device)
# 优化器
optimizer = AdamW(model.parameters, lr=LEARNING_RATE)
# 训练循环
for epoch in range(EPOCHS):
model.train
total_loss = 0
for batch in train_loader:
optimizer.zero_grad
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels
)
loss = outputs.loss
total_loss += loss.item
loss.backward
optimizer.step
avg_train_loss = total_loss / len(train_loader)
print(f'Epoch {epoch+1}/{EPOCHS}')
print(f'Train loss: {avg_train_loss:.4f}')
# 评估
model.eval
correct = 0
total = 0
with torch.no_grad:
for batch in val_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs.logits, dim=1)
correct += (preds == labels).sum.item
total += labels.size(0)
accuracy = correct / total
print(f'Validation Accuracy: {accuracy:.4f}')
# 保存模型
torch.save(model.state_dict, 'bert_chinese_sentiment.pth')
# 使用模型进行预测
def predict_sentiment(text):
encoding = tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=MAX_LEN,
padding='max_length',
truncation=True,
return_tensors='pt'
)
input_ids = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)
with torch.no_grad:
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
probabilities = torch.softmax(outputs.logits, dim=1)
predicted_label = torch.argmax(probabilities, dim=1).item
return predicted_label, probabilities
# 示例预测
text = "这个产品物超所值,非常满意!"
label, probs = predict_sentiment(text)
print(f"Text: {text}")
print(f"Predicted label: {'Positive' if label == 1 else 'Negative'}")
print(f"Probabilities: {probs.cpu.numpy}")
4. 关键说明
模型选择:Ø bert-base-chinese 是官方中文BERT基础版
Ø 可选其他中文模型:
hfl/chinese-bert-wwm-ext(哈工大版)hfl/chinese-roberta-wwm-ext数据处理:Ø 中文BERT使用字级别分词,无需额外分词
Ø 注意处理特殊符号和空格
Ø 最大长度根据数据分布调整(一般128-512)
训练优化:Ø 使用GPU加速训练(推荐Colab或配备NVIDIA GPU的环境)
Ø 可添加早停机制(Early Stopping)
Ø 学习率建议2e-5到5e-5之间
Ø 增加梯度裁剪(torch.nn.utils.clip_grad_norm_)
扩展功能:Ø 支持多分类(修改num_labels参数)
Ø 添加混淆矩阵评估
Ø 实现F1-score等其他评估指标
Ø 加入学习率调度器
5. 常见问题处理
内存不足:Ø 减小batch size
Ø 使用混合精度训练
Ø 尝试更小的模型(如BERT Tiny)
过拟合:Ø 增加dropout概率
Ø 使用数据增强(同义词替换等)
训练效果提升:Ø 尝试不同预训练模型
Ø 调整学习率和训练轮次
Ø 清洗和扩增训练数据
建议从少量数据开始测试流程,再逐步扩展到完整数据集。实际应用中需要根据具体业务需求调整分类策略和评估指标。
来源:老客数据一点号