初次实现
首先先去实现一个数据导入,数据来自项目: https://github.com/InsaneLife/ChineseNLPCorpus
参考代码如下
class MyDataset(Dataset):
def __init__(self) -> None:
super().__init__()
self.data = pd.read_csv("./ChnSentiCorp_htl_all.csv")
self.data = self.data.dropna()
def __getitem__(self, index):
return self.data.iloc[index]["review"], self.data.iloc[index]["label"]
def __len__(self):
return len(self.data)
dataset = MyDataset()
下面对数据进行划分和处理,模型用的是hfl/rbt3
,他这里先是把数据按照9:1的形式进行切分,分别划分成训练数据和验证数据,collate_func
函数是用来规范数据的,当前的作用是预处理数据,之后把两组数据丢到数据加载器DataLoader
中
参考代码如下
# 切分数据
trainset, validest = random_split(dataset, lengths=[0.9, 0.1])
tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")
# 数据处理
def collate_func(batch):
texts, labels = [], []
for item in batch:
# texts.append(item[0])
texts.append(str(item[0])) # 确保是字符串类型
labels.append(item[1])
inputs = tokenizer(
texts,
max_length=128,
padding="max_length",
truncation=True,
return_tensors="pt",
)
inputs["labels"] = torch.tensor(labels)
return inputs
# 创建DataLoader
# DataLoader是用来把数据转换为适合模型训练和评估的批量数据加载器。
# 这里的batch_size=32是每一批的数据大小,shuffle=True表示打乱数据顺序。
trainloader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=collate_func)
validestloader = DataLoader(
validest, batch_size=32, shuffle=True, collate_fn=collate_func
)
加载模型和优化器,通过优化器来设置学习效率,lr是指学习的效率。代码如下
# 创建模型和优化器
# model = AutoModelForSequenceClassification.from_pretrained("E:/AI/rbt3")
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")
if torch.cuda.is_available():
model = model.cuda()
# optimizer = adam(model.parameters(), lr=2e-5)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
定义训练和评估函数
# 评估模型
def evaluate():
model.eval()
acc_num = 0
with torch.inference_mode():
for batch in validestloader:
if torch.cuda.is_available():
batch = {k: v.cuda() for k, v in batch.items()}
outputs = model(**batch)
pred = torch.argmax(outputs.logits, dim=-1)
acc_num += (pred == batch["labels"].long()).float().sum()
return acc_num / len(validest)
# 训练模型
def train(epoch=3, log_step=100):
global_step = 0
for ep in range(epoch):
print(f"epoch: {ep}")
model.train()
for batch in trainloader:
# 如果有英伟达的GPU并且支持CUDA的
if torch.cuda.is_available():
batch = {k: v.cuda() for k, v in batch.items()}
optimizer.zero_grad()
outputs = model(**batch)
outputs.loss.backward()
optimizer.step()
# print(f"ep: {ep}, global_step: {global_step}, loss: {outputs.loss.item()}")
if global_step % log_step == 0 or global_step == 0:
print(
f"ep: {ep}, global_step: {global_step}, loss: {outputs.loss.item()}"
)
global_step += 1
acc = evaluate()
print(f"ep: {ep}, acc: {acc}")
开始训练,并测试输入,代码如下
# 开始训练
train()
# 测试模型
sen = "酒店很差"
id2_label = {0: "差评", 1: "好评"}
model.eval()
with torch.inference_mode():
inputs = tokenizer(sen, return_tensors="pt")
inputs = {k: v.cuda() for k, v in inputs.items()}
logits = model(**inputs).logits
pred = torch.argmax(logits, dim=-1)
print(f"输出: {sen}\n模型预测结果: {id2_label[pred.item()]}")
完整的代码参考如下
# 文本分类示例
# 导包
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import random_split
from torch.utils.data import DataLoader
import torch
import torch.optim as adam
# 创建DataSet
class MyDataset(Dataset):
def __init__(self) -> None:
super().__init__()
self.data = pd.read_csv("./ChnSentiCorp_htl_all.csv")
self.data = self.data.dropna()
def __getitem__(self, index):
return self.data.iloc[index]["review"], self.data.iloc[index]["label"]
def __len__(self):
return len(self.data)
dataset = MyDataset()
# 测试输出
# for i in range(5):
# print(dataset[i])
# 划分数据集
# 将 dataset(包含所有酒店评论数据)随机分成两部分:
# trainset:90% 的数据,用于后续模型训练。
# validest:10% 的数据,用于验证模型效果。
trainset, validest = random_split(dataset, lengths=[0.9, 0.1])
# tokenizer
# tokenizer = AutoTokenizer.from_pretrained("E:/AI/rbt3")
tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")
# 数据处理
def collate_func(batch):
texts, labels = [], []
for item in batch:
# texts.append(item[0])
texts.append(str(item[0])) # 确保是字符串类型
labels.append(item[1])
inputs = tokenizer(
texts,
max_length=128,
padding="max_length",
truncation=True,
return_tensors="pt",
)
inputs["labels"] = torch.tensor(labels)
return inputs
# 创建DataLoader
# DataLoader是用来把数据转换为适合模型训练和评估的批量数据加载器。
# 这里的batch_size=32是每一批的数据大小,shuffle=True表示打乱数据顺序。
trainloader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=collate_func)
validestloader = DataLoader(
validest, batch_size=32, shuffle=True, collate_fn=collate_func
)
# 测试读取
# print(next(enumerate(trainloader))[1])
# 创建模型和优化器
# model = AutoModelForSequenceClassification.from_pretrained("E:/AI/rbt3")
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")
if torch.cuda.is_available():
model = model.cuda()
# optimizer = adam(model.parameters(), lr=2e-5)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
# 评估模型
def evaluate():
model.eval()
acc_num = 0
with torch.inference_mode():
for batch in validestloader:
if torch.cuda.is_available():
batch = {k: v.cuda() for k, v in batch.items()}
outputs = model(**batch)
pred = torch.argmax(outputs.logits, dim=-1)
acc_num += (pred == batch["labels"].long()).float().sum()
return acc_num / len(validest)
# 训练模型
def train(epoch=3, log_step=100):
global_step = 0
for ep in range(epoch):
print(f"epoch: {ep}")
model.train()
for batch in trainloader:
# 如果有英伟达的GPU并且支持CUDA的
if torch.cuda.is_available():
batch = {k: v.cuda() for k, v in batch.items()}
optimizer.zero_grad()
outputs = model(**batch)
outputs.loss.backward()
optimizer.step()
# print(f"ep: {ep}, global_step: {global_step}, loss: {outputs.loss.item()}")
if global_step % log_step == 0 or global_step == 0:
print(
f"ep: {ep}, global_step: {global_step}, loss: {outputs.loss.item()}"
)
global_step += 1
acc = evaluate()
print(f"ep: {ep}, acc: {acc}")
# 开始训练
train()
# 测试模型
sen = "酒店很差"
id2_label = {0: "差评", 1: "好评"}
model.eval()
with torch.inference_mode():
inputs = tokenizer(sen, return_tensors="pt")
inputs = {k: v.cuda() for k, v in inputs.items()}
logits = model(**inputs).logits
pred = torch.argmax(logits, dim=-1)
print(f"输出: {sen}\n模型预测结果: {id2_label[pred.item()]}")
运行结果如下
(transformers) root@5df85838fd20:~/lanyun-tmp/transformers-code/03-model# python classification_demo.py
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
epoch: 0
ep: 0, global_step: 0, loss: 0.674994945526123
ep: 0, global_step: 100, loss: 0.39935100078582764
ep: 0, global_step: 200, loss: 0.15889674425125122
ep: 0, acc: 0.8981958627700806
epoch: 1
ep: 1, global_step: 300, loss: 0.22040846943855286
ep: 1, global_step: 400, loss: 0.09449265897274017
ep: 1, acc: 0.8994845151901245
epoch: 2
ep: 2, global_step: 500, loss: 0.11801067739725113
ep: 2, global_step: 600, loss: 0.23855355381965637
ep: 2, acc: 0.8981958627700806
输出: 酒店很差
模型预测结果: 差评
(transformers) root@5df85838fd20:~/lanyun-tmp/transformers-code/03-model#
2 条评论
好羡慕你的网站浏览量
啊?我这流量很高吗..感觉不到啊,感觉都是搜索引擎的爬虫再给我刷流量