初次实现

首先先去实现一个数据导入,数据来自项目: https://github.com/InsaneLife/ChineseNLPCorpus
参考代码如下

class MyDataset(Dataset):
    def __init__(self) -> None:
        super().__init__()
        self.data = pd.read_csv("./ChnSentiCorp_htl_all.csv")
        self.data = self.data.dropna()

    def __getitem__(self, index):
        return self.data.iloc[index]["review"], self.data.iloc[index]["label"]

    def __len__(self):
        return len(self.data)
dataset = MyDataset()

下面对数据进行划分和处理,模型用的是hfl/rbt3,他这里先是把数据按照9:1的形式进行切分,分别划分成训练数据和验证数据,collate_func函数是用来规范数据的,当前的作用是预处理数据,之后把两组数据丢到数据加载器DataLoader
参考代码如下

# 切分数据
trainset, validest = random_split(dataset, lengths=[0.9, 0.1])
tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")
# 数据处理
def collate_func(batch):
    texts, labels = [], []
    for item in batch:
        # texts.append(item[0])
        texts.append(str(item[0]))  # 确保是字符串类型
        labels.append(item[1])
    inputs = tokenizer(
        texts,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    inputs["labels"] = torch.tensor(labels)
    return inputs


# 创建DataLoader
# DataLoader是用来把数据转换为适合模型训练和评估的批量数据加载器。
# 这里的batch_size=32是每一批的数据大小,shuffle=True表示打乱数据顺序。
trainloader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=collate_func)
validestloader = DataLoader(
    validest, batch_size=32, shuffle=True, collate_fn=collate_func
)

加载模型和优化器,通过优化器来设置学习效率,lr是指学习的效率。代码如下

# 创建模型和优化器
# model = AutoModelForSequenceClassification.from_pretrained("E:/AI/rbt3")
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")
if torch.cuda.is_available():
    model = model.cuda()
# optimizer = adam(model.parameters(), lr=2e-5)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

定义训练和评估函数


# 评估模型
def evaluate():
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in validestloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            outputs = model(**batch)
            pred = torch.argmax(outputs.logits, dim=-1)
            acc_num += (pred == batch["labels"].long()).float().sum()
    return acc_num / len(validest)
    
# 训练模型
def train(epoch=3, log_step=100):
    global_step = 0
    for ep in range(epoch):
        print(f"epoch: {ep}")
        model.train()
        for batch in trainloader:
            # 如果有英伟达的GPU并且支持CUDA的
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            outputs = model(**batch)
            outputs.loss.backward()
            optimizer.step()
            # print(f"ep: {ep}, global_step: {global_step}, loss: {outputs.loss.item()}")
            if global_step % log_step == 0 or global_step == 0:
                print(
                    f"ep: {ep}, global_step: {global_step}, loss: {outputs.loss.item()}"
                )
            global_step += 1
        acc = evaluate()
        print(f"ep: {ep}, acc: {acc}")

开始训练,并测试输入,代码如下

# 开始训练
train()
# 测试模型
sen = "酒店很差"
id2_label = {0: "差评", 1: "好评"}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")
    inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(f"输出: {sen}\n模型预测结果: {id2_label[pred.item()]}")

完整的代码参考如下

# 文本分类示例
# 导包
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import random_split
from torch.utils.data import DataLoader
import torch
import torch.optim as adam

# 创建DataSet
class MyDataset(Dataset):
    def __init__(self) -> None:
        super().__init__()
        self.data = pd.read_csv("./ChnSentiCorp_htl_all.csv")
        self.data = self.data.dropna()

    def __getitem__(self, index):
        return self.data.iloc[index]["review"], self.data.iloc[index]["label"]

    def __len__(self):
        return len(self.data)

dataset = MyDataset()
# 测试输出
# for i in range(5):
#     print(dataset[i])

# 划分数据集
# 将 dataset(包含所有酒店评论数据)随机分成两部分:
# trainset:90% 的数据,用于后续模型训练。
# validest:10% 的数据,用于验证模型效果。
trainset, validest = random_split(dataset, lengths=[0.9, 0.1])

# tokenizer
# tokenizer = AutoTokenizer.from_pretrained("E:/AI/rbt3")
tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

# 数据处理
def collate_func(batch):
    texts, labels = [], []
    for item in batch:
        # texts.append(item[0])
        texts.append(str(item[0]))  # 确保是字符串类型
        labels.append(item[1])
    inputs = tokenizer(
        texts,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    inputs["labels"] = torch.tensor(labels)
    return inputs

# 创建DataLoader
# DataLoader是用来把数据转换为适合模型训练和评估的批量数据加载器。
# 这里的batch_size=32是每一批的数据大小,shuffle=True表示打乱数据顺序。
trainloader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=collate_func)
validestloader = DataLoader(
    validest, batch_size=32, shuffle=True, collate_fn=collate_func
)
# 测试读取
# print(next(enumerate(trainloader))[1])

# 创建模型和优化器
# model = AutoModelForSequenceClassification.from_pretrained("E:/AI/rbt3")
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")
if torch.cuda.is_available():
    model = model.cuda()
# optimizer = adam(model.parameters(), lr=2e-5)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

# 评估模型
def evaluate():
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in validestloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            outputs = model(**batch)
            pred = torch.argmax(outputs.logits, dim=-1)
            acc_num += (pred == batch["labels"].long()).float().sum()
    return acc_num / len(validest)

# 训练模型
def train(epoch=3, log_step=100):
    global_step = 0
    for ep in range(epoch):
        print(f"epoch: {ep}")
        model.train()
        for batch in trainloader:
            # 如果有英伟达的GPU并且支持CUDA的
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            outputs = model(**batch)
            outputs.loss.backward()
            optimizer.step()
            # print(f"ep: {ep}, global_step: {global_step}, loss: {outputs.loss.item()}")
            if global_step % log_step == 0 or global_step == 0:
                print(
                    f"ep: {ep}, global_step: {global_step}, loss: {outputs.loss.item()}"
                )
            global_step += 1
        acc = evaluate()
        print(f"ep: {ep}, acc: {acc}")
        
# 开始训练
train()

# 测试模型
sen = "酒店很差"
id2_label = {0: "差评", 1: "好评"}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")
    inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(f"输出: {sen}\n模型预测结果: {id2_label[pred.item()]}")

运行结果如下

(transformers) root@5df85838fd20:~/lanyun-tmp/transformers-code/03-model# python classification_demo.py 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
epoch: 0
ep: 0, global_step: 0, loss: 0.674994945526123
ep: 0, global_step: 100, loss: 0.39935100078582764
ep: 0, global_step: 200, loss: 0.15889674425125122
ep: 0, acc: 0.8981958627700806
epoch: 1
ep: 1, global_step: 300, loss: 0.22040846943855286
ep: 1, global_step: 400, loss: 0.09449265897274017
ep: 1, acc: 0.8994845151901245
epoch: 2
ep: 2, global_step: 500, loss: 0.11801067739725113
ep: 2, global_step: 600, loss: 0.23855355381965637
ep: 2, acc: 0.8981958627700806
输出: 酒店很差
模型预测结果: 差评
(transformers) root@5df85838fd20:~/lanyun-tmp/transformers-code/03-model# 
最后修改:2025 年 03 月 18 日
如果觉得我的文章对你有用,请随意赞赏