lstm 新闻标题聚类
数据集加载
数据集选用了网上找的灾害新闻标题数据集,然后选了5个种类灾害合在一起,重新制作。基本就如图所示:
 
label代表标签。
 commment代表新闻标题内容。
 这里放一个数据集下载链接。
 数据集下载地址
然后处理文本标签内容,将其转成更加容易处理的数字标签
 处理后的数据如图所示:
 
 数据集制作完成后使用torchtext来加载数据。
torch.manual_seed(SEED)
seg = pkuseg.pkuseg()
def tokenizer(text):
    return seg.cut(text)
TEXT = Field(sequential=True, tokenize=tokenizer, fix_length=35)
POS = Field(sequential=False, use_vocab=False)
FIELD = [('label', None), ('content', TEXT), ('pos', POS)]
df = TabularDataset(
    path='./data/news.csv', format='csv',
    fields=FIELD, skip_header=True)
TEXT.build_vocab(df, min_freq=3, vectors='glove.6B.50d')
train, valid = df.split(split_ratio=0.7, random_state=random.seed(SEED))
train_iter, valid_iter = BucketIterator.splits(
    (train, valid),
    batch_sizes=(batch_size, batch_size),
    device=device,
    sort_key=lambda x: len(x.content),
    sort_within_batch=False,
    repeat=False
)
模型定义
模型采用LSTM,pytorch已经内置实现
class LSTM(nn.Module):
    def __init__(self, emb_len, emb_dim, out_dim):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(emb_len, emb_dim)
        self.lstm = nn.LSTM(emb_dim, out_dim, batch_first=True, dropout=0.5, bidirectional=True, num_layers=2)
        self.linear = nn.Sequential(
            nn.Linear(2 * out_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 5)
        )
    def forward(self, x):
        # 初始输入格式为(length, batch_size)
        out = self.embedding(x)
        # (length, batch_size, emb) -> (batch_size, length , emb )
        out = torch.transpose(out, 0, 1)
        out, (h, c) = self.lstm(out)
        out = torch.cat((h[-2, :, :], h[-1, :, :]), dim=1)
        out = self.linear(out)
        return out
bidirectional代表使用双向传播,所以在forword中out需要将不同的h进行拼接操作。
训练
模型采用LSTM,pytorch已经内置实现
model = LSTM(len(TEXT.vocab), 64, 128).to(device)
import torch.optim as optim
import torch.nn.functional as F
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = F.cross_entropy
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)
# -----------------------------------模型训练--------------------------------------
epochs = 100
stop = 20
cnt = 0
best_valid_acc = float('-inf')
model_save_path = './model/torchtext.pkl'
for epoch in range(epochs):
    loss_one_epoch = 0.0
    correct_num = 0.0
    total_num = 0.0
    for i, batch in enumerate(train_iter):
        model.train()
        pos, content = batch.pos, batch.content
        # 进行forward()、backward()、更新权重
        optimizer.zero_grad()
        pred = model(content)
        loss = criterion(pred, pos)
        loss.backward()
        optimizer.step()
        # 统计预测信息
        total_num += pos.size(0)
        # 预测有多少个标签是预测中的,并加总
        correct_num += (torch.argmax(pred, dim=1) == pos).sum().float().item()
        loss_one_epoch += loss.item()
    loss_avg = loss_one_epoch / len(train_iter)
    print("Train: Epoch[{:0>3}/{:0>3}]  Loss: {:.4f} Acc:{:.2%}".
          format(epoch + 1, epochs, loss_avg, correct_num / total_num))
    # ---------------------------------------验证------------------------------
    loss_one_epoch = 0.0
    total_num = 0.0
    correct_num = 0.0
    model.eval()
    for i, batch in enumerate(valid_iter):
        pos, content = batch.pos, batch.content
        pred = model(content)
        pred.detach()
        # 计算loss
        # 统计预测信息
        total_num += pos.size(0)
        # 预测有多少个标签是预测中的,并加总
        correct_num += (torch.argmax(pred, dim=1) == pos).sum().float().item()
    # 学习率调整
    scheduler.step()
    print('valid Acc:{:.2%}'.format(correct_num / total_num))
    # 每个epoch计算一下验证集准确率如果模型效果变好,保存模型
    if (correct_num / total_num) > best_valid_acc:
        print("超过最好模型,保存")
        best_valid_acc = (correct_num / total_num)
        torch.save(model.state_dict(), model_save_path)
        cnt = 0
    else:
        cnt = cnt + 1
        if cnt > stop:
            # 早停机制
            print("模型基本无变化,停止训练")
            print("训练集最高准确率为%.2f" % best_valid_acc)
            break
在训练中设置了早停机制与学习率调整策略。
最后的训练结果如下。
 
 测试集的准确率是很高,但验证集这边低了,应该是有点过拟合了。
 完整代码放github上了
 代码地址










