在bert的基础上加了一个分类层:
代码实现:
output = bert.model.output
output = Lambda(lambda x: x[:, 0], name='CLS-token')(output)
output = Dense(
units=num_classes,
activation='softmax',
kernel_initializer=bert.initializer
)(output)
model = keras.models.Model(bert.model.input, output)
然后就是利用bert的输出训练一个分类任务了!!!
完整代码如下:
#! -*- coding:utf-8 -*-
#FinWoBERT:中文金融领域增强预训练模型
'''
康明. 深度学习预训练语言模型(案例篇) ——中文金融文本情绪分类研究[M]. 北京: 清华大学出版社, 2022.
Ming Kang. Pretraining Language Models in Deep Learning: A Case Study of Chinese Sentiment Classification for Financial Text.
Beijing: Tsinghua University Press, 2022.
'''
import os, json
import numpy as np
from bert4keras.backend import keras, set_gelu
from bert4keras.tokenizers import Tokenizer
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam,extend_with_piecewise_linear_lr
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open
from keras.layers import Lambda, Dense
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import jieba_fast as jieba
jieba.initialize()
num_classes = 3
maxlen = 512
batch_size = 32
# bert配置
# path = "/Users/sssdjj/bert_source/"
config_path = 'data/chinese_wobert_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'data/chinese_wobert_L-12_H-768_A-12/bert_model.ckpt'
dict_path = 'data/chinese_wobert_L-12_H-768_A-12/vocab.txt'
labels = {"其他":0,"利多":1,"利空":2}
stop_words = []
# 加入停用词
# with open("data/cn_stopwords.txt") as f:
# for i in f:
# stop_words.append(i.strip())
def load_data(filename):
"""加载数据
单条格式:(文本, 标签id)
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
if len(l.strip().split('|||')) == 2:
label,text = l.strip().split('|||')
# 去除停用词
# for i in stop_words:
# text = str(text).replace(i," ")
D.append((text, labels[label]))
return D
path = "data/"
# 加载数据集
train_data = load_data(path+'train.txt')
valid_data = load_data(path+'test.txt')
# 增加自定义词库 word.txt 元词表 word_zhengf.txt 加入正负词
jieba.load_userdict(path+"word_zhengf_buzai_vocab.txt")
# 建立分词器
tokenizer = Tokenizer(
dict_path,
do_lower_case=True,
pre_tokenize=lambda s: jieba.cut(s, HMM=False)
)
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for is_end, (text, label) in self.sample(random):
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
batch_labels = sequence_padding(batch_labels)
yield [batch_token_ids, batch_segment_ids], batch_labels
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
ctokens = []
with open(path+"word_zhengf_buzai_vocab.txt") as f:
for i in f:
ctokens.append(tokenizer.encode(i.strip())[0][1:-1])
bert = build_transformer_model(
config_path,
checkpoint_path,
return_keras_model=False,
compound_tokens=ctokens
)
output = bert.model.output
output = Lambda(lambda x: x[:, 0], name='CLS-token')(output)
output = Dense(
units=num_classes,
activation='softmax',
kernel_initializer=bert.initializer
)(output)
model = keras.models.Model(bert.model.input, output)
model.summary()
# AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR')
model.compile(
loss='sparse_categorical_crossentropy',
optimizer=Adam(learning_rate=1e-6), # 用足够小的学习率
metrics=['accuracy'],
)
# 转换数据集
train_generator = data_generator(train_data, batch_size)
valid_generator = data_generator(valid_data, batch_size)
def norm_index(y_true,y_pred):
acc = accuracy_score(y_true, y_pred)
macro_prec = precision_score(y_true, y_pred, average='macro')
micro_prec = precision_score(y_true, y_pred, average='micro')
macro_recall = recall_score(y_true, y_pred, average='macro')
micro_recall = recall_score(y_true, y_pred, average='micro')
macro_f1 = f1_score(y_true, y_pred, average='macro')
micro_f1 = f1_score(y_true, y_pred, average='micro')
cm = confusion_matrix(y_true, y_pred)
return acc, macro_prec,micro_prec, macro_recall, micro_recall,macro_f1,micro_f1, cm
def evaluate(data):
total, right = 0., 0.
pred_list,true_list = [], []
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
y_true = y_true[:, 0]
# total += len(y_true)
# right += (y_true == y_pred).sum()
pred_list.extend(y_pred)
true_list.extend(y_true)
return norm_index(true_list,pred_list)
class Evaluator(keras.callbacks.Callback):
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, epoch, logs=None):
val_acc, macro_prec,micro_prec, macro_recall, micro_recall,macro_f1,micro_f1, cm = evaluate(valid_generator)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
model.save_weights('train/best_model_sentiment.weights')
print(
u'val_acc: %.15f, best_val_acc: %.15f,loss:%s\n' %
(val_acc, self.best_val_acc,logs)
)
print(
u'macro_prec: %.15f, micro_prec: %.15f\n' %
(macro_prec, micro_prec)
)
print(
u'macro_recall: %.15f, micro_recall: %.15f\n' %
(macro_recall, micro_recall)
)
print(
u'macro_f1: %.15f, micro_f1: %.15f\n' %
(macro_f1, micro_f1)
)
print(cm)
if __name__ == '__main__':
evaluator = Evaluator()
model.fit_generator(
train_generator.forfit(),
steps_per_epoch=len(train_generator),
epochs= 100,
callbacks=[evaluator]
)
else:
model.load_weights('best_model_sentiment.weights')
为了提升金融领域的领域大模型,还可以针对预训练加入金融领域特有的语料库:
关键技术:
灾难性遗忘
几个文章可以深入阅读下:
Yuqing Zhao, Divya Saxena, Jiannong Cao. Revisiting Parameter Reuse to Overcome Catastrophic Forgetting in Neural Networks.
arXiv:2207.11005v1 [cs.LG], 2022.
Matteo Boschini, Lorenzo Bonicelli, Angelo Porrello, et al. Transfer without Forgetting // Computer Vision – ECCV 2022: 17th European Conference,
Tel Aviv, Israel, October 23–27, 2022, Proceedings, Part I. Cham: Springer, 2022.
Yabin Wang, Zhiwu Huang, Xiaopeng Hong. S-Prompts Learning with Pre-trained Transformers: An Occam's Razor for Domain Incremental Learning.
arXiv:2207.12819v1 [cs.CV], 2022.
另外,为了增强可解释性,预训练的语料库要和分类任务保持一致。
最后为了增强模型的健壮性,还可以加入GAN:
Ian J. Goodfellow, Jonathon Shlens, Christian Szegedy. Explaining and Harnessing Adversarial Examples. arXiv:1412.6572v3 [stat.ML], 2015.
Christian Szegedy, Wojciech Zaremba, Ilya Sutskever, et al. Intriguing Properties of Neural Networks. arXiv:1312.6199v4 [cs.CV], 2014.
TensorFlow. Adversarial example using FGSM. https://tensorflow.google.cn/tutorials/generative/adversarial_fgsm, 2021.
Nathan Inkawhich. Adversarial Example Generation. https://pytorch.org/tutorials/beginner/fgsm_tutorial.html, 2021.
样本生成用的是该文章的方法:
另外,为了防止过拟合,在输出层可以加入L1正则化!