在推荐模型中,有很多因素会影响id之外的特性是否有用:
- 上下文的重要性:如果用户的首选项跨上下文和时间相对稳定,那么上下文特性可能不会提供太多好处。然而,如果用户偏好是高度上下文相关的,那么添加上下文将显著改善模型。例如,在决定是推荐一个短片还是一部电影时,一周的哪一天可能是一个重要的特征:用户在一周中可能只有时间看短内容,但可以在周末放松并享受一部完整长度的电影。类似地,查询时间戳可能在建模流行动态中扮演重要角色:一部电影可能在周围非常流行
- 数据稀疏:如果数据稀疏,使用非id特征可能是关键。由于对给定用户或项目的观察很少,模型可能难以估计每个用户或每个项目的良好表示。为了建立一个准确的模型,其他的特征,如项目类别、描述和图像,必须被用来帮助模型泛化训练数据以外的数据。这在冷启动的情况下尤其重要,因为在冷启动的情况下,一些项目或用户的数据相对较少。
import os
import tempfile
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
特征数据
ratings = tfds.load("movielens/100k-ratings", split="train")
movies = tfds.load("movielens/100k-movies", split="train")
ratings = ratings.map(lambda x: {
"movie_title": x["movie_title"],
"user_id": x["user_id"],
"timestamp": x["timestamp"],
})
movies = movies.map(lambda x: x["movie_title"])
特征词汇表
timestamps = np.concatenate(list(ratings.map(lambda x: x["timestamp"]).batch(100)))
max_timestamp = timestamps.max()
min_timestamp = timestamps.min()
timestamp_buckets = np.linspace(
min_timestamp, max_timestamp, num=1000,
)
unique_movie_titles = np.unique(np.concatenate(list(movies.batch(1000))))
unique_user_ids = np.unique(np.concatenate(list(ratings.batch(1_000).map(
lambda x: x["user_id"]))))
定义模型
user model
代码中改动:增加是否使用时间戳这个特征的选择。
class UserModel(tf.keras.Model):
def __init__(self, use_timestamps):
super().__init__()
self._use_timestamps = use_timestamps
self.user_embedding = tf.keras.Sequential([
tf.keras.layers.StringLookup(
vocabulary=unique_user_ids, mask_token=None),
tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32),
])
if use_timestamps:
self.timestamp_embedding = tf.keras.Sequential([
tf.keras.layers.Discretization(timestamp_buckets.tolist()),
tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32),
])
self.normalized_timestamp = tf.keras.layers.Normalization(
axis=None
)
self.normalized_timestamp.adapt(timestamps)
def call(self, inputs):
if not self._use_timestamps:
return self.user_embedding(inputs["user_id"])
return tf.concat([
self.user_embedding(inputs["user_id"]),
self.timestamp_embedding(inputs["timestamp"]),
tf.reshape(self.normalized_timestamp(inputs["timestamp"]), (-1, 1)),
], axis=1)
movie model
class MovieModel(tf.keras.Model):
def __init__(self):
super().__init__()
max_tokens = 10_000
self.title_embedding = tf.keras.Sequential([
tf.keras.layers.StringLookup(
vocabulary=unique_movie_titles, mask_token=None),
tf.keras.layers.Embedding(len(unique_movie_titles) + 1, 32)
])
self.title_vectorizer = tf.keras.layers.TextVectorization(
max_tokens=max_tokens)
self.title_text_embedding = tf.keras.Sequential([
self.title_vectorizer,
tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
tf.keras.layers.GlobalAveragePooling1D(),
])
self.title_vectorizer.adapt(movies)
def call(self, titles):
return tf.concat([
self.title_embedding(titles),
self.title_text_embedding(titles),
], axis=1)
组合模型
class MovielensModel(tfrs.models.Model):
def __init__(self, use_timestamps):
super().__init__()
self.query_model = tf.keras.Sequential([
UserModel(use_timestamps),
tf.keras.layers.Dense(32)
])
self.candidate_model = tf.keras.Sequential([
MovieModel(),
tf.keras.layers.Dense(32)
])
self.task = tfrs.tasks.Retrieval(
metrics=tfrs.metrics.FactorizedTopK(
candidates=movies.batch(128).map(self.candidate_model),
),
)
def compute_loss(self, features, training=False):
# We only pass the user id and timestamp features into the query model. This
# is to ensure that the training inputs would have the same keys as the
# query inputs. Otherwise the discrepancy in input structure would cause an
# error when loading the query model after saving it.
query_embeddings = self.query_model({
"user_id": features["user_id"],
"timestamp": features["timestamp"],
})
movie_embeddings = self.candidate_model(features["movie_title"])
return self.task(query_embeddings, movie_embeddings)
实验
准备数据集
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)
train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)
cached_train = train.shuffle(100_000).batch(2048)
cached_test = test.batch(4096).cache()
Baseline :没有时间戳特性
model = MovielensModel(use_timestamps=False)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
model.fit(cached_train, epochs=3)
train_accuracy = model.evaluate(
cached_train, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
test_accuracy = model.evaluate(
cached_test, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")
利用时间特征捕捉时间动态
model = MovielensModel(use_timestamps=True)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
model.fit(cached_train, epochs=3)
train_accuracy = model.evaluate(
cached_train, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
test_accuracy = model.evaluate(
cached_test, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")