Wide and Deep#

Note

Deep部分同Embedding+MLP，Wide部分负责记忆

结构#

jupyter

左侧是wide部分，右侧是deep部分。

wide部分：直接把输入层连接到输出层，作用是让模型有较强的记忆力。

deep部分：典型的embedding + mlp结构，作用是让模型有较强的泛化能力。

所谓“记忆能力”，即模型直接学习物品或特征的“共现频率”，并把他们直接作为推荐依据。比如说喜欢A电影的也喜欢B这个规则。

这类规则有两个特点：1.数量非常多；2.非常具体，没必要和其他特征交叉。

这样我们的Wide&Deep模型就能同时拥有记忆力和泛化能力。

数据预处理#

import tensorflow as tf
from tensorflow import keras
import rec

# 读取movielens数据集
train_dataset, test_dataset = rec.load_movielens()

df = rec.get_movielens_df()
df

	movieId	userId	rating	timestamp	label	releaseYear	movieGenre1	movieGenre2	movieGenre3	movieRatingCount	...	userRatingCount	userAvgReleaseYear	userReleaseYearStddev	userAvgRating	userRatingStddev	userGenre1	userGenre2	userGenre3	userGenre4	userGenre5
0	1	15555	3.0	900953740	0	1995	Adventure	Animation	Children	10759	...	92	1992	8.98	3.86	0.74	Drama	Comedy	Thriller	Action	Crime
1	1	25912	3.5	1111631768	1	1995	Adventure	Animation	Children	10759	...	21	1988	14.09	3.48	1.28	Action	Comedy	Romance	Adventure	Thriller
2	1	29912	3.0	866820360	0	1995	Adventure	Animation	Children	10759	...	4	1995	0.50	3.00	0.00	NaN	NaN	NaN	NaN	NaN
3	10	17686	0.5	1195555011	0	1995	Action	Adventure	Thriller	6330	...	35	1992	8.35	2.97	1.48	Comedy	Drama	Adventure	Action	Thriller
4	104	20158	4.0	1155357691	1	1996	Comedy	NaN	NaN	3954	...	81	1991	8.70	3.60	0.72	Thriller	Drama	Action	Crime	Adventure
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
88822	968	26865	3.0	854092232	0	1968	Horror	Sci-Fi	Thriller	1824	...	94	1991	12.23	3.35	0.85	Drama	Thriller	Comedy	Crime	Romance
88823	968	8507	2.0	974709061	0	1968	Horror	Sci-Fi	Thriller	1824	...	5	1994	0.89	2.00	1.00	NaN	NaN	NaN	NaN	NaN
88824	969	16689	5.0	857854044	1	1951	Adventure	Comedy	Romance	2380	...	97	1992	9.95	3.53	0.82	Drama	Comedy	Crime	Romance	Thriller
88825	969	26460	2.0	1250279576	0	1951	Adventure	Comedy	Romance	2380	...	55	1990	11.78	2.73	1.42	Thriller	Crime	Drama	Comedy	Sci-Fi
88826	970	3033	2.0	1272394603	0	1953	Adventure	Comedy	Crime	98	...	100	1985	17.64	3.67	0.89	Drama	Romance	Comedy	Thriller	Crime

88827 rows × 27 columns

Deep部分#

就像上一节那样处理

# 电影的类别
genre_vocab = ['Film-Noir', 'Action', 'Adventure', 'Horror', 'Romance', 'War', 
               'Comedy', 'Western', 'Documentary', 'Sci-Fi', 'Drama', 'Thriller', 
               'Crime', 'Fantasy', 'Animation', 'IMAX', 'Mystery', 'Children', 'Musical']
# 类别列
GENRE_FEATURES = {
    'userGenre1': genre_vocab,
    'userGenre2': genre_vocab,
    'userGenre3': genre_vocab,
    'userGenre4': genre_vocab,
    'userGenre5': genre_vocab,
    'movieGenre1': genre_vocab,
    'movieGenre2': genre_vocab,
    'movieGenre3': genre_vocab
}

categorical_columns = []
for feature, vocab in GENRE_FEATURES.items():
    # 先转化为one-hot
    cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
        key=feature, vocabulary_list=vocab)
    # 再转化为embedding，维度是10维
    emb_col = tf.feature_column.embedding_column(cat_col, 10)
    categorical_columns.append(emb_col)

# movie id embedding feature
movie_col = tf.feature_column.categorical_column_with_identity(key='movieId', num_buckets=1001)
movie_emb_col = tf.feature_column.embedding_column(movie_col, 10)
categorical_columns.append(movie_emb_col)

# user id embedding feature
user_col = tf.feature_column.categorical_column_with_identity(key='userId', num_buckets=30001)
user_emb_col = tf.feature_column.embedding_column(user_col, 10)
categorical_columns.append(user_emb_col)

# all numerical features
numerical_columns = [tf.feature_column.numeric_column('releaseYear'),
                     tf.feature_column.numeric_column('movieRatingCount'),
                     tf.feature_column.numeric_column('movieAvgRating'),
                     tf.feature_column.numeric_column('movieRatingStddev'),
                     tf.feature_column.numeric_column('userRatingCount'),
                     tf.feature_column.numeric_column('userAvgRating'),
                     tf.feature_column.numeric_column('userRatingStddev')]

Wide部分#

使用两个特征的交叉

# define input for keras model
inputs = {
    'movieAvgRating': tf.keras.layers.Input(name='movieAvgRating', shape=(), dtype='float32'),
    'movieRatingStddev': tf.keras.layers.Input(name='movieRatingStddev', shape=(), dtype='float32'),
    'movieRatingCount': tf.keras.layers.Input(name='movieRatingCount', shape=(), dtype='int32'),
    'userAvgRating': tf.keras.layers.Input(name='userAvgRating', shape=(), dtype='float32'),
    'userRatingStddev': tf.keras.layers.Input(name='userRatingStddev', shape=(), dtype='float32'),
    'userRatingCount': tf.keras.layers.Input(name='userRatingCount', shape=(), dtype='int32'),
    'releaseYear': tf.keras.layers.Input(name='releaseYear', shape=(), dtype='int32'),

    'movieId': tf.keras.layers.Input(name='movieId', shape=(), dtype='int32'),
    'userId': tf.keras.layers.Input(name='userId', shape=(), dtype='int32'),
    'userRatedMovie1': tf.keras.layers.Input(name='userRatedMovie1', shape=(), dtype='int32'),

    'userGenre1': tf.keras.layers.Input(name='userGenre1', shape=(), dtype='string'),
    'userGenre2': tf.keras.layers.Input(name='userGenre2', shape=(), dtype='string'),
    'userGenre3': tf.keras.layers.Input(name='userGenre3', shape=(), dtype='string'),
    'userGenre4': tf.keras.layers.Input(name='userGenre4', shape=(), dtype='string'),
    'userGenre5': tf.keras.layers.Input(name='userGenre5', shape=(), dtype='string'),
    'movieGenre1': tf.keras.layers.Input(name='movieGenre1', shape=(), dtype='string'),
    'movieGenre2': tf.keras.layers.Input(name='movieGenre2', shape=(), dtype='string'),
    'movieGenre3': tf.keras.layers.Input(name='movieGenre3', shape=(), dtype='string'),
}

rated_movie = tf.feature_column.categorical_column_with_identity(key='userRatedMovie1',
                                                                 num_buckets=1001)
# 使用movie_col和rated_movie的交叉作为wide部分的输入
crossed_feature = tf.feature_column.indicator_column(
    tf.feature_column.crossed_column([movie_col, rated_movie], 10000))

定义模型#

使用keras的函数式API进行定义。

# wide and deep model architecture
# deep part for all input features
deep = tf.keras.layers.DenseFeatures(numerical_columns + categorical_columns)(inputs)
deep = tf.keras.layers.Dense(128, activation='relu')(deep)
deep = tf.keras.layers.Dense(128, activation='relu')(deep)

# wide part for cross feature
wide = tf.keras.layers.DenseFeatures(crossed_feature)(inputs)

both = tf.keras.layers.concatenate([deep, wide])
output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(both)
model = tf.keras.Model(inputs, output_layer)

训练#

# compile the model, set loss function, optimizer and evaluation metrics
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', tf.keras.metrics.AUC(curve='ROC'), tf.keras.metrics.AUC(curve='PR')])

# train the model
model.fit(train_dataset, epochs=5)

Epoch 1/5

/Users/facer/opt/anaconda3/lib/python3.8/site-packages/keras/engine/functional.py:582: UserWarning: Input dict contained keys ['rating', 'timestamp', 'userRatedMovie2', 'userRatedMovie3', 'userRatedMovie4', 'userRatedMovie5', 'userAvgReleaseYear', 'userReleaseYearStddev'] which did not match any model input. They will be ignored by the model.
  warnings.warn(

7403/7403 [==============================] - 24s 3ms/step - loss: 0.7510 - accuracy: 0.6077 - auc: 0.6272 - auc_1: 0.6638
Epoch 2/5
7403/7403 [==============================] - 20s 3ms/step - loss: 0.6049 - accuracy: 0.6767 - auc: 0.7304 - auc_1: 0.7556
Epoch 3/5
7403/7403 [==============================] - 21s 3ms/step - loss: 0.5482 - accuracy: 0.7214 - auc: 0.7897 - auc_1: 0.8113
Epoch 4/5
7403/7403 [==============================] - 20s 3ms/step - loss: 0.5051 - accuracy: 0.7546 - auc: 0.8270 - auc_1: 0.8471
Epoch 5/5
7403/7403 [==============================] - 20s 3ms/step - loss: 0.4816 - accuracy: 0.7691 - auc: 0.8452 - auc_1: 0.8668

<keras.callbacks.History at 0x7fb47ea620a0>

推荐系统手册

Wide and Deep

Contents