Wide and Deep#

Note

Deep部分同Embedding+MLP,Wide部分负责记忆

结构#

jupyter

左侧是wide部分,右侧是deep部分。

wide部分:直接把输入层连接到输出层,作用是让模型有较强的记忆力。

deep部分:典型的embedding + mlp结构,作用是让模型有较强的泛化能力。

所谓“记忆能力”,即模型直接学习物品或特征的“共现频率”,并把他们直接作为推荐依据。比如说喜欢A电影的也喜欢B这个规则。

这类规则有两个特点:1.数量非常多;2.非常具体,没必要和其他特征交叉。

这样我们的Wide&Deep模型就能同时拥有记忆力和泛化能力。

数据预处理#

import tensorflow as tf
from tensorflow import keras
import rec

# 读取movielens数据集
train_dataset, test_dataset = rec.load_movielens()
df = rec.get_movielens_df()
df
movieId userId rating timestamp label releaseYear movieGenre1 movieGenre2 movieGenre3 movieRatingCount ... userRatingCount userAvgReleaseYear userReleaseYearStddev userAvgRating userRatingStddev userGenre1 userGenre2 userGenre3 userGenre4 userGenre5
0 1 15555 3.0 900953740 0 1995 Adventure Animation Children 10759 ... 92 1992 8.98 3.86 0.74 Drama Comedy Thriller Action Crime
1 1 25912 3.5 1111631768 1 1995 Adventure Animation Children 10759 ... 21 1988 14.09 3.48 1.28 Action Comedy Romance Adventure Thriller
2 1 29912 3.0 866820360 0 1995 Adventure Animation Children 10759 ... 4 1995 0.50 3.00 0.00 NaN NaN NaN NaN NaN
3 10 17686 0.5 1195555011 0 1995 Action Adventure Thriller 6330 ... 35 1992 8.35 2.97 1.48 Comedy Drama Adventure Action Thriller
4 104 20158 4.0 1155357691 1 1996 Comedy NaN NaN 3954 ... 81 1991 8.70 3.60 0.72 Thriller Drama Action Crime Adventure
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
88822 968 26865 3.0 854092232 0 1968 Horror Sci-Fi Thriller 1824 ... 94 1991 12.23 3.35 0.85 Drama Thriller Comedy Crime Romance
88823 968 8507 2.0 974709061 0 1968 Horror Sci-Fi Thriller 1824 ... 5 1994 0.89 2.00 1.00 NaN NaN NaN NaN NaN
88824 969 16689 5.0 857854044 1 1951 Adventure Comedy Romance 2380 ... 97 1992 9.95 3.53 0.82 Drama Comedy Crime Romance Thriller
88825 969 26460 2.0 1250279576 0 1951 Adventure Comedy Romance 2380 ... 55 1990 11.78 2.73 1.42 Thriller Crime Drama Comedy Sci-Fi
88826 970 3033 2.0 1272394603 0 1953 Adventure Comedy Crime 98 ... 100 1985 17.64 3.67 0.89 Drama Romance Comedy Thriller Crime

88827 rows × 27 columns

Deep部分#

就像上一节那样处理

# 电影的类别
genre_vocab = ['Film-Noir', 'Action', 'Adventure', 'Horror', 'Romance', 'War', 
               'Comedy', 'Western', 'Documentary', 'Sci-Fi', 'Drama', 'Thriller', 
               'Crime', 'Fantasy', 'Animation', 'IMAX', 'Mystery', 'Children', 'Musical']
# 类别列
GENRE_FEATURES = {
    'userGenre1': genre_vocab,
    'userGenre2': genre_vocab,
    'userGenre3': genre_vocab,
    'userGenre4': genre_vocab,
    'userGenre5': genre_vocab,
    'movieGenre1': genre_vocab,
    'movieGenre2': genre_vocab,
    'movieGenre3': genre_vocab
}

categorical_columns = []
for feature, vocab in GENRE_FEATURES.items():
    # 先转化为one-hot
    cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
        key=feature, vocabulary_list=vocab)
    # 再转化为embedding,维度是10维
    emb_col = tf.feature_column.embedding_column(cat_col, 10)
    categorical_columns.append(emb_col)
# movie id embedding feature
movie_col = tf.feature_column.categorical_column_with_identity(key='movieId', num_buckets=1001)
movie_emb_col = tf.feature_column.embedding_column(movie_col, 10)
categorical_columns.append(movie_emb_col)

# user id embedding feature
user_col = tf.feature_column.categorical_column_with_identity(key='userId', num_buckets=30001)
user_emb_col = tf.feature_column.embedding_column(user_col, 10)
categorical_columns.append(user_emb_col)
# all numerical features
numerical_columns = [tf.feature_column.numeric_column('releaseYear'),
                     tf.feature_column.numeric_column('movieRatingCount'),
                     tf.feature_column.numeric_column('movieAvgRating'),
                     tf.feature_column.numeric_column('movieRatingStddev'),
                     tf.feature_column.numeric_column('userRatingCount'),
                     tf.feature_column.numeric_column('userAvgRating'),
                     tf.feature_column.numeric_column('userRatingStddev')]

Wide部分#

使用两个特征的交叉

# define input for keras model
inputs = {
    'movieAvgRating': tf.keras.layers.Input(name='movieAvgRating', shape=(), dtype='float32'),
    'movieRatingStddev': tf.keras.layers.Input(name='movieRatingStddev', shape=(), dtype='float32'),
    'movieRatingCount': tf.keras.layers.Input(name='movieRatingCount', shape=(), dtype='int32'),
    'userAvgRating': tf.keras.layers.Input(name='userAvgRating', shape=(), dtype='float32'),
    'userRatingStddev': tf.keras.layers.Input(name='userRatingStddev', shape=(), dtype='float32'),
    'userRatingCount': tf.keras.layers.Input(name='userRatingCount', shape=(), dtype='int32'),
    'releaseYear': tf.keras.layers.Input(name='releaseYear', shape=(), dtype='int32'),

    'movieId': tf.keras.layers.Input(name='movieId', shape=(), dtype='int32'),
    'userId': tf.keras.layers.Input(name='userId', shape=(), dtype='int32'),
    'userRatedMovie1': tf.keras.layers.Input(name='userRatedMovie1', shape=(), dtype='int32'),

    'userGenre1': tf.keras.layers.Input(name='userGenre1', shape=(), dtype='string'),
    'userGenre2': tf.keras.layers.Input(name='userGenre2', shape=(), dtype='string'),
    'userGenre3': tf.keras.layers.Input(name='userGenre3', shape=(), dtype='string'),
    'userGenre4': tf.keras.layers.Input(name='userGenre4', shape=(), dtype='string'),
    'userGenre5': tf.keras.layers.Input(name='userGenre5', shape=(), dtype='string'),
    'movieGenre1': tf.keras.layers.Input(name='movieGenre1', shape=(), dtype='string'),
    'movieGenre2': tf.keras.layers.Input(name='movieGenre2', shape=(), dtype='string'),
    'movieGenre3': tf.keras.layers.Input(name='movieGenre3', shape=(), dtype='string'),
}
rated_movie = tf.feature_column.categorical_column_with_identity(key='userRatedMovie1',
                                                                 num_buckets=1001)
# 使用movie_col和rated_movie的交叉作为wide部分的输入
crossed_feature = tf.feature_column.indicator_column(
    tf.feature_column.crossed_column([movie_col, rated_movie], 10000))

定义模型#

使用keras的函数式API进行定义。

# wide and deep model architecture
# deep part for all input features
deep = tf.keras.layers.DenseFeatures(numerical_columns + categorical_columns)(inputs)
deep = tf.keras.layers.Dense(128, activation='relu')(deep)
deep = tf.keras.layers.Dense(128, activation='relu')(deep)

# wide part for cross feature
wide = tf.keras.layers.DenseFeatures(crossed_feature)(inputs)

both = tf.keras.layers.concatenate([deep, wide])
output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(both)
model = tf.keras.Model(inputs, output_layer)

训练#

# compile the model, set loss function, optimizer and evaluation metrics
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', tf.keras.metrics.AUC(curve='ROC'), tf.keras.metrics.AUC(curve='PR')])
# train the model
model.fit(train_dataset, epochs=5)
Epoch 1/5
/Users/facer/opt/anaconda3/lib/python3.8/site-packages/keras/engine/functional.py:582: UserWarning: Input dict contained keys ['rating', 'timestamp', 'userRatedMovie2', 'userRatedMovie3', 'userRatedMovie4', 'userRatedMovie5', 'userAvgReleaseYear', 'userReleaseYearStddev'] which did not match any model input. They will be ignored by the model.
  warnings.warn(
7403/7403 [==============================] - 24s 3ms/step - loss: 0.7510 - accuracy: 0.6077 - auc: 0.6272 - auc_1: 0.6638
Epoch 2/5
7403/7403 [==============================] - 20s 3ms/step - loss: 0.6049 - accuracy: 0.6767 - auc: 0.7304 - auc_1: 0.7556
Epoch 3/5
7403/7403 [==============================] - 21s 3ms/step - loss: 0.5482 - accuracy: 0.7214 - auc: 0.7897 - auc_1: 0.8113
Epoch 4/5
7403/7403 [==============================] - 20s 3ms/step - loss: 0.5051 - accuracy: 0.7546 - auc: 0.8270 - auc_1: 0.8471
Epoch 5/5
7403/7403 [==============================] - 20s 3ms/step - loss: 0.4816 - accuracy: 0.7691 - auc: 0.8452 - auc_1: 0.8668
<keras.callbacks.History at 0x7fb47ea620a0>