Examples
Last updated
Was this helpful?
Last updated
Was this helpful?
The MovieLens data has been used for personalized tag recommendation,which contains 668, 953 tag applications of users on movies. Here is a small fraction of data include only sparse field.
This example shows how to use DSSM
to solve a matching task. You can get the demo data movielens_sample.txt and run the following codes.
import pandas as pd
from deepctr.inputs import SparseFeat, VarLenSparseFeat
from preprocess import gen_data_set, gen_model_input
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.models import Model
from deepmatch.models import *
if __name__ == "__main__":
data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt")
sparse_features = ["movie_id", "user_id",
"gender", "age", "occupation", "zip", ]
SEQ_LEN = 50
negsample = 3
# 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input`
features = ['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip']
feature_max_idx = {}
for feature in features:
lbe = LabelEncoder()
data[feature] = lbe.fit_transform(data[feature]) + 1
feature_max_idx[feature] = data[feature].max() + 1
user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id')
item_profile = data[["movie_id"]].drop_duplicates('movie_id')
user_profile.set_index("user_id", inplace=True)
user_item_list = data.groupby("user_id")['movie_id'].apply(list)
train_set, test_set = gen_data_set(data, negsample)
train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN)
test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN)
# 2.count #unique features for each sparse field and generate feature config for sequence feature
embedding_dim = 16
user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim),
SparseFeat("gender", feature_max_idx['gender'], embedding_dim),
SparseFeat("age", feature_max_idx['age'], embedding_dim),
SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim),
SparseFeat("zip", feature_max_idx['zip'], embedding_dim),
VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim,
embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'),
]
item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)]
# 3.Define Model and train
model = DSSM(user_feature_columns, item_feature_columns) # FM(user_feature_columns,item_feature_columns)
model.compile(optimizer='adagrad', loss="binary_crossentropy")
history = model.fit(train_model_input, train_label, # train_label,
batch_size=256, epochs=1, verbose=1, validation_split=0.0, )
# 4. Generate user features for testing and full item features for retrieval
test_user_model_input = test_model_input
all_item_model_input = {"movie_id": item_profile['movie_id'].values, "movie_idx": item_profile['movie_id'].values}
user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)
user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)
item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)
print(user_embs.shape)
print(item_embs.shape)
# 5. [Optional] ANN search by faiss and evaluate the result
# test_true_label = {line[0]:[line[2]] for line in test_set}
#
# import numpy as np
# import faiss
# from tqdm import tqdm
# from deepmatch.utils import recall_N
#
# index = faiss.IndexFlatIP(embedding_dim)
# # faiss.normalize_L2(item_embs)
# index.add(item_embs)
# # faiss.normalize_L2(user_embs)
# D, I = index.search(user_embs, 50)
# s = []
# hit = 0
# for i, uid in tqdm(enumerate(test_user_model_input['user_id'])):
# try:
# pred = [item_profile['movie_id'].values[x] for x in I[i]]
# filter_item = None
# recall_score = recall_N(test_true_label[uid], pred, N=50)
# s.append(recall_score)
# if test_true_label[uid] in pred:
# hit += 1
# except:
# print(i)
# print("recall", np.mean(s))
# print("hr", hit / len(test_user_model_input['user_id']))
The MovieLens data has been used for personalized tag recommendation,which contains 668, 953 tag applications of users on movies. Here is a small fraction of data include only sparse field.
This example shows how to use YoutubeDNN
to solve a matching task. You can get the demo data movielens_sample.txt and run the following codes.
import pandas as pd
from deepctr.inputs import SparseFeat, VarLenSparseFeat
from preprocess import gen_data_set, gen_model_input
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
from deepmatch.models import *
from deepmatch.utils import sampledsoftmaxloss
if __name__ == "__main__":
data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt")
sparse_features = ["movie_id", "user_id",
"gender", "age", "occupation", "zip", ]
SEQ_LEN = 50
negsample = 0
# 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input`
features = ['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip']
feature_max_idx = {}
for feature in features:
lbe = LabelEncoder()
data[feature] = lbe.fit_transform(data[feature]) + 1
feature_max_idx[feature] = data[feature].max() + 1
user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id')
item_profile = data[["movie_id"]].drop_duplicates('movie_id')
user_profile.set_index("user_id", inplace=True)
user_item_list = data.groupby("user_id")['movie_id'].apply(list)
train_set, test_set = gen_data_set(data, negsample)
train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN)
test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN)
# 2.count #unique features for each sparse field and generate feature config for sequence feature
embedding_dim = 16
user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim),
SparseFeat("gender", feature_max_idx['gender'], embedding_dim),
SparseFeat("age", feature_max_idx['age'], embedding_dim),
SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim),
SparseFeat("zip", feature_max_idx['zip'], embedding_dim),
VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim,
embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'),
]
item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)]
# 3.Define Model and train
K.set_learning_phase(True)
model = YoutubeDNN(user_feature_columns, item_feature_columns, num_sampled=5, user_dnn_hidden_units=(64, 16))
# model = MIND(user_feature_columns,item_feature_columns,dynamic_k=True,p=1,k_max=2,num_sampled=5,user_dnn_hidden_units=(64,16),init_std=0.001)
model.compile(optimizer="adagrad", loss=sampledsoftmaxloss) # "binary_crossentropy")
history = model.fit(train_model_input, train_label, # train_label,
batch_size=256, epochs=1, verbose=1, validation_split=0.0, )
# 4. Generate user features for testing and full item features for retrieval
test_user_model_input = test_model_input
all_item_model_input = {"movie_id": item_profile['movie_id'].values, "movie_idx": item_profile['movie_id'].values}
user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)
user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)
# user_embs = user_embs[:, i, :] i in [0,k_max) if MIND
item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)
print(user_embs.shape)
print(item_embs.shape)
# 5. [Optional] ANN search by faiss and evaluate the result
# test_true_label = {line[0]:[line[2]] for line in test_set}
#
# import numpy as np
# import faiss
# from tqdm import tqdm
# from deepmatch.utils import recall_N
#
# index = faiss.IndexFlatIP(embedding_dim)
# # faiss.normalize_L2(item_embs)
# index.add(item_embs)
# # faiss.normalize_L2(user_embs)
# D, I = index.search(user_embs, 50)
# s = []
# hit = 0
# for i, uid in tqdm(enumerate(test_user_model_input['user_id'])):
# try:
# pred = [item_profile['movie_id'].values[x] for x in I[i]]
# filter_item = None
# recall_score = recall_N(test_true_label[uid], pred, N=50)
# s.append(recall_score)
# if test_true_label[uid] in pred:
# hit += 1
# except:
# print(i)
# print("recall", np.mean(s))
# print("hr", hit / len(test_user_model_input['user_id']))