基于TensorFlow的FM实现
在之前的文章「从矩阵分解到FM的演进、FM如何用于召回和排序以及实现说明」中介绍了FM算法的演化史,主要是从协同过滤CF到矩阵分解MF,再到线性模型LR,最后介绍了FM用于召回和精排的情况。这一篇文章主要是FM的代码实现,以MovielLens数据集为例进行说明。
MovielLens数据集(ml-100k)包含了10万条评分记录,其中涉及了943个用户和1682个item,这里使用的是<user, item, rate>这样的数据形式。
我这里导入的是tf2.x的环境,但是是用tf1.x写的,因为工作中还是用的1x比较多。那么如何在tf2.x的环境中应用1.x的功能呢?
tf.compat.v1.disable_eager_execution()
另外一个需要注意的点是,保存模型需要在每个epoch都要进行保存
for epoch in range(epochs): ... ... # 保存模型 self.saver.save(self.sess, '{}/tf_with_1x'.format(self.modelpath))
OK,看代码实现,首先定义工具类,主要包含了三个部分的功能:
加载数据
def load_dataset(self, train_path, test_path, mode):
cols = ['user', 'item', 'rating', 'timestamp']
train = pd.read_csv(train_path, delimiter='\t', names=cols)
test = pd.read_csv(test_path, delimiter='\t', names=cols)
print(train.user.values)
X_train, ix = self.vectorize_dic({'users': train.user.values, 'items': train.item.values})
X_test, ix = self.vectorize_dic({'users': test.user.values, 'items': test.item.values}, ix, X_train.shape[1])
y1 = train.rating.values
y_train = np.zeros((len(y1), 1))
y2 = test.rating.values
y_test = np.zeros((len(y2), 1))
if mode == 'regression':
y_train = y1.copy()
y_test = y2.copy()
elif mode == 'classification':
y_train[np.where(y1 == 5)] = 1
y_train[np.where(y1 < 5)] = -1
y_test[np.where(y2 == 5)] = 1
y_test[np.where(y2 < 5)] = -1
return X_train, y_train, X_test, y_test
创建一个scipy csr matrix
def vectorize_dic(self, dic, ix=None, p=None): ''' Creates a scipy csr matrix from a list of lists (each inner list is a set of values corresponding to a feature) :param dic: dictionay of feature lists. Keys are the name of features :param ix: index generator(default None) :param p: dimension of feature space (number of columns in the sparse matrix) :return: '''
if (ix == None): d = count(0) ix = defaultdict(lambda: next(d)) # 样本数 n = len(list(dic.values())[0]) # 特征数 g = len(list(dic.keys())) # 生成矩阵拆平之后的总长度 nz = n * g
col_ix = np.empty(nz, dtype=int)
i = 0 for k, lis in dic.items(): # 从i位置开始,间隔 g col_ix[i::g] = [ix[str(el) + str(k)] for el in lis] i += 1
# np.repeat(np.arange(0, 10), 3) # array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9]) row_ix = np.repeat(np.arange(0, n), g) data = np.ones(nz)
if (p == None): p = len(ix) ixx = np.where(col_ix < p) # 关于矩阵压缩 csr.csr_matrix参考:https://cloud.tencent.com/developer/article/1099721 return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])), shape=(n, p)), ix
batch数据的返回
def get_batcher(self, X_, y_=None, batch_size=None):
n_samples = X_.shape[0]
if batch_size is None:
batch_size = n_samples
for i in range(0, n_samples, batch_size):
upper_bound = min(i + batch_size, n_samples)
ret_x = X_[i:upper_bound]
ret_y = None
if y_ is not None:
ret_y = y_[i:i + batch_size]
yield (ret_x, ret_y)
然后是定义FM模型,FM模型主要包含了几个部分:
加载数据
def load_data(self): self.X_train, self.y_train, self.X_test, self.y_test = self.util.load_dataset(self.trainPath, self.testPath, self.mode) self.X_train = self.X_train.todense() self.X_test = self.X_test.todense() print('Train data shape: ', self.X_train.shape) print(self.X_train[:3]) print('Test data shape: ', self.X_test.shape) print(self.X_test[:3])
创建模型
def build_model(self):
self.row_num, self.col_num = self.X_train.shape
# design matrix
self.X = tf.compat.v1.placeholder('float', shape=[None, self.col_num])
# target vector
self.y = tf.compat.v1.placeholder('float', shape=[None, 1])
# 偏置和权重
w0 = tf.Variable(tf.zeros([1]))
W = tf.Variable(tf.zeros([self.col_num]))
# 初始化向量矩阵
self.V = tf.Variable(tf.random.normal([self.k, self.col_num], stddev=0.01))
# 创建结果值
y_hat = tf.Variable(tf.zeros([self.row_num, 1]))
# 线性部分
linear_terms = tf.add(w0, tf.reduce_sum(tf.multiply(W, self.X), 1, keepdims=True))
# 特征交叉部分 参考 https://mp.weixin.qq.com/s/mJpNwEDGqS7u-vtZ54zV6A 推导过程
pair_interaction = (tf.multiply(0.5,
tf.reduce_sum(
tf.subtract(
tf.pow(tf.matmul(self.X, tf.transpose(self.V)), 2),
tf.matmul(tf.pow(self.X, 2), tf.transpose(tf.pow(self.V, 2)))
),
1, keepdims=True)))
self.y_hat = tf.add(linear_terms, pair_interaction)
# lambda_w = tf.constant(0.001, name='lambda_w')
# lambda_v = tf.constant(0.001, name='lambda_v')
lambda_w = tf.constant(0.00, name='lambda_w')
lambda_v = tf.constant(0.00, name='lambda_v')
l2_norm = tf.add(
tf.reduce_sum(tf.multiply(lambda_w, tf.pow(W, 2))),
tf.reduce_sum(tf.multiply(lambda_v, tf.pow(self.V, 2)))
)
if self.mode == 'regression':
self.error = tf.reduce_mean(tf.square(tf.subtract(self.y, self.y_hat)))
self.loss = tf.add(self.error, l2_norm)
elif self.mode == 'classification':
print(self.y.get_shape().as_list())
print(self.y_hat.get_shape().as_list())
self.error = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=self.y, logits=self.y_hat))
self.loss = tf.add(self.error, l2_norm)
print(self.loss.get_shape().as_list())
print(l2_norm.get_shape().as_list())
# self.optimizer = tf.train.AdamOptimizer(beta1=0.9, beta2=0.5).minimize(self.loss)
self.optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.001).minimize(self.loss)
模型训练
def train(self): epochs = 5 batch_size = 256 self.init = tf.compat.v1.global_variables_initializer() self.sess = tf.compat.v1.Session() self.saver = tf.compat.v1.train.Saver() # 在构建网络后使用 self.sess.run(self.init) for epoch in range(epochs): perm = np.random.permutation(self.X_train.shape[0]) cnt = 0 for batchX, batchY in self.util.get_batcher(self.X_train[perm], self.y_train[perm], batch_size): _, loss = self.sess.run((self.optimizer, self.loss), feed_dict={self.X: batchX.reshape(-1, self.col_num), self.y: batchY.reshape(-1, 1)}) if cnt == 1: print('Epoch: %d, Loss: %.3f' % (epoch + 1, loss)) cnt += 1 # 保存模型 self.saver.save(self.sess, '{}/tf_with_1x'.format(self.modelpath))
模型评估
def evaluate(self):
# 加载模型
with tf.compat.v1.Session() as sess:
sess.run(self.init)
self.saver.restore(sess, '{}/tf_with_1x'.format(self.modelpath))
print('模型加载成功 ...')
if self.mode == 'regression':
errors = []
for batchX, batchY in self.util.get_batcher(self.X_test, self.y_test):
errors.append(sess.run(self.error, feed_dict={self.X: batchX.reshape(-1, self.col_num), self.y: batchY.reshape(-1, 1)}))
RMSE = np.sqrt(np.array(errors).mean())
print('RMSE: ', RMSE)
elif self.mode == 'classification':
pred = np.zeros((len(self.X_test), 1))
for batchX, batchY in self.util.get_batcher(self.X_test, self.y_test):
logits = sess.run(self.y_hat, feed_dict={self.X: batchX.reshape(-1, self.col_num), self.y: batchY.reshape(-1, 1)})
y_hat = self.util.sigmoid(logits)
pred[np.where(y_hat > 0.5)] = 1
pred[np.where(y_hat < 0.5)] = -1
print('Accuracy: ', np.mean(self.y_test == pred))
sess.close()