基于TensorFlow的FM实现 / 开普饭

在之前的文章「从矩阵分解到FM的演进、FM如何用于召回和排序以及实现说明」中介绍了FM算法的演化史，主要是从协同过滤CF到矩阵分解MF，再到线性模型LR，最后介绍了FM用于召回和精排的情况。这一篇文章主要是FM的代码实现，以MovielLens数据集为例进行说明。

MovielLens数据集（ml-100k）包含了10万条评分记录，其中涉及了943个用户和1682个item，这里使用的是<user, item, rate>这样的数据形式。

我这里导入的是tf2.x的环境，但是是用tf1.x写的，因为工作中还是用的1x比较多。那么如何在tf2.x的环境中应用1.x的功能呢？

tf.compat.v1.disable_eager_execution()

另外一个需要注意的点是，保存模型需要在每个epoch都要进行保存

for epoch in range(epochs):    ... ...    # 保存模型    self.saver.save(self.sess, '{}/tf_with_1x'.format(self.modelpath))

OK，看代码实现，首先定义工具类，主要包含了三个部分的功能：

加载数据

   def load_dataset(self, train_path, test_path, mode):

        cols = ['user', 'item', 'rating', 'timestamp']

        train = pd.read_csv(train_path, delimiter='\t', names=cols)

        test = pd.read_csv(test_path, delimiter='\t', names=cols)

        print(train.user.values)

        X_train, ix = self.vectorize_dic({'users': train.user.values, 'items': train.item.values})

        X_test, ix = self.vectorize_dic({'users': test.user.values, 'items': test.item.values}, ix, X_train.shape[1])

        y1 = train.rating.values

        y_train = np.zeros((len(y1), 1))

        y2 = test.rating.values

        y_test = np.zeros((len(y2), 1))

        if mode == 'regression':

            y_train = y1.copy()

            y_test = y2.copy()

        elif mode == 'classification':

            y_train[np.where(y1 == 5)] = 1

            y_train[np.where(y1 < 5)] = -1

            y_test[np.where(y2 == 5)] = 1

            y_test[np.where(y2 < 5)] = -1

        return X_train, y_train, X_test, y_test

创建一个scipy csr matrix

    def vectorize_dic(self, dic, ix=None, p=None):        '''        Creates a scipy csr matrix from a list of lists (each inner list is a set of values corresponding to a feature)        :param dic: dictionay of feature lists. Keys are the name of features        :param ix:  index generator(default None)        :param p: dimension of feature space (number of columns in the sparse matrix)        :return:        '''

        if (ix == None):            d = count(0)            ix = defaultdict(lambda: next(d))        # 样本数        n = len(list(dic.values())[0])        # 特征数        g = len(list(dic.keys()))        # 生成矩阵拆平之后的总长度        nz = n * g

        col_ix = np.empty(nz, dtype=int)

        i = 0        for k, lis in dic.items():            # 从i位置开始，间隔 g            col_ix[i::g] = [ix[str(el) + str(k)] for el in lis]            i += 1

        # np.repeat(np.arange(0, 10), 3)        # array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9])        row_ix = np.repeat(np.arange(0, n), g)        data = np.ones(nz)

        if (p == None):            p = len(ix)        ixx = np.where(col_ix < p)        # 关于矩阵压缩 csr.csr_matrix参考：https://cloud.tencent.com/developer/article/1099721        return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])), shape=(n, p)), ix

batch数据的返回

  def get_batcher(self, X_, y_=None, batch_size=None):

        n_samples = X_.shape[0]
if batch_size is None:

            batch_size = n_samples

for i in range(0, n_samples, batch_size): upper_bound = min(i + batch_size, n_samples) ret_x = X_[i:upper_bound] ret_y = None if y_ is not None: ret_y = y_[i:i + batch_size] yield (ret_x, ret_y)

然后是定义FM模型，FM模型主要包含了几个部分：

加载数据

    def load_data(self):      self.X_train, self.y_train, self.X_test, self.y_test = self.util.load_dataset(self.trainPath, self.testPath, self.mode)      self.X_train = self.X_train.todense()      self.X_test = self.X_test.todense()      print('Train data shape: ', self.X_train.shape)      print(self.X_train[:3])      print('Test data shape: ', self.X_test.shape)      print(self.X_test[:3])

创建模型

    def build_model(self):

      self.row_num, self.col_num = self.X_train.shape
# design matrix

      self.X = tf.compat.v1.placeholder('float', shape=[None, self.col_num])

      # target vector

      self.y = tf.compat.v1.placeholder('float', shape=[None, 1])
# 偏置和权重

      w0 = tf.Variable(tf.zeros([1]))

      W = tf.Variable(tf.zeros([self.col_num]))
# 初始化向量矩阵

      self.V = tf.Variable(tf.random.normal([self.k, self.col_num], stddev=0.01))
# 创建结果值

      y_hat = tf.Variable(tf.zeros([self.row_num, 1]))
# 线性部分

      linear_terms = tf.add(w0, tf.reduce_sum(tf.multiply(W, self.X), 1, keepdims=True))

      # 特征交叉部分 参考 https://mp.weixin.qq.com/s/mJpNwEDGqS7u-vtZ54zV6A 推导过程

      pair_interaction = (tf.multiply(0.5,

                                       tf.reduce_sum(

                                           tf.subtract(

                                               tf.pow(tf.matmul(self.X, tf.transpose(self.V)), 2),

                                               tf.matmul(tf.pow(self.X, 2), tf.transpose(tf.pow(self.V, 2)))

                                           ),

                                           1, keepdims=True)))

      self.y_hat = tf.add(linear_terms, pair_interaction)
# lambda_w = tf.constant(0.001, name='lambda_w')

      # lambda_v = tf.constant(0.001, name='lambda_v')

      lambda_w = tf.constant(0.00, name='lambda_w')

      lambda_v = tf.constant(0.00, name='lambda_v')

      l2_norm = tf.add(

          tf.reduce_sum(tf.multiply(lambda_w, tf.pow(W, 2))),

          tf.reduce_sum(tf.multiply(lambda_v, tf.pow(self.V, 2)))

      )
if self.mode == 'regression':

          self.error = tf.reduce_mean(tf.square(tf.subtract(self.y, self.y_hat)))

          self.loss = tf.add(self.error, l2_norm)

      elif self.mode == 'classification':

          print(self.y.get_shape().as_list())

          print(self.y_hat.get_shape().as_list())

          self.error = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=self.y, logits=self.y_hat))

          self.loss = tf.add(self.error, l2_norm)

          print(self.loss.get_shape().as_list())

          print(l2_norm.get_shape().as_list())

# self.optimizer = tf.train.AdamOptimizer(beta1=0.9, beta2=0.5).minimize(self.loss) self.optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.001).minimize(self.loss)

模型训练

    def train(self):      epochs = 5      batch_size = 256      self.init = tf.compat.v1.global_variables_initializer()      self.sess = tf.compat.v1.Session()      self.saver = tf.compat.v1.train.Saver()  # 在构建网络后使用      self.sess.run(self.init)      for epoch in range(epochs):          perm = np.random.permutation(self.X_train.shape[0])          cnt = 0          for batchX, batchY in self.util.get_batcher(self.X_train[perm], self.y_train[perm], batch_size):              _, loss = self.sess.run((self.optimizer, self.loss), feed_dict={self.X: batchX.reshape(-1, self.col_num), self.y: batchY.reshape(-1, 1)})              if cnt == 1:                  print('Epoch: %d, Loss: %.3f' % (epoch + 1, loss))              cnt += 1          # 保存模型          self.saver.save(self.sess, '{}/tf_with_1x'.format(self.modelpath))

模型评估

    def evaluate(self):

      # 加载模型

      with tf.compat.v1.Session() as sess:

          sess.run(self.init)

          self.saver.restore(sess, '{}/tf_with_1x'.format(self.modelpath))

          print('模型加载成功 ...')

          if self.mode == 'regression':

              errors = []

              for batchX, batchY in self.util.get_batcher(self.X_test, self.y_test):

                  errors.append(sess.run(self.error, feed_dict={self.X: batchX.reshape(-1, self.col_num), self.y: batchY.reshape(-1, 1)}))

              RMSE = np.sqrt(np.array(errors).mean())

              print('RMSE: ', RMSE)

          elif self.mode == 'classification':

              pred = np.zeros((len(self.X_test), 1))

              for batchX, batchY in self.util.get_batcher(self.X_test, self.y_test):

                  logits = sess.run(self.y_hat, feed_dict={self.X: batchX.reshape(-1, self.col_num), self.y: batchY.reshape(-1, 1)})

                  y_hat = self.util.sigmoid(logits)

                  pred[np.where(y_hat > 0.5)] = 1

                  pred[np.where(y_hat < 0.5)] = -1

              print('Accuracy: ', np.mean(self.y_test == pred))

      sess.close()

关注我们不错过每一篇精彩

搜索与推荐Wiki

专注于搜索和推荐系统，以系列分享为主，持续打造精品内容！

207篇原创内容

公众号

基于TensorFlow的FM实现

相关推荐