bigfreecoder
diff --git a/‎__init__.py b/‎__init__.py
diff --git a/‎__init__.pyc
110 Bytes b/‎__init__.pyc
110 Bytes
diff --git a/‎__pycache__/data_manager.cpython-36.pyc
12 KB b/‎__pycache__/data_manager.cpython-36.pyc
12 KB
diff --git a/‎__pycache__/models.cpython-36.pyc
3.01 KB b/‎__pycache__/models.cpython-36.pyc
3.01 KB
diff --git a/‎__pycache__/util.cpython-36.pyc
963 Bytes b/‎__pycache__/util.cpython-36.pyc
963 Bytes
diff --git a/‎data_manager.py
Lines changed: 460 additions & 0 deletions b/‎data_manager.py
Lines changed: 460 additions & 0 deletions
diff --git a/‎data_manager.pyc
12.9 KB b/‎data_manager.pyc
12.9 KB
diff --git a/‎models.py
Lines changed: 136 additions & 0 deletions b/‎models.py
Lines changed: 136 additions & 0 deletions
diff --git a/‎models.pyc
3.37 KB b/‎models.pyc
3.37 KB
diff --git a/‎run.py
Lines changed: 140 additions & 0 deletions b/‎run.py
Lines changed: 140 additions & 0 deletions
diff --git a/‎run_test_ConvMF.sh
Lines changed: 10 additions & 0 deletions b/‎run_test_ConvMF.sh
Lines changed: 10 additions & 0 deletions
diff --git a/‎run_test_preprocess.sh
Lines changed: 8 additions & 0 deletions b/‎run_test_preprocess.sh
Lines changed: 8 additions & 0 deletions
@@ -0,0 +1,136 @@
+'''
+Created on Dec 8, 2015
+
+@author: donghyun
+'''
+
+import os
+import time
+
+from util import eval_RMSE
+import math
+import numpy as np
+from text_analysis.models import CNN_module
+
+
+
+def ConvMF(res_dir, train_user, train_item, valid_user, test_user,
+           R, CNN_X, vocab_size, init_W=None, give_item_weight=True,
+           max_iter=50, lambda_u=1, lambda_v=100, dimension=50,
+           dropout_rate=0.2, emb_dim=200, max_len=300, num_kernel_per_ws=100):
+    # explicit setting
+    a = 1
+    b = 0
+
+    num_user = R.shape[0]
+    num_item = R.shape[1]
+    PREV_LOSS = 1e-50
+    if not os.path.exists(res_dir):
+        os.makedirs(res_dir)
+    f1 = open(res_dir + '/state.log', 'w')
+    # state.log record
+
+    Train_R_I = train_user[1]
+    Train_R_J = train_item[1]
+    Test_R = test_user[1]
+    Valid_R = valid_user[1]
+
+    # 这部分暂时先不管有什么用
+    if give_item_weight is True:
+        item_weight = np.array([math.sqrt(len(i))
+                                for i in Train_R_J], dtype=float)
+        item_weight *= (float(num_item) / item_weight.sum())
+    else:
+        item_weight = np.ones(num_item, dtype=float)
+
+    pre_val_eval = 1e10
+
+    # dimension: latent of dimension for users and items
+    # emb_dim: Size of latent dimension for word vectors
+    cnn_module = CNN_module(dimension, vocab_size, dropout_rate,
+                            emb_dim, max_len, num_kernel_per_ws, init_W)
+
+    # return the theta of CNN
+    theta = cnn_module.get_projection_layer(CNN_X)
+    np.random.seed(133)
+    U = np.random.uniform(size=(num_user, dimension))
+    V = theta
+
+    endure_count = 5
+    count = 0
+    for iteration in range(max_iter):
+        loss = 0
+        tic = time.time()
+        print ("%d iteration\t(patience: %d)" % (iteration, count))
+
+        VV = b * (V.T.dot(V)) + lambda_u * np.eye(dimension)
+        sub_loss = np.zeros(num_user)
+
+        for i in range(num_user):
+            idx_item = train_user[0][i]
+            V_i = V[idx_item]
+            R_i = Train_R_I[i]
+            A = VV + (a - b) * (V_i.T.dot(V_i))
+            B = (a * V_i * np.tile(R_i, (dimension, 1)).T).sum(0)
+
+            U[i] = np.linalg.solve(A, B)
+
+            sub_loss[i] = -0.5 * lambda_u * np.dot(U[i], U[i])
+
+        loss = loss + np.sum(sub_loss)
+
+        sub_loss = np.zeros(num_item)
+        UU = b * (U.T.dot(U))
+        for j in range(num_item):
+            idx_user = train_item[0][j]
+            U_j = U[idx_user]
+            R_j = Train_R_J[j]
+
+            tmp_A = UU + (a - b) * (U_j.T.dot(U_j))
+            A = tmp_A + lambda_v * item_weight[j] * np.eye(dimension)
+            B = (a * U_j * np.tile(R_j, (dimension, 1)).T
+                 ).sum(0) + lambda_v * item_weight[j] * theta[j]
+            V[j] = np.linalg.solve(A, B)
+
+            sub_loss[j] = -0.5 * np.square(R_j * a).sum()
+            sub_loss[j] = sub_loss[j] + a * np.sum((U_j.dot(V[j])) * R_j)
+            sub_loss[j] = sub_loss[j] - 0.5 * np.dot(V[j].dot(tmp_A), V[j])
+
+        loss = loss + np.sum(sub_loss)
+        seed = np.random.randint(100000)
+        history = cnn_module.train(CNN_X, V, item_weight, seed)
+        theta = cnn_module.get_projection_layer(CNN_X)
+        cnn_loss = history.history['loss'][-1]
+
+        loss -= 0.5 * lambda_v * cnn_loss * num_item
+
+        tr_eval = eval_RMSE(Train_R_I, U, V, train_user[0])
+        val_eval = eval_RMSE(Valid_R, U, V, valid_user[0])
+        te_eval = eval_RMSE(Test_R, U, V, test_user[0])
+
+        toc = time.time()
+        elapsed = toc - tic
+
+        converge = abs((loss - PREV_LOSS) / PREV_LOSS)
+
+        if val_eval < pre_val_eval:
+            cnn_module.save_model(res_dir + '/CNN_weights.hdf5')
+            np.savetxt(res_dir + '/U.dat', U)
+            np.savetxt(res_dir + '/V.dat', V)
+            np.savetxt(res_dir + '/theta.dat', theta)
+        else:
+            count += 1
+
+        pre_val_eval = val_eval
+
+        print ("Loss: %.5f Elpased: %.4fs Converge: %.6f Tr: %.5f Val: %.5f Te: %.5f" % (
+            loss, elapsed, converge, tr_eval, val_eval, te_eval))
+        f1.write("Loss: %.5f Elpased: %.4fs Converge: %.6f Tr: %.5f Val: %.5f Te: %.5f\n" % (
+            loss, elapsed, converge, tr_eval, val_eval, te_eval))
+
+        if count == endure_count:
+            break
+
+        PREV_LOSS = loss
+
+    f1.close()
@@ -0,0 +1,140 @@
+'''
+Created on Dec 9, 2015
+
+@author: donghyunz
+'''
+import argparse
+import sys
+from data_manager import Data_Factory
+
+parser = argparse.ArgumentParser()
+
+# Option for pre-processing data
+parser.add_argument("-c", "--do_preprocess", type=bool,
+                    help="True or False to preprocess raw data for ConvMF (default = False)", default=False)
+parser.add_argument("-r", "--raw_rating_data_path", type=str,
+                    help="Path to raw rating data. data format - user id::item id::rating")
+parser.add_argument("-i", "--raw_item_document_data_path", type=str,
+                    help="Path to raw item document data. item document consists of multiple text. data format - item id::text1|text2...")
+parser.add_argument("-m", "--min_rating", type=int,
+                    help="Users who have less than \"min_rating\" ratings will be removed (default = 1)", default=1)
+parser.add_argument("-l", "--max_length_document", type=int,
+                    help="Maximum length of document of each item (default = 300)", default=300)
+parser.add_argument("-f", "--max_df", type=float,
+                    help="Threshold to ignore terms that have a document frequency higher than the given value (default = 0.5)", default=0.5)
+parser.add_argument("-s", "--vocab_size", type=int,
+                    help="Size of vocabulary (default = 8000)", default=8000)
+parser.add_argument("-t", "--split_ratio", type=float,
+                    help="Ratio: 1-ratio, ratio/2 and ratio/2 of the entire dataset (R) will be training, valid and test set, respectively (default = 0.2)", default=0.2)
+
+# Option for pre-processing data and running ConvMF
+parser.add_argument("-d", "--data_path", type=str,
+                    help="Path to training, valid and test data sets")
+parser.add_argument("-a", "--aux_path", type=str, help="Path to R, D_all sets")
+
+# Option for running ConvMF
+parser.add_argument("-o", "--res_dir", type=str,
+                    help="Path to ConvMF's result")
+parser.add_argument("-e", "--emb_dim", type=int,
+                    help="Size of latent dimension for word vectors (default: 200)", default=200)
+parser.add_argument("-p", "--pretrain_w2v", type=str,
+                    help="Path to pretrain word embedding model  to initialize word vectors")
+parser.add_argument("-g", "--give_item_weight", type=bool,
+                    help="True or False to give item weight of ConvMF (default = False)", default=True)
+parser.add_argument("-k", "--dimension", type=int,
+                    help="Size of latent dimension for users and items (default: 50)", default=50)
+parser.add_argument("-u", "--lambda_u", type=float,
+                    help="Value of user regularizer")
+parser.add_argument("-v", "--lambda_v", type=float,
+                    help="Value of item regularizer")
+parser.add_argument("-n", "--max_iter", type=int,
+                    help="Value of max iteration (default: 200)", default=200)
+parser.add_argument("-w", "--num_kernel_per_ws", type=int,
+                    help="Number of kernels per window size for CNN module (default: 100)", default=100)
+
+args = parser.parse_args()
+do_preprocess = args.do_preprocess
+data_path = args.data_path
+aux_path = args.aux_path
+if data_path is None:
+    sys.exit("Argument missing - data_path is required")
+if aux_path is None:
+    sys.exit("Argument missing - aux_path is required")
+
+data_factory = Data_Factory()
+
+if do_preprocess:
+    path_rating = args.raw_rating_data_path
+    path_itemtext = args.raw_item_document_data_path
+    min_rating = args.min_rating
+    max_length = args.max_length_document
+    max_df = args.max_df
+    vocab_size = args.vocab_size
+    split_ratio = args.split_ratio
+
+    print ("=================================Preprocess Option Setting=================================")
+    print ("\tsaving preprocessed aux path - %s" % aux_path)
+    print ("\tsaving preprocessed data path - %s" % data_path)
+    print ("\trating data path - %s" % path_rating)
+    print ("\tdocument data path - %s" % path_itemtext)
+    print ("\tmin_rating: %d\n\tmax_length_document: %d\n\tmax_df: %.1f\n\tvocab_size: %d\n\tsplit_ratio: %.1f" \
+        % (min_rating, max_length, max_df, vocab_size, split_ratio))
+    print ("===========================================================================================")
+
+    R, D_all = data_factory.preprocess(
+        path_rating, path_itemtext, min_rating, max_length, max_df, vocab_size)
+    data_factory.save(aux_path, R, D_all)
+    data_factory.generate_train_valid_test_file_from_R(
+        data_path, R, split_ratio)
+else:
+    res_dir = args.res_dir
+    emb_dim = args.emb_dim
+    pretrain_w2v = args.pretrain_w2v
+    dimension = args.dimension
+    lambda_u = args.lambda_u
+    lambda_v = args.lambda_v
+    max_iter = args.max_iter
+    num_kernel_per_ws = args.num_kernel_per_ws
+    give_item_weight = args.give_item_weight
+
+    if res_dir is None:
+        sys.exit("Argument missing - res_dir is required")
+    if lambda_u is None:
+        sys.exit("Argument missing - lambda_u is required")
+    if lambda_v is None:
+        sys.exit("Argument missing - lambda_v is required")
+
+    print ("===================================ConvMF Option Setting===================================")
+    print ("\taux path - %s" % aux_path)
+    print ("\tdata path - %s" % data_path)
+    print ("\tresult path - %s" % res_dir)
+    print ("\tpretrained w2v data path - %s" % pretrain_w2v)
+    print ("\tdimension: %d\n\tlambda_u: %.4f\n\tlambda_v: %.4f\n\tmax_iter: %d\n\tnum_kernel_per_ws: %d" \
+        % (dimension, lambda_u, lambda_v, max_iter, num_kernel_per_ws))
+    print ("===========================================================================================")
+
+    R, D_all = data_factory.load(aux_path)
+    CNN_X = D_all['X_sequence']
+    vocab_size = len(D_all['X_vocab']) + 1
+
+    print("\tJay::vocab_size is %d" % vocab_size)
+    print("\tJay::cnn_x is %d" % len(CNN_X))
+
+    from models import ConvMF
+
+    if pretrain_w2v is None:
+        init_W = None
+    else:
+        # 生成词向量矩阵
+        init_W = data_factory.read_pretrained_word2vec(
+            pretrain_w2v, D_all['X_vocab'], emb_dim)
+
+    train_user = data_factory.read_rating(data_path + '/train_user.dat')
+    train_item = data_factory.read_rating(data_path + '/train_item.dat')
+    valid_user = data_factory.read_rating(data_path + '/valid_user.dat')
+    test_user = data_factory.read_rating(data_path + '/test_user.dat')
+
+    ConvMF(max_iter=max_iter, res_dir=res_dir,
+           lambda_u=lambda_u, lambda_v=lambda_v, dimension=dimension, vocab_size=vocab_size, init_W=init_W,
+           give_item_weight=give_item_weight, CNN_X=CNN_X, emb_dim=emb_dim, num_kernel_per_ws=num_kernel_per_ws,
+           train_user=train_user, train_item=train_item, valid_user=valid_user, test_user=test_user, R=R)
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+python ./run.py \
+-d ./data/preprocessed/ml-1m/0.2/ \
+-a ./data/preprocessed/ml-1m/ \
+-o ./test/ml-1m/result/1_100_200 \
+-e 200 \
+-p ./data/preprocessed/glove/glove.6B.200d.txt \
+-u 10 \
+-v 100 \
+-g True
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+python ./run.py \
+-d ./test/ml-1m/0.2/ \
+-a ./test/ml-1m/ \
+-c True \
+-r ./data/movielens/ml-1m_ratings.dat \
+-i ./data/movielens/ml_plot.dat \
+-m 1