commit message

66ac04a3 · 高雅喆 · 66ac04a3 · 66ac04a3 · 66ac04a3 · 66ac04a3
Commit 66ac04a3 authored Aug 06, 2018 by 高雅喆
17 changed files
--- a/.DS_Store
+++ b/.DS_Store
--- a/FfmEncoder.py
+++ b/FfmEncoder.py
+import hashlib, math, os, subprocess
+from multiprocessing import Process
+
+
+def hashstr(str, nr_bins=1e+6):
+    return int(hashlib.md5(str.encode('utf8')).hexdigest(), 16) % (int(nr_bins) - 1) + 1
+
+
+class FfmEncoder():
+    def __init__(self, field_names, label_name, nthread=1):
+        self.field_names = field_names
+        self.nthread = nthread
+        self.label = label_name
+
+    def gen_feats(self, row):
+        feats = []
+        for field in self.field_names:
+            value = row[field]
+            key = field + '-' + str(value)
+            feats.append(key)
+        return feats
+
+    def gen_hashed_fm_feats(self, feats):
+        feats = ['{0}:{1}:1'.format(field, hashstr(feat, 1e+6)) for (field, feat) in feats]
+        return feats
+
+    def convert(self, df, path, i):
+        lines_per_thread = math.ceil(float(df.shape[0]) / self.nthread)
+        sub_df = df.iloc[i * lines_per_thread: (i + 1) * lines_per_thread]
+        tmp_path = path + '_tmp_{0}'.format(i)
+        with open(tmp_path, 'w') as f:
+            for index,row in sub_df.iterrows():
+                feats = []
+                for i, feat in enumerate(self.gen_feats(row)):
+                    feats.append((i, feat))
+                feats = self.gen_hashed_fm_feats(feats)
+                f.write(str(int(row[self.label])) + ' ' + ' '.join(feats) + '\n')
+
+    def parallel_convert(self, df, path):
+        processes = []
+        for i in range(self.nthread):
+            p = Process(target=self.convert, args=(df, path, i))
+            p.start()
+            processes.append(p)
+        for p in processes:
+            p.join()
+
+    def delete(self, path):
+        for i in range(self.nthread):
+            os.remove(path + '_tmp_{0}'.format(i))
+
+    def cat(self, path):
+        if os.path.exists(path):
+            os.remove(path)
+        for i in range(self.nthread):
+            cmd = 'cat {svm}_tmp_{idx} >> {svm}'.format(svm=path, idx=i)
+            p = subprocess.Popen(cmd, shell=True)
+            p.communicate()
+
+    def transform(self, df, path):
+        print('converting data......')
+        self.parallel_convert(df, path)
+        self.cat(path)
+        self.delete(path)
+
+import pandas as pd
+
+test = pd.read_csv("./train_all.csv")
+test = test[["device_id","cid","user_view","time","label"]]
+test["time"] = pd.to_datetime(test['time'])
+test["hour"] = [test["time"][i].hour for i in range(test.shape[0])]
+test["weekday"] = [test["time"][i].weekday() for i in range(test.shape[0])]
+filed_names = ["device_id", "cid", "user_view","hour","weekday"]
+fe = FfmEncoder(filed_names,label_name='label',nthread=1)
+fe.transform(test, 'train_all.ffm')
--- a/model.out
+++ b/model.out
--- a/output.txt
+++ b/output.txt
--- a/run_demo_ctr.py
+++ b/run_demo_ctr.py
+import xlearn as xl
+
+# Training task
+ffm_model = xl.create_ffm() # Use field-aware factorization machine
+ffm_model.setTrain("./train_last.ffm")  # Training data
+ffm_model.setValidate("./test_last.ffm")  # Validation data
+
+# param:
+#  0. binary classification
+#  1. learning rate: 0.2
+#  2. regular lambda: 0.002
+#  3. evaluation metric: accuracy
+param = {'task':'binary', 'lr':0.2, 
+         'lambda':0.002, 'metric':'acc'}
+
+# Start to train
+# The trained model will be stored in model.out
+ffm_model.fit(param, './model.out')
+
+# Prediction task
+ffm_model.setTest("./test_imp.ffm")  # Test data
+ffm_model.setSigmoid()  # Convert output to 0-1
+
+# Start to predict
+# The output result will be stored in output.txt
+ffm_model.predict("./model.out", "./output.txt")
--- a/small_click.csv
+++ b/small_click.csv
--- a/test_clk.ffm
+++ b/test_clk.ffm
--- a/test_clk.ffm.bin
+++ b/test_clk.ffm.bin
--- a/test_imp.ffm
+++ b/test_imp.ffm
--- a/test_imp.ffm.bin
+++ b/test_imp.ffm.bin
--- a/test_last.ffm
+++ b/test_last.ffm
--- a/test_last.ffm.bin
+++ b/test_last.ffm.bin
--- a/train_all.csv
+++ b/train_all.csv
--- a/train_all.ffm
+++ b/train_all.ffm
--- a/train_last.ffm
+++ b/train_last.ffm
--- a/train_last.ffm.bin
+++ b/train_last.ffm.bin
--- a/train_shuf.ffm
+++ b/train_shuf.ffm