rm *

4f49d54e · 高雅喆 · 0fd31641 · 0fd31641 · 0fd31641 · 0fd31641
Commit 4f49d54e authored Aug 06, 2018 by 高雅喆
13 changed files
--- a/FfmEncoder.py
+++ b/FfmEncoder.py
-import hashlib, math, os, subprocess
-from multiprocessing import Process
-
-
-def hashstr(str, nr_bins=1e+6):
-    return int(hashlib.md5(str.encode('utf8')).hexdigest(), 16) % (int(nr_bins) - 1) + 1
-
-
-class FfmEncoder():
-    def __init__(self, field_names, label_name, nthread=1):
-        self.field_names = field_names
-        self.nthread = nthread
-        self.label = label_name
-
-    def gen_feats(self, row):
-        feats = []
-        for field in self.field_names:
-            value = row[field]
-            key = field + '-' + str(value)
-            feats.append(key)
-        return feats
-
-    def gen_hashed_fm_feats(self, feats):
-        feats = ['{0}:{1}:1'.format(field, hashstr(feat, 1e+6)) for (field, feat) in feats]
-        return feats
-
-    def convert(self, df, path, i):
-        lines_per_thread = math.ceil(float(df.shape[0]) / self.nthread)
-        sub_df = df.iloc[i * lines_per_thread: (i + 1) * lines_per_thread]
-        tmp_path = path + '_tmp_{0}'.format(i)
-        with open(tmp_path, 'w') as f:
-            for index,row in sub_df.iterrows():
-                feats = []
-                for i, feat in enumerate(self.gen_feats(row)):
-                    feats.append((i, feat))
-                feats = self.gen_hashed_fm_feats(feats)
-                f.write(str(int(row[self.label])) + ' ' + ' '.join(feats) + '\n')
-
-    def parallel_convert(self, df, path):
-        processes = []
-        for i in range(self.nthread):
-            p = Process(target=self.convert, args=(df, path, i))
-            p.start()
-            processes.append(p)
-        for p in processes:
-            p.join()
-
-    def delete(self, path):
-        for i in range(self.nthread):
-            os.remove(path + '_tmp_{0}'.format(i))
-
-    def cat(self, path):
-        if os.path.exists(path):
-            os.remove(path)
-        for i in range(self.nthread):
-            cmd = 'cat {svm}_tmp_{idx} >> {svm}'.format(svm=path, idx=i)
-            p = subprocess.Popen(cmd, shell=True)
-            p.communicate()
-
-    def transform(self, df, path):
-        print('converting data......')
-        self.parallel_convert(df, path)
-        self.cat(path)
-        self.delete(path)
-
-import pandas as pd
-
-test = pd.read_csv("./train_all.csv")
-test = test[["device_id","cid","user_view","time","label"]]
-test["time"] = pd.to_datetime(test['time'])
-test["hour"] = [test["time"][i].hour for i in range(test.shape[0])]
-test["weekday"] = [test["time"][i].weekday() for i in range(test.shape[0])]
-filed_names = ["device_id", "cid", "user_view","hour","weekday"]
-fe = FfmEncoder(filed_names,label_name='label',nthread=1)
-fe.transform(test, 'train_all.ffm')
--- a/output.txt
+++ b/output.txt
--- a/run_demo_ctr.py
+++ b/run_demo_ctr.py
-import xlearn as xl
-
-# Training task
-ffm_model = xl.create_ffm() # Use field-aware factorization machine
-ffm_model.setTrain("./train_last.ffm")  # Training data
-ffm_model.setValidate("./test_last.ffm")  # Validation data
-
-# param:
-#  0. binary classification
-#  1. learning rate: 0.2
-#  2. regular lambda: 0.002
-#  3. evaluation metric: accuracy
-param = {'task':'binary', 'lr':0.2, 
-         'lambda':0.002, 'metric':'acc'}
-
-# Start to train
-# The trained model will be stored in model.out
-ffm_model.fit(param, './model.out')
-
-# Prediction task
-ffm_model.setTest("./test_imp.ffm")  # Test data
-ffm_model.setSigmoid()  # Convert output to 0-1
-
-# Start to predict
-# The output result will be stored in output.txt
-ffm_model.predict("./model.out", "./output.txt")
--- a/test_clk.ffm
+++ b/test_clk.ffm
--- a/test_clk.ffm.bin
+++ b/test_clk.ffm.bin
--- a/test_imp.ffm
+++ b/test_imp.ffm
--- a/test_imp.ffm.bin
+++ b/test_imp.ffm.bin
--- a/test_last.ffm
+++ b/test_last.ffm
--- a/test_last.ffm.bin
+++ b/test_last.ffm.bin
--- a/train_all.ffm
+++ b/train_all.ffm
--- a/train_last.ffm
+++ b/train_last.ffm
--- a/train_last.ffm.bin
+++ b/train_last.ffm.bin
--- a/train_shuf.ffm
+++ b/train_shuf.ffm