Commit 69d4072a authored by 高雅喆's avatar 高雅喆

commit message

parent 01c09e33
File added
import hashlib, math, os, subprocess
from multiprocessing import Process
def hashstr(str, nr_bins=1e+6):
return int(hashlib.md5(str.encode('utf8')).hexdigest(), 16) % (int(nr_bins) - 1) + 1
class FfmEncoder():
def __init__(self, field_names, label_name, nthread=1):
self.field_names = field_names
self.nthread = nthread
self.label = label_name
def gen_feats(self, row):
feats = []
for field in self.field_names:
value = row[field]
key = field + '-' + str(value)
feats.append(key)
return feats
def gen_hashed_fm_feats(self, feats):
feats = ['{0}:{1}:1'.format(field, hashstr(feat, 1e+6)) for (field, feat) in feats]
return feats
def convert(self, df, path, i):
lines_per_thread = math.ceil(float(df.shape[0]) / self.nthread)
sub_df = df.iloc[i * lines_per_thread: (i + 1) * lines_per_thread]
tmp_path = path + '_tmp_{0}'.format(i)
with open(tmp_path, 'w') as f:
for index,row in sub_df.iterrows():
feats = []
for i, feat in enumerate(self.gen_feats(row)):
feats.append((i, feat))
feats = self.gen_hashed_fm_feats(feats)
f.write(str(int(row[self.label])) + ' ' + ' '.join(feats) + '\n')
def parallel_convert(self, df, path):
processes = []
for i in range(self.nthread):
p = Process(target=self.convert, args=(df, path, i))
p.start()
processes.append(p)
for p in processes:
p.join()
def delete(self, path):
for i in range(self.nthread):
os.remove(path + '_tmp_{0}'.format(i))
def cat(self, path):
if os.path.exists(path):
os.remove(path)
for i in range(self.nthread):
cmd = 'cat {svm}_tmp_{idx} >> {svm}'.format(svm=path, idx=i)
p = subprocess.Popen(cmd, shell=True)
p.communicate()
def transform(self, df, path):
print('converting data......')
self.parallel_convert(df, path)
self.cat(path)
self.delete(path)
import pandas as pd
test = pd.read_csv("./train_all.csv")
test = test[["device_id","cid","user_view","time","label"]]
test["time"] = pd.to_datetime(test['time'])
test["hour"] = [test["time"][i].hour for i in range(test.shape[0])]
test["weekday"] = [test["time"][i].weekday() for i in range(test.shape[0])]
filed_names = ["device_id", "cid", "user_view","hour","weekday"]
fe = FfmEncoder(filed_names,label_name='label',nthread=1)
fe.transform(test, 'train_all.ffm')
File added
This diff is collapsed.
import xlearn as xl
# Training task
ffm_model = xl.create_ffm() # Use field-aware factorization machine
ffm_model.setTrain("./train_last.ffm") # Training data
ffm_model.setValidate("./test_last.ffm") # Validation data
# param:
# 0. binary classification
# 1. learning rate: 0.2
# 2. regular lambda: 0.002
# 3. evaluation metric: accuracy
param = {'task':'binary', 'lr':0.2,
'lambda':0.002, 'metric':'acc'}
# Start to train
# The trained model will be stored in model.out
ffm_model.fit(param, './model.out')
# Prediction task
ffm_model.setTest("./test_imp.ffm") # Test data
ffm_model.setSigmoid() # Convert output to 0-1
# Start to predict
# The output result will be stored in output.txt
ffm_model.predict("./model.out", "./output.txt")
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment