Commit 4f49d54e authored by 高雅喆's avatar 高雅喆

rm *

parent 0fd31641
import hashlib, math, os, subprocess
from multiprocessing import Process
def hashstr(str, nr_bins=1e+6):
return int(hashlib.md5(str.encode('utf8')).hexdigest(), 16) % (int(nr_bins) - 1) + 1
class FfmEncoder():
def __init__(self, field_names, label_name, nthread=1):
self.field_names = field_names
self.nthread = nthread
self.label = label_name
def gen_feats(self, row):
feats = []
for field in self.field_names:
value = row[field]
key = field + '-' + str(value)
feats.append(key)
return feats
def gen_hashed_fm_feats(self, feats):
feats = ['{0}:{1}:1'.format(field, hashstr(feat, 1e+6)) for (field, feat) in feats]
return feats
def convert(self, df, path, i):
lines_per_thread = math.ceil(float(df.shape[0]) / self.nthread)
sub_df = df.iloc[i * lines_per_thread: (i + 1) * lines_per_thread]
tmp_path = path + '_tmp_{0}'.format(i)
with open(tmp_path, 'w') as f:
for index,row in sub_df.iterrows():
feats = []
for i, feat in enumerate(self.gen_feats(row)):
feats.append((i, feat))
feats = self.gen_hashed_fm_feats(feats)
f.write(str(int(row[self.label])) + ' ' + ' '.join(feats) + '\n')
def parallel_convert(self, df, path):
processes = []
for i in range(self.nthread):
p = Process(target=self.convert, args=(df, path, i))
p.start()
processes.append(p)
for p in processes:
p.join()
def delete(self, path):
for i in range(self.nthread):
os.remove(path + '_tmp_{0}'.format(i))
def cat(self, path):
if os.path.exists(path):
os.remove(path)
for i in range(self.nthread):
cmd = 'cat {svm}_tmp_{idx} >> {svm}'.format(svm=path, idx=i)
p = subprocess.Popen(cmd, shell=True)
p.communicate()
def transform(self, df, path):
print('converting data......')
self.parallel_convert(df, path)
self.cat(path)
self.delete(path)
import pandas as pd
test = pd.read_csv("./train_all.csv")
test = test[["device_id","cid","user_view","time","label"]]
test["time"] = pd.to_datetime(test['time'])
test["hour"] = [test["time"][i].hour for i in range(test.shape[0])]
test["weekday"] = [test["time"][i].weekday() for i in range(test.shape[0])]
filed_names = ["device_id", "cid", "user_view","hour","weekday"]
fe = FfmEncoder(filed_names,label_name='label',nthread=1)
fe.transform(test, 'train_all.ffm')
This diff is collapsed.
import xlearn as xl
# Training task
ffm_model = xl.create_ffm() # Use field-aware factorization machine
ffm_model.setTrain("./train_last.ffm") # Training data
ffm_model.setValidate("./test_last.ffm") # Validation data
# param:
# 0. binary classification
# 1. learning rate: 0.2
# 2. regular lambda: 0.002
# 3. evaluation metric: accuracy
param = {'task':'binary', 'lr':0.2,
'lambda':0.002, 'metric':'acc'}
# Start to train
# The trained model will be stored in model.out
ffm_model.fit(param, './model.out')
# Prediction task
ffm_model.setTest("./test_imp.ffm") # Test data
ffm_model.setSigmoid() # Convert output to 0-1
# Start to predict
# The output result will be stored in output.txt
ffm_model.predict("./model.out", "./output.txt")
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment