Commit de8f872a authored by 王志伟's avatar 王志伟
parents f27d8217 971a0d0e
......@@ -114,10 +114,10 @@ class multiFFMFormatPandas:
x = 0
while True:
if x + step < data.__len__():
data_list.append(data.loc[x:x + step])
x = x + step + 1
data_list.append(data.iloc[x:x + step])
x = x + step
else:
data_list.append(data.loc[x:data.__len__()])
data_list.append(data.iloc[x:data.__len__()])
break
return data_list
......@@ -147,7 +147,7 @@ def get_data():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select e.y,e.z,e.stat_date,e.ucity_id,e.clevel1_id,e.ccity_name," \
"u.device_type,u.manufacturer,u.channel,c.top,cid_time.time,e.device_id " \
"from esmm_train_data e left join user_feature u on e.device_id = u.device_id " \
"from esmm_train_data e left join user_feature_clean u on e.device_id = u.device_id " \
"left join cid_type_top c on e.device_id = c.device_id left join cid_time on e.cid_id = cid_time.cid_id " \
"where e.stat_date >= '{}'".format(start)
df = con_sql(db, sql)
......@@ -208,7 +208,7 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select e.y,e.z,e.label,e.ucity_id,e.clevel1_id,e.ccity_name," \
"u.device_type,u.manufacturer,u.channel,c.top,cid_time.time,e.device_id,e.cid_id " \
"from esmm_pre_data e left join user_feature u on e.device_id = u.device_id " \
"from esmm_pre_data e left join user_feature_clean u on e.device_id = u.device_id " \
"left join cid_type_top c on e.device_id = c.device_id left join cid_time on e.cid_id = cid_time.cid_id"
df = con_sql(db, sql)
df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name",
......
......@@ -19,6 +19,11 @@ rm -r ${DATA_PATH}/model_ckpt/DeepCvrMTL/2018*
echo "data2ffm"
${PYTHON_PATH} ${MODEL_PATH}/Feature_pipline/data2ffm.py > ${DATA_PATH}/infer.log
all_sample=$((`cat ${DATA_PATH}/tr.csv | awk -F '\t' '{print$5}' | awk -F ',' '{print$2$3$4}' | sort | uniq | wc -l`))
uniq_feat=$((`cat ${DATA_PATH}/tr.csv | awk -F '\t' '{print$5}' | awk -F ',' '{print$4}' | sort | uniq -u | wc -l`))
repe_feat=$((all_sample-uniq_feat))
echo "Bayes Error Rate" : $((repe_feat*100/all_sample))%
echo "split data"
split -l $((`wc -l < ${DATA_PATH}/tr.csv`/15)) ${DATA_PATH}/tr.csv -d -a 4 ${DATA_PATH}/tr/tr_ --additional-suffix=.csv
split -l $((`wc -l < ${DATA_PATH}/va.csv`/5)) ${DATA_PATH}/va.csv -d -a 4 ${DATA_PATH}/va/va_ --additional-suffix=.csv
......
......@@ -68,12 +68,20 @@ def sort_app():
"job": {"智联招聘", "前程无忧", "斗米", "拉勾", "Boss直聘", "猎聘同道", "智联招聘"}
}
df["app_list"] = df["app_list"].apply(json_format)
n = df.shape[0]
df["sum"] = 0
for i in category.keys():
df[i] = df["app_list"].apply(lambda x: 1 if len(x & category[i]) > 0 else 0)
print(i)
print(df[i].value_counts())
df["sum"] = df["sum"]+df[i]
# print(i)
# print(df.loc[df[i]==1].shape[0]/n)
df = df.drop("app_list",axis=1)
# for i in df["sum"].unique():
# print(i)
# a = df.loc[df["sum"] == i].shape[0]/n
# print(a)
yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8')
print(df.shape)
n = 200000
......
import pandas as pd
import pymysql
def con_sql(db,sql):
cursor = db.cursor()
try:
cursor.execute(sql)
result = cursor.fetchall()
df = pd.DataFrame(list(result))
except Exception:
print("发生异常", Exception)
df = pd.DataFrame()
finally:
db.close()
return df
def exp():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select manufacturer,channel from user_feature"
df = con_sql(db, sql)
n = df.shape[0]
manufacturer = df[0].unique()
manufacturer_map = {}
print("manufacturer unique")
print(len(manufacturer))
for i in manufacturer:
manufacturer_map[i] = df.loc[df[0]==i].shape[0]/n
print(sorted(manufacturer_map.items(),key = lambda x:x[1]))
channel = df[1].unique()
channel_map = {}
print("channel unique")
print(len(channel))
for i in channel:
channel_map[i] = df.loc[df[1] == i].shape[0] / n
print(sorted(channel_map.items(), key=lambda x: x[1]))
def clean():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select device_id,device_type,manufacturer,channel,city_id from user_feature"
df = con_sql(db, sql)
df = df.rename(columns={0: "device_id",1: "device_type", 2: "manufacturer", 3: "channel", 4: "city_id"})
n = df.shape[0]
manufacturer = df["manufacturer"].unique()
for i in manufacturer:
if df.loc[df["manufacturer"]==i].shape[0]/n < 0.0005:
df.loc[df["manufacturer"] == i,["manufacturer"]] = "other"
channel = df["channel"].unique()
for i in channel:
if df.loc[df["channel"] == i].shape[0] / n < 0.0001:
df.loc[df["channel"] == i, ["channel"]] = "other"
from sqlalchemy import create_engine
yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8')
n = 200000
for i in range(0,df.shape[0],n):
print(i)
if i == 0:
temp = df.loc[0:n]
elif i+n > df.shape[0]:
temp = df.loc[i+1:]
else:
temp = df.loc[i+1:i+n]
pd.io.sql.to_sql(temp, "user_feature_clean", yconnect, schema='jerry_test', if_exists='append', index=False)
print("insert done")
if __name__ == "__main__":
clean()
......@@ -99,6 +99,7 @@ class multiFFMFormatPandas:
result_map = {}
for i in data_list:
result_map.update(i.get())
pool.close()
pool.join()
......@@ -114,10 +115,10 @@ class multiFFMFormatPandas:
x = 0
while True:
if x + step < data.__len__():
data_list.append(data.loc[x:x + step])
x = x + step + 1
data_list.append(data.iloc[x:x + step])
x = x + step
else:
data_list.append(data.loc[x:data.__len__()])
data_list.append(data.iloc[x:data.__len__()])
break
return data_list
......@@ -147,7 +148,7 @@ def get_data():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select e.y,e.z,e.stat_date,e.ucity_id,e.clevel1_id,e.ccity_name," \
"u.device_type,u.manufacturer,u.channel,c.top,cid_time.time,e.device_id " \
"from esmm_train_data e left join user_feature u on e.device_id = u.device_id " \
"from esmm_train_data e left join user_feature_clean u on e.device_id = u.device_id " \
"left join cid_type_top c on e.device_id = c.device_id left join cid_time on e.cid_id = cid_time.cid_id " \
"where e.stat_date >= '{}'".format(start)
df = con_sql(db, sql)
......@@ -174,6 +175,7 @@ def get_data():
manufacturer = list(set(df["manufacturer"].values.tolist()))
channel = list(set(df["channel"].values.tolist()))
return df,validate_date,ucity_id,ccity_name,manufacturer,channel
......@@ -197,8 +199,8 @@ def transform(a,validate_date):
train = train.drop("stat_date",axis=1)
test = df[df["stat_date"] == validate_date]
test = test.drop("stat_date",axis=1)
# print("train shape")
# print(train.shape)
print("train shape")
print(train.shape)
train.to_csv(path + "tr.csv", sep="\t", index=False)
test.to_csv(path + "va.csv", sep="\t", index=False)
......@@ -209,7 +211,7 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select e.y,e.z,e.label,e.ucity_id,e.clevel1_id,e.ccity_name," \
"u.device_type,u.manufacturer,u.channel,c.top,cid_time.time,e.device_id,e.cid_id " \
"from esmm_pre_data e left join user_feature u on e.device_id = u.device_id " \
"from esmm_pre_data e left join user_feature_clean u on e.device_id = u.device_id " \
"left join cid_type_top c on e.device_id = c.device_id left join cid_time on e.cid_id = cid_time.cid_id"
df = con_sql(db, sql)
df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name",
......@@ -218,23 +220,23 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
print("before filter:")
print(df.shape)
print(df.loc[df["device_id"]=="358035085192742"].shape)
df = df[df["ucity_id"].isin(ucity_id)]
print("after ucity filter:")
print(df.shape)
print(df.loc[df["device_id"] == "358035085192742"].shape)
df = df[df["ccity_name"].isin(ccity_name)]
print("after ccity_name filter:")
print(df.shape)
print(df.loc[df["device_id"] == "358035085192742"].shape)
df = df[df["manufacturer"].isin(manufacturer)]
print("after manufacturer filter:")
print(df.shape)
print(df.loc[df["device_id"] == "358035085192742"].shape)
df = df[df["channel"].isin(channel)]
print("after channel filter:")
print(df.shape)
print(df.loc[df["device_id"] == "358035085192742"].shape)
df["cid_id"] = df["cid_id"].astype("str")
df["clevel1_id"] = df["clevel1_id"].astype("str")
df["top"] = df["top"].astype("str")
......@@ -245,9 +247,10 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
[df["device_id"].values.tolist(), df["ucity_id"].values.tolist(), df["cid_id"].values.tolist(),
df["y"].values.tolist(), df["z"].values.tolist()], sep=",")
df = df.drop(["z","label","device_id","cid_id"], axis=1).fillna(0.0)
print(df.head(2))
df = model.transform(df,n=160000, processes=22)
df = pd.DataFrame(df)
print("before transform")
print(df.shape)
temp_series = model.transform(df,n=160000, processes=22)
df = pd.DataFrame(temp_series)
print("after transform")
print(df.shape)
df["label"] = df[0].apply(lambda x: x.split(",")[0])
......@@ -286,8 +289,8 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
if __name__ == "__main__":
path = "/home/gmuser/ffm/"
a = time.time()
df, validate_date, ucity_id,ccity_name,manufacturer,channel = get_data()
model = transform(df, validate_date)
temp, validate_date, ucity_id,ccity_name,manufacturer,channel = get_data()
model = transform(temp, validate_date)
get_predict_set(ucity_id,model,ccity_name,manufacturer,channel)
b = time.time()
print("cost(分钟)")
......
import time
from pyspark.context import SparkContext
from pyspark.conf import SparkConf
conf = SparkConf().setMaster("spark://10.30.181.88:7077").setAppName("My app")
sc = SparkContext(conf=conf)
sc.setLogLevel("WARN")
for i in range(1,100):
print(i)
time.sleep(5)
\ No newline at end of file
import pandas as pd
import pymysql
def con_sql(db,sql):
cursor = db.cursor()
try:
cursor.execute(sql)
result = cursor.fetchall()
df = pd.DataFrame(list(result))
except Exception:
print("发生异常", Exception)
df = pd.DataFrame()
finally:
db.close()
return df
def exp():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select native_queue from esmm_device_diary_queue where device_id = '358035085192742'"
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchone()[0]
native = tuple(result.split(","))
print("total")
print(len(native))
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod')
sql = "select diary_id,level1_ids,level2_ids,level3_ids from diary_feat where diary_id in {}".format(native)
df = con_sql(db,sql)
n = df.shape[0]
one = df[1].unique()
one_map = {}
for i in one:
one_map[i] = df.loc[df[1]==i].shape[0]/n
print(sorted(one_map.items(),key = lambda x:x[1]))
two = df[2].unique()
two_map = {}
print("分界线")
for i in two:
two_map[i] = df.loc[df[2] == i].shape[0] / n
print(sorted(two_map.items(), key=lambda x: x[1]))
def click():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod')
sql = "select d.cid_id,f.level1_ids,f.level2_ids from data_feed_click d left join diary_feat f " \
"on d.cid_id = f.diary_id where d.device_id = '358035085192742' " \
"and (d.cid_type = 'diary' or d.cid_type = 'diary_video') and d.stat_date > '2018-12-20'"
df = con_sql(db, sql)
n = df.shape[0]
print(n)
one = df[1].unique()
one_map = {}
for i in one:
one_map[i] = df.loc[df[1] == i].shape[0] / n
print(sorted(one_map.items(), key=lambda x: x[1],reverse=True))
two = df[2].unique()
two_map = {}
print("分界线")
for i in two:
two_map[i] = df.loc[df[2] == i].shape[0] / n
print(sorted(two_map.items(), key=lambda x: x[1],reverse=True))
if __name__ == "__main__":
click()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment