Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
0eacdc06
Commit
0eacdc06
authored
Aug 28, 2018
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
增加日记视频集合是空的判断
parent
9ef83045
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
46 additions
and
35 deletions
+46
-35
diaryQueueUpdate.py
diaryQueueUpdate.py
+42
-31
prepareData.py
prepareData.py
+3
-3
processData.py
processData.py
+1
-1
No files found.
diaryQueueUpdate.py
View file @
0eacdc06
...
...
@@ -21,10 +21,13 @@ def get_video_id():
result
=
cursor
.
fetchall
()
df
=
pd
.
DataFrame
(
list
(
result
))
print
(
"videio_id 预览"
)
print
(
df
.
head
(
2
))
video_id
=
df
[
0
]
.
values
.
tolist
()
print
(
df
.
head
(
1
))
db
.
close
()
return
video_id
if
df
.
empty
:
return
False
else
:
video_id
=
df
[
0
]
.
values
.
tolist
()
return
video_id
# 将device_id、city_id拼接到对应的城市热门日记表。注意:下面预测集特征顺序要与训练集保持一致
...
...
@@ -109,42 +112,50 @@ def get_score(queue_arg):
def
update_dairy_queue
(
score_df
,
predict_score_df
,
total_video_id
):
diary_id
=
score_df
[
"cid"
]
.
values
.
tolist
()
video_id
=
list
(
set
(
diary_id
)
&
set
(
total_video_id
))
if
len
(
video_id
)
>
0
:
not_video
=
list
(
set
(
diary_id
)
-
set
(
video_id
))
# 为了相加时cid能够匹配,先把cid变成索引
not_video_df
=
score_df
.
loc
[
score_df
[
"cid"
]
.
isin
(
not_video
)]
.
set_index
([
"cid"
])
not_video_predict_df
=
predict_score_df
.
loc
[
predict_score_df
[
"cid"
]
.
isin
(
not_video
)]
.
set_index
([
"cid"
])
not_video_df
[
"score"
]
=
not_video_df
[
"score"
]
+
not_video_predict_df
[
"score"
]
not_video_df
=
not_video_df
.
sort_values
(
by
=
"score"
,
ascending
=
False
)
video_df
=
score_df
.
loc
[
score_df
[
"cid"
]
.
isin
(
video_id
)]
.
set_index
([
"cid"
])
video_predict_df
=
predict_score_df
.
loc
[
predict_score_df
[
"cid"
]
.
isin
(
video_id
)]
.
set_index
([
"cid"
])
video_df
[
"score"
]
=
video_df
[
"score"
]
+
video_predict_df
[
"score"
]
video_df
=
video_df
.
sort_values
(
by
=
"score"
,
ascending
=
False
)
not_video_id
=
not_video_df
.
index
.
tolist
()
video_id
=
video_df
.
index
.
tolist
()
new_queue
=
not_video_id
i
=
1
for
j
in
video_id
:
new_queue
.
insert
(
i
,
j
)
i
+=
5
# print("分数合并成功")
return
new_queue
# 如果没有视频日记
if
total_video_id
:
video_id
=
list
(
set
(
diary_id
)
&
set
(
total_video_id
))
if
len
(
video_id
)
>
0
:
not_video
=
list
(
set
(
diary_id
)
-
set
(
video_id
))
# 为了相加时cid能够匹配,先把cid变成索引
not_video_df
=
score_df
.
loc
[
score_df
[
"cid"
]
.
isin
(
not_video
)]
.
set_index
([
"cid"
])
not_video_predict_df
=
predict_score_df
.
loc
[
predict_score_df
[
"cid"
]
.
isin
(
not_video
)]
.
set_index
([
"cid"
])
not_video_df
[
"score"
]
=
not_video_df
[
"score"
]
+
not_video_predict_df
[
"score"
]
not_video_df
=
not_video_df
.
sort_values
(
by
=
"score"
,
ascending
=
False
)
video_df
=
score_df
.
loc
[
score_df
[
"cid"
]
.
isin
(
video_id
)]
.
set_index
([
"cid"
])
video_predict_df
=
predict_score_df
.
loc
[
predict_score_df
[
"cid"
]
.
isin
(
video_id
)]
.
set_index
([
"cid"
])
video_df
[
"score"
]
=
video_df
[
"score"
]
+
video_predict_df
[
"score"
]
video_df
=
video_df
.
sort_values
(
by
=
"score"
,
ascending
=
False
)
not_video_id
=
not_video_df
.
index
.
tolist
()
video_id
=
video_df
.
index
.
tolist
()
new_queue
=
not_video_id
i
=
1
for
j
in
video_id
:
new_queue
.
insert
(
i
,
j
)
i
+=
5
# print("分数合并成功")
return
new_queue
# 如果取交集后没有视频日记
else
:
score_df
=
score_df
.
set_index
([
"cid"
])
predict_score_df
=
predict_score_df
.
set_index
([
"cid"
])
score_df
[
"score"
]
=
score_df
[
"score"
]
+
predict_score_df
[
"score"
]
score_df
=
score_df
.
sort_values
(
by
=
"score"
,
ascending
=
False
)
# print("分数合并成功1")
return
score_df
.
index
.
tolist
()
# 如果total_video_id是空
else
:
score_df
=
score_df
.
set_index
([
"cid"
])
predict_score_df
=
predict_score_df
.
set_index
([
"cid"
])
score_df
[
"score"
]
=
score_df
[
"score"
]
+
predict_score_df
[
"score"
]
score_df
[
"score"
]
=
score_df
[
"score"
]
+
predict_score_df
[
"score"
]
score_df
=
score_df
.
sort_values
(
by
=
"score"
,
ascending
=
False
)
# print("分数合并成功1")
return
score_df
.
index
.
tolist
()
def
update_sql_dairy_queue
(
queue_name
,
diary_id
,
device_id
,
city_id
):
db
=
pymysql
.
connect
(
host
=
'rm-m5e842126ng59jrv6.mysql.rds.aliyuncs.com'
,
port
=
3306
,
user
=
'doris'
,
passwd
=
'o5gbA27hXHHm'
,
db
=
'doris_prod'
)
...
...
prepareData.py
View file @
0eacdc06
...
...
@@ -6,12 +6,12 @@ import time
def
fetch_data
(
start_date
,
end_date
):
# 获取点击表里的device_id
sql
=
"select distinct device_id from data_feed_click"
sql
=
"select distinct device_id from data_feed_click
2
"
click_device_id
=
con_sql
(
sql
)[
0
]
.
values
.
tolist
()
print
(
"成功获取点击表里的device_id"
)
# 获取点击表里的数据
sql
=
"select cid,device_id,time,stat_date from data_feed_click "
\
sql
=
"select cid,device_id,time,stat_date from data_feed_click
2
"
\
"where stat_date >= '{0}' and stat_date <= '{1}'"
.
format
(
start_date
,
end_date
)
click
=
con_sql
(
sql
)
click
=
click
.
rename
(
columns
=
{
0
:
"cid"
,
1
:
"device_id"
,
2
:
"time_date"
,
3
:
"stat_date"
})
...
...
@@ -22,7 +22,7 @@ def fetch_data(start_date, end_date):
click
=
click
.
drop
(
"time_date"
,
axis
=
1
)
# 获取曝光表里的数据
sql
=
"select cid,device_id,time,stat_date from data_feed_exposure "
\
sql
=
"select cid,device_id,time,stat_date from data_feed_exposure
2
"
\
"where stat_date >= '{0}' and stat_date <= '{1}'"
.
format
(
start_date
,
end_date
)
start
=
time
.
time
()
exposure
=
con_sql
(
sql
)
...
...
processData.py
View file @
0eacdc06
...
...
@@ -48,7 +48,7 @@ def feature_en(data_start_date, data_end_date, validation_date, test_date):
data
[
"hour"
]
=
data
[
"hour"
]
.
astype
(
"category"
)
data
[
"minute"
]
=
data
[
"minute"
]
.
astype
(
"category"
)
# 持久化候选cid
# 持久化候选cid
,选预测候选集时用这个过滤
data_set_cid
=
data
[
"cid"
]
.
unique
()
cid_df
=
pd
.
DataFrame
()
cid_df
[
'cid'
]
=
data_set_cid
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment