Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
a408229b
Commit
a408229b
authored
Sep 27, 2019
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add
parent
16054827
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
21 additions
and
99 deletions
+21
-99
hello.py
hello.py
+21
-99
No files found.
hello.py
View file @
a408229b
...
...
@@ -18,109 +18,31 @@ import redis
import
datetime
# filter logging
def
gbk_decoder
(
s
):
if
s
is
None
:
return
None
try
:
data
=
msgpack
.
loads
(
s
,
encoding
=
'utf-8'
)
return
data
except
:
data
=
json
.
loads
(
s
)
return
data
def
ctr
(
x
):
sum
=
0
for
i
in
x
:
if
i
[
'is_cpc'
]
==
1
:
sum
=
sum
+
1
return
sum
def
maidian
(
x
):
try
:
data
=
json
.
loads
(
x
[
1
])
if
'type'
in
data
and
'device'
in
data
:
if
data
[
'type'
]
==
'on_click_button'
\
and
data
[
'params'
][
'page_name'
]
==
'home'
and
data
[
'params'
][
'tab_name'
]
==
'精选'
\
and
data
[
'params'
][
'button_name'
]
==
'user_feedback_type'
\
and
data
[
'params'
][
'extra_param'
][
0
][
"card_content_type"
]
==
"diary"
\
and
(
"1"
in
data
[
'params'
][
'extra_param'
][
0
][
"feedback_type"
]
or
"2"
in
data
[
'params'
][
'extra_param'
][
0
][
"feedback_type"
]):
return
True
else
:
return
False
else
:
return
False
except
Exception
as
e
:
print
(
"filter fail"
)
print
(
e
)
def
get_data
(
x
):
try
:
device_id
=
x
[
1
][
'device'
][
'device_id'
]
diary_id
=
x
[
1
][
'params'
][
'extra_param'
][
0
][
"card_id"
]
return
device_id
,
diary_id
except
Exception
as
e
:
print
(
"get_data fail"
)
send_email
(
"get_data"
,
"get_data"
,
e
)
def
write_redis
(
device_id
,
cid_list
):
try
:
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'eagle'
)
sql
=
"select b.id from src_mimas_prod_api_diary_tags a left join src_zhengxing_api_tag b "
\
"on a.tag_id = b.id where b.tag_type = '3' and a.diary_id in {}"
.
format
(
tuple
(
cid_list
))
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
tags
=
list
(
set
([
i
[
0
]
for
i
in
result
]))
if
tags
is
not
None
:
sql
=
"select a.id from src_mimas_prod_api_diary a left join src_mimas_prod_api_diary_tags b "
\
"on a.id=b.diary_id left join src_zhengxing_api_tag c on b.tag_id=c.id "
\
"where a.is_online = 1 and a.content_level >= '3' "
\
"and c.id in {} and c.tag_type = '3'"
.
format
(
tuple
(
tags
))
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
if
result
is
not
None
:
cids
=
list
(
set
([
i
[
0
]
for
i
in
result
]))
r
=
redis
.
StrictRedis
.
from_url
(
'redis://:ReDis!GmTx*0aN6@172.16.40.133:6379'
)
key
=
str
(
device_id
)
+
"_dislike_diary"
if
r
.
exists
(
key
):
value
=
eval
(
r
.
get
(
key
))
value
.
extend
(
cids
)
cids
=
json
.
dumps
(
list
(
set
(
value
)))
r
.
set
(
key
,
json
.
dumps
(
cids
))
else
:
r
.
set
(
key
,
json
.
dumps
(
cids
))
r
.
expire
(
key
,
7
*
24
*
60
*
60
)
except
Exception
as
e
:
print
(
"insert redis fail"
)
print
(
e
)
def
model
(
rdd
):
try
:
rdd
.
filter
(
lambda
x
:
maidian
(
x
))
.
map
(
lambda
x
:
get_data
(
x
)
.
na
.
drop
()
.
groupByKey
())
\
.
map
(
lambda
x
:
write_redis
(
x
[
0
],
x
[
1
]))
except
Exception
as
e
:
print
(
"fail"
)
print
(
e
)
if
__name__
==
'__main__'
:
sc
=
SparkContext
(
conf
=
SparkConf
()
.
setMaster
(
"spark://nvwa01:7077"
)
.
setAppName
(
"dislike_filter"
)
.
set
(
"spark.io.compression.codec"
,
"lzf"
))
ssc
=
StreamingContext
(
sc
,
10
)
sc
.
setLogLevel
(
"WARN"
)
kafkaParams
=
{
"metadata.broker.list"
:
"172.16.44.25:9092,172.16.44.31:9092,172.16.44.45:9092"
,
"group.id"
:
"dislike"
,
"socket.timeout.ms"
:
"600000"
,
"auto.offset.reset"
:
"largest"
}
try
:
stream
=
KafkaUtils
.
createDirectStream
(
ssc
,
[
"gm-maidian-data"
],
kafkaParams
,
keyDecoder
=
gbk_decoder
,
valueDecoder
=
gbk_decoder
)
transformstream
=
stream
.
transform
(
lambda
x
:
model
(
x
))
transformstream
.
pprint
()
ssc
.
start
()
ssc
.
awaitTermination
()
except
Exception
as
e
:
print
(
e
)
# send_email(sc.appName, sc.applicationId, e)
sparkConf
=
SparkConf
()
.
set
(
"spark.hive.mapred.supports.subdirectories"
,
"true"
)
\
.
set
(
"spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive"
,
"true"
)
\
.
set
(
"spark.tispark.plan.allow_index_double_read"
,
"false"
)
\
.
set
(
"spark.tispark.plan.allow_index_read"
,
"true"
)
\
.
set
(
"spark.sql.extensions"
,
"org.apache.spark.sql.TiExtensions"
)
\
.
set
(
"spark.tispark.pd.addresses"
,
"172.16.40.170:2379"
)
.
set
(
"spark.io.compression.codec"
,
"lzf"
)
\
.
set
(
"spark.driver.maxResultSize"
,
"8g"
)
.
set
(
"spark.sql.avro.compression.codec"
,
"snappy"
)
spark
=
SparkSession
.
builder
.
config
(
conf
=
sparkConf
)
.
enableHiveSupport
()
.
getOrCreate
()
sql
=
"select params['exposure_cards'] from online.ml_community_precise_exposure_detail "
\
"where action = 'page_precise_exposure' and page_name = 'search_result_welfare' "
\
"AND partition_date='20190926' limit 20"
rdd
=
spark
.
sql
(
sql
)
.
rdd
.
map
(
lambda
x
:
x
[
0
])
.
map
(
lambda
x
:
eval
(
x
[
0
]))
.
map
(
lambda
x
:
ctr
(
x
))
spark
.
createDataFrame
(
rdd
)
.
show
(
6
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment