Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
M
meta_base_code
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
黎涛
meta_base_code
Commits
040a0f2c
Commit
040a0f2c
authored
Oct 29, 2020
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
e0b47295
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
151 additions
and
19 deletions
+151
-19
__init__.py
api_crawler/__init__.py
+6
-0
api_crawler_test.py
api_crawler/api_crawler_test.py
+124
-0
meigou_data.py
task/meigou_data.py
+21
-19
No files found.
api_crawler/__init__.py
0 → 100644
View file @
040a0f2c
# -*- coding:UTF-8 -*-
# @Time : 2020/10/26 16:10
# @File : __init__.py.py
# @email : litao@igengmei.com
# @author : litao
\ No newline at end of file
api_crawler/api_crawler_test.py
0 → 100644
View file @
040a0f2c
# -*- coding:UTF-8 -*-
# @Time : 2020/10/26 16:10
# @File : api_crawler_test.py
# @email : litao@igengmei.com
# @author : litao
import
requests
from
crawler.crawler_sys.utils.output_results
import
retry_get_url
import
json
,
random
,
urllib
import
requests
import
json
from
colorama
import
init
,
Fore
,
Back
init
(
autoreset
=
True
)
def
test_search_diary
(
query
=
''
):
url
=
'https://backend.igengmei.com/api/search/v6/content?query=
%
s&tab_type=1&is_first=0&size=100&order_by=0&area_ref=&max_price=100000&min_price=0&ai_tab_type=0&sort_type=0&app_name=com.wanmeizhensuo.zhensuo&version=7.35.0&platform=android&device_id=869412032478155&os_version=10&model=LYA-AL00&screen=1080x2265&lat=40.005167&lng=116.477603&channel=benzhan&manufacturer=HUAWEI&uuid=1277ae88-094c-4484-9357-1b3e4519521e&android_device_id=androidid_7255c3398845cdd5¤t_city_id=beijing'
%
(
query
)
diary_id_list
=
[]
tractate_id_list
=
[]
topic_id
=
[]
for
index
in
range
(
0
,
6
):
response
=
retry_get_url
(
url
,
params
=
{
'offset'
:
index
*
100
})
# print(response.url)
dict_response
=
response
.
json
()[
'data'
][
'diaries'
]
print
(
"这是第
%
d次请求"
%
index
)
for
data
in
dict_response
:
# print(data['data_type'],data['id'])
if
data
[
'data_type'
]
==
19
:
if
data
[
'id'
]
not
in
tractate_id_list
:
tractate_id_list
.
append
(
data
[
'id'
])
# print(data['id'])
elif
data
[
'data_type'
]
==
0
:
if
data
[
'id'
]
not
in
diary_id_list
:
diary_id_list
.
append
(
data
[
'id'
])
# if len(diary_id_list) >= 100 and len(tractate_id_list) >= 100:
# break
print
(
"================================================"
)
print
(
diary_id_list
,
tractate_id_list
)
return
(
diary_id_list
,
tractate_id_list
)
if
__name__
==
'__main__'
:
query_list
=
[
"双眼皮"
,
"瘦脸针"
,
"脱毛"
,
"小气泡"
,
"发型"
,
"水光针"
,
"热玛吉"
,
"光子嫩肤"
,
"测脸型"
,
"吸脂"
,
"玻尿酸"
,
"鼻综合"
,
"脸型适合什么发型"
,
"果酸焕肤"
,
"双眼皮2388"
,
"植发"
,
"线雕"
,
"开内眼角"
,
"祛斑"
,
"美白针"
,
"搜索项目、商品、医生"
,
"测发型"
,
"隆鼻"
,
"菲洛嘉"
,
"瘦腿针"
,
"眼综合"
,
"祛痘"
,
"除皱"
,
"超声刀"
,
"如何根据脸型测发型"
,
"皮秒"
,
"隆胸"
,
"脸型"
,
"热拉提"
,
"超皮秒"
,
"手术瘦脸"
,
"皮秒激光"
,
"微针"
,
"改善肤质"
,
"丰胸(隆胸)"
,
"厚唇改薄"
,
"点阵激光"
,
"鼻头"
,
"韩式半永久纹眉"
,
"玻尿酸注射"
,
"下颌角"
,
"牙齿矫正"
,
"面部吸脂"
,
"光子嫩肤88"
,
"韩式半永久妆"
,
"毛发移植"
,
"热玛吉五代"
,
"自体脂肪填充面部"
,
"拍照测发型"
,
"医选"
,
"颧骨内推"
,
"牙齿美白"
,
"面部轮廓"
,
"瘦脸针限时680"
,
"鼻部综合"
,
"自体脂肪填充"
,
"武汉洪山月目医疗美容"
,
"吸脂瘦脸"
,
"黄金微针"
,
"嗨体"
,
]
res_list
=
[]
for
query
in
query_list
:
print
(
query
)
try
:
diary_id_list
,
tractate_id_list
=
test_search_diary
(
query
=
query
)
res_list
.
append
([
query
,
diary_id_list
,
tractate_id_list
])
except
:
continue
#break
import
pandas
as
pd
res
=
pd
.
DataFrame
(
res_list
)
res
.
to_csv
(
"save.csv"
,
encoding
=
'gb18030'
)
\ No newline at end of file
task/meigou_data.py
View file @
040a0f2c
...
...
@@ -75,7 +75,7 @@ spark.sql("CREATE TEMPORARY FUNCTION is_json AS 'com.gmei.hive.common.udf.UDFJso
spark
.
sql
(
"CREATE TEMPORARY FUNCTION arrayMerge AS 'com.gmei.hive.common.udf.UDFArryMerge'"
)
task_list
=
[]
task_days
=
3
task_days
=
120
for
t
in
range
(
1
,
task_days
):
day_num
=
0
-
t
now
=
(
datetime
.
datetime
.
now
()
+
datetime
.
timedelta
(
days
=
day_num
))
...
...
@@ -85,7 +85,7 @@ for t in range(1, task_days):
one_week_age_str
=
(
now
+
datetime
.
timedelta
(
days
=-
7
))
.
strftime
(
"
%
Y
%
m
%
d"
)
sql_search_ctr
=
r"""
SELECT
SELECT
t1.partition_date as partition_date
,active_type as active_type
,device_os_type as device_os_type
...
...
@@ -99,6 +99,7 @@ t1.partition_date as partition_date
,NVL(sum(two_click_pv),0) as two_click_pv--`有效二跳pv`
,NVL(sum(cpc_click_pv),0) as cpc_click_pv--`cpc卡片点击pv`
,NVL(sum(cpc_exp_pv),0) as cpc_exp_pv --`cpc卡片曝光pv`
FROM
(
SELECT partition_date
...
...
@@ -108,16 +109,16 @@ FROM
,device_id
,CASE WHEN substr(md5(device_id),-1) in ('0','1','2','3','4','5','6','7') THEN '灰度' ELSE '非灰' END AS grey_type
FROM online.ml_device_day_active_status
WHERE partition_date>={
start_day
}
AND partition_date<= {partition_da
y
}
WHERE partition_date>={
yesterday_str
}
AND partition_date<= {partition_da
te
}
AND active_type IN ('1','2','4')
)t1
JOIN
(--精准曝光
SELECT cl_id,partition_date,card_id,count(1) as exp_pv,count(CASE WHEN get_json_object(exposure_card, '$.is_cpc')=1 THEN 1 END) as cpc_exp_pv
FROM online.ml_community_precise_exposure_detail
WHERE partition_date>={
start_day
}
AND partition_date<= {partition_da
y
}
WHERE partition_date>={
yesterday_str
}
AND partition_date<= {partition_da
te
}
AND action in ('page_precise_exposure','home_choiceness_card_exposure') --7745版本action改为page_precise_exposure
AND page_name in('welfare_home')
AND tab_name in ('精选')
...
...
@@ -127,11 +128,11 @@ JOIN
)t2
on t1.device_id=t2.cl_id and t1.partition_date=t2.partition_date
LEFT JOIN
(--卡片点击
SELECT cl_id,partition_date,params['card_id'] as card_id,count(1) as click_pv,count(CASE WHEN params['is_cpc']=1 THEN 1 E
LSE 0 E
ND) as cpc_click_pv
(--卡片点击
SELECT cl_id,partition_date,params['card_id'] as card_id,count(1) as click_pv,count(CASE WHEN params['is_cpc']=1 THEN 1 END) as cpc_click_pv
FROM online.bl_hdfs_maidian_updates
WHERE partition_date>={
start_day
}
AND partition_date<= {partition_da
y
}
WHERE partition_date>={
yesterday_str
}
AND partition_date<= {partition_da
te
}
AND action='on_click_card'
AND params['tab_name']='精选'
AND params['page_name'] ='welfare_home'
...
...
@@ -145,8 +146,8 @@ LEFT JOIN
(--商祥二跳
SELECT cl_id,partition_date,params['service_id'] as service_id,count(1) as two_click_pv
FROM online.bl_hdfs_maidian_updates
WHERE partition_date>={
start_day
}
AND partition_date<= {partition_da
y
}
WHERE partition_date>={
yesterday_str
}
AND partition_date<= {partition_da
te
}
AND (referrer in ('welfare_home')
or (params['referrer_link'] like '
%
[
%
' and json_split(params['referrer_link'])[size(json_split(params['referrer_link']))-1] in ('welfare_home')))
AND ((action in ('welfare_multiattribute_click_add','welfare_multiattribute_click_buy') AND page_name = 'welfare_detail')
...
...
@@ -164,7 +165,7 @@ LEFT JOIN
UNION ALL
SELECT device_id
FROM ml.ml_d_ct_dv_devicespam_d --剔除刷量设备
WHERE partition_day={partition_da
y
}
WHERE partition_day={partition_da
te
}
)a
on t1.device_id=a.device_id
LEFT JOIN
...
...
@@ -175,20 +176,20 @@ LEFT JOIN
SELECT user_id,partition_date,
if(size(device_list) > 0, device_list [ 0 ], '') AS device_id
FROM online.ml_user_updates
WHERE partition_date>={
start_day
}
AND partition_date<= {partition_da
y
}
WHERE partition_date>={
yesterday_str
}
AND partition_date<= {partition_da
te
}
)t1
JOIN
( --医生账号
SELECT distinct user_id
FROM online.tl_hdfs_doctor_view
WHERE partition_date = {partition_da
y
}
WHERE partition_date = {partition_da
te
}
--马甲账号/模特用户
UNION ALL
SELECT user_id
FROM ml.ml_c_ct_ui_user_dimen_d
WHERE partition_day = {partition_da
y
}
WHERE partition_day = {partition_da
te
}
AND (is_puppet = 'true' or is_classifyuser = 'true')
UNION ALL
...
...
@@ -204,13 +205,13 @@ LEFT JOIN
SELECT user_id, v.device_id as device_id
FROM online.ml_user_history_detail
LATERAL VIEW EXPLODE(device_history_list) v AS device_id
WHERE partition_date = {partition_da
y
}
WHERE partition_date = {partition_da
te
}
)t1
JOIN
(
SELECT device_id
FROM online.ml_device_history_detail
WHERE partition_date = {partition_da
y
}
WHERE partition_date = {partition_da
te
}
AND is_login_doctor = '1'
)t2
ON t1.device_id = t2.device_id
...
...
@@ -224,6 +225,7 @@ and (b.device_id is null or b.device_id ='')
GROUP BY t1.partition_date
,grey_type,active_type,device_os_type
order by 1
"""
.
format
(
partition_day
=
today_str
,
start_day
=
yesterday_str
)
print
(
sql_search_ctr
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment