Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
56c55b5a
Commit
56c55b5a
authored
Aug 19, 2020
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
b9ce66aa
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
136 additions
and
306 deletions
+136
-306
revise_table_data_to_table.py
crawler_sys/tools/revise_table_data_to_table.py
+25
-0
check_high_play_count_data_source_v_qq.py
tasks/check_high_play_count_data_source_v_qq.py
+0
-92
from_sparksql_to_mysql.py
tasks/from_sparksql_to_mysql.py
+111
-0
update_DU_ATU_from_crawler_raw.py
tasks/update_DU_ATU_from_crawler_raw.py
+0
-0
write_key_releaser_to_week_doc_weekly.py
tasks/write_key_releaser_to_week_doc_weekly.py
+0
-214
No files found.
crawler_sys/tools/revise_table_data_to_table.py
0 → 100644
View file @
56c55b5a
# -*- coding:UTF-8 -*-
# @Time : 2020/8/19 11:47
# @File : revise_table_data_to_table.py
# @email : litao@igengmei.com
# @author : litao
import
pymysql
def
con_sql
(
sql
):
# 从数据库的表里获取数据
"""
:type sql : str
:rtype : tuple
"""
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'st_user'
,
passwd
=
'YPEzp78HQBuhByWPpefQu6X3D6hEPfD6'
,
db
=
'jerry_prod'
)
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
db
.
close
()
return
result
sql
=
"slect * from xxx"
\ No newline at end of file
tasks/check_high_play_count_data_source_v_qq.py
deleted
100644 → 0
View file @
b9ce66aa
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 5 17:52:53 2018
@author: fangyucheng
"""
from
crawler_sys.site_crawler.crawler_v_qq
import
Crawler_v_qq
from
crawler_sys.utils.output_results
import
output_result
from
crawler_sys.utils
import
Metaorphosis
as
meta
from
crawler_sys.utils.output_log
import
output_log
logging
=
output_log
(
page_category
=
'video_page'
,
program_info
=
'tencent'
)
def
tran_input_data_to_lst
(
file_name
,
file_category
=
'csv'
):
if
file_category
==
'csv'
:
video_info_lst
=
meta
.
csv_to_lst_whth_headline
(
file_name
)
url_lst
=
[]
for
line
in
video_info_lst
:
try
:
if
line
[
'data_provider'
]
==
'CCR'
:
url_lst
.
append
(
line
[
'url'
])
except
:
pass
return
url_lst
elif
file_category
==
'file'
:
url_lst
=
meta
.
str_file_to_lst
(
file_name
)
return
url_lst
url_lst
=
tran_input_data_to_lst
(
file_name
=
'R:/CCR/数据需求/短期临时需求/TX'
,
file_category
=
'file'
)
crawler
=
Crawler_v_qq
()
get_video_page
=
crawler
.
video_page
def
get_data_source
(
url_lst
=
url_lst
,
output_to_file
=
False
,
filepath
=
None
,
output_to_es_raw
=
False
,
output_to_es_register
=
False
,
push_to_redis
=
False
,
output_es_index
=
None
,
output_doc_type
=
None
):
result_lst
=
[]
for
url
in
url_lst
:
video_info
=
get_video_page
(
url
=
url
)
result_lst
.
append
(
video_info
)
logging
.
info
(
'get_data at page
%
s'
%
url
)
if
len
(
result_lst
)
>=
100
:
if
output_es_index
is
not
None
and
output_doc_type
is
not
None
:
output_result
(
result_lst
,
platform
=
'腾讯视频'
,
output_to_file
=
output_to_file
,
output_to_es_raw
=
output_to_es_raw
,
output_to_es_register
=
output_to_es_register
,
push_to_redis
=
push_to_redis
,
es_index
=
output_es_index
,
doc_type
=
output_doc_type
)
result_lst
.
clear
()
else
:
output_result
(
result_lst
,
platform
=
'腾讯视频'
,
output_to_file
=
output_to_file
,
output_to_es_raw
=
output_to_es_raw
,
output_to_es_register
=
output_to_es_register
,
push_to_redis
=
push_to_redis
)
result_lst
.
clear
()
if
len
(
result_lst
)
!=
[]:
if
output_es_index
is
not
None
and
output_doc_type
is
not
None
:
output_result
(
result_lst
,
platform
=
'腾讯视频'
,
output_to_file
=
output_to_file
,
output_to_es_raw
=
output_to_es_raw
,
output_to_es_register
=
output_to_es_register
,
push_to_redis
=
push_to_redis
,
es_index
=
output_es_index
,
doc_type
=
output_doc_type
)
result_lst
.
clear
()
else
:
output_result
(
result_lst
,
platform
=
'腾讯视频'
,
output_to_file
=
output_to_file
,
output_to_es_raw
=
output_to_es_raw
,
output_to_es_register
=
output_to_es_register
,
push_to_redis
=
push_to_redis
)
result_lst
.
clear
()
if
__name__
==
'__main__'
:
get_data_source
(
output_to_es_raw
=
True
,
output_es_index
=
'test2'
,
output_doc_type
=
'fyc'
)
\ No newline at end of file
tasks/from_sparksql_to_mysql.py
0 → 100644
View file @
56c55b5a
# -*- coding:UTF-8 -*-
# @Time : 2020/8/19 11:53
# @File : from_sparksql_to_mysql.py
# @email : litao@igengmei.com
# @author : litao
import
hashlib
import
json
import
pymysql
import
xlwt
,
datetime
import
redis
# from pyhive import hive
from
maintenance.func_send_email_with_file
import
send_file_email
from
typing
import
Dict
,
List
from
elasticsearch_7
import
Elasticsearch
from
elasticsearch_7.helpers
import
scan
import
sys
import
time
from
pyspark
import
SparkConf
from
pyspark.sql
import
SparkSession
,
DataFrame
from
pyspark.sql.functions
import
lit
import
pytispark.pytispark
as
pti
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'st_user'
,
passwd
=
'YPEzp78HQBuhByWPpefQu6X3D6hEPfD6'
,
db
=
'jerry_prod'
)
cursor
=
db
.
cursor
()
def
con_sql
(
sql
):
# 从数据库的表里获取数据
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'st_user'
,
passwd
=
'YPEzp78HQBuhByWPpefQu6X3D6hEPfD6'
,
db
=
'jerry_prod'
)
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
db
.
close
()
return
result
startTime
=
time
.
time
()
sparkConf
=
SparkConf
()
sparkConf
.
set
(
"spark.sql.crossJoin.enabled"
,
True
)
sparkConf
.
set
(
"spark.debug.maxToStringFields"
,
"100"
)
sparkConf
.
set
(
"spark.tispark.plan.allow_index_double_read"
,
False
)
sparkConf
.
set
(
"spark.tispark.plan.allow_index_read"
,
True
)
sparkConf
.
set
(
"spark.hive.mapred.supports.subdirectories"
,
True
)
sparkConf
.
set
(
"spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive"
,
True
)
sparkConf
.
set
(
"spark.serializer"
,
"org.apache.spark.serializer.KryoSerializer"
)
sparkConf
.
set
(
"mapreduce.output.fileoutputformat.compress"
,
False
)
sparkConf
.
set
(
"mapreduce.map.output.compress"
,
False
)
sparkConf
.
set
(
"prod.gold.jdbcuri"
,
"jdbc:mysql://172.16.30.136/doris_prod?user=doris&password=o5gbA27hXHHm&rewriteBatchedStatements=true"
)
sparkConf
.
set
(
"prod.mimas.jdbcuri"
,
"jdbc:mysql://172.16.30.138/mimas_prod?user=mimas&password=GJL3UJe1Ck9ggL6aKnZCq4cRvM&rewriteBatchedStatements=true"
)
sparkConf
.
set
(
"prod.gaia.jdbcuri"
,
"jdbc:mysql://172.16.30.143/zhengxing?user=work&password=BJQaT9VzDcuPBqkd&rewriteBatchedStatements=true"
)
sparkConf
.
set
(
"prod.tidb.jdbcuri"
,
"jdbc:mysql://172.16.40.158:4000/eagle?user=st_user&password=aqpuBLYzEV7tML5RPsN1pntUzFy&rewriteBatchedStatements=true"
)
sparkConf
.
set
(
"prod.jerry.jdbcuri"
,
"jdbc:mysql://172.16.40.158:4000/jerry_prod?user=st_user&password=aqpuBLYzEV7tML5RPsN1pntUzFy&rewriteBatchedStatements=true"
)
sparkConf
.
set
(
"prod.tispark.pd.addresses"
,
"172.16.40.158:2379"
)
sparkConf
.
set
(
"prod.tispark.pd.addresses"
,
"172.16.40.170:4000"
)
sparkConf
.
set
(
"prod.tidb.database"
,
"jerry_prod"
)
spark
=
(
SparkSession
.
builder
.
config
(
conf
=
sparkConf
)
.
config
(
"spark.sql.extensions"
,
"org.apache.spark.sql.TiExtensions"
)
.
config
(
"spark.tispark.pd.addresses"
,
"172.16.40.170:2379"
)
.
appName
(
"LR PYSPARK TEST"
)
.
enableHiveSupport
()
.
getOrCreate
())
spark
.
sql
(
"ADD JAR hdfs:///user/hive/share/lib/udf/brickhouse-0.7.1-SNAPSHOT.jar"
)
spark
.
sql
(
"ADD JAR hdfs:///user/hive/share/lib/udf/hive-udf-1.0-SNAPSHOT.jar"
)
spark
.
sql
(
"CREATE TEMPORARY FUNCTION json_map AS 'brickhouse.udf.json.JsonMapUDF'"
)
spark
.
sql
(
"CREATE TEMPORARY FUNCTION is_json AS 'com.gmei.hive.common.udf.UDFJsonFormatCheck'"
)
spark
.
sql
(
"CREATE TEMPORARY FUNCTION arrayMerge AS 'com.gmei.hive.common.udf.UDFArryMerge'"
)
select_sql
=
"""SELECT * FROM pm.tl_pm_contentpage_ctr"""
device_df
=
spark
.
sql
(
select_sql
)
device_df
.
show
(
1
,
False
)
sql_res
=
device_df
.
collect
()
print
(
"-----------------------------------------------------------------------------"
)
for
res
in
sql_res
:
day_id
=
res
.
day_id
device_os_type
=
res
.
device_os_type
active_type
=
res
.
active_type
grey_type
=
res
.
grey_type
page_name
=
res
.
page_name
content_pv
=
res
.
content_pv
content_uv
=
res
.
content_uv
wel_exp_pv
=
res
.
wel_exp_pv
content_exp_pv
=
res
.
content_exp_pv
wel_click_pv
=
res
.
wel_click_pv
content_click_pv
=
res
.
content_click_pv
slide_wel_click_pv
=
res
.
slide_wel_click_pv
self_wel_click_pv
=
res
.
self_wel_click_pv
partition_day
=
res
.
partition_day
pid
=
hashlib
.
md5
(
day_id
+
device_os_type
+
active_type
+
grey_type
+
page_name
)
sql
=
"""INSERT INTO conent_detail_page_grayscale_ctr(day_id,device_os_type,
active_type,grey_type,page_name,content_pv,content_uv,wel_exp_pv,content_exp_pv,wel_click_pv,content_click_pv,
wel_click_pv,content_click_pv,slide_wel_click_pv,partition_day,pid
) VALUES('{day_id}','{device_os_type}',
'{active_type}','{grey_type}','{page_name}',{content_pv},{content_uv},{wel_exp_pv},{content_exp_pv},{wel_click_pv},{content_click_pv},
{wel_click_pv},{content_click_pv},{slide_wel_click_pv},'{partition_day}','{pid})'"""
.
format
(
day_id
=
day_id
,
device_os_type
=
device_os_type
,
active_type
=
active_type
,
grey_type
=
grey_type
,
page_name
=
page_name
,
content_pv
=
content_pv
,
content_uv
=
content_uv
,
wel_exp_pv
=
wel_exp_pv
,
content_exp_pv
=
content_exp_pv
,
wel_click_pv
=
wel_click_pv
,
content_click_pv
=
content_click_pv
,
slide_wel_click_pv
=
slide_wel_click_pv
,
self_wel_click_pv
=
self_wel_click_pv
,
partition_day
=
partition_day
,
pid
=
pid
)
cursor
.
execute
(
sql
)
# cursor.executemany()
db
.
close
()
\ No newline at end of file
tasks/update_DU_ATU_from_crawler_raw.py
deleted
100644 → 0
View file @
b9ce66aa
This diff is collapsed.
Click to expand it.
tasks/write_key_releaser_to_week_doc_weekly.py
deleted
100644 → 0
View file @
b9ce66aa
# -*- coding:utf-8 -*-
# @Time : 2019/8/14 18:01
# @Author : litao
import
json
# import argparse
import
datetime
from
elasticsearch
import
Elasticsearch
from
elasticsearch.helpers
import
scan
from
func_find_week_num
import
find_week_belongs_to
from
crawler.crawler_sys.framework.platform_crawler_register
import
get_crawler
from
crawler.crawler_sys.utils
import
trans_format
from
write_data_into_es.func_cal_doc_id
import
cal_doc_id
hosts
=
'192.168.17.11'
port
=
80
user
=
'zhouyujiang'
passwd
=
'8tM9JDN2LVxM'
http_auth
=
(
user
,
passwd
)
es
=
Elasticsearch
(
hosts
=
hosts
,
port
=
port
,
http_auth
=
http_auth
)
# parser = argparse.ArgumentParser()
# parser.add_argument('-w', '--week_str', type=str, default=None)
def
week_start_day
(
week_year
,
week_no
,
week_day
,
week_day_start
=
1
):
year_week_start
=
find_first_day_for_given_start_weekday
(
week_year
,
week_day_start
)
week_start
=
year_week_start
+
datetime
.
timedelta
(
days
=
(
week_no
-
1
)
*
7
)
return
week_start
def
define_doc_type
(
week_year
,
week_no
,
week_day_start
):
"""
doc_type = 'daily-url-2018_w24_s2' means select Tuesday as the
first day of each week, it's year 2018's 24th week.
In isocalendar defination, Monday - weekday 1, Tuesday - weekday 2,
..., Saturday - weekday 6, Sunday - weekday 7.
"""
doc_type_str
=
'daily-url-
%
d_w
%02
d_s
%
d'
%
(
week_year
,
week_no
,
week_day_start
)
return
doc_type_str
def
find_first_day_for_given_start_weekday
(
year
,
start_weekday
):
i
=
0
while
i
<
7
:
dayDi
=
datetime
.
date
(
year
,
1
,
1
)
+
datetime
.
timedelta
(
days
=
i
)
if
dayDi
.
weekday
()
==
start_weekday
:
cal_day1D
=
dayDi
-
datetime
.
timedelta
(
days
=
1
)
break
else
:
cal_day1D
=
None
i
+=
1
return
cal_day1D
def
get_target_releaser_video_info
(
platform
,
releaserUrl
,
log_file
=
None
,
output_to_es_raw
=
True
,
es_index
=
None
,
doc_type
=
None
,
releaser_page_num_max
=
100
):
if
log_file
==
None
:
log_file
=
open
(
'error.log'
,
'w'
)
crawler
=
get_crawler
(
platform
=
platform
)
crawler_initialization
=
crawler
()
if
platform
==
'haokan'
:
try
:
crawler_initialization
.
releaser_page
(
releaserUrl
=
releaserUrl
,
releaser_page_num_max
=
releaser_page_num_max
,
output_to_es_raw
=
True
,
es_index
=
es_index
,
doc_type
=
doc_type
,
fetchFavoriteCommnt
=
True
)
except
:
print
(
releaserUrl
,
platform
,
file
=
log_file
)
else
:
try
:
crawler_initialization
.
releaser_page
(
releaserUrl
=
releaserUrl
,
releaser_page_num_max
=
releaser_page_num_max
,
output_to_es_raw
=
True
,
es_index
=
es_index
,
doc_type
=
doc_type
)
except
:
print
(
releaserUrl
,
platform
,
file
=
log_file
)
def
func_search_reUrl_from_target_index
(
platform
,
releaser
):
search_body
=
{
"query"
:
{
"bool"
:
{
"filter"
:
[
{
"term"
:
{
"platform.keyword"
:
platform
}},
{
"term"
:
{
"releaser.keyword"
:
releaser
}}
]
}
}
}
search_re
=
es
.
search
(
index
=
'target_releasers'
,
doc_type
=
'doc'
,
body
=
search_body
)
if
search_re
[
'hits'
][
'total'
]
>
0
:
return
search_re
[
'hits'
][
'hits'
][
0
][
'_source'
][
'releaserUrl'
]
else
:
print
(
'Can not found:'
,
platform
,
releaser
)
return
None
def
func_write_into_weekly_index_new_released
(
line_list
,
doc_type
,
index
=
'short-video-weekly'
):
count
=
0
bulk_all_body
=
''
re_list
=
[]
for
line
in
line_list
:
count
=
count
+
1
weekly_net_inc_play_count
=
line
[
'play_count'
]
weekly_net_inc_comment_count
=
line
[
'comment_count'
]
weekly_net_inc_favorite_count
=
line
[
'favorite_count'
]
weekly_cal_base
=
'accumulate'
timestamp
=
int
(
datetime
.
datetime
.
timestamp
(
datetime
.
datetime
.
now
())
*
1000
)
line
.
update
({
'timestamp'
:
timestamp
,
'weekly_cal_base'
:
weekly_cal_base
,
'weekly_net_inc_favorite_count'
:
weekly_net_inc_favorite_count
,
'weekly_net_inc_comment_count'
:
weekly_net_inc_comment_count
,
'weekly_net_inc_play_count'
:
weekly_net_inc_play_count
})
re_list
.
append
(
line
)
url
=
line
[
'url'
]
platform
=
line
[
'platform'
]
doc_id
=
cal_doc_id
(
platform
,
url
=
url
,
doc_id_type
=
'all-time-url'
,
data_dict
=
line
)
bulk_head
=
'{"index": {"_id":"
%
s"}}'
%
doc_id
data_str
=
json
.
dumps
(
line
,
ensure_ascii
=
False
)
bulk_one_body
=
bulk_head
+
'
\n
'
+
data_str
+
'
\n
'
#
bulk_all_body
+=
bulk_one_body
if
count
%
500
==
0
:
eror_dic
=
es
.
bulk
(
index
=
index
,
doc_type
=
doc_type
,
body
=
bulk_all_body
,
request_timeout
=
200
)
bulk_all_body
=
''
if
eror_dic
[
'errors'
]
is
True
:
print
(
eror_dic
[
'items'
])
print
(
bulk_all_body
)
print
(
count
)
if
bulk_all_body
!=
''
:
eror_dic
=
es
.
bulk
(
body
=
bulk_all_body
,
index
=
index
,
doc_type
=
doc_type
,
request_timeout
=
200
)
if
eror_dic
[
'errors'
]
is
True
:
print
(
eror_dic
)
todayT
=
datetime
.
datetime
.
now
()
# todayT=datetime.datetime(2019,2,5)
week_day_start
=
1
# if args.week_str is None:
seven_days_ago_T
=
todayT
-
datetime
.
timedelta
(
days
=
7
)
week_year
,
week_no
,
week_day
=
find_week_belongs_to
(
seven_days_ago_T
,
week_day_start
)
week_start
=
week_start_day
(
week_year
,
week_no
,
week_day
)
re_s
=
week_start
-
datetime
.
timedelta
(
1
)
re_s_dt
=
datetime
.
datetime
.
strptime
(
str
(
re_s
),
'
%
Y-
%
m-
%
d'
)
re_s_t
=
int
(
datetime
.
datetime
.
timestamp
(
re_s_dt
)
*
1000
)
re_e
=
week_start
+
datetime
.
timedelta
(
6
)
re_e_dt
=
datetime
.
datetime
.
strptime
(
str
(
re_e
),
'
%
Y-
%
m-
%
d'
)
re_e_t
=
int
(
datetime
.
datetime
.
timestamp
(
re_e_dt
)
*
1000
)
# nowT_feihua = week_start + datetime.timedelta(days=6)
weekly_doc_type_name
=
define_doc_type
(
week_year
,
week_no
,
week_day_start
=
week_day_start
)
key_releaser_body
=
{
"query"
:
{
"bool"
:
{
"filter"
:
[
{
"term"
:
{
"key_releaser.keyword"
:
"True"
}}
]
}
}
}
releaser_re
=
scan
(
client
=
es
,
index
=
'target_releasers'
,
doc_type
=
'doc'
,
query
=
key_releaser_body
,
scroll
=
'3m'
)
for
re
in
releaser_re
:
releaser
=
re
[
"_source"
][
'releaser'
]
platform
=
re
[
"_source"
][
'platform'
]
if
releaser
!=
None
:
re_list
=
[]
search_body
=
{
"query"
:
{
"bool"
:
{
"filter"
:
[
{
"term"
:
{
"platform.keyword"
:
platform
}},
{
"term"
:
{
"releaser.keyword"
:
releaser
}},
{
"range"
:
{
"release_time"
:
{
"gte"
:
re_s_t
,
"lt"
:
re_e_t
}}},
{
"range"
:
{
"fetch_time"
:
{
"gte"
:
re_s_t
}}}
]
}
}
}
scan_re
=
scan
(
client
=
es
,
index
=
'short-video-all-time-url'
,
doc_type
=
'all-time-url'
,
query
=
search_body
,
scroll
=
'3m'
)
for
one_scan
in
scan_re
:
re_list
.
append
(
one_scan
[
'_source'
])
func_write_into_weekly_index_new_released
(
re_list
,
doc_type
=
weekly_doc_type_name
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment