Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
c3d3152f
Commit
c3d3152f
authored
4 years ago
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
d15fee19
master
litao
mr/develop/xiaohongshu
soyang
No related merge requests found
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
538 additions
and
4 deletions
+538
-4
cal_ni_and_put_to_backend.py
crawler_sys/scheduler/cal_ni_and_put_to_backend.py
+6
-4
crawler_zhihu.py
crawler_sys/site_crawler/crawler_zhihu.py
+532
-0
No files found.
crawler_sys/scheduler/cal_ni_and_put_to_backend.py
View file @
c3d3152f
...
@@ -24,8 +24,10 @@ import random
...
@@ -24,8 +24,10 @@ import random
# from mistune import Renderer, InlineGrammar, InlineLexer, Markdown, escape
# from mistune import Renderer, InlineGrammar, InlineLexer, Markdown, escape
rds
=
redis
.
StrictRedis
(
host
=
'154.8.190.251'
,
port
=
6379
,
db
=
19
)
rds
=
redis
.
StrictRedis
(
host
=
'154.8.190.251'
,
port
=
6379
,
db
=
19
)
conn
=
pymysql
.
connect
(
host
=
'bj-cdb-6slgqwlc.sql.tencentcdb.com'
,
port
=
62120
,
user
=
'work'
,
passwd
=
'Gengmei1'
,
# conn = pymysql.connect(host='bj-cdb-6slgqwlc.sql.tencentcdb.com', port=62120, user='work', passwd='Gengmei1',
db
=
'mimas_test'
,
charset
=
'utf8'
)
# db='mimas_test', charset='utf8')
conn
=
pymysql
.
connect
(
host
=
'172.16.30.138'
,
port
=
3306
,
user
=
'mimas'
,
passwd
=
'GJL3UJe1Ck9ggL6aKnZCq4cRvM'
,
db
=
'mimas_prod'
,
charset
=
'utf8mb4'
)
cur
=
conn
.
cursor
()
cur
=
conn
.
cursor
()
...
@@ -313,7 +315,7 @@ def write_data_into_mysql(res_data):
...
@@ -313,7 +315,7 @@ def write_data_into_mysql(res_data):
sql_query
=
"""insert into api_tractate
sql_query
=
"""insert into api_tractate
(user_id,content,is_online,status,platform,content_level,is_excellent,create_time,last_modified,user_del,low_quality,low_quality_deal,platform_id,pgc_type,title)
(user_id,content,is_online,status,platform,content_level,is_excellent,create_time,last_modified,user_del,low_quality,low_quality_deal,platform_id,pgc_type,title)
values ({user_id},'{content}',{is_online},{status},{platform},{content_level},{is_excellent},'{create_time}','{last_modified}',{user_del},{low_quality},{low_quality_deal},'{platform_id}',{pgc_type},'{title}');"""
.
format
(
values ({user_id},'{content}',{is_online},{status},{platform},{content_level},{is_excellent},'{create_time}','{last_modified}',{user_del},{low_quality},{low_quality_deal},'{platform_id}',{pgc_type},'{title}');"""
.
format
(
user_id
=
random
.
choice
(
user_id_list
),
content
=
data
[
"content"
],
is_online
=
1
,
status
=
2
,
platform
=
3
,
user_id
=
random
.
choice
(
user_id_list
),
content
=
data
[
"content"
],
is_online
=
1
,
status
=
2
,
platform
=
15
,
content_level
=
data
[
"level"
],
content_level
=
data
[
"level"
],
is_excellent
=
0
,
create_time
=
now_str
,
is_excellent
=
0
,
create_time
=
now_str
,
last_modified
=
now_str
,
user_del
=
0
,
last_modified
=
now_str
,
user_del
=
0
,
...
@@ -330,7 +332,7 @@ def write_data_into_mysql(res_data):
...
@@ -330,7 +332,7 @@ def write_data_into_mysql(res_data):
if
img_info
[
0
]
in
data
.
get
(
"content"
):
if
img_info
[
0
]
in
data
.
get
(
"content"
):
image_url_source
=
2
image_url_source
=
2
else
:
else
:
image_url_source
=
1
image_url_source
=
3
try
:
try
:
image_type
=
img_type
.
get
(
img_info
[
1
][
"format"
]
.
upper
())
image_type
=
img_type
.
get
(
img_info
[
1
][
"format"
]
.
upper
())
except
:
except
:
...
...
This diff is collapsed.
Click to expand it.
crawler_sys/site_crawler/crawler_zhihu.py
0 → 100644
View file @
c3d3152f
# -*- coding:UTF-8 -*-
# @Time : 2020/8/3 13:35
# @File : crawler_zhihu.py
# @email : litao@igengmei.com
# @author : litao
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 14 20:13:21 2018
@author: fangyucheng
"""
import
copy
import
re
# import rsa
import
time
import
json
import
urllib
import
base64
import
binascii
import
datetime
import
requests
from
bs4
import
BeautifulSoup
from
crawler.crawler_sys.framework.video_fields_std
import
Std_fields_video
from
crawler.crawler_sys.utils.output_results
import
retry_get_url
,
output_result
from
crawler.crawler_sys.utils.trans_str_play_count_to_int
import
trans_play_count
from
crawler.crawler_sys.utils.trans_strtime_to_timestamp
import
weibo_parse_time
,
trans_strtime_to_timestamp
from
crawler.crawler_sys.utils.trans_duration_str_to_second
import
trans_duration
from
crawler.crawler_sys.utils.util_logging
import
logged
from
crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili
import
get_proxy
from
crawler.crawler_sys.utils.html_to_str
import
dehtml
from
write_data_into_es.func_get_releaser_id
import
*
from
write_data_into_es.func_cal_doc_id
import
cal_doc_id
class
Crawler_zhihu
():
def
__init__
(
self
,
timeout
=
None
,
platform
=
'weibo'
):
self
.
platform
=
"zhihu"
self
.
session
=
requests
.
Session
()
std_fields
=
Std_fields_video
()
self
.
video_data
=
std_fields
.
video_data
self
.
video_data
[
'platform'
]
=
self
.
platform
# remove fields that crawled data don't have
pop_key_Lst
=
[
'channel'
,
'describe'
,
'isOriginal'
,
"repost_count"
,
"video_id"
]
for
popk
in
pop_key_Lst
:
self
.
video_data
.
pop
(
popk
)
def
get_single_article_page
(
self
,
article_id
,
keyword
,
proxies
=
0
):
headers
=
{
"Accept"
:
"application/json, text/plain, */*"
,
"Accept-Encoding"
:
"gzip, deflate"
,
"Accept-Language"
:
"zh-CN,zh;q=0.9"
,
"Connection"
:
"keep-alive"
,
# "Cookie": "SINAGLOBAL=565010119549.1364.1559571258394; login_sid_t=85753e367d54782a25518436f329cfa0; cross_origin_proto=SSL; _s_tentry=www.baidu.com; Apache=5712976583220.359.1595386386561; ULV=1595386386575:2:1:1:5712976583220.359.1595386386561:1592884354178; UOR=,,login.sina.com.cn; SSOLoginState=1595829153; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWZ46TE-isMWEvFjmXZnGFZ5JpX5KMhUgL.Fo2Re0zpShqfSoe2dJLoI7_e9gfadcvadcvad7tt; ALF=1627695088; SCF=AlrGNPCzM_VX3PzgxftYKkUv6Gj7FjmOVVbH8EpsTADeRxEeW-7_ipW8LVV7sGN-t7JJA-VwFKC2Ot0ZkHwHstE.; SUB=_2A25yJwQhDeRhGedG6FAQ9CjJzT-IHXVRVXLprDV8PUNbmtAKLRPmkW9NUVHbR2NjdmB2ZEtnFBK75m3CwwTzeqTJ; SUHB=08J6qQipU2qH8A; CARD-MAIN=cfec82595a1164dea323b2fb276c823f",
"Host"
:
"card.weibo.com"
,
"Referer"
:
"https://card.weibo.com/article/m/show/id/{0}?_wb_client_=1&open_source=weibo_search&luicode=10000011&lfid=100103type
%3
D21
%26
q
%3
D{1}
%26
t
%3
D0"
.
format
(
article_id
,
urllib
.
parse
.
quote
(
keyword
)),
"Sec-Fetch-Dest"
:
"empty"
,
"Sec-Fetch-Mode"
:
"cors"
,
"Sec-Fetch-Site"
:
"same-origin"
,
"User-Agent"
:
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
,
}
url
=
"https://card.weibo.com/article/m/aj/detail?id={0}&_t={1}"
.
format
(
article_id
,
int
(
datetime
.
datetime
.
now
()
.
timestamp
()
*
1e3
))
try
:
requests_res
=
retry_get_url
(
url
,
headers
=
headers
,
proxies
=
proxies
)
res_json
=
requests_res
.
json
()
# print(res_json)
data
=
res_json
[
"data"
]
video_dic
=
{}
video_dic
[
"url"
]
=
data
[
"target_url"
]
video_dic
[
"title"
]
=
data
[
"title"
]
video_dic
[
"fetch_time"
]
=
int
(
datetime
.
datetime
.
now
()
.
timestamp
()
*
1e3
)
video_dic
[
"release_time"
]
=
trans_strtime_to_timestamp
(
data
[
"create_at"
])
video_dic
[
"play_count"
]
=
int
(
data
[
"read_count"
])
video_dic
[
"content"
]
=
data
[
"content"
]
video_dic
[
"releaser"
]
=
data
[
"userinfo"
]
.
get
(
'screen_name'
)
video_dic
[
"releaser_id"
]
=
str
(
data
[
"userinfo"
]
.
get
(
'id'
))
video_dic
[
"releaserUrl"
]
=
data
[
"userinfo"
]
.
get
(
'url'
)
video_dic
[
"releaser_id_str"
]
=
"weibo_"
+
str
(
video_dic
[
"releaser_id"
])
video_dic
[
"img_list"
]
=
re
.
findall
(
'img src="(.*?)"'
,
data
[
"content"
])
return
video_dic
except
Exception
as
e
:
print
(
"single data row formate error
%
s"
%
e
)
def
get_serach_page_cookies
(
self
,
keyword
):
url
=
"https://www.zhihu.com/search?type=content&q=
%
s"
%
keyword
headers
=
{
"accept"
:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
,
"accept-encoding"
:
"gzip, deflate, br"
,
"accept-language"
:
"zh-CN,zh;q=0.9"
,
"cache-control"
:
"max-age=0"
,
"cookie"
:
'_zap=20547721-b576-4409-95c1-000c6f20517b; d_c0=AIDu7_zGrA-PToWVy-siVNLS835i5YXmFCQ=|1562072925; __gads=ID=bdc51df6433d4288:T=1562072932:S=ALNI_MbUwg2TeI33p4EnEYpHr8bAKBUiNQ; _ga=GA1.2.929365035.1592357886; _xsrf=9883cfd3-4ae9-409d-9150-4bed1c5fb89e; tst=r; SESSIONID=u14cNx7BBTdkwNnJUEmVvebNsxweydFrakHmXhCPpfw; JOID=W18XAkKDlnFa_djmdIv4qxEeF55u-KJGMbOu1DXBzBAygpaGO9lMWQ3z0-B_XpsC-EdjO4KDArW2i-V8Y16DNXM=; osd=UVscCkyJknpS89Lif4P2oRUVH5Bk_KlOP7mq3z3PxhQ5ipiMP9JEVwf32OhxVJ8J8ElpP4mLDL-ygO1yaVqIPX0=; q_c1=e59a45f95396455e871eb111bdd827e1|1596185954000|1562072927000; _gid=GA1.2.544062079.1596418493; capsion_ticket=2|1:0|10:1596418535|14:capsion_ticket|44:MmJhMzEyNzYzNzE5NDAyOTg3ZGQzNDFmYTFlYjJmMjE=|facc3f88969d538b60f0530ff9bbdb74aa1bb7012584b9dfd2a5f3a3c1fb9726; z_c0="2|1:0|10:1596418574|4:z_c0|92:Mi4xSDJLUUhRQUFBQUFBZ083dl9NYXNEeVlBQUFCZ0FsVk5EYmdVWUFDcDlBZjhBb0stY3RHTnhNS013YXItcko0VXFn|73520023927845cb04e21a4a1fbfae5d25088de4ffae91090d55cf7a5ba5b008; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1596184903,1596185679,1596418492,1596419419; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1596435670; KLBRSID=53650870f91603bc3193342a80cf198c|1596435676|1596435655'
,
"referer"
:
"https://www.zhihu.com/search?type=content&q=
%
E7
%83%
AD
%
E7
%8
E
%9
B
%
E5
%90%89
"
,
"sec-fetch-dest"
:
"document"
,
"sec-fetch-mode"
:
"navigate"
,
"sec-fetch-site"
:
"same-origin"
,
"sec-fetch-user"
:
"?1"
,
"upgrade-insecure-requests"
:
"1"
,
"user-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
,
}
requests_res
=
retry_get_url
(
url
,
headers
=
headers
)
print
(
requests_res
.
cookies
.
get_dict
())
return
requests_res
.
cookies
.
get_dict
()
def
search_article_page
(
self
,
keyword
,
search_pages_max
=
12
,
output_to_es_raw
=
False
,
output_to_es_register
=
False
,
es_index
=
None
,
doc_type
=
None
,
proxies_num
=
0
):
res_cookies_dict
=
self
.
get_serach_page_cookies
(
keyword
=
keyword
)
headers_search
=
{
"accept"
:
"*/*"
,
"accept-encoding"
:
"gzip, deflate"
,
"accept-language"
:
"zh-CN,zh;q=0.9"
,
"sec-fetch-dest"
:
"empty"
,
"sec-fetch-mode"
:
"cors"
,
"sec-fetch-site"
:
"same-origin"
,
"user-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
,
"x-ab-param"
:
"li_yxzl_new_style_a=1;se_major=0;se_adsrank=4;se_hi_trunc=0;se_v053=1;li_panswer_topic=0;zr_test_aa1=1;pf_noti_entry_num=2;zr_search_sim2=2;zr_slotpaidexp=2;se_zp_boost=0;tp_club_entrance=1;pf_profile2_tab=0;ug_newtag=1;li_answer_card=0;ls_recommend_test=5;qap_labeltype=1;zr_rec_answer_cp=open;se_sug_term=0;tp_topic_tab=0;ge_ge01=5;se_wil_act=0;se_videobox=0;tsp_ad_cardredesign=0;qap_question_visitor= 0;zr_slot_training=2;tp_clubhyb=0;li_ebook_gen_search=2;se_v_v005=0;zw_sameq_sorce=999;ge_ge02=6;se_mobilecard=0;se_auth_src=0;tp_header_style=1;tp_flow_ctr=0;pf_creator_card=1;li_viptab_name=0;zr_intervene=0;se_bert128=1;se_ffzx_jushen1=0;top_v_album=1;se_preset=0;tp_discover=1;ls_fmp4=0;tp_club_top=0;top_universalebook=1;li_svip_cardshow=1;li_paid_answer_exp=0;tp_topic_style=0;zr_art_rec=base;se_colorfultab=1;se_auth_src2=0;tp_club_qa_entrance=1;tp_club__entrance2=1;tsp_hotlist_ui=3;li_svip_tab_search=1;se_entity22=1;tp_meta_card=0;tp_topic_tab_new=0-0-0;tp_zrec=0;top_ebook=0;pf_adjust=1;qap_question_author=0;zr_topic_rpc=0;se_topicfeed=0;tp_club_feed=0;tsp_ioscard2=0;zr_rel_search=base;se_recommend=1;se_usercard=0;tp_club_fdv4=0;tp_m_intro_re_topic=1;pf_foltopic_usernum=0;li_vip_verti_search=0;zr_training_boost=false;se_v054=0;tp_contents=1;soc_feed_intelligent=3;tsp_ios_cardredesign=0;pf_fuceng=1;pf_newguide_vertical=0;ug_follow_topic_1=2;ls_video_commercial=0;li_car_meta=1;se_sug_dnn=0;tp_fenqu_wei=0;li_catalog_card=1;top_quality=0;se_click_v_v=1;se_aa_base=1;se_club_ui=0;se_return_1=0;soc_notification=1;zr_ans_rec=gbrank;zr_search_paid=1;zr_expslotpaid=3;zr_rerank=0;se_college=default;se_whitelist=1;top_root=0;li_yxxq_aut=A1;tsp_adcard2=0;ls_videoad=2;se_col_boost=1;li_edu_page=old;zr_training_first=false;se_t2sug=1;se_vbert3=0;se_merge=0;li_video_section=1;zr_km_answer=open_cvr;zr_sim3=0;se_v_v006=0;tp_dingyue_video=0;li_topics_search=0;se_searchwiki=0;se_guess=0;se_major_v2=0;tp_club_bt=0;tp_sft=a;top_test_4_liguangyi=1"
,
"x-api-version"
:
"3.0.91"
,
"x-app-za"
:
"OS=Web"
,
"x-requested-with"
:
"fetch"
,
"x-zse-83"
:
"3_2.0"
,
"x-zse-86"
:
"1.0_a0Oy67L8cXYxFgtBK8FBo6r0NCxxgBN009tBk4Lq2XFY"
,
"referer"
:
"https://www.zhihu.com/search?type=content&q={0}"
.
format
(
urllib
.
parse
.
quote
(
keyword
)),
}
cookies_dict
=
{
"d_c0"
:
'"AIDu7_zGrA-PToWVy-siVNLS835i5YXmFCQ=|1562072925"'
,
"KLBRSID"
:
None
}
cookies_dict
.
update
(
res_cookies_dict
)
url
=
"https://www.zhihu.com/api/v4/search_v3?t=general&q={0}&correction=1&offset=0&limit=20&lc_idx=0&show_all_topics=0"
.
format
(
urllib
.
parse
.
quote
(
keyword
))
offset
=
0
while
offset
<=
search_pages_max
*
20
:
offset
+=
20
get_page
=
retry_get_url
(
url
,
headers
=
headers_search
,
cookies
=
cookies_dict
)
if
get_page
.
status_code
!=
200
:
# retry once
get_page
=
requests
.
get
(
url
)
if
get_page
.
status_code
!=
200
:
continue
page_dict
=
get_page
.
json
()
url
=
page_dict
[
"paging"
][
"next"
]
print
(
get_page
.
cookies
.
get_dict
())
cookies_dict
.
update
(
get_page
.
cookies
.
get_dict
())
headers_search
.
pop
(
"x-zse-86"
,
0
)
res_list
=
[]
if
page_dict
.
get
(
"data"
):
continue
for
one_line
in
page_dict
[
'data'
]:
try
:
article_type
=
one_line
[
'knowledge_ad'
]
title
=
one_line
[
'object'
][
"body"
][
"title"
]
url
=
one_line
[
'object'
][
"url"
]
article_id
=
re
.
findall
(
"(
\
d+)"
,
one_line
[
'scheme'
])[
0
]
releaser
=
one_line
[
'object'
][
"body"
][
"authors"
][
0
][
"name"
]
uid
=
one_line
[
'object'
][
"body"
][
"authors"
][
0
][
"url_token"
]
# releaserUrl = "https://www.toutiao.com/c/user/%s/" % uid
# release_time = one_line['publish_time']
# release_time = int(int(release_time) * 1e3)
fetch_time
=
int
(
datetime
.
datetime
.
now
()
.
timestamp
()
*
1e3
)
# releaser_id = self.get_releaser_id(releaserUrl)
D0
=
copy
.
deepcopy
(
self
.
video_data
)
D0
[
'title'
]
=
title
# D0['abstract'] = abstract
# D0['url'] = url
# D0['play_count'] = play_count
# D0['comment_count'] = comment_count
# D0['favorite_count'] = favorite_count
D0
[
'article_id'
]
=
article_id
# D0['releaser'] = releaser
# D0['releaserUrl'] = releaserUrl
# D0['release_time'] = release_time
# D0['releaser_id_str'] = "toutiao_%s" % releaser_id
D0
[
'fetch_time'
]
=
fetch_time
D0
[
'search_word'
]
=
keyword
D0
[
"type"
]
=
"article"
try
:
article_info
=
self
.
get_single_article_page
(
article_id
,
keyword
,
proxies
=
proxies_num
)
D0
.
update
(
article_info
)
except
Exception
as
e
:
print
(
"method get_web_article_info error
%
s"
%
e
)
print
(
D0
)
res_list
.
append
(
D0
)
except
KeyError
:
# It's totally ok to drop the last return data value.
# The search api just return something seems related to search
continue
else
:
break
if
len
(
res_list
)
>=
100
:
output_result
(
result_Lst
=
res_list
,
platform
=
self
.
platform
,
output_to_es_raw
=
output_to_es_raw
,
output_to_es_register
=
output_to_es_register
,
es_index
=
es_index
,
doc_type
=
doc_type
)
res_list
.
clear
()
if
res_list
!=
[]:
output_result
(
result_Lst
=
res_list
,
platform
=
self
.
platform
,
output_to_es_raw
=
output_to_es_raw
,
output_to_es_register
=
output_to_es_register
,
es_index
=
es_index
,
doc_type
=
doc_type
)
return
res_list
def
get_releaser_id
(
self
,
releaserUrl
):
return
get_releaser_id
(
platform
=
self
.
platform
,
releaserUrl
=
releaserUrl
)
def
search_page
(
self
,
keyword
,
search_pages_max
=
30
,
output_to_es_raw
=
False
,
output_to_es_register
=
False
,
es_index
=
None
,
doc_type
=
None
,
proxies_num
=
0
):
self
.
search_article_page
(
keyword
,
search_pages_max
=
search_pages_max
,
output_to_es_raw
=
output_to_es_raw
,
output_to_es_register
=
output_to_es_register
,
es_index
=
es_index
,
doc_type
=
doc_type
,
proxies_num
=
proxies_num
)
def
repost_page
(
self
,
weibo_id
,
user_name
,
password
):
total_page
=
0
result_lst
=
[]
cookie
=
self
.
manipulate_login
(
user_name
=
user_name
,
password
=
password
)
# cookie = self.test_cookie(get_cookie)
if
cookie
is
not
None
:
current_time
=
int
(
time
.
time
()
*
1000
)
repost_url
=
'https://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id='
+
weibo_id
+
'&max_id=0&page=1&__rnd='
+
str
(
current_time
)
get_page
=
requests
.
get
(
repost_url
,
headers
=
self
.
headers
,
cookies
=
cookie
)
get_page
.
encoding
=
'utf-8'
try
:
page_dic
=
get_page
.
json
()
total_page
=
page_dic
[
'data'
][
'page'
][
'totalpage'
]
repost_info
=
page_dic
[
'data'
][
'html'
]
repost_soup
=
BeautifulSoup
(
repost_info
,
'html.parser'
)
repost_agg
=
repost_soup
.
find_all
(
'div'
,
{
'action-type'
:
'feed_list_item'
})
for
line
in
repost_agg
:
try
:
one_repost
=
self
.
get_repost_info
(
line
)
result_lst
.
append
(
one_repost
)
print
(
'get one repost'
)
except
:
print
(
'one repost data error'
)
print
(
one_repost
)
except
:
print
(
"can't get repost data"
)
time
.
sleep
(
6
)
if
cookie
is
not
None
and
total_page
!=
0
:
for
page_num
in
range
(
1
,
total_page
+
1
):
current_time
=
int
(
time
.
time
()
*
1000
)
repost_url
=
(
'https://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id='
+
weibo_id
+
'&max_id=0&page='
+
str
(
page_num
)
+
'&__rnd='
+
str
(
current_time
))
get_page
=
requests
.
get
(
repost_url
,
headers
=
self
.
headers
,
cookies
=
cookie
)
time
.
sleep
(
3
)
get_page
.
encoding
=
'utf-8'
try
:
page_dic
=
get_page
.
json
()
total_page
=
page_dic
[
'data'
][
'page'
][
'totalpage'
]
repost_info
=
page_dic
[
'data'
][
'html'
]
repost_soup
=
BeautifulSoup
(
repost_info
,
'html.parser'
)
repost_agg
=
repost_soup
.
find_all
(
'div'
,
{
'action-type'
:
'feed_list_item'
})
for
line
in
repost_agg
:
one_repost
=
self
.
get_repost_info
(
line
)
result_lst
.
append
(
one_repost
)
print
(
'get one repost at
%
s'
%
page_num
)
print
(
one_repost
)
except
:
print
(
"can't get repost data"
)
if
result_lst
!=
[]:
return
result_lst
else
:
print
(
"can't get repost data"
)
return
None
def
user_page
(
self
,
user_id
,
user_name
,
password
):
result_lst
=
[]
cookie_pool
=
open
(
'cookie_pool'
,
'r'
,
encoding
=
'utf-8'
)
for
coo
in
cookie_pool
:
print
(
coo
)
cookie
=
json
.
loads
(
coo
)
#cookie = self.manipulate_login(user_name=user_name,password=password)
#cookie = {"ALC": "ac%3D2%26bt%3D1561705868%26cv%3D5.0%26et%3D1593241868%26ic%3D-621306587%26login_time%3D1561705868%26scf%3D%26uid%3D7211103954%26vf%3D0%26vs%3D0%26vt%3D0%26es%3Db91c9d11ca009f8c4f48080505ae615b", "LT": "1561705868", "tgc": "TGT-NzIxMTEwMzk1NA==-1561705868-tc-6005B5FEAADCEB07A63BA0D6D544CF92-1", "ALF": "1593241868", "SCF": "Ah7YtXJ_s6ue4BJWekcj8HMaZEYi3Kel5243tYoDHC9y0TD9y7MYKIhYu7fV0_BEaPmgGpFKmkyz-WA-cF6-Vgc.", "SUB": "_2A25wEc3cDeRhGeFM6lMQ8C3FzjiIHXVTZrgUrDV_PUNbm9AKLULSkW9NQP7JKShhH9bCX9VIpjzhPXX89XiDiHbj", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WFmSG3DWrqckklXmwYD.UNJ5NHD95QNeo2peK501K-XWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNeKM7eKM0SheX15tt", "sso_info": "v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLGNs4CxjbOMtIyzkLiJp5WpmYO0t4yjhLGMk4CzjpOUtA==", "login": "609423641c81693ee710ee69b0d0e34c"}
if
cookie
is
not
None
:
for
page_num
in
range
(
1
,
3
):
first_url
=
(
'https://weibo.com/u/'
+
user_id
+
'?visible=0&is_all=1&is_tag=0'
'&profile_ftype=1&page='
+
str
(
page_num
)
+
'#feedtop'
)
get_page
=
requests
.
get
(
first_url
,
headers
=
self
.
headers
,
cookies
=
cookie
)
get_page
.
encoding
=
'utf-8'
page
=
get_page
.
text
soup
=
BeautifulSoup
(
page
,
'html.parser'
)
sfa
=
soup
.
find_all
(
'script'
)
find_content
=
''
for
line
in
sfa
:
if
'Pl_Official_MyProfileFeed__'
in
str
(
line
):
find_content
=
str
(
line
)
find_content
=
find_content
.
replace
(
'<script>FM.view('
,
''
)
.
replace
(
')</script>'
,
''
)
# print(find_content)
find_content_dic
=
json
.
loads
(
find_content
)
content_for_soup
=
find_content_dic
[
'html'
]
soup_content
=
BeautifulSoup
(
content_for_soup
,
'html.parser'
)
weibo_lst
=
soup_content
.
find_all
(
'div'
,
{
'action-type'
:
'feed_list_item'
})
# time.sleep(15)
for
line_count
,
line
in
enumerate
(
weibo_lst
):
weibo_info
=
self
.
get_user_weibo_info
(
line
,
cookie
)
weibo_info
[
'user_id'
]
=
user_id
weibo_info
[
'user_url'
]
=
'https://weibo.com/'
+
user_id
result_lst
.
append
(
weibo_info
)
print
(
'get data at element page:
%
s pagebar:
%
s'
%
(
page_num
,
line_count
))
get_parameter
=
soup
.
find_all
(
'script'
,
{
'type'
:
'text/javascript'
})
for
line
in
get_parameter
:
if
'pid'
in
str
(
line
)
and
'oid'
in
str
(
line
):
parameter_str
=
str
(
line
)
parameter_str
=
parameter_str
.
replace
(
'
\r
'
,
''
)
.
replace
(
'
\n
'
,
''
)
.
replace
(
"
\'
"
,
''
)
domain
=
re
.
findall
(
'
\
d+'
,
''
.
join
(
re
.
findall
(
"pid]=
\
d+"
,
parameter_str
)))[
0
]
special_id
=
re
.
findall
(
'
\
d+'
,
''
.
join
(
re
.
findall
(
"page_id]=
\
d+"
,
parameter_str
)))[
0
]
current_time
=
int
(
time
.
time
()
*
1000
)
for
pagebar
in
[
0
,
1
]:
user_url
=
(
'https://weibo.com/p/aj/v6/mblog/mbloglist?ajwvr=6&domain='
+
domain
+
'&profile_ftype=1&is_all=1&pagebar='
+
str
(
pagebar
)
+
'&pl_name=Pl_Official_MyProfileFeed__22&id='
+
special_id
+
'&script_uri=/'
+
user_id
+
'&feed_type=0&page='
+
str
(
page_num
)
+
'&pre_page=1'
'&domain_op='
+
domain
+
'&__rnd='
+
str
(
current_time
))
get_page
=
requests
.
get
(
user_url
,
headers
=
self
.
headers
,
cookies
=
cookie
)
get_page
.
encoding
=
'utf-8'
try
:
page_dic
=
get_page
.
json
()
user_weibo_str
=
page_dic
[
'data'
]
user_weibo_soup
=
BeautifulSoup
(
user_weibo_str
,
'html.parser'
)
user_weibo_agg
=
user_weibo_soup
.
find_all
(
'div'
,
{
'action-type'
:
'feed_list_item'
})
# time.sleep(15)
for
line
in
user_weibo_agg
:
try
:
weibo_info
=
self
.
get_user_weibo_info
(
line
,
cookie
)
weibo_info
[
'user_id'
]
=
user_id
weibo_info
[
'user_url'
]
=
'https://weibo.com/'
+
user_id
result_lst
.
append
(
weibo_info
)
print
(
'get data at ajax page page_num:
%
s pagebar:
%
s'
%
(
page_num
,
pagebar
))
except
:
print
(
'one weibo_info error'
)
except
:
print
(
'page error at page_num:
%
s pagebar:
%
s'
%
(
page_num
,
pagebar
))
if
result_lst
!=
[]:
return
result_lst
else
:
print
(
"can't get repost data"
)
return
None
@staticmethod
def
get_single_page
(
mid
):
url
=
"https://m.weibo.cn/status/
%
s"
%
mid
headers
=
{
"accept"
:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
,
"accept-encoding"
:
"gzip, deflate, br"
,
"accept-language"
:
"zh-CN,zh;q=0.9"
,
# "cookie": "_T_WM=68345544646; WEIBOCN_FROM=1110006030; MLOGIN=0; XSRF-TOKEN=fd1a69; M_WEIBOCN_PARAMS=oid%3D4523948446845543%26luicode%3D20000061%26lfid%3D4528703037509890%26uicode%3D20000061%26fid%3D4523948446845543",
"sec-fetch-dest"
:
"empty"
,
"sec-fetch-mode"
:
"same-origin"
,
"sec-fetch-site"
:
"same-origin"
,
"upgrade-insecure-requests"
:
"1"
,
"user-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
,
}
page_res
=
retry_get_url
(
url
,
headers
=
headers
,
proxies
=
0
)
page_json_context
=
re
.
findall
(
r"render_data = (.*)\[0\]"
,
page_res
.
text
,
flags
=
re
.
DOTALL
)[
0
]
page_json
=
json
.
loads
(
page_json_context
)
text
=
dehtml
(
page_json
[
0
][
"status"
][
"text"
])
repost_count
=
trans_play_count
(
page_json
[
0
][
"status"
][
"reposts_count"
])
comment_count
=
trans_play_count
(
page_json
[
0
][
"status"
][
"comments_count"
])
favorite_count
=
trans_play_count
(
page_json
[
0
][
"status"
][
"attitudes_count"
])
return
text
,
repost_count
,
comment_count
,
favorite_count
def
get_releaser_id
(
self
,
releaserUrl
):
return
get_releaser_id
(
platform
=
self
.
platform
,
releaserUrl
=
releaserUrl
)
@staticmethod
def
get_img
(
data
):
img_list
=
[]
if
data
.
get
(
"pics"
):
for
one
in
data
.
get
(
"pics"
):
try
:
img_list
.
append
(
one
[
"large"
][
"url"
])
except
Exception
as
e
:
img_list
.
append
(
one
[
"url"
])
print
(
"add img error
%
s"
%
e
)
return
img_list
def
releaser_page
(
self
,
releaserUrl
,
output_to_file
=
False
,
filepath
=
None
,
output_to_es_raw
=
False
,
output_to_es_register
=
False
,
push_to_redis
=
False
,
releaser_page_num_max
=
10000
,
es_index
=
None
,
doc_type
=
None
,
proxies_num
=
None
):
print
(
'Processing releaserUrl
%
s'
%
releaserUrl
)
result_Lst
=
[]
releaser_id
=
self
.
get_releaser_id
(
releaserUrl
)
# xsrf_token,url_extr = self.get_weibo_info(releaser_id)
headers
=
{
"accept"
:
"application/json, text/plain, */*"
,
"accept-encoding"
:
"gzip, deflate, br"
,
"accept-language"
:
"zh-CN,zh;q=0.9"
,
# "cookie": "_T_WM=30976479190; XSRF-TOKEN=9e4bb8; WEIBOCN_FROM=1110006030; MLOGIN=0; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D1%2526q%253D%25E8%25BF%25AA%25E4%25B8%25BD%25E7%2583%25AD%25E5%25B7%25B4%26fid%3D1076031669879400%26uicode%3D10000011",
"mweibo-pwa"
:
"1"
,
# "referer": "https://m.weibo.cn/u/1669879400?uid=1669879400&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4",
# "referer": "https://m.weibo.cn/u/1669879400?uid=1669879400&t=0",
"sec-fetch-dest"
:
"empty"
,
"sec-fetch-mode"
:
"cors"
,
"sec-fetch-site"
:
"same-origin"
,
"user-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
,
"x-requested-with"
:
"XMLHttpRequest"
,
# "x-xsrf-token": xsrf_token,
}
pagenum
=
0
has_more
=
True
since_id
=
0
if
releaser_id
:
while
pagenum
<=
releaser_page_num_max
and
has_more
:
pagenum
+=
1
time
.
sleep
(
0.5
)
"?uid=1669879400&t=0&luicode=10000011&lfid=100103type
%3
D1
%26
q
%3
D
%
E8
%
BF
%
AA
%
E4
%
B8
%
BD
%
E7
%83%
AD
%
E5
%
B7
%
B4&type=uid&value=1669879400&containerid=1076031669879400&since_id=451822205602429"
url
=
"https://m.weibo.cn/api/container/getIndex?uid={0}&t=0&type=uid&value={1}&containerid=107603{2}&since_id={3}"
.
format
(
releaser_id
,
releaser_id
,
releaser_id
,
since_id
)
headers
[
"referer"
]
=
"https://m.weibo.cn/u/uid={0}&t=0"
.
format
(
releaser_id
)
print
(
'Page number:
%
d'
%
pagenum
)
try
:
if
proxies_num
:
get_page
=
retry_get_url
(
url
,
headers
=
headers
,
timeout
=
self
.
timeout
,
proxies
=
proxies_num
)
else
:
get_page
=
retry_get_url
(
url
,
headers
=
headers
,
timeout
=
self
.
timeout
)
except
:
get_page
=
None
has_more
=
False
if
get_page
and
get_page
.
status_code
==
200
:
try
:
page_json
=
get_page
.
json
()
total
=
page_json
[
"data"
][
"cardlistInfo"
][
"total"
]
if
pagenum
>
total
:
break
since_id
=
page_json
[
"data"
][
"cardlistInfo"
][
"since_id"
]
page_dic
=
page_json
[
"data"
]
.
get
(
"cards"
)
except
Exception
as
e
:
print
(
"load data error
%
s"
%
e
)
continue
if
page_dic
:
for
one
in
page_dic
:
try
:
mblog
=
one
.
get
(
"mblog"
)
mid
=
mblog
.
get
(
"mid"
)
forward_text
=
""
forward_user
=
""
if
one
.
get
(
"source"
)
==
"绿洲"
:
text_type
=
"绿洲"
elif
mblog
.
get
(
"retweeted_status"
):
text_type
=
"转发"
forward_text
=
mblog
.
get
(
"retweeted_status"
)
.
get
(
"raw_text"
)
forward_user
=
mblog
.
get
(
"retweeted_status"
)
.
get
(
"user"
)
.
get
(
"screen_name"
)
else
:
text_type
=
one
.
get
(
"source"
)
if
mblog
.
get
(
"isLongText"
):
text
,
repost_count
,
comment_count
,
favorite_count
=
self
.
get_single_page
(
mid
)
else
:
text
=
mblog
[
"raw_text"
]
res_dic
=
{
"release_time"
:
trans_strtime_to_timestamp
(
mblog
[
"created_at"
]),
"fetch_time"
:
int
(
datetime
.
datetime
.
now
()
.
timestamp
()
*
1e3
),
"url"
:
one
[
"scheme"
],
"releaser"
:
mblog
[
"user"
][
"screen_name"
],
"repost_count"
:
trans_play_count
(
mblog
[
"reposts_count"
]),
"comment_count"
:
trans_play_count
(
mblog
[
"comments_count"
]),
"favorite_count"
:
trans_play_count
(
mblog
[
"attitudes_count"
]),
"title"
:
text
.
replace
(
"
\u200b
"
,
""
),
"wb_type"
:
text_type
,
"forward_user"
:
forward_user
,
"forward_text"
:
forward_text
,
"mid"
:
mid
,
"releaserUrl"
:
"https://www.weibo.com/u/
%
s"
%
releaser_id
,
"releaser_id_str"
:
"weibo_
%
s"
%
releaser_id
,
"img_list"
:
self
.
get_img
(
mblog
),
"platform"
:
"weibo"
,
# "doc_id":doc_id
}
res_dic
[
"doc_id"
]
=
cal_doc_id
(
platform
=
"weibo"
,
url
=
one
[
"scheme"
],
data_dict
=
res_dic
,
doc_id_type
=
"all-time-url"
)
yield
res_dic
except
Exception
as
e
:
print
(
json
.
dumps
(
mblog
))
print
(
"row formate error
%
s"
%
e
)
continue
def
get_releaser_follower_num
(
self
,
releaserUrl
):
pass
def
releaser_page_by_time
(
self
,
start_time
,
end_time
,
url
,
allow
,
**
kwargs
):
count_false
=
0
for
res
in
self
.
releaser_page
(
url
,
proxies_num
=
kwargs
.
get
(
"proxies_num"
)):
video_time
=
res
[
"release_time"
]
# print(res)
if
video_time
:
if
start_time
<
video_time
:
if
video_time
<
end_time
:
yield
res
else
:
count_false
+=
1
if
count_false
>
allow
:
break
else
:
yield
res
if
__name__
==
'__main__'
:
zhihu
=
Crawler_zhihu
()
# zhihu.get_serach_page_cookies("热玛吉")
zhihu
.
search_page
(
"热玛吉"
)
# print(user_page)
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment