Commit 929f9869 authored by haowang's avatar haowang

separate zhihu spider

parent 709a4df1
import pymysql
import execjs
import os
import re
from datetime import datetime
from pymysql import escape_string
import sys
HOST = '172.18.51.14'
PORT = 3306
USER = 'spider'
PASSWD = 'Gengmei123'
DB = 'spider'
JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
class RefreshContent(object):
def __init__(self):
'''
初始化数据库,调整js规则
'''
self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
passwd=PASSWD,
db=DB, charset='utf8')
self.cur = self.conn.cursor()
os.environ["EXECJS_RUNTIME"] = 'Node'
try:
with open('./zhihu.js', 'r', encoding='utf-8') as f:
js = f.read()
except:
with open(JS_FILE_PATH, 'r', encoding='utf-8') as f:
js = f.read()
self.exec_js = execjs.compile(js)
def refresh_content(self, table, pic_table, key_id):
'''
替换url,更新回答内容
'''
import re
sql = """select {}, url, new_url from {}""".format(key_id, pic_table)
self.cur.execute(sql)
tuple = self.cur.fetchall()
self.conn.commit()
for i in range(len(tuple)):
if tuple[i][2] == None:
continue
find_id = tuple[i][0]
temp = str(tuple[i][1])
temp1 = temp.replace("?", "#")
sql = """select new_content from {} where {} = '{}' """.format(table, key_id, find_id)
self.cur.execute(sql)
tuples = self.cur.fetchall()
# tuples = str(tuples)
content = tuples[0][0]
pattern = r'%s(.+?)%s' % ("<noscript><img src=\"" + temp1, "</figure>")
temp_tuples = content.replace("?", "#")
new_content = re.sub(pattern, "<noscript><img src=\"" + str(tuple[i][2]) + "\"></noscript></figure>",
temp_tuples)
new_content = r'%s' % (new_content)
new_content = escape_string(new_content)
sql = """update {} set new_content = '{}' WHERE {} = '{}' """.format(table, new_content, key_id, find_id)
self.cur.execute(sql)
self.conn.commit()
def answer_refresh_content(self):
'''
替换url,更新回答内容
'''
sql = """select answer_id, url, new_url from zhihu_answer_picture_url"""
self.cur.execute(sql)
tuple = self.cur.fetchall()
self.conn.commit()
for i in range(len(tuple)):
if tuple[i][2] == None:
continue
find_id = tuple[i][0]
temp = str(tuple[i][1])
temp1 = temp.replace("?", "#")
sql = """select new_content from zhihu_answer where zhihu_answer.answer_id = '{}' """.format(find_id)
self.cur.execute(sql)
tuples = self.cur.fetchall()
# tuples = str(tuples)
content = tuples[0][0]
pattern = r'%s(.+?)%s' % ("<noscript><img src=\"" + temp1, "</figure>")
temp_tuples = content.replace("?", "#")
new_content = re.sub(pattern, "<noscript><img src=\"" + str(tuple[i][2]) + "\"></noscript></figure>",
temp_tuples)
new_content = r'%s' % (new_content)
new_content = escape_string(new_content)
sql = """update zhihu_answer set new_content = '{}' WHERE answer_id = '{}' """.format(new_content,
tuple[i][0])
self.cur.execute(sql)
self.conn.commit()
def article_refresh_content(self):
'''
替换url,更新文章内容
'''
sql = """select article_id, url, new_url from zhihu_article_picture_url"""
self.cur.execute(sql)
tuple = self.cur.fetchall()
self.conn.commit()
for i in range(len(tuple)):
find_id = tuple[i][0]
temp = str(tuple[i][1])
temp1 = temp.replace("?", "#")
sql = """select new_content from zhihu_article where zhihu_article.article_id = '{}' """.format(find_id)
self.cur.execute(sql)
tuples = self.cur.fetchall()
# tuples = str(tuples)
content = tuples[0][0]
pattern = r'%s(.+?)%s' % ("<noscript><img src=\"" + temp1, "</figure>")
temp_tuples = content.replace("?", "#")
new_content = re.sub(pattern, "<noscript><img src=\"" + str(tuple[i][2]) + "\"></noscript></figure>",
temp_tuples)
new_content = r'%s' % (new_content)
new_content = escape_string(new_content)
sql = """update zhihu_article set new_content = '{}' WHERE article_id = '{}' """.format(new_content,
tuple[i][0])
self.cur.execute(sql)
self.conn.commit()
if __name__ == '__main__':
# print('参数个数为:', len(sys.argv), '个参数。')
# print('参数列表:', type(sys.argv[0]), sys.argv[0], type(sys.argv[1]), sys.argv[1])
mark = int(sys.argv[1])
print(datetime.now())
refresh = RefreshContent()
if mark == 0:
refresh.refresh_content('zhihu_answer', 'zhihu_answer_picture_url', 'answer_id')
elif mark == 1:
refresh.refresh_content('zhihu_article', 'zhihu_article_picture_url', 'article_id')
elif mark == 2:
refresh.refresh_content('zhihu_thought', 'zhihu_thought_picture_url', 'thought_id')
print(datetime.now())
# import rsa
# import os, sys
# base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# sys.path.append("/Users/xuwei")
# sys.path.append("/Users/xuwei/crawler")
# sys.path.append("/Users/xuwei/crawler/crawler_sys")
import pymysql
import hashlib
import requests
import execjs
import os
import re
import sys
from crawler_sys.utils.output_results import retry_get_url, retry_get_url_no_proxies
from datetime import datetime
HOST = '172.18.51.14'
PORT = 3306
USER = 'spider'
PASSWD = 'Gengmei123'
DB = 'spider'
JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
SPIDER_URL = 'https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1'
ANSWER_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created'
ARTICLE_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20&sort_by=created'
THOUGHT_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data%5B*%5D.upvoted_followees%2Cadmin_closed_comment'
ANSWER_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/answers/{0}/root_comments?order=normal&limit=20&offset={1}&status=open'
ARTICLE_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/articles/{0}/root_comments?order=normal&limit=20&offset={1}&status=open'
CHILD_COMMENT_START_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments'
CHILD_COMMENT_OFFSET_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments?limit=20&offset={1}'
THOUGHT_COMMENT_URL = 'https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open'
class Spider(object):
def __init__(self):
'''
初始化数据库,调整js规则
'''
self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
passwd=PASSWD,
db=DB, charset='utf8')
self.cur = self.conn.cursor()
os.environ["EXECJS_RUNTIME"] = 'Node'
try:
with open('./zhihu.js', 'r', encoding='utf-8') as f:
js = f.read()
except:
with open(JS_FILE_PATH, 'r', encoding='utf-8') as f:
js = f.read()
self.exec_js = execjs.compile(js)
def get_serach_page_cookies(self):
'''
cookies更新
'''
url = "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1"
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"cookie": '_SESSIONID=rvJz2mbRjCqJFwvD79ADGb8gpdpRkWXAsdVDqOVALgh; JOID=UVkSBEtHLEUgV_KGUUMd3fULiLJHZglmBHfTo3JnDWADc9KndNqP8XtW9oBWoLo5jsAnAcPG0-JwbPp8rD1JsFI=; osd=W1gdB05NLUojUviHXkAY1_QEi7dNZwZlAX3SrHFiB2EMcNetddWM9HFX-YNTqrs2jcUtAMzF1uhxY_l5pjxGs1c=; _zap=cc1330a6-d6e9-4b25-8232-0b0481e37ea2; _xsrf=4798d610-1db7-489f-8c14-d608c0bbcf08; d_c0="AOCcdS0CNBKPToRkxgVd-8qBKKgTDM9yF4Y=|1605507877"; capsion_ticket="2|1:0|10:1605684439|14:capsion_ticket|44:MzdiN2JlOTZlNWE2NGU5Zjg0MDNiODFlN2ViMjQzNGU=|b84d77e03b5e9447a3740a614d208ca5f3bc1e4fe15fe9b46f1a2a5702da5f99"; SESSIONID=nplCMucg2EXp8xNQAz74jmaLovUS9CG4rVtcFY4jWLT; JOID=V1sUBk8zRBeITtMgLDNzj10QrRI6WzZ_ygKgQU8AB038d6pof-COptdJ1yYpGa7oR9-1bgHql0Hfs4FvUF_YW-A=; osd=Ul4XBkI2QRSIQ9YlLzN-ilgTrR8_XjV_xwelQk8NAkj_d6dteuOOq9JM1CYkHKvrR9KwawLqmkTasIFiVVrbW-0=; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1605608394,1605684435,1605684439,1605684455; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1605684522; KLBRSID=fe0fceb358d671fa6cc33898c8c48b48|1605685245|1605683234',
"referer": "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36",
}
requests_res = retry_get_url(url, headers=headers)
return requests_res.cookies.get_dict()
def parse_sigle_page(self, data_dict, mark):
'''
插入主要内容数据和图片的url,寻找评论
'''
if mark == 0:
into = "insert into zhihu_answer(title, content, answer_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"],
data_dict["comment_count"], data_dict["content"])
elif mark == 1:
into = "insert into zhihu_article(title, content, article_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (
data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"],
data_dict["content"])
self.cur.execute(into, values)
self.conn.commit()
offset = 0
if data_dict["comment_count"] != 0:
next = 1
while next == 1:
next = self.search_root_comment(data_dict["id"], offset, mark)
offset = offset + 20
patt = r'%s(.+?)%s' % ("<noscript><img src=\"", "\" data-caption")
pattern = re.compile(patt)
result = pattern.findall(data_dict["content"])
for results in result:
if mark == 0:
into = "insert into zhihu_answer_picture_url(answer_id, url) value(%s, %s)"
elif mark == 1:
into = "insert into zhihu_article_picture_url(article_id, url) value(%s, %s)"
values = (data_dict["id"], results)
self.cur.execute(into, values)
self.conn.commit()
return
def search_page(self, mark, page_max, start_page=0):
'''
函数主入口
params:
mark 0 answer, 1 article, 2 thought
'''
offset = start_page
for i in range(page_max):
if mark == 0:
self.search_answer_article_page(offset, 0)
elif mark == 1:
self.search_answer_article_page(offset, 1)
elif mark == 2:
self.search_thought_page(offset)
offset = offset + 20
self.conn.close()
return
def search_answer_article_page(self, offset, mark, proxies_num=0):
'''
实现文章和回答的数据包请求
'''
offset = str(offset)
if mark == 0:
url = ANSWER_URL.format(offset)
elif mark == 1:
url = ARTICLE_URL.format(offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
if get_page.status_code != 200:
print("article_error")
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
try:
if one_line["content"] != None:
self.parse_sigle_page(one_line, mark)
print("finshed_article" + offset)
except KeyError:
# It's totally ok to drop the last return data value.
# The search api just return something seems related to search
continue
else:
print("article_data_error")
return
def search_root_comment(self, answerid, offset, mark, proxies_num=0):
'''
实现父评论的数据包请求
'''
offset = str(offset)
answerid = str(answerid)
if mark == 0:
url = ANSWER_ROOT_COMMENT_URL.format(answerid, offset)
elif mark == 1:
url = ARTICLE_ROOT_COMMENT_URL.format(answerid, offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
if get_page.status_code != 200:
print("root_comment_error")
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
try:
self.root_comment_data(one_line, answerid, mark)
print("finshed_root" + offset)
except KeyError:
continue
else:
print("root_data_error")
next = 0
if len(page_dict['data']) == 20:
next = 1
return next
def root_comment_data(self, data_dict, answerid, mark):
'''
插入父评论相关信息并关联子评论
'''
if mark == 0:
into = "insert into zhihu_answer_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s, %s)"
elif mark == 1:
into = "insert into zhihu_article_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s, %s)"
values = (data_dict["id"], data_dict["author"]["member"]["name"], data_dict["content"], answerid,
data_dict["child_comment_count"], data_dict["featured"], data_dict["created_time"],
data_dict["author"]["member"]["id"])
self.cur.execute(into, values)
self.conn.commit()
offset = 0
if data_dict["child_comment_count"] != 0:
next = 1
while next == 1:
next = self.search_child_comment(data_dict["id"], offset, mark)
offset = offset + 20
return
def search_child_comment(self, root_comment_id, offset, proxies_num=0):
'''
文章和回答的数据包请求
'''
root_comment_id = str(root_comment_id)
offsets = offset
offset = str(offset)
if offsets == 0:
url = CHILD_COMMENT_START_URL.format(root_comment_id)
else:
url = CHILD_COMMENT_OFFSET_URL.format(root_comment_id, offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url_no_proxies(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
if get_page.status_code != 200:
print("child_comment_error")
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
try:
self.child_comment_data(one_line, root_comment_id)
except KeyError:
continue
else:
pass
next = 0
if len(page_dict['data']) == 20:
next = 1
return next
def child_comment_data(self, data_dict, root_comment_id):
'''
子评论数据插入
'''
into = "insert into zhihu_child_comment(root_comment_id, author_name, content, reply_name, child_comment_id, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s)"
values = (root_comment_id, data_dict["author"]["member"]["name"], data_dict["content"],
data_dict["reply_to_author"]["member"]["name"], data_dict["id"], data_dict["created_time"],
data_dict["author"]["member"]["name"])
self.cur.execute(into, values)
self.conn.commit()
return
def headers_handle(self, url):
'''
url请求中的头部伪装
'''
res_cookies_dict = self.get_serach_page_cookies()
headers_search = {
"accept": "*/*",
"accept-encoding": "gzip, deflate",
"accept-language": "zh-CN,zh;q=0.9",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36",
"x-ab-param": "li_yxzl_new_style_a=1;se_major=0;se_adsrank=4;se_hi_trunc=0;se_v053=1;li_panswer_topic=0;zr_test_aa1=1;pf_noti_entry_num=2;zr_search_sim2=2;zr_slotpaidexp=2;se_zp_boost=0;tp_club_entrance=1;pf_profile2_tab=0;ug_newtag=1;li_answer_card=0;ls_recommend_test=5;qap_labeltype=1;zr_rec_answer_cp=open;se_sug_term=0;tp_topic_tab=0;ge_ge01=5;se_wil_act=0;se_videobox=0;tsp_ad_cardredesign=0;qap_question_visitor= 0;zr_slot_training=2;tp_clubhyb=0;li_ebook_gen_search=2;se_v_v005=0;zw_sameq_sorce=999;ge_ge02=6;se_mobilecard=0;se_auth_src=0;tp_header_style=1;tp_flow_ctr=0;pf_creator_card=1;li_viptab_name=0;zr_intervene=0;se_bert128=1;se_ffzx_jushen1=0;top_v_album=1;se_preset=0;tp_discover=1;ls_fmp4=0;tp_club_top=0;top_universalebook=1;li_svip_cardshow=1;li_paid_answer_exp=0;tp_topic_style=0;zr_art_rec=base;se_colorfultab=1;se_auth_src2=0;tp_club_qa_entrance=1;tp_club__entrance2=1;tsp_hotlist_ui=3;li_svip_tab_search=1;se_entity22=1;tp_meta_card=0;tp_topic_tab_new=0-0-0;tp_zrec=0;top_ebook=0;pf_adjust=1;qap_question_author=0;zr_topic_rpc=0;se_topicfeed=0;tp_club_feed=0;tsp_ioscard2=0;zr_rel_search=base;se_recommend=1;se_usercard=0;tp_club_fdv4=0;tp_m_intro_re_topic=1;pf_foltopic_usernum=0;li_vip_verti_search=0;zr_training_boost=false;se_v054=0;tp_contents=1;soc_feed_intelligent=3;tsp_ios_cardredesign=0;pf_fuceng=1;pf_newguide_vertical=0;ug_follow_topic_1=2;ls_video_commercial=0;li_car_meta=1;se_sug_dnn=0;tp_fenqu_wei=0;li_catalog_card=1;top_quality=0;se_click_v_v=1;se_aa_base=1;se_club_ui=0;se_return_1=0;soc_notification=1;zr_ans_rec=gbrank;zr_search_paid=1;zr_expslotpaid=3;zr_rerank=0;se_college=default;se_whitelist=1;top_root=0;li_yxxq_aut=A1;tsp_adcard2=0;ls_videoad=2;se_col_boost=1;li_edu_page=old;zr_training_first=false;se_t2sug=1;se_vbert3=0;se_merge=0;li_video_section=1;zr_km_answer=open_cvr;zr_sim3=0;se_v_v006=0;tp_dingyue_video=0;li_topics_search=0;se_searchwiki=0;se_guess=0;se_major_v2=0;tp_club_bt=0;tp_sft=a;top_test_4_liguangyi=1",
"x-api-version": "3.0.91",
"x-app-za": "OS=Web",
"x-requested-with": "fetch",
"x-zse-83": "3_2.0",
"x-zse-86": None,
"referer": "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?page=1",
}
cookies_dict = {
"d_c0": '"AOCcdS0CNBKPToRkxgVd-8qBKKgTDM9yF4Y=|1605507877"',
"KLBRSID": None
}
cookies_dict.update(res_cookies_dict)
f = "+".join(
["3_2.0", url.replace("https://www.zhihu.com", ""), headers_search["referer"], cookies_dict["d_c0"]])
fmd5 = hashlib.new('md5', f.encode()).hexdigest()
headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b", fmd5)
return headers_search, cookies_dict
def search_thought_page(self, offset, proxies_num=0):
'''
想法数据包请求
'''
offset = str(offset)
url = THOUGHT_URL.format(offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
if get_page.status_code != 200:
print("article_error")
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
try:
self.parse_thought_sigle_page(one_line)
print("finshed_article" + offset)
except KeyError:
# It's totally ok to drop the last return data value.
# The search api just return something seems related to search
continue
else:
print("article_data_error")
return
def parse_thought_sigle_page(self, data_dict):
'''
想法内容插入
'''
for one_dict in data_dict["content"]:
if one_dict["type"] == "text":
into = "insert into zhihu_thought(content, thought_id, created_time, comment_count) value(%s, %s, %s, %s)"
values = (one_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"])
self.cur.execute(into, values)
self.conn.commit()
else:
into = "insert into zhihu_thought_picture_url(thought_id, url) value(%s, %s)"
values = (data_dict["id"], one_dict["url"])
self.cur.execute(into, values)
self.conn.commit()
offset = 0
if data_dict["comment_count"] != 0:
next = 1
while next == 1:
next = self.search_thought_comment(data_dict["id"], offset)
offset = offset + 20
return
def search_thought_comment(self, answerid, offset, proxies_num=0):
'''
想法评论数据包请求
'''
offset = str(offset)
answerid = str(answerid)
url = THOUGHT_COMMENT_URL.format(answerid, offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
if get_page.status_code != 200:
print("root_comment_error")
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
try:
self.thought_comment_data(one_line, answerid)
print("finshed_root" + offset)
except KeyError:
continue
else:
print("root_data_error")
next = 0
if len(page_dict['data']) == 20:
next = 1
return next
def thought_comment_data(self, data_dict, answerid):
'''
想法评论数据插入
'''
into = "insert into zhihu_thought_comment(thought_comment_id, author_name, content, answerid, created_time, author_id) value(%s, %s, %s, %s, %s, %s)"
values = (data_dict["id"], data_dict["author"]["member"]["name"], data_dict["content"], answerid,
data_dict["created_time"], data_dict["author"]["member"]["id"])
self.cur.execute(into, values)
self.conn.commit()
return
if __name__ == '__main__':
mark = int(sys.argv[1])
max_page = int(sys.argv[2])
start_page = int(sys.argv[3])
print(datetime.now())
spider = Spider()
if mark == 0:
spider.search_page(mark, max_page, start_page)
elif mark == 1:
spider.search_page(mark, max_page, start_page)
elif mark == 2:
spider.search_page(mark, max_page, start_page)
print(datetime.now())
import os import os
import sys
import re import re
import time import time
import pymysql import pymysql
...@@ -10,8 +11,8 @@ from datetime import datetime ...@@ -10,8 +11,8 @@ from datetime import datetime
from image_qiniu import upload_file, IMG_TYPE from image_qiniu import upload_file, IMG_TYPE
DATA_OS_PATH = '/Users/haowei/workspace/gm/crawler/image' DATA_OS_PATH = '/image'
PROJECT_PATH = '/Users/haowei/workspace/gm/crawler' PROJECT_PATH = '/'
class UploadImage(object): class UploadImage(object):
...@@ -43,7 +44,7 @@ class UploadImage(object): ...@@ -43,7 +44,7 @@ class UploadImage(object):
with open(self.JS_FILE_PATH, 'r', encoding='utf-8') as f: with open(self.JS_FILE_PATH, 'r', encoding='utf-8') as f:
js = f.read() js = f.read()
# print(js) # print(js)
self.exec_js = execjs.compile(js) self.exec_js = execjs.compile(js, )
def get_serach_page_cookies(self): def get_serach_page_cookies(self):
''' '''
...@@ -133,11 +134,11 @@ class UploadImage(object): ...@@ -133,11 +134,11 @@ class UploadImage(object):
print('upload ..... error') print('upload ..... error')
return None return None
def picture_download_and_cut(self, path, new_path, table, key_id): def picture_download_and_cut(self, path, new_path, table, key_id, offset=0, count=10):
''' '''
文章图片剪切和下载 文章图片剪切和下载
''' '''
sql = """select {}, url from {}""".format(key_id, table) sql = """select {}, url from {} where new_url == '' limit {}, {}""".format(key_id, table, offset, count)
self.cur.execute(sql) self.cur.execute(sql)
tuple = self.cur.fetchall() tuple = self.cur.fetchall()
self.conn.commit() self.conn.commit()
...@@ -186,7 +187,15 @@ class UploadImage(object): ...@@ -186,7 +187,15 @@ class UploadImage(object):
if __name__ == '__main__': if __name__ == '__main__':
mark = int(sys.argv[1]) or 0
offset = int(sys.argv[2]) or 0
count = int(sys.argv[3]) or 10
print(datetime.now()) print(datetime.now())
a = UploadImage() a = UploadImage()
a.picture_download_and_cut_process() if mark == 0:
a.picture_download_and_cut(a.ANSWER_PICTURE_PATH, a.ANSWER_PICTURE_CUT_PATH, 'zhihu_answer_picture_url', 'answer_id', offset, count)
if mark == 1:
a.picture_download_and_cut(a.ARTICLE_PICTURE_PATH, a.ARTICLE_PICTURE_CUT_PATH, 'zhihu_article_picture_url', 'article_id', offset, count)
if mark == 2:
a.picture_download_and_cut(a.THOUGHT_PICTURE_PATH, a.THOUGHT_PICTURE_CUT_PATH, 'zhihu_thought_picture_url', 'thought_id', offset, count)
print(datetime.now()) print(datetime.now())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment