Commit d5e464a2 authored by 向万's avatar 向万

zhihu spider and data structure

parent 772fd8b7
# import rsa
# import os, sys
# base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# sys.path.append("/Users/xuwei")
# sys.path.append("/Users/xuwei/crawler")
# sys.path.append("/Users/xuwei/crawler/crawler_sys")
import urllib
# import execjs
import pymysql
import hashlib
import requests
import execjs
import os
import re
# from bs4 import BeautifulSoup
from crawler_sys.utils.output_results import retry_get_url, retry_get_url_no_proxies
from datetime import datetime
from gm_upload.gm_upload.upload import upload, upload_file
from gm_upload.gm_upload.consts import IMG_TYPE
import cv2
from pymysql import escape_string
class Crawler_zhihu():
def __init__(self):
self.conn = pymysql.connect(host='bj-cdb-b8oeejac.sql.tencentcdb.com', port=62118, user='work',
passwd='Gengmei1',
db='mimas_dev', charset='utf8')
self.cur = self.conn.cursor()
self.cur.execute("drop table if exists zhihu_answer")
sql = """create table zhihu_answer(title char(40),
content text(59999),
id int,
created_time int,
comment_count int)"""
self.cur.execute(sql)
self.conn.commit()
self.cur.execute("drop table if exists zhihu_article")
sql = """create table zhihu_article(title char(40),
content text(59999),
id int,
created_time int,
comment_count int)"""
self.cur.execute(sql)
self.conn.commit()
self.cur.execute("drop table if exists zhihu_answer_root_comment")
sql = """create table zhihu_answer_root_comment(root_comment_id int,
author_name char(40),
content text(59999),
answerid int,
child_comment_count int,
featured char(5),
created_time int,
author_id char(50))"""
self.cur.execute(sql)
self.conn.commit()
self.cur.execute("drop table if exists zhihu_child_comment")
sql = """create table zhihu_child_comment(root_comment_id int,
author_name char(40),
content text(59999),
reply_name char(40),
child_comment_id int,
created_time int,
author_id char(50))"""
self.cur.execute(sql)
self.conn.commit()
self.cur.execute("drop table if exists zhihu_article_root_comment")
sql = """create table zhihu_article_root_comment(root_comment_id int,
author_name char(40),
content text(59999),
answerid int,
child_comment_count int,
featured char(5),
created_time int,
author_id char(50))"""
self.cur.execute(sql)
self.conn.commit()
self.cur.execute("drop table if exists zhihu_answer_picture_url")
sql = """create table zhihu_answer_picture_url(answer_id int, url text(59999), new_url text(59999))"""
self.cur.execute(sql)
self.conn.commit()
self.cur.execute("drop table if exists zhihu_article_picture_url")
sql = """create table zhihu_article_picture_url(article_id int, url text(59999), new_url text(59999))"""
self.cur.execute(sql)
self.conn.commit()
self.cur.execute("drop table if exists zhihu_thought")
sql = """create table zhihu_thought(id char(50),
content text(59999),
created_time int,
comment_count int)"""
self.cur.execute(sql)
self.conn.commit()
self.cur.execute("drop table if exists zhihu_thought_comment")
sql = """create table zhihu_thought_comment(thought_comment_id int,
author_name char(40),
content text(59999),
answerid char(50),
created_time int,
author_id char(50))"""
self.cur.execute(sql)
self.conn.commit()
self.cur.execute("drop table if exists zhihu_thought_picture_url")
sql = """create table zhihu_thought_picture_url(thought_id char(50), url text(59999), new_url text(59999))"""
self.cur.execute(sql)
self.conn.commit()
os.environ["EXECJS_RUNTIME"] = 'Node'
try:
with open('./zhihu.js', 'r', encoding='utf-8') as f:
js = f.read()
except:
with open('/Users/xuwei/crawler/crawler_sys/site_crawler/zhihu.js', 'r', encoding='utf-8') as f:
js = f.read()
# print(js)
self.exec_js = execjs.compile(js, cwd=r'/usr/local/lib/node_modules')
def get_serach_page_cookies(self):
url = "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1"
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"cookie": '_SESSIONID=rvJz2mbRjCqJFwvD79ADGb8gpdpRkWXAsdVDqOVALgh; JOID=UVkSBEtHLEUgV_KGUUMd3fULiLJHZglmBHfTo3JnDWADc9KndNqP8XtW9oBWoLo5jsAnAcPG0-JwbPp8rD1JsFI=; osd=W1gdB05NLUojUviHXkAY1_QEi7dNZwZlAX3SrHFiB2EMcNetddWM9HFX-YNTqrs2jcUtAMzF1uhxY_l5pjxGs1c=; _zap=cc1330a6-d6e9-4b25-8232-0b0481e37ea2; _xsrf=4798d610-1db7-489f-8c14-d608c0bbcf08; d_c0="AOCcdS0CNBKPToRkxgVd-8qBKKgTDM9yF4Y=|1605507877"; capsion_ticket="2|1:0|10:1605684439|14:capsion_ticket|44:MzdiN2JlOTZlNWE2NGU5Zjg0MDNiODFlN2ViMjQzNGU=|b84d77e03b5e9447a3740a614d208ca5f3bc1e4fe15fe9b46f1a2a5702da5f99"; SESSIONID=nplCMucg2EXp8xNQAz74jmaLovUS9CG4rVtcFY4jWLT; JOID=V1sUBk8zRBeITtMgLDNzj10QrRI6WzZ_ygKgQU8AB038d6pof-COptdJ1yYpGa7oR9-1bgHql0Hfs4FvUF_YW-A=; osd=Ul4XBkI2QRSIQ9YlLzN-ilgTrR8_XjV_xwelQk8NAkj_d6dteuOOq9JM1CYkHKvrR9KwawLqmkTasIFiVVrbW-0=; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1605608394,1605684435,1605684439,1605684455; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1605684522; KLBRSID=fe0fceb358d671fa6cc33898c8c48b48|1605685245|1605683234',
"referer": "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36",
}
requests_res = retry_get_url(url, headers=headers)
return requests_res.cookies.get_dict()
def parse_sigle_page(self, data_dict, mark):
if mark == 0:
into = "insert into zhihu_answer(title, content, id, created_time, comment_count) value(%s, %s, %s, %s, %s)"
values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"], data_dict["comment_count"])
elif mark == 1:
into = "insert into zhihu_article(title, content, id, created_time, comment_count) value(%s, %s, %s, %s, %s)"
values = (data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"])
self.cur.execute(into, values)
self.conn.commit()
offset = 0
if data_dict["comment_count"] != 0:
next = 1
while next == 1:
next = self.search_root_comment(data_dict["id"], offset, mark)
offset = offset + 20
patt = r'%s(.+?)%s' % ("<noscript><img src=\"", "\" data-caption")
pattern = re.compile(patt)
result = pattern.findall(data_dict["content"])
for results in result:
if mark == 0:
into = "insert into zhihu_answer_picture_url(answer_id, url) value(%s, %s)"
elif mark == 1:
into = "insert into zhihu_article_picture_url(article_id, url) value(%s, %s)"
values = (data_dict["id"], results)
self.cur.execute(into, values)
self.conn.commit()
return
def search_page(self, answer_page_max, article_page_max, thought_page_max):
offset = 0
for i in range(answer_page_max):
self.search_answer_article_page(offset, 0)
offset = offset + 20
offset = 0
for i in range(article_page_max):
self.search_answer_article_page(offset, 1)
offset = offset + 20
offset = 0
for i in range(thought_page_max):
self.search_thought_page(offset)
offset = offset + 20
self.answer_picture_doneload_and_cut()
self.answer_refresh_content()
#self.article_picture_doneload_and_cut()
#self.article_refresh_content()
self.answer_data_complex()
self.conn.close()
return
def search_answer_article_page(self, offset, mark, proxies_num=0):
offset = str(offset)
if mark == 0:
url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created".format(offset)
elif mark == 1:
url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20&sort_by=created".format(offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict,proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
if get_page.status_code != 200:
print("article_error")
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
try:
if one_line["content"] != None:
self.parse_sigle_page(one_line, mark)
print("finshed_article" + offset)
except KeyError:
# It's totally ok to drop the last return data value.
# The search api just return something seems related to search
continue
else:
print("article_data_error")
return
def search_root_comment(self, answerid, offset, mark, proxies_num=0):
offset = str(offset)
answerid = str(answerid)
if mark == 0:
url = "https://www.zhihu.com/api/v4/answers/{0}/root_comments?order=normal&limit=20&offset={1}&status=open".format(answerid, offset)
elif mark == 1:
url = "https://www.zhihu.com/api/v4/articles/{0}/root_comments?order=normal&limit=20&offset={1}&status=open".format(answerid, offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict,proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
if get_page.status_code != 200:
print("root_comment_error")
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
try:
self.root_comment_data(one_line, answerid, mark)
print("finshed_root" + offset)
except KeyError:
continue
else:
print("root_data_error")
next = 0
if len(page_dict['data']) == 20:
next = 1
return next
def root_comment_data(self, data_dict, answerid, mark):
if mark == 0:
into = "insert into zhihu_answer_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s, %s)"
elif mark == 1:
into = "insert into zhihu_article_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s, %s)"
values = (data_dict["id"], data_dict["author"]["member"]["name"], data_dict["content"], answerid, data_dict["child_comment_count"], data_dict["featured"], data_dict["created_time"], data_dict["author"]["member"]["id"])
self.cur.execute(into, values)
self.conn.commit()
offset = 0
if data_dict["child_comment_count"] != 0:
next = 1
while next == 1:
next = self.search_child_comment(data_dict["id"], offset, mark)
offset = offset + 20
return
def search_child_comment(self, root_comment_id, offset, proxies_num=0):
root_comment_id = str(root_comment_id)
offsets = offset
offset = str(offset)
if offsets == 0:
url = "https://www.zhihu.com/api/v4/comments/{0}/child_comments".format(root_comment_id)
else:
url = "https://www.zhihu.com/api/v4/comments/{0}/child_comments?limit=20&offset={1}".format(root_comment_id, offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url_no_proxies(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
if get_page.status_code != 200:
print("child_comment_error")
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
try:
self.child_comment_data(one_line, root_comment_id)
except KeyError:
continue
else:
pass
next = 0
if len(page_dict['data']) == 20:
next = 1
return next
def child_comment_data(self, data_dict, root_comment_id):
into = "insert into zhihu_child_comment(root_comment_id, author_name, content, reply_name, child_comment_id, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s)"
values = (root_comment_id, data_dict["author"]["member"]["name"], data_dict["content"], data_dict["reply_to_author"]["member"]["name"], data_dict["id"], data_dict["created_time"], data_dict["author"]["member"]["name"])
self.cur.execute(into, values)
self.conn.commit()
return
def headers_handle(self, url):
res_cookies_dict = self.get_serach_page_cookies()
headers_search = {
"accept": "*/*",
"accept-encoding": "gzip, deflate",
"accept-language": "zh-CN,zh;q=0.9",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36",
"x-ab-param": "li_yxzl_new_style_a=1;se_major=0;se_adsrank=4;se_hi_trunc=0;se_v053=1;li_panswer_topic=0;zr_test_aa1=1;pf_noti_entry_num=2;zr_search_sim2=2;zr_slotpaidexp=2;se_zp_boost=0;tp_club_entrance=1;pf_profile2_tab=0;ug_newtag=1;li_answer_card=0;ls_recommend_test=5;qap_labeltype=1;zr_rec_answer_cp=open;se_sug_term=0;tp_topic_tab=0;ge_ge01=5;se_wil_act=0;se_videobox=0;tsp_ad_cardredesign=0;qap_question_visitor= 0;zr_slot_training=2;tp_clubhyb=0;li_ebook_gen_search=2;se_v_v005=0;zw_sameq_sorce=999;ge_ge02=6;se_mobilecard=0;se_auth_src=0;tp_header_style=1;tp_flow_ctr=0;pf_creator_card=1;li_viptab_name=0;zr_intervene=0;se_bert128=1;se_ffzx_jushen1=0;top_v_album=1;se_preset=0;tp_discover=1;ls_fmp4=0;tp_club_top=0;top_universalebook=1;li_svip_cardshow=1;li_paid_answer_exp=0;tp_topic_style=0;zr_art_rec=base;se_colorfultab=1;se_auth_src2=0;tp_club_qa_entrance=1;tp_club__entrance2=1;tsp_hotlist_ui=3;li_svip_tab_search=1;se_entity22=1;tp_meta_card=0;tp_topic_tab_new=0-0-0;tp_zrec=0;top_ebook=0;pf_adjust=1;qap_question_author=0;zr_topic_rpc=0;se_topicfeed=0;tp_club_feed=0;tsp_ioscard2=0;zr_rel_search=base;se_recommend=1;se_usercard=0;tp_club_fdv4=0;tp_m_intro_re_topic=1;pf_foltopic_usernum=0;li_vip_verti_search=0;zr_training_boost=false;se_v054=0;tp_contents=1;soc_feed_intelligent=3;tsp_ios_cardredesign=0;pf_fuceng=1;pf_newguide_vertical=0;ug_follow_topic_1=2;ls_video_commercial=0;li_car_meta=1;se_sug_dnn=0;tp_fenqu_wei=0;li_catalog_card=1;top_quality=0;se_click_v_v=1;se_aa_base=1;se_club_ui=0;se_return_1=0;soc_notification=1;zr_ans_rec=gbrank;zr_search_paid=1;zr_expslotpaid=3;zr_rerank=0;se_college=default;se_whitelist=1;top_root=0;li_yxxq_aut=A1;tsp_adcard2=0;ls_videoad=2;se_col_boost=1;li_edu_page=old;zr_training_first=false;se_t2sug=1;se_vbert3=0;se_merge=0;li_video_section=1;zr_km_answer=open_cvr;zr_sim3=0;se_v_v006=0;tp_dingyue_video=0;li_topics_search=0;se_searchwiki=0;se_guess=0;se_major_v2=0;tp_club_bt=0;tp_sft=a;top_test_4_liguangyi=1",
"x-api-version": "3.0.91",
"x-app-za": "OS=Web",
"x-requested-with": "fetch",
"x-zse-83": "3_2.0",
"x-zse-86": None,
"referer": "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?page=1",
}
cookies_dict = {
"d_c0": '"AOCcdS0CNBKPToRkxgVd-8qBKKgTDM9yF4Y=|1605507877"',
"KLBRSID": None
}
cookies_dict.update(res_cookies_dict)
f = "+".join(["3_2.0", url.replace("https://www.zhihu.com",""), headers_search["referer"], cookies_dict["d_c0"]])
fmd5 = hashlib.new('md5', f.encode()).hexdigest()
headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b",fmd5)
return headers_search, cookies_dict
def answer_picture_doneload_and_cut(self):
sql = """select answer_id, url from zhihu_answer_picture_url"""
self.cur.execute(sql)
tuple = self.cur.fetchall()
self.conn.commit()
path = "/Users/xuwei/Desktop/answer_picture/"
gif_patt = r'gif'
for i in range(len(tuple)):
mark = re.search(gif_patt, tuple[i][1])
url = tuple[i][1]
[headers_search, cookies_dict] = self.headers_handle(url)
r = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=0)
print(r.status_code)
if mark:
pathes = path + str('num') + str(i) + '.gif'
with open(pathes, 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
f.write(r.content) # 往f里写入r对象的二进制文件
f.close()
new_url = self.upload_image_with_path(pathes)
sql = """UPDATE zhihu_answer_picture_url SET new_url = "{0}" WHERE url = "{1}" """.format(str(new_url), str(tuple[i][1]))
print(1)
self.cur.execute(sql)
self.conn.commit()
else:
pathes = path + str('num') + str(i) + '.jpg'
with open(pathes, 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
f.write(r.content) # 往f里写入r对象的二进制文件
f.close()
img = cv2.imread(pathes)
high, width = img.shape[:2]
cropped = img[0:int(high / 10 * 9), 0:width]
pathes = "/Users/xuwei/Desktop/answer_picture_cut/num" + str(i) + ".jpg"
cv2.imwrite(pathes, cropped)
new_url = self.upload_image_with_path(pathes)
sql = """UPDATE zhihu_answer_picture_url SET new_url = "{0}" WHERE url = "{1}" """.format(str(new_url), str(tuple[i][1]))
print(2)
self.cur.execute(sql)
self.conn.commit()
# for picture_deals in picture_deal:
# result = str(list[i])
# result = pattern.findall(result)
# url = result[0]
# [headers_search, cookies_dict] = self.headers_handle(url)
# r = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=0)
# print(r.status_code)
# with open(path + str('num') + str(i) + '.jpg', 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
# f.write(r.content) # 往f里写入r对象的二进制文件
# f.close()
#
# for i in range(11):
# path = "/Users/xuwei/Desktop/picture/num" + str(i) + ".jpg"
# img = cv2.imread(path)
# high, width = img.shape[:2]
# cropped = img[0:int(high / 10 * 9), 0:width]
# paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
# cv2.imwrite(paths, cropped)
def upload_image_with_path(self, path, img_type=IMG_TYPE.TOPICIMAGE):
'''非站内图片处理'''
try:
# with open(path, 'rb') as f:
# url = upload(f.read(), img_type=img_type)
# print('upload ..... ', url)
# return url
url = upload_file(file_path=path, img_type=img_type)
print('upload ..... ', url)
return url
except:
print('upload ..... error')
return None
def answer_refresh_content(self):
sql = """select answer_id, url, new_url from zhihu_answer_picture_url"""
self.cur.execute(sql)
tuple = self.cur.fetchall()
self.conn.commit()
for i in range(len(tuple)):
find_id = tuple[i][0]
temp = str(tuple[i][1])
temp1 = temp.replace("?", "#")
sql = """select content from zhihu_answer where zhihu_answer.id = '{}' """.format(find_id)
self.cur.execute(sql)
tuples = self.cur.fetchall()
# tuples = str(tuples)
content = tuples[0][0]
pattern = r'%s(.+?)%s' % ("<noscript><img src=\"" + temp1, "</figure>")
temp_tuples = content.replace("?", "#")
new_content = re.sub(pattern, "<noscript><img src=\"" + str(tuple[i][2]) + "\"></noscript></figure>",
temp_tuples)
new_content = r'%s' % (new_content)
new_content = escape_string(new_content)
sql = """update zhihu_answer set content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0])
self.cur.execute(sql)
self.conn.commit()
def article_picture_doneload_and_cut(self):
sql = """select article_id, url from zhihu_article_picture_url"""
self.cur.execute(sql)
tuple = self.cur.fetchall()
self.conn.commit()
path = "/Users/xuwei/Desktop/article_picture/"
gif_patt = r'gif'
for i in range(len(tuple)):
mark = re.search(gif_patt, tuple[i][1])
url = tuple[i][1]
[headers_search, cookies_dict] = self.headers_handle(url)
r = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=0)
print(r.status_code)
if mark:
pathes = path + str('num') + str(i) + '.gif'
with open(pathes, 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
f.write(r.content) # 往f里写入r对象的二进制文件
f.close()
new_url = self.upload_image_with_path(pathes)
sql = """UPDATE zhihu_article_picture_url SET new_url = "{0}" WHERE url = "{1}" """.format(str(new_url), str(tuple[i][1]))
print(1)
self.cur.execute(sql)
self.conn.commit()
else:
pathes = path + str('num') + str(i) + '.jpg'
with open(pathes, 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
f.write(r.content) # 往f里写入r对象的二进制文件
f.close()
img = cv2.imread(pathes)
high, width = img.shape[:2]
cropped = img[0:int(high / 10 * 9), 0:width]
pathes = "/Users/xuwei/Desktop/article_picture_cut/num" + str(i) + ".jpg"
cv2.imwrite(pathes, cropped)
new_url = self.upload_image_with_path(pathes)
sql = """UPDATE zhihu_article_picture_url SET new_url = "{0}" WHERE url = "{1}" """.format(str(new_url), str(tuple[i][1]))
print(2)
self.cur.execute(sql)
self.conn.commit()
# for picture_deals in picture_deal:
# result = str(list[i])
# result = pattern.findall(result)
# url = result[0]
# [headers_search, cookies_dict] = self.headers_handle(url)
# r = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=0)
# print(r.status_code)
# with open(path + str('num') + str(i) + '.jpg', 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
# f.write(r.content) # 往f里写入r对象的二进制文件
# f.close()
#
# for i in range(11):
# path = "/Users/xuwei/Desktop/picture/num" + str(i) + ".jpg"
# img = cv2.imread(path)
# high, width = img.shape[:2]
# cropped = img[0:int(high / 10 * 9), 0:width]
# paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
# cv2.imwrite(paths, cropped)
def article_refresh_content(self):
sql = """select article_id, url, new_url from zhihu_article_picture_url"""
self.cur.execute(sql)
tuple = self.cur.fetchall()
self.conn.commit()
for i in range(len(tuple)):
find_id = tuple[i][0]
temp = str(tuple[i][1])
temp1 = temp.replace("?", "#")
sql = """select content from zhihu_article where zhihu_article.id = '{}' """.format(find_id)
self.cur.execute(sql)
tuples = self.cur.fetchall()
# tuples = str(tuples)
content = tuples[0][0]
pattern = r'%s(.+?)%s' % ("<noscript><img src=\"" + temp1, "</figure>")
temp_tuples = content.replace("?", "#")
new_content = re.sub(pattern, "<noscript><img src=\"" + str(tuple[i][2]) + "\"></noscript></figure>",
temp_tuples)
new_content = r'%s' % (new_content)
new_content = escape_string(new_content)
sql = """update zhihu_article set content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0])
self.cur.execute(sql)
self.conn.commit()
def search_thought_page(self, offset, proxies_num=0):
offset = str(offset)
url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data%5B*%5D.upvoted_followees%2Cadmin_closed_comment".format(offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict,proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
if get_page.status_code != 200:
print("article_error")
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
try:
self.parse_thought_sigle_page(one_line)
print("finshed_article" + offset)
except KeyError:
# It's totally ok to drop the last return data value.
# The search api just return something seems related to search
continue
else:
print("article_data_error")
return
def parse_thought_sigle_page(self, data_dict):
for one_dict in data_dict["content"]:
if one_dict["type"] == "text":
into = "insert into zhihu_thought(content, id, created_time, comment_count) value(%s, %s, %s, %s)"
values = (one_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"])
self.cur.execute(into, values)
self.conn.commit()
else:
into = "insert into zhihu_thought_picture_url(thought_id, url) value(%s, %s)"
values = (data_dict["id"], one_dict["url"])
self.cur.execute(into, values)
self.conn.commit()
offset = 0
if data_dict["comment_count"] != 0:
next = 1
while next == 1:
next = self.search_thought_comment(data_dict["id"], offset)
offset = offset + 20
return
def search_thought_comment(self, answerid, offset, proxies_num=0):
offset = str(offset)
answerid = str(answerid)
url = "https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open".format(answerid, offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict,proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
if get_page.status_code != 200:
print("root_comment_error")
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
try:
self.thought_comment_data(one_line, answerid)
print("finshed_root" + offset)
except KeyError:
continue
else:
print("root_data_error")
next = 0
if len(page_dict['data']) == 20:
next = 1
return next
def thought_comment_data(self, data_dict, answerid):
into = "insert into zhihu_thought_comment(thought_comment_id, author_name, content, answerid, created_time, author_id) value(%s, %s, %s, %s, %s, %s)"
values = (data_dict["id"], data_dict["author"]["member"]["name"], data_dict["content"], answerid, data_dict["created_time"], data_dict["author"]["member"]["id"])
self.cur.execute(into, values)
self.conn.commit()
return
def thought_picture_doneload_and_cut(self):
sql = """select thought_id, url from zhihu_thought_picture_url"""
self.cur.execute(sql)
tuple = self.cur.fetchall()
self.conn.commit()
path = "/Users/xuwei/Desktop/thought_picture/"
gif_patt = r'gif'
for i in range(len(tuple)):
mark = re.search(gif_patt, tuple[i][1])
url = tuple[i][1]
[headers_search, cookies_dict] = self.headers_handle(url)
r = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=0)
print(r.status_code)
if mark:
pathes = path + str('num') + str(i) + '.gif'
with open(pathes, 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
f.write(r.content) # 往f里写入r对象的二进制文件
f.close()
new_url = self.upload_image_with_path(pathes)
sql = """UPDATE zhihu_thought_picture_url SET new_url = "{0}" WHERE url = "{1}" """.format(str(new_url), str(tuple[i][1]))
print(1)
self.cur.execute(sql)
self.conn.commit()
else:
pathes = path + str('num') + str(i) + '.jpg'
with open(pathes, 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
f.write(r.content) # 往f里写入r对象的二进制文件
f.close()
img = cv2.imread(pathes)
high, width = img.shape[:2]
cropped = img[0:int(high / 10 * 9), 0:width]
pathes = "/Users/xuwei/Desktop/thought_picture_cut/num" + str(i) + ".jpg"
cv2.imwrite(pathes, cropped)
new_url = self.upload_image_with_path(pathes)
sql = """UPDATE zhihu_thought_picture_url SET new_url = "{0}" WHERE url = "{1}" """.format(str(new_url), str(tuple[i][1]))
print(2)
self.cur.execute(sql)
self.conn.commit()
# for picture_deals in picture_deal:
# result = str(list[i])
# result = pattern.findall(result)
# url = result[0]
# [headers_search, cookies_dict] = self.headers_handle(url)
# r = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=0)
# print(r.status_code)
# with open(path + str('num') + str(i) + '.jpg', 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
# f.write(r.content) # 往f里写入r对象的二进制文件
# f.close()
#
# for i in range(11):
# path = "/Users/xuwei/Desktop/picture/num" + str(i) + ".jpg"
# img = cv2.imread(path)
# high, width = img.shape[:2]
# cropped = img[0:int(high / 10 * 9), 0:width]
# paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
# cv2.imwrite(paths, cropped)
def answer_data_complex(self):
sql = """select id, content, created_time, comment_count from zhihu_answer"""
self.cur.execute(sql)
topics = []
tuple = self.cur.fetchall()
for i in range(len(tuple)):
sql = """select url from zhihu_answer_picture_url as a where a.answer_id = '{}' """.format(tuple[i][0])
self.cur.execute(sql)
images_url = self.cur.fetchall()
sql = """select root_comment_id, child_comment_count, content, created_time, author_id from zhihu_answer_root_comment as a where a.answerid = '{}' """.format(tuple[i][0])
self.cur.execute(sql)
root_comment = self.cur.fetchall()
comment = []
for j in range(len(root_comment)):
if root_comment[j][1] != 0:
sql = """select child_comment_id, content, created_time, author_id from zhihu_child_comment as a where a.root_comment_id = '{}' """.format(root_comment[j][0])
self.cur.execute(sql)
child_comments = self.cur.fetchall()
reply = [{'id': item[0], 'comment':item[1], 'create_time':item[2], 'user':{'id': item[3]}} for item in child_comments]
comment.append(
[
{
'id': root_comment[j][0],
'comment': root_comment[j][2],
'create_time': root_comment[j][3],
'user': {'id': root_comment[j][4]},
'reply': reply,
}
])
topics.append(
{
'images': images_url,
'content': tuple[i][1],
'id': tuple[i][0],
'create_time': tuple[i][2],
'comments': comment,
}
)
return topics
def article_data_complex(self):
sql = """select id, content, created_time, comment_count from zhihu_article"""
self.cur.execute(sql)
topics = []
tuple = self.cur.fetchall()
for i in range(len(tuple)):
sql = """select url from zhihu_article_picture_url as a where a.answer_id = '{}' """.format(tuple[i][0])
self.cur.execute(sql)
images_url = self.cur.fetchall()
sql = """select root_comment_id, child_comment_count, content, created_time, author_id from zhihu_article_root_comment as a where a.answerid = '{}' """.format(tuple[i][0])
self.cur.execute(sql)
root_comment = self.cur.fetchall()
comment = []
for j in range(len(root_comment)):
if root_comment[j][1] != 0:
sql = """select child_comment_id, content, created_time, author_id from zhihu_child_comment as a where a.root_comment_id = '{}' """.format(root_comment[j][0])
self.cur.execute(sql)
child_comments = self.cur.fetchall()
reply = [{'id': item[0], 'comment':item[1], 'create_time':item[2], 'user':{'id': item[3]}} for item in child_comments]
comment.append(
[
{
'id': root_comment[j][0],
'comment': root_comment[j][2],
'create_time': root_comment[j][3],
'user': {'id': root_comment[j][4]},
'reply': reply,
}
])
topics.append(
{
'images': images_url,
'content': tuple[i][1],
'id': tuple[i][0],
'create_time': tuple[i][2],
'comments': comment,
}
)
return topics
def thought_data_complex(self):
sql = """select id, content, created_time, comment_count from zhihu_thought"""
self.cur.execute(sql)
topics = []
tuple = self.cur.fetchall()
for i in range(len(tuple)):
sql = """select url from zhihu_thought_picture_url as a where a.answer_id = '{}' """.format(tuple[i][0])
self.cur.execute(sql)
images_url = self.cur.fetchall()
sql = """select thought_comment_id, content, created_time, author_id from zhihu_article_root_comment as a where a.answerid = '{}' """.format(tuple[i][0])
self.cur.execute(sql)
root_comment = self.cur.fetchall()
comment = []
for j in range(len(root_comment)):
comment.append(
[
{
'id': root_comment[j][0],
'comment': root_comment[j][1],
'create_time': root_comment[j][2],
'user': {'id': root_comment[j][3]},
}
])
topics.append(
{
'images': images_url,
'content': tuple[i][1],
'id': tuple[i][0],
'create_time': tuple[i][2],
'comments': comment,
}
)
return topics
if __name__ == '__main__':
#a = Crawler_zhihu()
# url = a.upload_image_with_path('/Users/xuwei/Desktop/picture/num0.jpg')
# print(url)
# a.data_zuhe()
print(datetime.now())
zhihu = Crawler_zhihu()
zhihu.search_page(1, 0, 0)
print(datetime.now())
......@@ -13,3 +13,5 @@ numpy==1.19.1
pymysql==0.10.0
qiniu==7.1.4
redis==3.5.3
pymysql==0.10.0
opencv-python==4.4.0.46
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment