Commit 9bde750f authored by haowang's avatar haowang

modify zhihu spider

parent 7c2c462d
...@@ -24,6 +24,7 @@ class RefreshContent(object): ...@@ -24,6 +24,7 @@ class RefreshContent(object):
初始化数据库,调整js规则 初始化数据库,调整js规则
''' '''
self.update_error_content_id = [] self.update_error_content_id = []
self.update_error_url_content_id = {}
self.conn = pymysql.connect(host=HOST, port=PORT, user=USER, self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
passwd=PASSWD, passwd=PASSWD,
...@@ -53,49 +54,75 @@ class RefreshContent(object): ...@@ -53,49 +54,75 @@ class RefreshContent(object):
def create_new_content(self, content_id, content, pic_dict): def create_new_content(self, content_id, content, pic_dict):
content = self.replace_html_image_to_url(content) content = self.replace_html_image_to_url(content)
rich_obj = BeautifulSoup(content, features="html.parser") rich_obj = BeautifulSoup(content, features="html.parser")
update_error = False
for item in rich_obj.find_all("img"): for item in rich_obj.find_all("img"):
url = item.get("src") url = item.get("src")[23:]
new_url = pic_dict.get(url) new_url = pic_dict.get(url)
if not new_url: if not new_url:
self.update_error_content_id.append({content_id: url}) if content_id not in self.update_error_content_id:
self.update_error_content_id.append(content_id)
self.update_error_url_content_id[url] = content_id
print({content_id: url}) print({content_id: url})
update_error = True
continue continue
item['src'] = new_url + '-w' item['src'] = new_url + '-w'
return rich_obj.decode() new_content = r'%s' % (rich_obj.decode())
return escape_string(new_content), update_error
def get_all_content_ids(self, table, key_id): def get_all_content_ids(self, table, pic_table, key_id, offset=0, count=10):
sql = """select distinct {} from {}""".format(key_id, table) if offset == 0:
self.cur.execute(sql) sql = """select distinct {} from {}""".format(key_id, pic_table)
res = self.cur.fetchall() print(sql)
self.conn.commit() self.cur.execute(sql)
if res: res = self.cur.fetchall()
return [item[0] for item in res] self.conn.commit()
return None if res:
return [item[0] for item in res]
return None
else:
sql = """select answer_id {} from {} limit {}, {}""".format(key_id, table, offset, count)
print(sql)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
if res:
return [item[0] for item in res]
return None
def refresh_content(self, table, pic_table, key_id): def refresh_content(self, table, pic_table, key_id, offset=0, count=10):
''' '''
替换url,更新回答内容 替换url,更新回答内容
''' '''
content_ids = self.get_all_content_ids(pic_table, key_id) content_ids = self.get_all_content_ids(table, pic_table, key_id, offset, count)
for content_id in content_ids: for content_id in content_ids:
print('start deal table: {}, content_id: {},'.format(table, content_id), datetime.now()) print('start deal table: {}, content_id: {},'.format(table, content_id), datetime.now())
sql = """select url, new_url from {} where {} = {} and new_url is not null""".format(pic_table, key_id, content_id)
sql = """select content from {} where {} = {} and is_new = 0""".format(table, key_id, content_id)
print(sql)
self.cur.execute(sql) self.cur.execute(sql)
res = self.cur.fetchall() res = self.cur.fetchall()
self.conn.commit() self.conn.commit()
pic_dict = {item[0]: item[1] for item in res} if not res:
continue
content = res[0][0]
sql = """select content from {} where {} = {}""".format(table, key_id, content_id) sql = """select url, new_url from {} where {} = {} and new_url is not null""".format(
pic_table, key_id, content_id)
self.cur.execute(sql) self.cur.execute(sql)
res = self.cur.fetchall() res = self.cur.fetchall()
self.conn.commit() self.conn.commit()
content = res[0][0] pic_dict = {
item[0][23:]: item[1] for item in res}
new_content = self.create_new_content(content_id, content, pic_dict) new_content, update_error = self.create_new_content(content_id, content, pic_dict)
update_code = 1 if not update_error else 0
sql = """update {} set new_content = '{}' WHERE {} = '{}' """.format(table, new_content, key_id, content_id) sql = """update {} set new_content = '{}', is_new = {} WHERE {} = '{}' """.format(
table, new_content, update_code, key_id, content_id)
self.cur.execute(sql) self.cur.execute(sql)
self.conn.commit() self.conn.commit()
print('end deal table: {}, content_id: {},'.format(table, content_id), datetime.now()) print('end deal table: {}, content_id: {},'.format(table, content_id), datetime.now())
...@@ -104,16 +131,19 @@ class RefreshContent(object): ...@@ -104,16 +131,19 @@ class RefreshContent(object):
if __name__ == '__main__': if __name__ == '__main__':
""" python script_file mark """ """ python script_file mark """
print('参数个数为:', len(sys.argv), '个参数。') print('参数个数为:', len(sys.argv), '个参数。')
print('参数列表:', type(sys.argv[0]), sys.argv[0], type(sys.argv[1]), sys.argv[1]) print('参数列表:', type(sys.argv[0]), sys.argv[0], type(sys.argv[1]), sys.argv[1], type(sys.argv[2]), sys.argv[2], type(sys.argv[3]), sys.argv[3])
mark = int(sys.argv[1]) mark = int(sys.argv[1])
offset = int(sys.argv[2])
count = int(sys.argv[3])
print(datetime.now()) print(datetime.now())
refresh = RefreshContent() refresh = RefreshContent()
if mark == 0: if mark == 0:
refresh.refresh_content('zhihu_answer', 'zhihu_answer_picture_url', 'answer_id') refresh.refresh_content('zhihu_answer', 'zhihu_answer_picture_url', 'answer_id', offset, count)
elif mark == 1: elif mark == 1:
refresh.refresh_content('zhihu_article', 'zhihu_article_picture_url', 'article_id') refresh.refresh_content('zhihu_article', 'zhihu_article_picture_url', 'article_id')
elif mark == 2: elif mark == 2:
refresh.refresh_content('zhihu_thought', 'zhihu_thought_picture_url', 'thought_id') refresh.refresh_content('zhihu_thought', 'zhihu_thought_picture_url', 'thought_id')
print(refresh.update_error_content_id) print('update_error_url_content_ids : ', refresh.update_error_url_content_id)
print('update_error_content_ids : ', refresh.update_error_content_id)
print(datetime.now()) print(datetime.now())
# import rsa # import rsa
# import os, sys import os, sys
# base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# sys.path.append("/Users/xuwei") sys.path.append("/Users/haowei")
# sys.path.append("/Users/xuwei/crawler") sys.path.append("/Users/haowei/workspace/gm/crawler")
# sys.path.append("/Users/xuwei/crawler/crawler_sys") sys.path.append("/Users/haowei/workspace/gm/crawler/crawler_sys")
import pymysql import pymysql
import hashlib import hashlib
...@@ -12,7 +12,7 @@ import execjs ...@@ -12,7 +12,7 @@ import execjs
import os import os
import re import re
import sys import sys
from crawler_sys.utils.output_results import retry_get_url, retry_get_url_no_proxies import time
from datetime import datetime from datetime import datetime
HOST = '172.18.51.14' HOST = '172.18.51.14'
...@@ -20,21 +20,53 @@ PORT = 3306 ...@@ -20,21 +20,53 @@ PORT = 3306
USER = 'spider' USER = 'spider'
PASSWD = 'Gengmei123' PASSWD = 'Gengmei123'
DB = 'spider' DB = 'spider'
JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js' # JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
SPIDER_URL = 'https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1' JS_FILE_PATH = '/Users/haowei/workspace/gm/crawler/crawler_sys/site_crawler/zhihu.js'
ANSWER_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created' # SPIDER_URL = 'https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1'
ARTICLE_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20&sort_by=created' # ANSWER_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created'
THOUGHT_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data%5B*%5D.upvoted_followees%2Cadmin_closed_comment' # ARTICLE_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20&sort_by=created'
ANSWER_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/answers/{0}/root_comments?order=normal&limit=20&offset={1}&status=open' # THOUGHT_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data%5B*%5D.upvoted_followees%2Cadmin_closed_comment'
ARTICLE_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/articles/{0}/root_comments?order=normal&limit=20&offset={1}&status=open' # ANSWER_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/answers/{0}/root_comments?order=normal&limit=20&offset={1}&status=open'
CHILD_COMMENT_START_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments' # ARTICLE_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/articles/{0}/root_comments?order=normal&limit=20&offset={1}&status=open'
CHILD_COMMENT_OFFSET_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments?limit=20&offset={1}' # CHILD_COMMENT_START_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments'
THOUGHT_COMMENT_URL = 'https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open' # CHILD_COMMENT_OFFSET_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments?limit=20&offset={1}'
# THOUGHT_COMMENT_URL = 'https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open'
def retry_get_url_no_proxies(url, retrys=3, timeout=10, **kwargs):
retry_c = 0
while retry_c < retrys:
try:
get_resp = requests.get(url, timeout=timeout, **kwargs)
return get_resp
except Exception as e:
retry_c += 1
time.sleep(1)
print(e)
print('Failed to get page %s after %d retries, %s'
% (url, retrys, datetime.now()))
return None
def retry_get_url(url, retrys=3, timeout=10, **kwargs):
retry_c = 0
while retry_c < retrys:
try:
get_resp = requests.get(url, timeout=timeout, **kwargs)
return get_resp
except Exception as e:
retry_c += 1
time.sleep(1)
print(e)
print('Failed to get page %s after %d retries, %s'
% (url, retrys, datetime.now()))
return None
class Spider(object): class Spider(object):
def __init__(self): def __init__(self, spider_url):
''' '''
初始化数据库,调整js规则 初始化数据库,调整js规则
''' '''
...@@ -43,6 +75,9 @@ class Spider(object): ...@@ -43,6 +75,9 @@ class Spider(object):
db=DB, charset='utf8') db=DB, charset='utf8')
self.cur = self.conn.cursor() self.cur = self.conn.cursor()
self.spider_url = spider_url
self.ANSWER_URL = self.spider_url.replace("https://www.zhihu.com/people", "https://www.zhihu.com/api/v4/members") + '/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created'
os.environ["EXECJS_RUNTIME"] = 'Node' os.environ["EXECJS_RUNTIME"] = 'Node'
try: try:
with open('./zhihu.js', 'r', encoding='utf-8') as f: with open('./zhihu.js', 'r', encoding='utf-8') as f:
...@@ -57,14 +92,13 @@ class Spider(object): ...@@ -57,14 +92,13 @@ class Spider(object):
cookies更新 cookies更新
''' '''
url = "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1"
headers = { headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br", "accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9", "accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0", "cache-control": "max-age=0",
"cookie": '_SESSIONID=rvJz2mbRjCqJFwvD79ADGb8gpdpRkWXAsdVDqOVALgh; JOID=UVkSBEtHLEUgV_KGUUMd3fULiLJHZglmBHfTo3JnDWADc9KndNqP8XtW9oBWoLo5jsAnAcPG0-JwbPp8rD1JsFI=; osd=W1gdB05NLUojUviHXkAY1_QEi7dNZwZlAX3SrHFiB2EMcNetddWM9HFX-YNTqrs2jcUtAMzF1uhxY_l5pjxGs1c=; _zap=cc1330a6-d6e9-4b25-8232-0b0481e37ea2; _xsrf=4798d610-1db7-489f-8c14-d608c0bbcf08; d_c0="AOCcdS0CNBKPToRkxgVd-8qBKKgTDM9yF4Y=|1605507877"; capsion_ticket="2|1:0|10:1605684439|14:capsion_ticket|44:MzdiN2JlOTZlNWE2NGU5Zjg0MDNiODFlN2ViMjQzNGU=|b84d77e03b5e9447a3740a614d208ca5f3bc1e4fe15fe9b46f1a2a5702da5f99"; SESSIONID=nplCMucg2EXp8xNQAz74jmaLovUS9CG4rVtcFY4jWLT; JOID=V1sUBk8zRBeITtMgLDNzj10QrRI6WzZ_ygKgQU8AB038d6pof-COptdJ1yYpGa7oR9-1bgHql0Hfs4FvUF_YW-A=; osd=Ul4XBkI2QRSIQ9YlLzN-ilgTrR8_XjV_xwelQk8NAkj_d6dteuOOq9JM1CYkHKvrR9KwawLqmkTasIFiVVrbW-0=; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1605608394,1605684435,1605684439,1605684455; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1605684522; KLBRSID=fe0fceb358d671fa6cc33898c8c48b48|1605685245|1605683234', "cookie": '_SESSIONID=rvJz2mbRjCqJFwvD79ADGb8gpdpRkWXAsdVDqOVALgh; JOID=UVkSBEtHLEUgV_KGUUMd3fULiLJHZglmBHfTo3JnDWADc9KndNqP8XtW9oBWoLo5jsAnAcPG0-JwbPp8rD1JsFI=; osd=W1gdB05NLUojUviHXkAY1_QEi7dNZwZlAX3SrHFiB2EMcNetddWM9HFX-YNTqrs2jcUtAMzF1uhxY_l5pjxGs1c=; _zap=cc1330a6-d6e9-4b25-8232-0b0481e37ea2; _xsrf=4798d610-1db7-489f-8c14-d608c0bbcf08; d_c0="AOCcdS0CNBKPToRkxgVd-8qBKKgTDM9yF4Y=|1605507877"; capsion_ticket="2|1:0|10:1605684439|14:capsion_ticket|44:MzdiN2JlOTZlNWE2NGU5Zjg0MDNiODFlN2ViMjQzNGU=|b84d77e03b5e9447a3740a614d208ca5f3bc1e4fe15fe9b46f1a2a5702da5f99"; SESSIONID=nplCMucg2EXp8xNQAz74jmaLovUS9CG4rVtcFY4jWLT; JOID=V1sUBk8zRBeITtMgLDNzj10QrRI6WzZ_ygKgQU8AB038d6pof-COptdJ1yYpGa7oR9-1bgHql0Hfs4FvUF_YW-A=; osd=Ul4XBkI2QRSIQ9YlLzN-ilgTrR8_XjV_xwelQk8NAkj_d6dteuOOq9JM1CYkHKvrR9KwawLqmkTasIFiVVrbW-0=; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1605608394,1605684435,1605684439,1605684455; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1605684522; KLBRSID=fe0fceb358d671fa6cc33898c8c48b48|1605685245|1605683234',
"referer": "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1", "referer": self.spider_url,
"sec-fetch-dest": "document", "sec-fetch-dest": "document",
"sec-fetch-mode": "navigate", "sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin", "sec-fetch-site": "same-origin",
...@@ -72,31 +106,55 @@ class Spider(object): ...@@ -72,31 +106,55 @@ class Spider(object):
"upgrade-insecure-requests": "1", "upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36",
} }
requests_res = retry_get_url(url, headers=headers) requests_res = retry_get_url(self.spider_url, headers=headers)
return requests_res.cookies.get_dict() return requests_res.cookies.get_dict()
def parse_sigle_page(self, data_dict, mark): def check_data_exist(self, data_dict, mark):
''' '''
插入主要内容数据和图片的url,寻找评论 数据插入前检测
''' '''
sql = "select id from {table} where id = {id_}"
exist = None
if mark == 0: if mark == 0:
into = "insert into zhihu_answer(title, content, answer_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)" select_sql = sql.format(table='zhihu_answer', id_=data_dict["id"])
values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"], self.cur.execute(select_sql)
data_dict["comment_count"], data_dict["content"]) exist = self.cur.fetchone()
elif mark == 1: if mark == 1:
into = "insert into zhihu_article(title, content, article_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)" select_sql = sql.format(table='zhihu_article', id_=data_dict["id"])
values = ( self.cur.execute(select_sql)
data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"], exist = self.cur.fetchone()
data_dict["content"])
self.cur.execute(into, values) if exist:
self.conn.commit() return True
offset = 0 return False
if data_dict["comment_count"] != 0:
next = 1 def parse_sigle_page(self, data_dict, mark, need_comment=False):
while next == 1: '''
next = self.search_root_comment(data_dict["id"], offset, mark) 插入主要内容数据和图片的url,寻找评论
offset = offset + 20 '''
if not self.check_data_exist(data_dict, mark):
if mark == 0:
into = "insert into zhihu_answer(title, content, answer_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"],
data_dict["comment_count"], data_dict["content"])
elif mark == 1:
into = "insert into zhihu_article(title, content, article_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (
data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"],
data_dict["content"])
print(data_dict["question"]["title"])
self.cur.execute(into, values)
self.conn.commit()
if need_comment:
offset = 0
if data_dict["comment_count"] != 0:
next = 1
while next == 1:
next = self.search_root_comment(data_dict["id"], offset, mark)
offset = offset + 20
# patt = r'%s(.+?)%s' % ("<noscript><img src=\"", "\" data-caption") # patt = r'%s(.+?)%s' % ("<noscript><img src=\"", "\" data-caption")
# pattern = re.compile(patt) # pattern = re.compile(patt)
...@@ -112,7 +170,7 @@ class Spider(object): ...@@ -112,7 +170,7 @@ class Spider(object):
return return
def search_page(self, mark, page_max, start_page=0): def search_page(self, mark, page_max, start_page=0, need_commend=False):
''' '''
函数主入口 函数主入口
...@@ -123,9 +181,9 @@ class Spider(object): ...@@ -123,9 +181,9 @@ class Spider(object):
for i in range(page_max): for i in range(page_max):
if mark == 0: if mark == 0:
self.search_answer_article_page(offset, 0) self.search_answer_article_page(offset, 0, 0, need_commend)
elif mark == 1: elif mark == 1:
self.search_answer_article_page(offset, 1) self.search_answer_article_page(offset, 1, 0, need_commend)
elif mark == 2: elif mark == 2:
self.search_thought_page(offset) self.search_thought_page(offset)
...@@ -134,13 +192,13 @@ class Spider(object): ...@@ -134,13 +192,13 @@ class Spider(object):
self.conn.close() self.conn.close()
return return
def search_answer_article_page(self, offset, mark, proxies_num=0): def search_answer_article_page(self, offset, mark, proxies_num=0, need_comment=False):
''' '''
实现文章和回答的数据包请求 实现文章和回答的数据包请求
''' '''
offset = str(offset) offset = str(offset)
if mark == 0: if mark == 0:
url = ANSWER_URL.format(offset) url = self.ANSWER_URL.format(offset)
elif mark == 1: elif mark == 1:
url = ARTICLE_URL.format(offset) url = ARTICLE_URL.format(offset)
[headers_search, cookies_dict] = self.headers_handle(url) [headers_search, cookies_dict] = self.headers_handle(url)
...@@ -156,8 +214,8 @@ class Spider(object): ...@@ -156,8 +214,8 @@ class Spider(object):
for one_line in page_dict['data']: for one_line in page_dict['data']:
try: try:
if one_line["content"] != None: if one_line["content"] != None:
self.parse_sigle_page(one_line, mark) self.parse_sigle_page(one_line, mark, need_comment=need_comment)
print("finshed_article" + offset) print("finshed_crawler " + offset)
except KeyError: except KeyError:
# It's totally ok to drop the last return data value. # It's totally ok to drop the last return data value.
# The search api just return something seems related to search # The search api just return something seems related to search
...@@ -279,6 +337,7 @@ class Spider(object): ...@@ -279,6 +337,7 @@ class Spider(object):
url请求中的头部伪装 url请求中的头部伪装
''' '''
res_cookies_dict = self.get_serach_page_cookies() res_cookies_dict = self.get_serach_page_cookies()
referer = self.spider_url.replace("https://www.zhihu.com/people", "https://www.zhihu.com/api/v4/members")
headers_search = { headers_search = {
"accept": "*/*", "accept": "*/*",
...@@ -294,7 +353,7 @@ class Spider(object): ...@@ -294,7 +353,7 @@ class Spider(object):
"x-requested-with": "fetch", "x-requested-with": "fetch",
"x-zse-83": "3_2.0", "x-zse-83": "3_2.0",
"x-zse-86": None, "x-zse-86": None,
"referer": "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?page=1", "referer": referer + "/answers?page=1",
} }
cookies_dict = { cookies_dict = {
...@@ -410,11 +469,18 @@ class Spider(object): ...@@ -410,11 +469,18 @@ class Spider(object):
if __name__ == '__main__': if __name__ == '__main__':
'''
python tasks/zhihu/spider.py 0 1 0 'https://www.zhihu.com/people/taoxi-1130'
python script_file_path mark(指定是问题还是其他, 0 是问题, 1是文章, 2是想法) max_page(最大页码) start_page(起始页码 0开始) spider_url(要爬取用户知乎首页地址)
'''
mark = int(sys.argv[1]) mark = int(sys.argv[1])
max_page = int(sys.argv[2]) max_page = int(sys.argv[2])
start_page = int(sys.argv[3]) start_page = int(sys.argv[3])
spider_url = sys.argv[4]
# spider_url = 'https://www.zhihu.com/people/geng-mei-suo-chang/answers'
print(datetime.now()) print(datetime.now())
spider = Spider() spider = Spider(spider_url=spider_url)
if mark == 0: if mark == 0:
spider.search_page(mark, max_page, start_page) spider.search_page(mark, max_page, start_page)
elif mark == 1: elif mark == 1:
......
...@@ -10,6 +10,7 @@ import execjs ...@@ -10,6 +10,7 @@ import execjs
from datetime import datetime from datetime import datetime
from image_qiniu import upload_file, IMG_TYPE from image_qiniu import upload_file, IMG_TYPE
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
# pip3 install "requests[security]" -i https://pypi.tuna.tsinghua.edu.cn/simple
# DATA_OS_PATH = '/data' # DATA_OS_PATH = '/data'
...@@ -109,15 +110,16 @@ class UploadImage(object): ...@@ -109,15 +110,16 @@ class UploadImage(object):
return headers_search, cookies_dict return headers_search, cookies_dict
@staticmethod @staticmethod
def retry_get_url(url, retrys=3, proxies=None, timeout=10, **kwargs): def retry_get_url(url, retrys=5, proxies=None, timeout=10, **kwargs):
retry_c = 0 retry_c = 0
while retry_c < retrys: while retry_c < retrys:
try: try:
get_resp = requests.get(url, timeout=timeout, **kwargs) requests.packages.urllib3.disable_warnings()
get_resp = requests.get(url, verify=False, timeout=timeout, **kwargs)
return get_resp return get_resp
except Exception as e: except Exception as e:
retry_c += 1 retry_c += 1
time.sleep(1) time.sleep(2)
print(e) print(e)
print('Failed to get page %s after %d retries, %s' print('Failed to get page %s after %d retries, %s'
% (url, retrys, datetime.now())) % (url, retrys, datetime.now()))
...@@ -142,9 +144,9 @@ class UploadImage(object): ...@@ -142,9 +144,9 @@ class UploadImage(object):
文章图片剪切和下载 文章图片剪切和下载
''' '''
def _deal_image_by_path(file_path, old_url): def _deal_image_by_path(res, file_path, old_url, i):
img = cv2.imread(file_path) img = cv2.imread(file_path)
if img: if img is not None:
high, width = img.shape[:2] high, width = img.shape[:2]
cropped = img[0:int(high / 10 * 9), 0:width] cropped = img[0:int(high / 10 * 9), 0:width]
pathes = new_path + "num" + str(i) + ".jpg" pathes = new_path + "num" + str(i) + ".jpg"
...@@ -152,7 +154,7 @@ class UploadImage(object): ...@@ -152,7 +154,7 @@ class UploadImage(object):
new_url = self.upload_image_with_path(pathes) new_url = self.upload_image_with_path(pathes)
sql = """UPDATE {} SET new_url = "{}" WHERE url = "{}" """.format( sql = """UPDATE {} SET new_url = "{}" WHERE url = "{}" """.format(
table, str(new_url), str(tuple[i][1])) table, str(new_url), str(res[i][1]))
self.cur.execute(sql) self.cur.execute(sql)
self.conn.commit() self.conn.commit()
else: else:
...@@ -167,36 +169,44 @@ class UploadImage(object): ...@@ -167,36 +169,44 @@ class UploadImage(object):
self.cur.execute(sql) self.cur.execute(sql)
self.conn.commit() self.conn.commit()
def _download_picture():
sql = """select {}, url from {} where {} = {} and new_url is null""".format(key_id, table, key_id, content_id)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
gif_patt = r'gif'
for i in range(len(res)):
mark = re.search(gif_patt, res[i][1])
url = res[i][1]
[headers_search, cookies_dict] = self.headers_handle(url)
r = self.retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=0)
if not r:
continue
# try:
if mark:
pathes = path + str('num') + str(i) + '.gif'
with open(pathes, 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
f.write(r.content) # 往f里写入r对象的二进制文件
f.close()
new_url = self.upload_image_with_path(pathes)
sql = """UPDATE {} SET new_url = "{}" WHERE url = "{}" """.format(
table, str(new_url), str(url))
self.cur.execute(sql)
self.conn.commit()
else:
pathes = path + str('num') + str(i) + '.jpg'
with open(pathes, 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
f.write(r.content) # 往f里写入r对象的二进制文件
f.close()
_deal_image_by_path(res, pathes, url, i)
# except Exception as e:
# print(e)
urls = self.find_all_url(content) urls = self.find_all_url(content)
self.insert_picture_urls(table, urls, content_id, key_id) self.insert_picture_urls(table, urls, content_id, key_id)
_download_picture()
sql = """select {}, url from {} where {} = {} and new_url is null""".format(key_id, table, key_id, content_id) _download_picture()
self.cur.execute(sql)
tuple = self.cur.fetchall()
self.conn.commit()
gif_patt = r'gif'
for i in range(len(tuple)):
mark = re.search(gif_patt, tuple[i][1])
url = tuple[i][1]
[headers_search, cookies_dict] = self.headers_handle(url)
r = self.retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=0)
if mark:
pathes = path + str('num') + str(i) + '.gif'
with open(pathes, 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
f.write(r.content) # 往f里写入r对象的二进制文件
f.close()
new_url = self.upload_image_with_path(pathes)
sql = """UPDATE {} SET new_url = "{}" WHERE url = "{}" """.format(
table, str(new_url), str(url))
self.cur.execute(sql)
self.conn.commit()
else:
pathes = path + str('num') + str(i) + '.jpg'
with open(pathes, 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
f.write(r.content) # 往f里写入r对象的二进制文件
f.close()
_deal_image_by_path(pathes, url)
def picture_process(self, path, new_path, table, pic_table, key_id, offset=0, count=10): def picture_process(self, path, new_path, table, pic_table, key_id, offset=0, count=10):
content_dict = self.gets_content_dict(table, key_id, offset, count) content_dict = self.gets_content_dict(table, key_id, offset, count)
......
CREATE TABLE `zhihu_answer` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`title` varchar(40) DEFAULT NULL COMMENT '标题',
`content` mediumtext COMMENT '内容',
`answer_id` int(11) DEFAULT NULL COMMENT 'id',
`created_time` int(11) DEFAULT NULL COMMENT '创建时间',
`comment_count` int(11) DEFAULT NULL COMMENT '评论数',
`new_content` mediumtext COMMENT '新内容',
`is_new` tinyint(1) NOT NULL DEFAULT 0 COMMENT '是否已更新',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎问答';
alter table `zhihu_answer` add column `is_new` tinyint(1) NOT NULL DEFAULT 0 COMMENT '是否已更新';
CREATE TABLE `zhihu_answer_root_comment` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`root_comment_id` int(11) DEFAULT NULL COMMENT '父评论id',
`author_name` varchar(40) DEFAULT NULL COMMENT '作者名',
`content` mediumtext COMMENT '内容',
`answerid` int(11) DEFAULT NULL COMMENT '回答ID',
`child_comment_count` int(11) DEFAULT NULL COMMENT '子评论数量',
`featured` varchar(5) DEFAULT NULL COMMENT '是否精彩评论',
`created_time` int(11) DEFAULT NULL COMMENT '创建时间',
`author_id` varchar(50) DEFAULT NULL COMMENT '作者id',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎问答父评论';
CREATE TABLE `zhihu_article` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`title` varchar(40) DEFAULT NULL COMMENT '标题',
`content` mediumtext COMMENT '内容',
`article_id` int(11) DEFAULT NULL COMMENT 'id',
`created_time` int(11) DEFAULT NULL COMMENT '创建时间',
`comment_count` int(11) DEFAULT NULL COMMENT '评论数量',
`new_content` mediumtext COMMENT '新内容',
`is_new` tinyint(1) NOT NULL DEFAULT 0 COMMENT '是否已更新',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎文章';
CREATE TABLE `zhihu_child_comment` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`root_comment_id` int(11) DEFAULT NULL COMMENT '父评论id',
`author_name` varchar(40) DEFAULT NULL COMMENT '作者名',
`content` mediumtext COMMENT '内容',
`reply_name` varchar(40) DEFAULT NULL COMMENT '回复者名字',
`child_comment_id` int(11) DEFAULT NULL COMMENT '子评论id',
`created_time` int(11) DEFAULT NULL COMMENT '创建时间',
`author_id` varchar(50) DEFAULT NULL COMMENT '作者id',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎子评论';
CREATE TABLE `zhihu_article_root_comment` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`root_comment_id` int(11) DEFAULT NULL COMMENT '父评论id',
`author_name` varchar(40) DEFAULT NULL COMMENT '作者名',
`content` mediumtext COMMENT '内容',
`answerid` int(11) DEFAULT NULL COMMENT '文章id',
`child_comment_count` int(11) DEFAULT NULL COMMENT '子评论数',
`featured` varchar(5) DEFAULT NULL COMMENT '是否精彩评论',
`created_time` int(11) DEFAULT NULL COMMENT '创建时间',
`author_id` varchar(50) DEFAULT NULL COMMENT '作者id',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎文章评论';
CREATE TABLE `zhihu_answer_picture_url` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`answer_id` int(11) DEFAULT NULL COMMENT '问答id',
`url` mediumtext COMMENT 'url',
`new_url` mediumtext COMMENT '新url',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎问答图片';
CREATE TABLE `zhihu_article_picture_url` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`article_id` int(11) DEFAULT NULL COMMENT '文章id',
`url` mediumtext COMMENT 'url',
`new_url` mediumtext COMMENT '新url',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎文章图片';
CREATE TABLE `zhihu_thought` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`thought_id` varchar(50) DEFAULT NULL COMMENT '想法id',
`content` text COMMENT '内容',
`created_time` int(11) DEFAULT NULL COMMENT '创建时间',
`comment_count` int(11) DEFAULT NULL COMMENT '评论数量',
`new_content` mediumtext COMMENT '新内容',
`is_new` tinyint(1) NOT NULL DEFAULT 0 COMMENT '是否已更新',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎想法';
CREATE TABLE `zhihu_thought_comment` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`thought_comment_id` int(11) DEFAULT NULL COMMENT '想法评论id',
`author_name` varchar(40) DEFAULT NULL COMMENT '作者名',
`content` text COMMENT '内容',
`answerid` varchar(50) DEFAULT NULL COMMENT '想法id',
`created_time` int(11) DEFAULT NULL COMMENT '创建时间',
`author_id` varchar(50) DEFAULT NULL COMMENT '作者名',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎想法评论';
CREATE TABLE `zhihu_thought_picture_url` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`thought_id` varchar(50) DEFAULT NULL COMMENT '想法ID',
`url` text COMMENT 'url',
`new_url` text COMMENT 'new url',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎想法图片'
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment