Commit f1a51df3 authored by haowang's avatar haowang

modify

parent 778c9506
...@@ -22,15 +22,38 @@ import cv2 ...@@ -22,15 +22,38 @@ import cv2
from pymysql import escape_string from pymysql import escape_string
HOST = '172.18.51.14'
PORT = '3306'
USER = 'spider'
PASSWD = 'Gengmei123'
DB = 'spider'
JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
SPIDER_URL = 'https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1'
ANSWER_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created'
ARTICLE_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20&sort_by=created'
THOUGHT_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data%5B*%5D.upvoted_followees%2Cadmin_closed_comment'
ANSWER_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/answers/{0}/root_comments?order=normal&limit=20&offset={1}&status=open'
ARTICLE_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/articles/{0}/root_comments?order=normal&limit=20&offset={1}&status=open'
CHILD_COMMENT_START_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments'
CHILD_COMMENT_OFFSET_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments?limit=20&offset={1}'
THOUGHT_COMMENT_URL = 'https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open'
ANSWER_PICTURE_PATH = '/data/answer_picture/'
ARTICLE_PICTURE_PATH = '/data/article_picture/'
THOUGHT_PICTURE_PATH = '/data/thought_picture/'
ANSWER_PICTURE_CUT_PATH = '/data/answer_picture_cut/'
ARTICLE_PICTURE_CUT_PATH = '/data/article_picture_cut/'
THOUGHT_PICTURE_CUT_PATH = '/data/thought_picture_cut/'
class Crawler_zhihu(): class Crawler_zhihu():
def __init__(self): def __init__(self):
''' '''
初始化数据库,调整js规则 初始化数据库,调整js规则
''' '''
self.conn = pymysql.connect(host='bj-cdb-b8oeejac.sql.tencentcdb.com', port=62118, user='work', self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
passwd='Gengmei1', passwd=PASSWD,
db='mimas_dev', charset='utf8') db=DB, charset='utf8')
self.cur = self.conn.cursor() self.cur = self.conn.cursor()
# self.cur.execute("drop table if exists zhihu_answer") # self.cur.execute("drop table if exists zhihu_answer")
# sql = """create table zhihu_answer(title char(40), # sql = """create table zhihu_answer(title char(40),
...@@ -120,7 +143,7 @@ class Crawler_zhihu(): ...@@ -120,7 +143,7 @@ class Crawler_zhihu():
with open('./zhihu.js', 'r', encoding='utf-8') as f: with open('./zhihu.js', 'r', encoding='utf-8') as f:
js = f.read() js = f.read()
except: except:
with open('/Users/xuwei/crawler/crawler_sys/site_crawler/zhihu.js', 'r', encoding='utf-8') as f: with open(JS_FILE_PATH, 'r', encoding='utf-8') as f:
js = f.read() js = f.read()
# print(js) # print(js)
self.exec_js = execjs.compile(js, cwd=r'/usr/local/lib/node_modules') self.exec_js = execjs.compile(js, cwd=r'/usr/local/lib/node_modules')
...@@ -216,9 +239,9 @@ class Crawler_zhihu(): ...@@ -216,9 +239,9 @@ class Crawler_zhihu():
''' '''
offset = str(offset) offset = str(offset)
if mark == 0: if mark == 0:
url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created".format(offset) url = ANSWER_URL.format(offset)
elif mark == 1: elif mark == 1:
url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20&sort_by=created".format(offset) url = ARTICLE_URL.format(offset)
[headers_search, cookies_dict] = self.headers_handle(url) [headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict,proxies=proxies_num) get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict,proxies=proxies_num)
...@@ -241,8 +264,6 @@ class Crawler_zhihu(): ...@@ -241,8 +264,6 @@ class Crawler_zhihu():
else: else:
print("article_data_error") print("article_data_error")
return return
def search_root_comment(self, answerid, offset, mark, proxies_num=0): def search_root_comment(self, answerid, offset, mark, proxies_num=0):
...@@ -252,9 +273,9 @@ class Crawler_zhihu(): ...@@ -252,9 +273,9 @@ class Crawler_zhihu():
offset = str(offset) offset = str(offset)
answerid = str(answerid) answerid = str(answerid)
if mark == 0: if mark == 0:
url = "https://www.zhihu.com/api/v4/answers/{0}/root_comments?order=normal&limit=20&offset={1}&status=open".format(answerid, offset) url = ANSWER_ROOT_COMMENT_URL.format(answerid, offset)
elif mark == 1: elif mark == 1:
url = "https://www.zhihu.com/api/v4/articles/{0}/root_comments?order=normal&limit=20&offset={1}&status=open".format(answerid, offset) url = ARTICLE_ROOT_COMMENT_URL.format(answerid, offset)
[headers_search, cookies_dict] = self.headers_handle(url) [headers_search, cookies_dict] = self.headers_handle(url)
...@@ -310,9 +331,9 @@ class Crawler_zhihu(): ...@@ -310,9 +331,9 @@ class Crawler_zhihu():
offsets = offset offsets = offset
offset = str(offset) offset = str(offset)
if offsets == 0: if offsets == 0:
url = "https://www.zhihu.com/api/v4/comments/{0}/child_comments".format(root_comment_id) url = CHILD_COMMENT_START_URL.format(root_comment_id)
else: else:
url = "https://www.zhihu.com/api/v4/comments/{0}/child_comments?limit=20&offset={1}".format(root_comment_id, offset) url = CHILD_COMMENT_OFFSET_URL.format(root_comment_id, offset)
[headers_search, cookies_dict] = self.headers_handle(url) [headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url_no_proxies(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num) get_page = retry_get_url_no_proxies(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
...@@ -391,7 +412,7 @@ class Crawler_zhihu(): ...@@ -391,7 +412,7 @@ class Crawler_zhihu():
self.cur.execute(sql) self.cur.execute(sql)
tuple = self.cur.fetchall() tuple = self.cur.fetchall()
self.conn.commit() self.conn.commit()
path = "/Users/xuwei/Desktop/answer_picture/" path = ANSWER_PICTURE_PATH
gif_patt = r'gif' gif_patt = r'gif'
for i in range(len(tuple)): for i in range(len(tuple)):
mark = re.search(gif_patt, tuple[i][1]) mark = re.search(gif_patt, tuple[i][1])
...@@ -419,7 +440,7 @@ class Crawler_zhihu(): ...@@ -419,7 +440,7 @@ class Crawler_zhihu():
img = cv2.imread(pathes) img = cv2.imread(pathes)
high, width = img.shape[:2] high, width = img.shape[:2]
cropped = img[0:int(high / 10 * 9), 0:width] cropped = img[0:int(high / 10 * 9), 0:width]
pathes = "/Users/xuwei/Desktop/answer_picture_cut/num" + str(i) + ".jpg" pathes = ANSWER_PICTURE_CUT_PATH + "num" + str(i) + ".jpg"
cv2.imwrite(pathes, cropped) cv2.imwrite(pathes, cropped)
new_url = self.upload_image_with_path(pathes) new_url = self.upload_image_with_path(pathes)
...@@ -502,7 +523,7 @@ class Crawler_zhihu(): ...@@ -502,7 +523,7 @@ class Crawler_zhihu():
self.cur.execute(sql) self.cur.execute(sql)
tuple = self.cur.fetchall() tuple = self.cur.fetchall()
self.conn.commit() self.conn.commit()
path = "/Users/xuwei/Desktop/article_picture/" path = ARTICLE_PICTURE_PATH
gif_patt = r'gif' gif_patt = r'gif'
for i in range(len(tuple)): for i in range(len(tuple)):
mark = re.search(gif_patt, tuple[i][1]) mark = re.search(gif_patt, tuple[i][1])
...@@ -530,7 +551,7 @@ class Crawler_zhihu(): ...@@ -530,7 +551,7 @@ class Crawler_zhihu():
img = cv2.imread(pathes) img = cv2.imread(pathes)
high, width = img.shape[:2] high, width = img.shape[:2]
cropped = img[0:int(high / 10 * 9), 0:width] cropped = img[0:int(high / 10 * 9), 0:width]
pathes = "/Users/xuwei/Desktop/article_picture_cut/num" + str(i) + ".jpg" pathes = ARTICLE_PICTURE_CUT_PATH + "num" + str(i) + ".jpg"
cv2.imwrite(pathes, cropped) cv2.imwrite(pathes, cropped)
new_url = self.upload_image_with_path(pathes) new_url = self.upload_image_with_path(pathes)
...@@ -590,7 +611,7 @@ class Crawler_zhihu(): ...@@ -590,7 +611,7 @@ class Crawler_zhihu():
想法数据包请求 想法数据包请求
''' '''
offset = str(offset) offset = str(offset)
url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data%5B*%5D.upvoted_followees%2Cadmin_closed_comment".format(offset) url = THOUGHT_URL.format(offset)
[headers_search, cookies_dict] = self.headers_handle(url) [headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict,proxies=proxies_num) get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict,proxies=proxies_num)
...@@ -644,7 +665,7 @@ class Crawler_zhihu(): ...@@ -644,7 +665,7 @@ class Crawler_zhihu():
''' '''
offset = str(offset) offset = str(offset)
answerid = str(answerid) answerid = str(answerid)
url = "https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open".format(answerid, offset) url = THOUGHT_COMMENT_URL.format(answerid, offset)
[headers_search, cookies_dict] = self.headers_handle(url) [headers_search, cookies_dict] = self.headers_handle(url)
...@@ -690,7 +711,7 @@ class Crawler_zhihu(): ...@@ -690,7 +711,7 @@ class Crawler_zhihu():
self.cur.execute(sql) self.cur.execute(sql)
tuple = self.cur.fetchall() tuple = self.cur.fetchall()
self.conn.commit() self.conn.commit()
path = "/Users/xuwei/Desktop/thought_picture/" path = THOUGHT_PICTURE_PATH
gif_patt = r'gif' gif_patt = r'gif'
for i in range(len(tuple)): for i in range(len(tuple)):
mark = re.search(gif_patt, tuple[i][1]) mark = re.search(gif_patt, tuple[i][1])
...@@ -718,7 +739,7 @@ class Crawler_zhihu(): ...@@ -718,7 +739,7 @@ class Crawler_zhihu():
img = cv2.imread(pathes) img = cv2.imread(pathes)
high, width = img.shape[:2] high, width = img.shape[:2]
cropped = img[0:int(high / 10 * 9), 0:width] cropped = img[0:int(high / 10 * 9), 0:width]
pathes = "/Users/xuwei/Desktop/thought_picture_cut/num" + str(i) + ".jpg" pathes = THOUGHT_PICTURE_CUT_PATH + "num" + str(i) + ".jpg"
cv2.imwrite(pathes, cropped) cv2.imwrite(pathes, cropped)
new_url = self.upload_image_with_path(pathes) new_url = self.upload_image_with_path(pathes)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment