Commit f1a51df3 authored by haowang's avatar haowang

modify

parent 778c9506
......@@ -22,15 +22,38 @@ import cv2
from pymysql import escape_string
HOST = '172.18.51.14'
PORT = '3306'
USER = 'spider'
PASSWD = 'Gengmei123'
DB = 'spider'
JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
SPIDER_URL = 'https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1'
ANSWER_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created'
ARTICLE_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20&sort_by=created'
THOUGHT_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data%5B*%5D.upvoted_followees%2Cadmin_closed_comment'
ANSWER_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/answers/{0}/root_comments?order=normal&limit=20&offset={1}&status=open'
ARTICLE_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/articles/{0}/root_comments?order=normal&limit=20&offset={1}&status=open'
CHILD_COMMENT_START_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments'
CHILD_COMMENT_OFFSET_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments?limit=20&offset={1}'
THOUGHT_COMMENT_URL = 'https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open'
ANSWER_PICTURE_PATH = '/data/answer_picture/'
ARTICLE_PICTURE_PATH = '/data/article_picture/'
THOUGHT_PICTURE_PATH = '/data/thought_picture/'
ANSWER_PICTURE_CUT_PATH = '/data/answer_picture_cut/'
ARTICLE_PICTURE_CUT_PATH = '/data/article_picture_cut/'
THOUGHT_PICTURE_CUT_PATH = '/data/thought_picture_cut/'
class Crawler_zhihu():
def __init__(self):
'''
初始化数据库,调整js规则
'''
self.conn = pymysql.connect(host='bj-cdb-b8oeejac.sql.tencentcdb.com', port=62118, user='work',
passwd='Gengmei1',
db='mimas_dev', charset='utf8')
self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
passwd=PASSWD,
db=DB, charset='utf8')
self.cur = self.conn.cursor()
# self.cur.execute("drop table if exists zhihu_answer")
# sql = """create table zhihu_answer(title char(40),
......@@ -120,7 +143,7 @@ class Crawler_zhihu():
with open('./zhihu.js', 'r', encoding='utf-8') as f:
js = f.read()
except:
with open('/Users/xuwei/crawler/crawler_sys/site_crawler/zhihu.js', 'r', encoding='utf-8') as f:
with open(JS_FILE_PATH, 'r', encoding='utf-8') as f:
js = f.read()
# print(js)
self.exec_js = execjs.compile(js, cwd=r'/usr/local/lib/node_modules')
......@@ -216,9 +239,9 @@ class Crawler_zhihu():
'''
offset = str(offset)
if mark == 0:
url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created".format(offset)
url = ANSWER_URL.format(offset)
elif mark == 1:
url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20&sort_by=created".format(offset)
url = ARTICLE_URL.format(offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict,proxies=proxies_num)
......@@ -241,8 +264,6 @@ class Crawler_zhihu():
else:
print("article_data_error")
return
def search_root_comment(self, answerid, offset, mark, proxies_num=0):
......@@ -252,9 +273,9 @@ class Crawler_zhihu():
offset = str(offset)
answerid = str(answerid)
if mark == 0:
url = "https://www.zhihu.com/api/v4/answers/{0}/root_comments?order=normal&limit=20&offset={1}&status=open".format(answerid, offset)
url = ANSWER_ROOT_COMMENT_URL.format(answerid, offset)
elif mark == 1:
url = "https://www.zhihu.com/api/v4/articles/{0}/root_comments?order=normal&limit=20&offset={1}&status=open".format(answerid, offset)
url = ARTICLE_ROOT_COMMENT_URL.format(answerid, offset)
[headers_search, cookies_dict] = self.headers_handle(url)
......@@ -310,9 +331,9 @@ class Crawler_zhihu():
offsets = offset
offset = str(offset)
if offsets == 0:
url = "https://www.zhihu.com/api/v4/comments/{0}/child_comments".format(root_comment_id)
url = CHILD_COMMENT_START_URL.format(root_comment_id)
else:
url = "https://www.zhihu.com/api/v4/comments/{0}/child_comments?limit=20&offset={1}".format(root_comment_id, offset)
url = CHILD_COMMENT_OFFSET_URL.format(root_comment_id, offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url_no_proxies(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
......@@ -391,7 +412,7 @@ class Crawler_zhihu():
self.cur.execute(sql)
tuple = self.cur.fetchall()
self.conn.commit()
path = "/Users/xuwei/Desktop/answer_picture/"
path = ANSWER_PICTURE_PATH
gif_patt = r'gif'
for i in range(len(tuple)):
mark = re.search(gif_patt, tuple[i][1])
......@@ -419,7 +440,7 @@ class Crawler_zhihu():
img = cv2.imread(pathes)
high, width = img.shape[:2]
cropped = img[0:int(high / 10 * 9), 0:width]
pathes = "/Users/xuwei/Desktop/answer_picture_cut/num" + str(i) + ".jpg"
pathes = ANSWER_PICTURE_CUT_PATH + "num" + str(i) + ".jpg"
cv2.imwrite(pathes, cropped)
new_url = self.upload_image_with_path(pathes)
......@@ -502,7 +523,7 @@ class Crawler_zhihu():
self.cur.execute(sql)
tuple = self.cur.fetchall()
self.conn.commit()
path = "/Users/xuwei/Desktop/article_picture/"
path = ARTICLE_PICTURE_PATH
gif_patt = r'gif'
for i in range(len(tuple)):
mark = re.search(gif_patt, tuple[i][1])
......@@ -530,7 +551,7 @@ class Crawler_zhihu():
img = cv2.imread(pathes)
high, width = img.shape[:2]
cropped = img[0:int(high / 10 * 9), 0:width]
pathes = "/Users/xuwei/Desktop/article_picture_cut/num" + str(i) + ".jpg"
pathes = ARTICLE_PICTURE_CUT_PATH + "num" + str(i) + ".jpg"
cv2.imwrite(pathes, cropped)
new_url = self.upload_image_with_path(pathes)
......@@ -590,7 +611,7 @@ class Crawler_zhihu():
想法数据包请求
'''
offset = str(offset)
url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data%5B*%5D.upvoted_followees%2Cadmin_closed_comment".format(offset)
url = THOUGHT_URL.format(offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict,proxies=proxies_num)
......@@ -644,7 +665,7 @@ class Crawler_zhihu():
'''
offset = str(offset)
answerid = str(answerid)
url = "https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open".format(answerid, offset)
url = THOUGHT_COMMENT_URL.format(answerid, offset)
[headers_search, cookies_dict] = self.headers_handle(url)
......@@ -690,7 +711,7 @@ class Crawler_zhihu():
self.cur.execute(sql)
tuple = self.cur.fetchall()
self.conn.commit()
path = "/Users/xuwei/Desktop/thought_picture/"
path = THOUGHT_PICTURE_PATH
gif_patt = r'gif'
for i in range(len(tuple)):
mark = re.search(gif_patt, tuple[i][1])
......@@ -718,7 +739,7 @@ class Crawler_zhihu():
img = cv2.imread(pathes)
high, width = img.shape[:2]
cropped = img[0:int(high / 10 * 9), 0:width]
pathes = "/Users/xuwei/Desktop/thought_picture_cut/num" + str(i) + ".jpg"
pathes = THOUGHT_PICTURE_CUT_PATH + "num" + str(i) + ".jpg"
cv2.imwrite(pathes, cropped)
new_url = self.upload_image_with_path(pathes)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment