# import rsa # import os, sys # base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # sys.path.append("/Users/xuwei") # sys.path.append("/Users/xuwei/crawler") # sys.path.append("/Users/xuwei/crawler/crawler_sys") import urllib # import execjs import pymysql import hashlib import requests import execjs import os import re # from bs4 import BeautifulSoup from crawler_sys.utils.output_results import retry_get_url, retry_get_url_no_proxies from datetime import datetime from gm_upload.gm_upload.upload import upload, upload_file from gm_upload.gm_upload.consts import IMG_TYPE import cv2 from pymysql import escape_string HOST = '172.18.51.14' PORT = '3306' USER = 'spider' PASSWD = 'Gengmei123' DB = 'spider' JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js' SPIDER_URL = 'https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1' ANSWER_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created' ARTICLE_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20&sort_by=created' THOUGHT_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data%5B*%5D.upvoted_followees%2Cadmin_closed_comment' ANSWER_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/answers/{0}/root_comments?order=normal&limit=20&offset={1}&status=open' ARTICLE_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/articles/{0}/root_comments?order=normal&limit=20&offset={1}&status=open' CHILD_COMMENT_START_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments' CHILD_COMMENT_OFFSET_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments?limit=20&offset={1}' THOUGHT_COMMENT_URL = 'https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open' ANSWER_PICTURE_PATH = '/data/answer_picture/' ARTICLE_PICTURE_PATH = '/data/article_picture/' THOUGHT_PICTURE_PATH = '/data/thought_picture/' ANSWER_PICTURE_CUT_PATH = '/data/answer_picture_cut/' ARTICLE_PICTURE_CUT_PATH = '/data/article_picture_cut/' THOUGHT_PICTURE_CUT_PATH = '/data/thought_picture_cut/' class Crawler_zhihu(): def __init__(self): ''' 初始化数据库,调整js规则 ''' self.conn = pymysql.connect(host=HOST, port=PORT, user=USER, passwd=PASSWD, db=DB, charset='utf8') self.cur = self.conn.cursor() # self.cur.execute("drop table if exists zhihu_answer") # sql = """create table zhihu_answer(title char(40), # content text(59999), # id int, # created_time int, # comment_count int)""" # self.cur.execute(sql) # self.conn.commit() # self.cur.execute("drop table if exists zhihu_article") # sql = """create table zhihu_article(title char(40), # content text(59999), # id int, # created_time int, # comment_count int)""" # self.cur.execute(sql) # self.conn.commit() # self.cur.execute("drop table if exists zhihu_answer_root_comment") # sql = """create table zhihu_answer_root_comment(root_comment_id int, # author_name char(40), # content text(59999), # answerid int, # child_comment_count int, # featured char(5), # created_time int, # author_id char(50))""" # self.cur.execute(sql) # self.conn.commit() # self.cur.execute("drop table if exists zhihu_child_comment") # sql = """create table zhihu_child_comment(root_comment_id int, # author_name char(40), # content text(59999), # reply_name char(40), # child_comment_id int, # created_time int, # author_id char(50))""" # self.cur.execute(sql) # self.conn.commit() # # self.cur.execute("drop table if exists zhihu_article_root_comment") # sql = """create table zhihu_article_root_comment(root_comment_id int, # author_name char(40), # content text(59999), # answerid int, # child_comment_count int, # featured char(5), # created_time int, # author_id char(50))""" # self.cur.execute(sql) # self.conn.commit() # # self.cur.execute("drop table if exists zhihu_answer_picture_url") # sql = """create table zhihu_answer_picture_url(answer_id int, url text(59999), new_url text(59999))""" # self.cur.execute(sql) # self.conn.commit() # # self.cur.execute("drop table if exists zhihu_article_picture_url") # sql = """create table zhihu_article_picture_url(article_id int, url text(59999), new_url text(59999))""" # self.cur.execute(sql) # self.conn.commit() # # self.cur.execute("drop table if exists zhihu_thought") # sql = """create table zhihu_thought(id char(50), # content text(59999), # created_time int, # comment_count int)""" # self.cur.execute(sql) # self.conn.commit() # # self.cur.execute("drop table if exists zhihu_thought_comment") # sql = """create table zhihu_thought_comment(thought_comment_id int, # author_name char(40), # content text(59999), # answerid char(50), # created_time int, # author_id char(50))""" # self.cur.execute(sql) # self.conn.commit() # # self.cur.execute("drop table if exists zhihu_thought_picture_url") # sql = """create table zhihu_thought_picture_url(thought_id char(50), url text(59999), new_url text(59999))""" # self.cur.execute(sql) # self.conn.commit() os.environ["EXECJS_RUNTIME"] = 'Node' try: with open('./zhihu.js', 'r', encoding='utf-8') as f: js = f.read() except: with open(JS_FILE_PATH, 'r', encoding='utf-8') as f: js = f.read() # print(js) self.exec_js = execjs.compile(js, cwd=r'/usr/local/lib/node_modules') def get_serach_page_cookies(self): ''' cookies更新 ''' url = "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1" headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9", "cache-control": "max-age=0", "cookie": '_SESSIONID=rvJz2mbRjCqJFwvD79ADGb8gpdpRkWXAsdVDqOVALgh; JOID=UVkSBEtHLEUgV_KGUUMd3fULiLJHZglmBHfTo3JnDWADc9KndNqP8XtW9oBWoLo5jsAnAcPG0-JwbPp8rD1JsFI=; osd=W1gdB05NLUojUviHXkAY1_QEi7dNZwZlAX3SrHFiB2EMcNetddWM9HFX-YNTqrs2jcUtAMzF1uhxY_l5pjxGs1c=; _zap=cc1330a6-d6e9-4b25-8232-0b0481e37ea2; _xsrf=4798d610-1db7-489f-8c14-d608c0bbcf08; d_c0="AOCcdS0CNBKPToRkxgVd-8qBKKgTDM9yF4Y=|1605507877"; capsion_ticket="2|1:0|10:1605684439|14:capsion_ticket|44:MzdiN2JlOTZlNWE2NGU5Zjg0MDNiODFlN2ViMjQzNGU=|b84d77e03b5e9447a3740a614d208ca5f3bc1e4fe15fe9b46f1a2a5702da5f99"; SESSIONID=nplCMucg2EXp8xNQAz74jmaLovUS9CG4rVtcFY4jWLT; JOID=V1sUBk8zRBeITtMgLDNzj10QrRI6WzZ_ygKgQU8AB038d6pof-COptdJ1yYpGa7oR9-1bgHql0Hfs4FvUF_YW-A=; osd=Ul4XBkI2QRSIQ9YlLzN-ilgTrR8_XjV_xwelQk8NAkj_d6dteuOOq9JM1CYkHKvrR9KwawLqmkTasIFiVVrbW-0=; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1605608394,1605684435,1605684439,1605684455; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1605684522; KLBRSID=fe0fceb358d671fa6cc33898c8c48b48|1605685245|1605683234', "referer": "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1", "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "same-origin", "sec-fetch-user": "?1", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36", } requests_res = retry_get_url(url, headers=headers) return requests_res.cookies.get_dict() def parse_sigle_page(self, data_dict, mark): ''' 插入主要内容数据和图片的url,寻找评论 ''' if mark == 0: into = "insert into zhihu_answer(title, content, answer_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)" values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"], data_dict["comment_count"], data_dict["content"]) elif mark == 1: into = "insert into zhihu_article(title, content, article_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)" values = (data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"], data_dict["content"]) self.cur.execute(into, values) self.conn.commit() offset = 0 if data_dict["comment_count"] != 0: next = 1 while next == 1: next = self.search_root_comment(data_dict["id"], offset, mark) offset = offset + 20 patt = r'%s(.+?)%s' % ("