# -*- coding:UTF-8 -*- # @Time : 2020/8/3 13:35 # @File : crawler_zhihu.py # @email : litao@igengmei.com # @author : litao # -*- coding: utf-8 -*- """ Created on Tue Aug 14 20:13:21 2018 @author: fangyucheng """ import copy import re # import rsa import time import json import urllib import base64 import binascii import datetime import requests # import execjs import hashlib import requests import execjs # from bs4 import BeautifulSoup from crawler.crawler_sys.framework.video_fields_std import Std_fields_video from crawler.crawler_sys.utils.output_results import retry_get_url, output_result from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count from crawler.crawler_sys.utils.trans_strtime_to_timestamp import weibo_parse_time, trans_strtime_to_timestamp from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration from crawler.crawler_sys.utils.util_logging import logged from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy from crawler.crawler_sys.utils.html_to_str import dehtml from write_data_into_es.func_get_releaser_id import * from write_data_into_es.func_cal_doc_id import cal_doc_id import os class Crawler_zhihu(): def __init__(self, timeout=5,): print(execjs.get().name) os.environ["EXECJS_RUNTIME"] = 'Node' print(execjs.get().name) self.platform = "zhihu" self.timeout = timeout self.session = requests.Session() std_fields = Std_fields_video() self.video_data = std_fields.video_data self.video_data['platform'] = self.platform # remove fields that crawled data don't have pop_key_Lst = ['channel', 'describe', 'isOriginal', "repost_count", "video_id"] try: with open('./zhihu.js', 'r', encoding='utf-8') as f: js = f.read() except: with open('/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js', 'r', encoding='utf-8') as f: js = f.read() # print(js) self.exec_js = execjs.compile(js) for popk in pop_key_Lst: self.video_data.pop(popk) def get_single_answer_page(self, question_id,answer_id, proxies_num): headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0", "accept-encoding": "gzip, deflate", "accept-language": "zh-CN,zh;q=0.9", "cache-control": "max-age=0", # "cookie": '_zap=20547721-b576-4409-95c1-000c6f20517b; d_c0="AIDu7_zGrA-PToWVy-siVNLS835i5YXmFCQ=|1562072925"; __gads=ID=bdc51df6433d4288:T=1562072932:S=ALNI_MbUwg2TeI33p4EnEYpHr8bAKBUiNQ; _ga=GA1.2.929365035.1592357886; tst=r; q_c1=e59a45f95396455e871eb111bdd827e1|1596185954000|1562072927000; _gid=GA1.2.544062079.1596418493; capsion_ticket="2|1:0|10:1596418535|14:capsion_ticket|44:MmJhMzEyNzYzNzE5NDAyOTg3ZGQzNDFmYTFlYjJmMjE=|facc3f88969d538b60f0530ff9bbdb74aa1bb7012584b9dfd2a5f3a3c1fb9726"; z_c0="2|1:0|10:1596418574|4:z_c0|92:Mi4xSDJLUUhRQUFBQUFBZ083dl9NYXNEeVlBQUFCZ0FsVk5EYmdVWUFDcDlBZjhBb0stY3RHTnhNS013YXItcko0VXFn|73520023927845cb04e21a4a1fbfae5d25088de4ffae91090d55cf7a5ba5b008"; _xsrf=MuvTOIUy5KNtEQCk76uG0nAbiqt6IyKS; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1596418492,1596419419,1596517065,1596593781; SESSIONID=6vXjlwH0fidkMDZqOt89HQlyXmxlz1J4fckhELGprA4; JOID=Vl4UCkslT8WtTsL2TigpEe-nCIZaET6Vzia3uXZHJKbOCoKuOFdW7v9Ix_lPjZ6PEBBE8JId1q13KhAqfRCRrDg=; osd=VlATBkIlQcKhR8L4SSQgEeGgBI9aHzmZxya5vnpOJKjJBouuNlBa5_9GwPVGjZCIHBlE_pUR3615LRwjfR6WoDE=; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1596596132; KLBRSID=e42bab774ac0012482937540873c03cf|1596596133|1596593779', "referer": "https://www.zhihu.com/", "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "same-origin", "sec-fetch-user": "?1", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36", } url = "https://www.zhihu.com/question/{0}/answer/{1}".format(question_id,answer_id) try: requests_res = retry_get_url(url, headers=headers, proxies=proxies_num) tres_json_test = requests_res.text res_json = json.loads(re.findall('<script id="js-initialData" type="text/json">(.*?)</script>',tres_json_test)[0]) # print(res_json) data = res_json["initialState"] video_dic = {} video_dic["url"] = url video_dic["title"] = data["entities"]["answers"][answer_id]["question"]["title"] video_dic["fetch_time"] = int(datetime.datetime.now().timestamp() * 1e3) video_dic["release_time"] = int(data["entities"]["answers"][answer_id]["createdTime"]*1e3) video_dic["voteup_count"] = trans_play_count(data["entities"]["answers"][answer_id]["voteupCount"]) video_dic["comment_count"] = trans_play_count(data["entities"]["answers"][answer_id]["commentCount"]) video_dic["content"] = data["entities"]["answers"][answer_id]["content"] video_dic["releaser"] = data["entities"]["answers"][answer_id]["author"]["name"] video_dic["releaser_id"] = data["entities"]["answers"][answer_id]["author"]["urlToken"] video_dic["releaserUrl"] = "https://www.zhihu.com/people/%s" %video_dic["releaser_id"] video_dic["releaser_id_str"] = "zhihu_" + str(video_dic["releaser_id"]) video_dic["img_list"] = re.findall('img src="(.*?)"', video_dic["content"]) video_dic["id"] = 'zhihu_%s_%s' % (question_id, answer_id) return video_dic except Exception as e: print("single data row formate error %s" % e) def get_serach_page_cookies(self, keyword): url = "https://www.zhihu.com/search?type=content&q=%s" % keyword headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9", "cache-control": "max-age=0", "cookie": '_zap=20547721-b576-4409-95c1-000c6f20517b; d_c0=AIDu7_zGrA-PToWVy-siVNLS835i5YXmFCQ=|1562072925; __gads=ID=bdc51df6433d4288:T=1562072932:S=ALNI_MbUwg2TeI33p4EnEYpHr8bAKBUiNQ; _ga=GA1.2.929365035.1592357886; _xsrf=9883cfd3-4ae9-409d-9150-4bed1c5fb89e; tst=r; SESSIONID=u14cNx7BBTdkwNnJUEmVvebNsxweydFrakHmXhCPpfw; JOID=W18XAkKDlnFa_djmdIv4qxEeF55u-KJGMbOu1DXBzBAygpaGO9lMWQ3z0-B_XpsC-EdjO4KDArW2i-V8Y16DNXM=; osd=UVscCkyJknpS89Lif4P2oRUVH5Bk_KlOP7mq3z3PxhQ5ipiMP9JEVwf32OhxVJ8J8ElpP4mLDL-ygO1yaVqIPX0=; q_c1=e59a45f95396455e871eb111bdd827e1|1596185954000|1562072927000; _gid=GA1.2.544062079.1596418493; capsion_ticket=2|1:0|10:1596418535|14:capsion_ticket|44:MmJhMzEyNzYzNzE5NDAyOTg3ZGQzNDFmYTFlYjJmMjE=|facc3f88969d538b60f0530ff9bbdb74aa1bb7012584b9dfd2a5f3a3c1fb9726; z_c0="2|1:0|10:1596418574|4:z_c0|92:Mi4xSDJLUUhRQUFBQUFBZ083dl9NYXNEeVlBQUFCZ0FsVk5EYmdVWUFDcDlBZjhBb0stY3RHTnhNS013YXItcko0VXFn|73520023927845cb04e21a4a1fbfae5d25088de4ffae91090d55cf7a5ba5b008; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1596184903,1596185679,1596418492,1596419419; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1596435670; KLBRSID=53650870f91603bc3193342a80cf198c|1596435676|1596435655', "referer": "https://www.zhihu.com/search?type=content&q=%E7%83%AD%E7%8E%9B%E5%90%89", "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "same-origin", "sec-fetch-user": "?1", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36", } requests_res = retry_get_url(url, headers=headers) print(requests_res.cookies.get_dict()) return requests_res.cookies.get_dict() def parse_sigle_page(self,article_type,data_dict,proxies_num): res_dict = {} if article_type == "knowledge_ad": pass elif article_type == "zvideo": pass elif article_type == 'search_result': article_type == data_dict["object"]["type"] url = "https://www.zhihu.com/question/{0}/answer/{1}".format(data_dict["object"]["question"]["id"], data_dict["object"]["id"]) res_dict = self.get_single_answer_page(data_dict["object"]["question"]["id"],data_dict["object"]["id"],proxies_num) elif article_type == "search_club": pass elif article_type == "relevant_query": pass else: pass return res_dict def search_article_page(self, keyword, search_pages_max=10, output_to_es_raw=False, output_to_es_register=False, es_index=None,proxies_num=0,**kwargs): res_cookies_dict = self.get_serach_page_cookies(keyword=keyword) headers_search = { "accept": "*/*", "accept-encoding": "gzip, deflate", "accept-language": "zh-CN,zh;q=0.9", "sec-fetch-dest": "empty", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36", "x-ab-param": "li_yxzl_new_style_a=1;se_major=0;se_adsrank=4;se_hi_trunc=0;se_v053=1;li_panswer_topic=0;zr_test_aa1=1;pf_noti_entry_num=2;zr_search_sim2=2;zr_slotpaidexp=2;se_zp_boost=0;tp_club_entrance=1;pf_profile2_tab=0;ug_newtag=1;li_answer_card=0;ls_recommend_test=5;qap_labeltype=1;zr_rec_answer_cp=open;se_sug_term=0;tp_topic_tab=0;ge_ge01=5;se_wil_act=0;se_videobox=0;tsp_ad_cardredesign=0;qap_question_visitor= 0;zr_slot_training=2;tp_clubhyb=0;li_ebook_gen_search=2;se_v_v005=0;zw_sameq_sorce=999;ge_ge02=6;se_mobilecard=0;se_auth_src=0;tp_header_style=1;tp_flow_ctr=0;pf_creator_card=1;li_viptab_name=0;zr_intervene=0;se_bert128=1;se_ffzx_jushen1=0;top_v_album=1;se_preset=0;tp_discover=1;ls_fmp4=0;tp_club_top=0;top_universalebook=1;li_svip_cardshow=1;li_paid_answer_exp=0;tp_topic_style=0;zr_art_rec=base;se_colorfultab=1;se_auth_src2=0;tp_club_qa_entrance=1;tp_club__entrance2=1;tsp_hotlist_ui=3;li_svip_tab_search=1;se_entity22=1;tp_meta_card=0;tp_topic_tab_new=0-0-0;tp_zrec=0;top_ebook=0;pf_adjust=1;qap_question_author=0;zr_topic_rpc=0;se_topicfeed=0;tp_club_feed=0;tsp_ioscard2=0;zr_rel_search=base;se_recommend=1;se_usercard=0;tp_club_fdv4=0;tp_m_intro_re_topic=1;pf_foltopic_usernum=0;li_vip_verti_search=0;zr_training_boost=false;se_v054=0;tp_contents=1;soc_feed_intelligent=3;tsp_ios_cardredesign=0;pf_fuceng=1;pf_newguide_vertical=0;ug_follow_topic_1=2;ls_video_commercial=0;li_car_meta=1;se_sug_dnn=0;tp_fenqu_wei=0;li_catalog_card=1;top_quality=0;se_click_v_v=1;se_aa_base=1;se_club_ui=0;se_return_1=0;soc_notification=1;zr_ans_rec=gbrank;zr_search_paid=1;zr_expslotpaid=3;zr_rerank=0;se_college=default;se_whitelist=1;top_root=0;li_yxxq_aut=A1;tsp_adcard2=0;ls_videoad=2;se_col_boost=1;li_edu_page=old;zr_training_first=false;se_t2sug=1;se_vbert3=0;se_merge=0;li_video_section=1;zr_km_answer=open_cvr;zr_sim3=0;se_v_v006=0;tp_dingyue_video=0;li_topics_search=0;se_searchwiki=0;se_guess=0;se_major_v2=0;tp_club_bt=0;tp_sft=a;top_test_4_liguangyi=1", "x-api-version": "3.0.91", "x-app-za": "OS=Web", "x-requested-with": "fetch", "x-zse-83": "3_2.0", "x-zse-86": None, "referer": "https://www.zhihu.com/search?type=content&q={0}".format(urllib.parse.quote(keyword)), } cookies_dict = { "d_c0": '"AIDu7_zGrA-PToWVy-siVNLS835i5YXmFCQ=|1562072925"', "KLBRSID": None } # import pdb # pdb.set_trace() cookies_dict.update(res_cookies_dict) url = "https://www.zhihu.com/api/v4/search_v3?t=general&q={0}&correction=1&offset=0&limit=20&lc_idx=0&show_all_topics=0".format( urllib.parse.quote(keyword)) offset = 0 f = "+".join(["3_2.0", url.replace("https://www.zhihu.com",""), headers_search["referer"], cookies_dict["d_c0"]]) fmd5 = hashlib.new('md5', f.encode()).hexdigest() headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b",fmd5) res_list = [] while offset <= search_pages_max * 20: offset += 20 get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict,proxies=proxies_num) if get_page.status_code != 200: # retry once get_page = requests.get(url) if get_page.status_code != 200: continue page_dict = get_page.json() url = page_dict["paging"]["next"] # print(get_page.cookies.get_dict()) cookies_dict.update(get_page.cookies.get_dict()) headers_search.pop("x-zse-86", 0) if page_dict.get("data"): for one_line in page_dict['data']: try: article_type = one_line['type'] res_data = self.parse_sigle_page(article_type,one_line,proxies_num) if not res_data: continue D0 = copy.deepcopy(self.video_data) D0['search_word'] = keyword D0["type"] = "article" D0["play_count"] = 0 try: D0.update(res_data) except Exception as e: print("method get_web_article_info error %s" % e) # print(D0) res_list.append(D0) except KeyError: # It's totally ok to drop the last return data value. # The search api just return something seems related to search continue else: break if len(res_list) >= 100: output_result(result_Lst=res_list, platform=self.platform, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, es_index=es_index,) res_list.clear() if res_list != []: output_result(result_Lst=res_list, platform=self.platform, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, es_index=es_index) return res_list def get_releaser_id(self, releaserUrl): return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl) def search_page(self, keyword, search_pages_max=30, output_to_es_raw=False, output_to_es_register=False, es_index=None, proxies_num=0,**kwargs): self.search_article_page(keyword, search_pages_max=search_pages_max, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, es_index=es_index, proxies_num=proxies_num,**kwargs) def get_releaser_id(self, releaserUrl): return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl) @staticmethod def get_img(data): img_list = [] if data.get("pics"): for one in data.get("pics"): try: img_list.append(one["large"]["url"]) except Exception as e: img_list.append(one["url"]) print("add img error %s" % e) return img_list def releaser_page(self, releaserUrl, output_to_file=False, filepath=None, output_to_es_raw=False, output_to_es_register=False, push_to_redis=False, releaser_page_num_max=10000, es_index=None, doc_type=None, proxies_num=None): print('Processing releaserUrl %s' % releaserUrl) result_Lst = [] releaser_id = self.get_releaser_id(releaserUrl) # xsrf_token,url_extr = self.get_weibo_info(releaser_id) headers = { "accept": "application/json, text/plain, */*", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9", # "cookie": "_T_WM=30976479190; XSRF-TOKEN=9e4bb8; WEIBOCN_FROM=1110006030; MLOGIN=0; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D1%2526q%253D%25E8%25BF%25AA%25E4%25B8%25BD%25E7%2583%25AD%25E5%25B7%25B4%26fid%3D1076031669879400%26uicode%3D10000011", "mweibo-pwa": "1", # "referer": "https://m.weibo.cn/u/1669879400?uid=1669879400&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4", # "referer": "https://m.weibo.cn/u/1669879400?uid=1669879400&t=0", "sec-fetch-dest": "empty", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36", "x-requested-with": "XMLHttpRequest", # "x-xsrf-token": xsrf_token, } pagenum = 0 has_more = True since_id = 0 if releaser_id: while pagenum <= releaser_page_num_max and has_more: pagenum += 1 time.sleep(0.5) "?uid=1669879400&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4&type=uid&value=1669879400&containerid=1076031669879400&since_id=451822205602429" url = "https://m.weibo.cn/api/container/getIndex?uid={0}&t=0&type=uid&value={1}&containerid=107603{2}&since_id={3}".format( releaser_id, releaser_id, releaser_id, since_id) headers["referer"] = "https://m.weibo.cn/u/uid={0}&t=0".format(releaser_id) print('Page number: %d' % pagenum) try: if proxies_num: get_page = retry_get_url(url, headers=headers, timeout=self.timeout, proxies=proxies_num) else: get_page = retry_get_url(url, headers=headers, timeout=self.timeout) except: get_page = None has_more = False if get_page and get_page.status_code == 200: try: page_json = get_page.json() total = page_json["data"]["cardlistInfo"]["total"] if pagenum > total: break since_id = page_json["data"]["cardlistInfo"]["since_id"] page_dic = page_json["data"].get("cards") except Exception as e: print("load data error %s" % e) continue if page_dic: for one in page_dic: try: mblog = one.get("mblog") mid = mblog.get("mid") forward_text = "" forward_user = "" if one.get("source") == "绿洲": text_type = "绿洲" elif mblog.get("retweeted_status"): text_type = "转发" forward_text = mblog.get("retweeted_status").get("raw_text") forward_user = mblog.get("retweeted_status").get("user").get("screen_name") else: text_type = one.get("source") if mblog.get("isLongText"): text, repost_count, comment_count, favorite_count = self.get_single_page(mid) else: text = mblog["raw_text"] res_dic = { "release_time": trans_strtime_to_timestamp(mblog["created_at"]), "fetch_time": int(datetime.datetime.now().timestamp() * 1e3), "url": one["scheme"], "releaser": mblog["user"]["screen_name"], "repost_count": trans_play_count(mblog["reposts_count"]), "comment_count": trans_play_count(mblog["comments_count"]), "favorite_count": trans_play_count(mblog["attitudes_count"]), "title": text.replace("\u200b", ""), "wb_type": text_type, "forward_user": forward_user, "forward_text": forward_text, "mid": mid, "releaserUrl": "https://www.weibo.com/u/%s" % releaser_id, "releaser_id_str": "weibo_%s" % releaser_id, "img_list": self.get_img(mblog), "platform": "weibo", # "doc_id":doc_id } res_dic["doc_id"] = cal_doc_id(platform="weibo", url=one["scheme"], data_dict=res_dic, doc_id_type="all-time-url") yield res_dic except Exception as e: print(json.dumps(mblog)) print("row formate error %s" % e) continue def get_releaser_follower_num(self, releaserUrl): pass def releaser_page_by_time(self, start_time, end_time, url, allow, **kwargs): count_false = 0 for res in self.releaser_page(url, proxies_num=kwargs.get("proxies_num")): video_time = res["release_time"] # print(res) if video_time: if start_time < video_time: if video_time < end_time: yield res else: count_false += 1 if count_false > allow: break else: yield res if __name__ == '__main__': zhihu = Crawler_zhihu() import os # print(execjs.get().name) # os.environ["EXECJS_RUNTIME"] = 'Node' # print(execjs.get().name ) # zhihu.get_serach_page_cookies("热玛吉") zhihu.search_page("双眼皮",search_pages_max=1,output_to_es_register=True) # zhihu.get_single_answer_page("325099876","1209953121") # print(user_page)