Commit 3bd3a181 authored by zhangmeng's avatar zhangmeng

整理知乎爬虫代码

parent dc9ebe26
...@@ -20,4 +20,10 @@ python /srv/apps/crawler/crawler_sys/framework/search_page_single_process.py ...@@ -20,4 +20,10 @@ python /srv/apps/crawler/crawler_sys/framework/search_page_single_process.py
#小红书爬取过程 #小红书爬取过程
1.将github上面litao分支的代码拉到服务器spider-prod-001上 1.将github上面litao分支的代码拉到服务器spider-prod-001上
2.配置环境 激活环境->python->import sys->sys.path->跳转到里面那个site-packages目录下面->vim mypath.pth->改成自己的项目路径 运行的时候就不会报出crawler包找不到 2.配置环境 激活环境->python->import sys->sys.path->跳转到里面那个site-packages目录下面->vim mypath.pth->改成自己的项目路径 运行的时候就不会报出crawler包找不到
3.小红书导出为txt的文件 在maintenance的temfile目录下的那个脚本 使用的时候把邮箱跟密码替换成自己的 3.小红书导出为txt的文件 在maintenance的temfile目录下的那个脚本 使用的时候把邮箱跟密码替换成自己的
\ No newline at end of file
##知乎爬取
1.将项目部署在spider-001-prod上
2.运行脚本命令爬取特定网址 python tasks/zhihu/spider.py 0 119 58 'https://www.zhihu.com/people/zhaotianqiang'
3. 运行脚本命令清洗图片入库 python tasks/zhihu/upload_picture.py 0 0 84297 0
4.去mimas项目运行 python django_manage.py qa_insert_by_spider level= 0 offset = 0 count=322784将数据导入自己数据库
\ No newline at end of file
# coding=utf-8
import pymysql
import execjs
import os
import re
from datetime import datetime
from pymysql import escape_string
from bs4 import BeautifulSoup
import sys
HOST = '172.18.51.14'
PORT = 3306
USER = 'spider'
PASSWD = 'Gengmei123'
DB = 'spider'
JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
class RefreshContent(object):
def __init__(self, is_online=0):
'''
初始化数据库,调整js规则
'''
self.update_error_content_id = []
self.update_error_url_content_id = {}
self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
passwd=PASSWD,
db=DB, charset='utf8')
self.cur = self.conn.cursor()
os.environ["EXECJS_RUNTIME"] = 'Node'
try:
with open('./zhihu.js', 'r', encoding='utf-8') as f:
js = f.read()
except:
with open(JS_FILE_PATH, 'r', encoding='utf-8') as f:
js = f.read()
self.exec_js = execjs.compile(js, cwd='/home/gmuser/node_modules')
self.is_online = is_online
@staticmethod
def replace_html_image_to_url(content):
rich_obj = BeautifulSoup(content, features="html.parser")
for item in rich_obj.find_all("figure"):
image_obj = item.find("img")
new_rich_obj = rich_obj.new_tag(name="img")
new_rich_obj["src"] = image_obj.get("src", "")
item.replace_with(new_rich_obj)
return rich_obj.decode()
def create_new_content(self, content_id, content, pic_dict):
content = self.replace_html_image_to_url(content)
rich_obj = BeautifulSoup(content, features="html.parser")
update_error = False
for item in rich_obj.find_all("img"):
url = item.get("src")[23:]
new_url = pic_dict.get(url)
if not new_url:
if content_id not in self.update_error_content_id:
self.update_error_content_id.append(content_id)
self.update_error_url_content_id[url] = content_id
print({content_id: url})
update_error = True
continue
item['src'] = new_url + '-w'
new_content = r'%s' % (rich_obj.decode())
return escape_string(new_content), update_error
def get_all_content_ids(self, table, pic_table, key_id, offset=0, count=10):
if offset == 0:
sql = """select distinct {} from {}""".format(key_id, pic_table)
print(sql)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
if res:
return [item[0] for item in res]
return None
else:
sql = """select answer_id {} from {} limit {}, {}""".format(key_id, table, offset, count)
print(sql)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
if res:
return [item[0] for item in res]
return None
def refresh_content(self, table, pic_table, key_id, offset=0, count=10):
'''
替换url,更新回答内容
'''
content_ids = self.get_all_content_ids(table, pic_table, key_id, offset, count)
for content_id in content_ids:
print('start deal table: {}, content_id: {},'.format(table, content_id), datetime.now())
sql = """select content from {} where {} = {} and is_new = 0 and is_online = {}""".format(table, key_id, content_id, self.is_online)
print(sql)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
if not res:
continue
content = res[0][0]
sql = """select url, new_url from {} where {} = {} and new_url is not null""".format(
pic_table, key_id, content_id)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
pic_dict = {
item[0][23:]: item[1] for item in res}
new_content, update_error = self.create_new_content(content_id, content, pic_dict)
update_code = 1 if not update_error else 0
sql = """update {} set new_content = '{}', is_new = {} WHERE {} = '{}' """.format(
table, new_content, update_code, key_id, content_id)
self.cur.execute(sql)
self.conn.commit()
print('end deal table: {}, content_id: {},'.format(table, content_id), datetime.now())
if __name__ == '__main__':
""" python script_file mark """
print('参数个数为:', len(sys.argv), '个参数。')
print('参数列表:', type(sys.argv[0]), sys.argv[0], type(sys.argv[1]), sys.argv[1], type(sys.argv[2]), sys.argv[2], type(sys.argv[3]), sys.argv[3])
mark = int(sys.argv[1])
offset = int(sys.argv[2])
count = int(sys.argv[3])
is_online = int(sys.argv[4]) or 0
print(datetime.now())
refresh = RefreshContent(is_online=is_online)
if mark == 0:
refresh.refresh_content('zhihu_answer', 'zhihu_answer_picture_url', 'answer_id', offset, count)
elif mark == 1:
refresh.refresh_content('zhihu_article', 'zhihu_article_picture_url', 'article_id')
elif mark == 2:
refresh.refresh_content('zhihu_thought', 'zhihu_thought_picture_url', 'thought_id')
print('update_error_url_content_ids : ', refresh.update_error_url_content_id)
print('update_error_content_ids : ', refresh.update_error_content_id)
print(datetime.now())
from __future__ import absolute_import, unicode_literals
import hashlib
import datetime
import time
import six
from collections import OrderedDict
from qiniu import Auth, PersistentFop, urlsafe_base64_encode
from qiniu import put_data, put_file
from qiniu import BucketManager, build_batch_copy
import json
from urllib import parse
QINIU_ACCESS_KEY = "UPCOYIJkZOMcdd9FDzpBqYjzWUh55fBpVi3AhWpL"
QINIU_SECRET_KEY = "z5YvpDDSam_JE345Z8J_f3TufzelOW2VOGNoBl9e"
# Dummy value for Enum as EnumMeta explicitly checks for it, but of course
# until EnumMeta finishes running the first time the Enum class doesn't exist.
Enum = None
def _is_descriptor(obj):
"""Returns True if obj is a descriptor, False otherwise."""
return (
hasattr(obj, '__get__') or
hasattr(obj, '__set__') or
hasattr(obj, '__delete__'))
def _is_dunder(name):
"""Returns True if a __dunder__ name, False otherwise."""
return (name[:2] == name[-2:] == '__' and
name[2:3] != '_' and
name[-3:-2] != '_' and
len(name) > 4)
def _is_sunder(name):
"""Returns True if a _sunder_ name, False otherwise."""
return (name[0] == name[-1] == '_' and
name[1:2] != '_' and
name[-2:-1] != '_' and
len(name) > 2)
class _EnumDict(dict):
"""Track enum member order and ensure member names are not reused.
EnumMeta will use the names found in self._member_names as the
enumeration member names.
"""
@classmethod
def hack4py2(cls, classdict, allow_types):
"""hacking for python2 to generate an instance from the original classdict
"""
assert isinstance(classdict, dict)
res = cls(allow_types=allow_types)
for key, value in six.iteritems(classdict):
res[key] = value
return res
def _check_value_type(self, v):
"""The value type must be six.integer_types or six.string_types
In fact, enumeration does not concern about values,
but just the names of enumeration.
Value type are restricted for the convenience to be json serializable.
"""
def _check_basetype(v):
if not isinstance(v, self._allow_types):
raise TypeError('Value type must be one of [%s], '
'instead of %s' % (', '.join(map(str, self._allow_types)), type(v)))
if type(v) is tuple:
if len(v) != 2:
raise TypeError('Tuple enum definition must be length of 2')
_check_basetype(v[0])
if not isinstance(v[1], six.string_types):
raise TypeError('The second element of tuple enum definition '
'must be string, i.e. an explanation of the enum value')
else:
_check_basetype(v)
def __init__(self, allow_types):
super(_EnumDict, self).__init__()
self._member_names = []
self._allow_types = allow_types
def __setitem__(self, key, value):
"""Changes anything not dundered
If an enum member name is used twice, an error is raised; duplicate
values are not checked for.
Single underscore (sunder) names are reserved.
"""
if _is_sunder(key):
raise ValueError('_names_ are reserved for future Enum use')
elif _is_dunder(key):
pass
elif key in self._member_names:
# descriptor overwriting an enum?
raise TypeError('Attempted to reuse key: %r' % key)
elif not _is_descriptor(value):
if key in self:
# enum overwriting a descriptor?
raise TypeError('Key already defined as: %r' % self[key])
# value must be BASE_TYPE, or tuple of (BASE_TYPE, six.string_types)
# BASE_TYPE is six.string_types or integer
self._check_value_type(value)
self._member_names.append(key)
super(_EnumDict, self).__setitem__(key, value)
class EnumMeta(type):
"""Metaclass for Enum"""
@staticmethod
def _find_allow_types_(cls, bases):
all_types = set(six.integer_types) | {six.text_type, str}
allow_types = set()
if Enum is None: # Enum base class
assert cls == 'Enum'
return tuple(all_types)
else:
for base in bases:
if not issubclass(base, Enum):
allow_types.add(base)
if allow_types:
return tuple(all_types & allow_types)
else:
return tuple(all_types)
@classmethod
def __prepare__(mcs, cls, bases):
return _EnumDict(EnumMeta._find_allow_types_(cls, bases))
def __new__(mcs, cls, bases, _dct):
# hacking to generate an _EnumDict Object
if six.PY2:
dct = _EnumDict.hack4py2(_dct, EnumMeta._find_allow_types_(cls, bases))
else:
dct = _dct
# save enum items into separate mapping so they don't get baked into
# the new class
# 临时方案,原因是python2中使用OrderedDict无效,并且会影响原来某些业务的枚举顺序
if six.PY2:
members = {k: dct[k] for k in dct._member_names}
else:
members = OrderedDict([(k, dct[k]) for k in dct._member_names])
for name in dct._member_names:
del dct[name]
# check for illegal enum names (any others?)
invalid_names = set(members) & {'mro', }
if invalid_names:
raise ValueError('Invalid enum member name: {0}'.format(
','.join(invalid_names)))
# create our new Enum type
enum_class = super(EnumMeta, mcs).__new__(mcs, cls, bases, dct)
enum_class._member_names_ = [] # names in definition order
enum_class._member_map_ = OrderedDict() # name->value map
# Reverse value->name map for hashable values.
enum_class._value2member_map_ = {}
enum_class.choices = []
# instantiate them, checking for duplicates as we go
# we instantiate first instead of checking for duplicates first in case
# a custom __new__ is doing something funky with the values -- such as
# auto-numbering ;)
for member_name, value in six.iteritems(members):
if isinstance(value, tuple):
real_value = value[0]
desc = value[1]
else:
real_value = value
desc = ''
enum_member = enum_class()
enum_member._name_ = member_name
enum_member._value_ = real_value
enum_member._desc_ = desc
# If another member with the same value was already defined, the
# new member becomes an alias to the existing one.
for name, canonical_member in six.iteritems(enum_class._member_map_):
if canonical_member._value_ == enum_member._value_:
enum_member = canonical_member
break
else:
# Aliases don't appear in member names (only in __members__).
enum_class._member_names_.append(member_name)
# now add to _member_map_
enum_class._member_map_[member_name] = enum_member
enum_class._value2member_map_[real_value] = enum_member
enum_class.choices.append([real_value, desc])
return enum_class
def __contains__(cls, value):
return value in cls._value2member_map_
def __delattr__(cls, attr):
# nicer error message when someone tries to delete an attribute
# (see issue19025).
if attr in cls._member_map_:
raise AttributeError(
"%s: cannot delete Enum member." % cls.__name__)
super(EnumMeta, cls).__delattr__(attr)
def __getattr__(cls, name):
"""Return the enum member matching `name`
We use __getattr__ instead of descriptors or inserting into the enum
class' __dict__ in order to support `name` and `value` being both
properties for enum members (which live in the class' __dict__) and
enum members themselves.
"""
# check if classmethod(bound now)
if _is_dunder(name):
raise AttributeError(name)
try:
enum_member = cls._member_map_[name]
return enum_member._value_
except KeyError:
six.raise_from(AttributeError(name), None)
def __desc__(cls, value):
return cls._value2member_map_[value]._desc_
@property
def __members__(cls):
"""Returns a mapping of member name->value.
This mapping lists all enum members, including aliases. Note that this
is a read-only view of the internal mapping.
"""
return cls._member_map_.copy()
def __getitem__(cls, name):
return cls._member_map_[name]
def __iter__(cls):
"""Returns a tuple of tuples(member.value, member.desc) for each member
"""
return ((
cls._member_map_[name]._value_,
cls._member_map_[name]._desc_) for name in cls._member_names_)
def __len__(cls):
return len(cls._member_names_)
def __repr__(cls):
return "<enum %r>" % cls.__name__
def __reversed__(cls):
return (cls._member_map_[name] for name in reversed(cls._member_names_))
def __setattr__(cls, name, value):
"""Block attempts to reassign Enum members.
A simple assignment to the class namespace only changes one of the
several possible ways to get an Enum member from the Enum class,
resulting in an inconsistent Enumeration.
"""
member_map = cls.__dict__.get('_member_map_', {})
if name in member_map:
raise AttributeError('Cannot reassign members.')
super(EnumMeta, cls).__setattr__(name, value)
def __dir__(self):
return list(super(EnumMeta, self).__dir__()) + self._member_names_
class Enum(six.with_metaclass(EnumMeta, object)):
"""Generic enumeration.
Derive from this class to define new enumerations.
"""
def __repr__(self):
return "<%s.%s: %r>" % (
self.__class__.__name__, self._name_, self._value_)
def __str__(self):
if self._desc_:
return "%s.%s(%s)" % (self.__class__.__name__, self._name_, self._desc_)
else:
return "%s.%s" % (self.__class__.__name__, self._name_)
def __hash__(self):
return hash(self._name_)
@classmethod
def getDesc(cls, key, defaultValue=None):
"""Backport function for gaia.rpc.tool.enumeration.Enumeration
"""
try:
return cls.__desc__(key)
except KeyError:
return defaultValue
def unique(enumeration):
"""Class decorator for enumerations ensuring unique member values."""
duplicates = []
for name, member in six.iteritems(enumeration.__members__):
if name != member._name_:
duplicates.append((name, member._name_))
if duplicates:
alias_details = ', '.join(
["%s -> %s" % (alias, name) for (alias, name) in duplicates])
raise ValueError('duplicate values found in %r: %s' %
(enumeration, alias_details))
return enumeration
@unique
class IMG_TYPE(Enum):
BANNER = (1, 'Banner')
BODYPART = (2, 'BodyPart')
DIARY = (3, '日记本')
CAMPAIGNIMAGELINK = (5, '活动图片链接')
CONSULTWIKI = (6, '咨询百科')
DOCTOR = (7, '医生')
FEEDBACKCATEGORY = (8, '反馈类型')
FEEDBACKIMAGE = (9, '用户反馈上传的图片')
GREETINGPOPUP = (10, '开屏页和首页提醒')
HOSPITAL = (11, '医院')
ITEMWIKI = (12, '整形项目(wiki)')
PRIVATECONVERSATION = (13, '私信对话')
ORGANIZATIONIMAGE = (14, '机构图片')
PREOPERATIONIMAGE = (15, '术前图')
RECOMMENDAPP = (16, '精品应用')
WEBSLIDE = (17, 'web 轮播图')
BULLETIN = (18, '公告(医生版)')
SERVICE = (19, '美购')
SERVICEACTIVITY = (20, '美购活动')
SHARE = (21, '分享')
SLIDE = (22, '轮播图')
SMALLIMAGE = (23, '首页小图(单排横滑模版)')
SPECIAL = (24, '专题')
TAG = (25, 'TAG')
TOPIC = (26, '帖子')
TOPICREPLY = (27, '帖子回复')
TOPICIMAGE = (28, '帖子图片')
USEREXTRA = (29, '用户')
POST = (30, '文章(医生版)')
ARTICLE = (31, '所长推荐/扒扒扒')
DOCTORREGISTER = (32, '医生注册')
HOSPITALCAPTURE = (33, '医院(采集)')
HOMEPOPUP = (34, '首页弹窗(医生)')
HOMESLIDE = (35, '首页轮播图(医生)')
ZEUS_STAFF_PROTRAIT = (36, 'ZEUS员工头像')
CIRCLEBANNER = (37, '圈子圈子横幅')
CAMPAIGNBANNER = (38, '活动圈子活动banner图')
CIRCLEICON = (39, '圈子图标')
ADVERTISE = (40, '广告位')
INSTALLMENT = (41, '分期')
BANK = (42, '银行logo')
AUDIO = (43, '录音音频')
ZHIBO = (44, '直播')
# 广告系统相关
ADCOVERMAP = (45, '广告封面图')
TRADEIMAGE = (46, '交易图片')
SERVICE_COMMENT = (47, '美购评价')
APOLLO = (48, '分销')
SERVICE_WATERMARK = (49, '促销标签')
PLUTUS_QR = (50, '分期二维码')
PLUTUS_PDF= (51, '金融pdf')
POLYMER = (52, '聚合页')
SERVICEHOME = (53, '美购主页')
CATEGORY_POLYMER = (54, '品类聚合页')
ICON = (55, '我的页面icon')
# 无类型
WATERMARK = (98, '带水印')
NOWATERMARK = (99, '不带水印')
# domain add http/https, when change the http or https, remember to modify testcase
qiniu_no_watermark = {
'domain': 'https://heras.igengmei.com',
'bucket': 'hera',
}
qiniu_no_watermark_platform = {
'qiniu': qiniu_no_watermark
}
qiniu_watermark = {
'domain': 'https://pic.igengmei.com',
'bucket': 'wanmeizhensuo',
}
qiniu_video = {
'domain': 'http://video-static.igengmei.com',
'bucket': 'video',
}
qiniu_watermark_platform = {
'qiniu': qiniu_watermark
}
qiniu_audio_platform = {
'qiniu': {
'domain': 'http://phonerecord.private.igengmei.com',
'bucket': 'phone-record',
}
}
qiniu_installment_platform = {
'qiniu': {
'domain': 'http://idcard.private.igengmei.com',
'bucket': 'id-card',
}
}
image_type = {
IMG_TYPE.WATERMARK: {
'prefix': '',
'platform': qiniu_watermark_platform,
},
IMG_TYPE.NOWATERMARK: {
'prefix': '',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.BANNER: {
'prefix': 'banner',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.ICON: {
'prefix': '',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.BODYPART: {
'prefix': 'bodypart',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.DIARY: {
'prefix': 'diary',
'platform': qiniu_watermark_platform,
},
IMG_TYPE.CAMPAIGNIMAGELINK: {
'prefix': 'campaignimagelink',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.CONSULTWIKI: {
'prefix': 'consultwiki',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.DOCTOR: {
'prefix': 'doctor',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.FEEDBACKCATEGORY: {
'prefix': 'feedbackcategory',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.FEEDBACKIMAGE: {
'prefix': 'feedbackimage',
'platform': qiniu_watermark_platform,
},
IMG_TYPE.GREETINGPOPUP: {
'prefix': 'greetingpopup',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.HOSPITAL: {
'prefix': 'hospital',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.ITEMWIKI: {
'prefix': 'itemwiki',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.PRIVATECONVERSATION: {
'prefix': 'privateconversation',
'platform': qiniu_watermark_platform,
},
IMG_TYPE.ORGANIZATIONIMAGE: {
'prefix': 'organizationimage',
'platform': qiniu_watermark_platform,
},
IMG_TYPE.PREOPERATIONIMAGE: {
'prefix': 'preoperationimage',
'platform': qiniu_watermark_platform,
},
IMG_TYPE.RECOMMENDAPP: {
'prefix': 'recommendapp',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.WEBSLIDE: {
'prefix': 'webslide',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.BULLETIN: {
'prefix': 'bulletin',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.SERVICE: {
'prefix': 'service',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.SERVICEACTIVITY: {
'prefix': 'serviceactivity',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.SHARE: {
'prefix': 'share',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.SLIDE: {
'prefix': 'slide',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.SMALLIMAGE: {
'prefix': 'smallimage',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.SPECIAL: {
'prefix': 'special',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.TAG: {
'prefix': 'tag',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.TOPIC: {
'prefix': 'topic',
'platform': qiniu_watermark_platform,
},
IMG_TYPE.TOPICREPLY: {
'prefix': 'topicreply',
'platform': qiniu_watermark_platform,
},
IMG_TYPE.TOPICIMAGE: {
'prefix': 'topicimage',
'platform': qiniu_watermark_platform,
},
IMG_TYPE.USEREXTRA: {
'prefix': 'userextra',
'platform': qiniu_watermark_platform,
},
IMG_TYPE.POST: {
'prefix': 'consultwiki',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.ARTICLE: {
'prefix': 'article',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.DOCTORREGISTER: {
'prefix': 'doctorregister',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.HOSPITALCAPTURE: {
'prefix': 'hospitalcapture',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.HOMEPOPUP: {
'prefix': 'homepopup',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.HOMESLIDE: {
'prefix': 'homeslide',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.ZEUS_STAFF_PROTRAIT: {
'prefix': 'zeusstaffprotrait',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.ADVERTISE: {
'prefix': 'advertise',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.CIRCLEBANNER: {
'prefix': 'circlebanner',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.CIRCLEICON: {
'prefix': 'circleicon',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.CAMPAIGNBANNER: {
'prefix': 'campaignbanner',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.INSTALLMENT: {
'prefix': 'installment',
'platform': qiniu_installment_platform,
},
IMG_TYPE.BANK: {
'prefix': 'bank',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.AUDIO: {
'prefix': '',
'platform': qiniu_audio_platform,
},
IMG_TYPE.ZHIBO: {
'prefix': '',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.ADCOVERMAP: {
'prefix': 'artemis',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.TRADEIMAGE: {
'prefix': 'artemis',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.SERVICE_COMMENT: {
'prefix': 'service_comment',
'platform': qiniu_watermark_platform,
},
IMG_TYPE.APOLLO: {
'prefix': 'apollo',
'platform': qiniu_installment_platform,
},
IMG_TYPE.SERVICE_WATERMARK: {
'prefix': 'service_watermark',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.PLUTUS_QR: {
'prefix': 'plutus_qr',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.PLUTUS_PDF: {
'prefix': 'plutus_pdf',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.POLYMER: {
'prefix': 'polymer',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.SERVICEHOME: {
'prefix': 'service_home',
'platform': qiniu_no_watermark_platform,
},
IMG_TYPE.CATEGORY_POLYMER: {
'prefix': 'category_polymer',
'platform': qiniu_no_watermark_platform,
},
}
class GmImageManager(object):
def __init__(self, img_type):
assert img_type in IMG_TYPE
self.img_type = img_type
def get_domain(self, platform='qiniu'):
# NOTE: 这里返回结果变为带http/https前缀的domain
return image_type.get(self.img_type)['platform'].get(platform)['domain']
def get_bucket(self, platform='qiniu'):
return image_type.get(self.img_type)['platform'].get(platform)['bucket']
def get_prefix(self):
return image_type.get(self.img_type)['prefix']
class QiniuTool(object):
access_key = QINIU_ACCESS_KEY
secret_key = QINIU_SECRET_KEY
q = Auth(access_key, secret_key)
bucket = BucketManager(q)
try:
from qiniu import CdnManager
cdn = CdnManager(q)
except:
cdn = None
@classmethod
def get_pritvate_url(cls, base_url, time=3600):
private_url = cls.q.private_download_url(base_url, expires=time)
return private_url
@classmethod
def get_private_url(cls, bucket_domain, key, time=3600):
# if bucket_domain not start with http/https, add http/https first, then construct base_url
if not bucket_domain.startswith('http://') and not bucket_domain.startswith('https://'):
bucket_domain = 'http://' + bucket_domain
base_url = '%s/%s' % (bucket_domain, key)
return cls.get_pritvate_url(base_url, time)
@classmethod
def upload(cls, file, save_name, bucket_name):
token = cls.q.upload_token(bucket_name)
data = file
count = 0
while count < 5:
count += 1
response = put_data(token, save_name, data, params=None, mime_type='application/octet-stream', check_crc=False,
progress_handler=None)
"""上传二进制流到七牛
Returns:
一个dict变量,类似 {"hash": "<Hash string>", "key": "<Key string>"}
一个ResponseInfo对象
"""
if response and response[0] and 'key' in response[0]:
key = response[0]['key']
return {'file': key}
raise Exception('upload filed')
@classmethod
def upload_file(cls, file_path, save_name, bucket_name):
# 上传本地文件
token = cls.q.upload_token(bucket_name)
count = 0
while count < 5:
count += 1
response = put_file(token, save_name, file_path)
"""上传本地文件到七牛
Returns:
一个dict变量,类似 {"hash": "<Hash string>", "key": "<Key string>"}
一个ResponseInfo对象
"""
if response and response[0] and 'key' in response[0]:
key = response[0]['key']
return {'file': key}
raise Exception('upload file filed')
@classmethod
def delete(cls, key, bucket_name):
ret, info = cls.bucket.delete(bucket_name, key)
if ret == {} or info.status_code == 612:
return True
else:
return False
@classmethod
def move(cls, old_key, new_key, old_bucket_name, new_bucket_name):
ret, info = cls.bucket.move(old_bucket_name, old_key, new_bucket_name, new_key)
if ret == {} or info.status_code == 614:
return True
else:
return False
@classmethod
def copy(cls, old_key, new_key, old_bucket_name, new_bucket_name):
ret, info = cls.bucket.copy(old_bucket_name, old_key, new_bucket_name, new_key)
if ret == {} or info.status_code == 614:
return True
else:
return False
@classmethod
def refresh(cls, urls):
'''刷新 cdn缓存'''
if not urls or not cls.cdn:
return True
ret, info = cls.cdn.refresh_urls(urls)
return True
@classmethod
def refresh_qiniu_resource_cache(cls, urls):
"""
刷新 七牛资源 缓存
注:
1、urls is list, 元素必须是绝对地址。且列表长度最大 100。
eg:["http://video-static.igengmei.com/883c4461af7c11c270afe9e80ae0d967.mp4", ]
2、需要刷新节点及 cdn节点资源。刷新生效时长 15分钟左右。
:return:
"""
if not urls or not cls.cdn:
return True
ret, info = cls.cdn.refresh_urls(urls)
ret, info = cls.cdn.prefetch_urls(urls)
return True
@classmethod
def prefetch(cls, key, bucket_name):
ret, info = cls.bucket.prefetch(bucket_name, key)
if ret == {}:
return True
else:
return False
@classmethod
def get_token(cls, bucket_name):
return cls.q.upload_token(bucket_name)
@classmethod
def set_video_watermark(cls, filename, newfilename, bucket_name, water_mark_url, pipeline):
base64URL = urlsafe_base64_encode(water_mark_url)
fops = 'avthumb/mp4/wmImage/' + base64URL
saveas_key = urlsafe_base64_encode(bucket_name + ':' + newfilename)
fops = fops + '|saveas/' + saveas_key
pfop = PersistentFop(cls.q, bucket_name, pipeline)
ops = []
ops.append(fops)
ret, info = pfop.execute(filename, ops, 1)
return ret['persistentId']
@classmethod
def video_clipping(cls, filename, new_filename, video_type, bucket_name, water_mark_url,
pipeline, start_time, duration, audio_no):
"""
视频转码/截取,该函数都是转为mp4
FrameRate 帧率未考虑,使用默认值,为低帧率(1~30)
:param filename: 原视频key
:param new_filename: 转码后的视频的key
:param video_type: 转码后的视频类型
:param bucket_name:
:param water_mark_url: 加水印的地址
:param pipeline: 专用通道
:param start_time: 时间偏移量,从那个时间点开始,单位:秒
:param duration: 截取多长时间
:param audio_no: 是否去掉音频
:return:
"""
fops_list = ['avthumb/{}'.format(video_type)]
if water_mark_url:
fops_list.append('wmImage/{}'.format(urlsafe_base64_encode(water_mark_url)))
if start_time:
fops_list.append('ss/{}'.format(start_time))
if duration:
fops_list.append('t/{}'.format(duration))
if audio_no is not None:
fops_list.append('an/{}'.format(audio_no))
fops = '/'.join(fops_list)
save_key = urlsafe_base64_encode(bucket_name + ':' + new_filename)
fops = fops + '|saveas/' + save_key
pfop = PersistentFop(cls.q, bucket_name, pipeline)
ops = []
ops.append(fops)
ret, info = pfop.execute(filename, ops, 1)
return ret['persistentId']
@classmethod
def set_text_watermark(cls,
text, image_url, saved_name, bucket_name, font_name="黑体",
font_size=240, text_color="FF0000", dissolve=100,
gravity="SouthEast", dis_x=10, dis_y=10
):
"""图片上设置文字水印"""
fop = [
'watermark/2/text/{}'.format(urlsafe_base64_encode(text))
]
if font_name:
fop.append("font/{}".format(urlsafe_base64_encode(font_name)))
if font_size:
fop.append("fontsize/{}".format(font_size))
if text_color:
fop.append("fill/{}".format(urlsafe_base64_encode('#' + text_color)))
if dissolve:
fop.append("dissolve/{}".format(dissolve))
if gravity:
fop.append("gravity/{}".format(gravity))
if dis_x:
fop.append("dx/{}".format(dis_x))
if dis_y:
fop.append("dy/{}".format(dis_y))
fops = '/'.join(fop)
fops = fops + '|saveas/' + urlsafe_base64_encode(bucket_name + ':' + saved_name)
pfop = PersistentFop(cls.q, bucket_name)
ret, info = pfop.execute(image_url, [fops], 1)
if ret:
return ret['persistentId']
return None
@classmethod
def batch_move_pic(cls, filename_list, bucket_name, new_bucket_name):
keys = {}
for filename in filename_list:
keys[filename] = filename
ops = build_batch_copy(bucket_name, keys, new_bucket_name)
if ops:
ret, infos = cls.bucket.batch(ops)
if infos:
i = 0
result = []
infos = json.loads(infos.text_body)
for info in infos:
if info['code'] == 614 or info['code'] == 200:
result.append(filename_list[i])
else:
result.append(None)
i += 1
return result
return []
@classmethod
def set_picture_watermark(cls, img_url, water_mark_url, dissolve=100, gravity='NorthEast', dx=40, dy=40):
"""
给已存在七牛云上的图片,设置图片水印
https://developer.qiniu.com/dora/manual/1316/image-watermarking-processing-watermark
:param img_url: 已存在于七牛的图片地址
:param water_mark_url: 水印原图片地址
:param dissolve: 透明度 0 - 200
:param gravity: 水印位置
:param dx: 横轴边距
:param dy: 纵轴边距
:return: 拼接之后的图片地址
"""
if not img_url or not water_mark_url:
return ""
base64_url = urlsafe_base64_encode(water_mark_url)
f_url = "{img_url}?watermark/1/image/{base64_url}/dissolve/{dissolve}/gravity/{gravity}/dx/{dx}/dy/{dy}".format(
img_url=img_url,
base64_url=base64_url,
dissolve=dissolve,
gravity=gravity,
dx=dx,
dy=dy
)
return f_url
@classmethod
def mkzip(cls, bucket, index_file_name, new_file_name):
"""
多文件压缩
:param bucket: 空间名称
:param index_file_name: 需要压缩的文件地址所在的文件路径
:param new_file_name: 生成压缩文件
:return:
"""
key = index_file_name
fops = 'mkzip/4/'
saveas_key = urlsafe_base64_encode(bucket + ':' + new_file_name)
fops = fops + '|saveas/' + saveas_key
pfop = PersistentFop(cls.q, bucket)
ops = [fops]
ret, info = pfop.execute(key, ops, 1)
return ret['persistentId']
def gen_rnd_filename(ext=None):
prefix = datetime.datetime.today().strftime("%Y/%m/%d")
md5 = hashlib.md5(str(time.time()).encode("utf8")).hexdigest()[:10]
md5 = prefix + '/' + md5
if ext is not None:
md5 = md5 + '.' + ext
return md5
def upload_with_short(image_file, img_type=IMG_TYPE.DIARY, save_name=None, platform='qiniu'):
from gm_upload.utils.qiniu_tool import QiniuTool
manager = GmImageManager(img_type)
prefix = manager.get_prefix()
if not save_name:
save_name = gen_rnd_filename()
if prefix:
save_name = prefix + '/' + save_name
QiniuTool.upload(image_file, save_name, manager.get_bucket())
return get_full_path(save_name, manager.get_domain()), save_name
def upload(image_file, img_type=IMG_TYPE.DIARY, save_name=None, platform='qiniu'):
""" 上传图片并返回全路径
"""
return upload_with_short(image_file, img_type, save_name, platform)[0]
def store_picture_and_get_key(picture):
etx = None
save_name = gen_rnd_filename(etx)
key = upload(picture, IMG_TYPE.NOWATERMARK, save_name)
return key
def get_domain(domain):
""" domain 补全http/https, 若domain不带http/https前缀,默认加http前缀
"""
# 若domain不带http/https前缀,默认加http前缀
if not domain.startswith('http://') and not domain.startswith('https://'):
domain = 'http://' + domain
if not domain.endswith('/'):
domain = domain + '/'
return domain
def get_full_path(path_name, domain, extra=''):
domain = get_domain(domain)
try:
p = parse.unquote(path_name)
path_name = p
except:
pass
# add https
if path_name and ( path_name.startswith('http://') or path_name.startswith('https://') ):
full_path = path_name
elif path_name:
full_path = parse.urljoin(domain, path_name)
else:
full_path = ''
return full_path + extra
def upload_file(file_path, img_type=IMG_TYPE.NOWATERMARK, save_name=None, platform='qiniu'):
""" 上传本地文件并返回全路径
"""
return upload_file_with_short(file_path, img_type, save_name, platform)[0]
def upload_file_with_short(file_path, img_type=IMG_TYPE.WATERMARK, save_name=None, platform='qiniu'):
manager = GmImageManager(img_type)
prefix = manager.get_prefix()
if not save_name:
save_name = gen_rnd_filename()
if prefix:
save_name = prefix + '/' + save_name
QiniuTool.upload_file(file_path, save_name, manager.get_bucket())
return get_full_path(save_name, manager.get_domain()), save_name
import os
import time
from datetime import datetime
if __name__ == '__main__':
plans = [
# "python tasks/zhihu/spider.py 0 3 0 'https://www.zhihu.com/people/chun-feng-zhu-lang'",
# "python tasks/zhihu/spider.py 0 10 0 'https://www.zhihu.com/people/drunkxiaojingguai'",
# "python tasks/zhihu/spider.py 0 1 0 'https://www.zhihu.com/people/kokokou'",
# "python tasks/zhihu/spider.py 0 5 0 'https://www.zhihu.com/people/yo-he-14-20'",
# "python tasks/zhihu/spider.py 0 7 0 'https://www.zhihu.com/people/wen-zi-ding-dan'",
# "python tasks/zhihu/spider.py 0 169 0 'https://www.zhihu.com/people/zhengxingba'",
#"python tasks/zhihu/spider.py 0 24 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-lu-hui'",
# "python tasks/zhihu/spider.py 0 5 0 'https://www.zhihu.com/people/xiao-zhu-99-20'",
"python tasks/zhihu/spider.py 0 119 58 'https://www.zhihu.com/people/zhaotianqiang'",
"python tasks/zhihu/spider.py 0 7 0 'https://www.zhihu.com/people/zuo-zheng-xing-de-liu-zhe-qi'",
"python tasks/zhihu/spider.py 0 14 0 'https://www.zhihu.com/people/cao-yang-yan-da-shi'",
"python tasks/zhihu/spider.py 0 2 0 'https://www.zhihu.com/people/zhe-mang-guo-mang'",
"python tasks/zhihu/spider.py 0 5 0 'https://www.zhihu.com/people/yuxi624'",
"python tasks/zhihu/spider.py 0 10 0 'https://www.zhihu.com/people/key-70-48'",
"python tasks/zhihu/spider.py 0 4 0 'https://www.zhihu.com/people/dryanling'",
"python tasks/zhihu/spider.py 0 2 0 'https://www.zhihu.com/people/shu-er-29-7'",
"python tasks/zhihu/spider.py 0 13 0 'https://www.zhihu.com/people/chen-shi-long-69-98'",
"python tasks/zhihu/spider.py 0 5 0 'https://www.zhihu.com/people/ka-li-yu-dan-94'",
"python tasks/zhihu/spider.py 0 4 0 'https://www.zhihu.com/people/nuo-nuo-jiang-46-40'",
"python tasks/zhihu/spider.py 0 10 0 'https://www.zhihu.com/people/lenhzin'",
"python tasks/zhihu/spider.py 0 7 0 'https://www.zhihu.com/people/tu-zi-ai-chi-cao-78'",
"python tasks/zhihu/spider.py 0 19 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-wang-shi-hu'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yu-ji-jie-jie-12'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/huan-xing-mei-mei'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/suan-jie-shen-ba'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/li-ji-hua-8'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/dong-yan-10-84'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-yu-yan'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yu-mao-wen-46'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/drfranklin-eps'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/sxsxp1'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/judyxiao-jie-jie'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bian-mei-jun'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hu-tao-tao-de-yan-jiu-shi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wei-fan-ceng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-zi-xun-shi-yu-er-duo-duo'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ju-zi-guo-jiang-20-66'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/a-qi-shu-shu-15'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chi-bu-ding-de-pang-ding-27-43'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jliyimei'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/reseted1503325608'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/miao-wu-a-wu-666'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/a-ye-xiang-wo'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/liang-yi-31-45'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhao-xue-qi-2'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-gang-2017'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/long-mao-xue-yuan'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mai-tian-li-de-ke-ai-duo'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-dobby'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-zhi-da-xue-tang'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-min-94-15'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xian-nu-mei-rong'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wei-xiao-xiao-72-30'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wen-bao-guo-bao'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-zhu-li-666'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/si-te-li-ke-lan-de-xian-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bian-mei-she'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/sou-di-leng-hua-zheng-qiao'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yimeisiba'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hua-jie-bu-xing-hua'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bitibo/pins'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/liu-cheng-sheng-zhu-li'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/gao-yi-chao-63'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chen-shi-er-78-24'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-tian-jiao-sweet'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/can-ma-de-can-lan-sheng-huo'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ai-chi-xue-gao-24'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xian-bei-bu-ai-hua-zhuang'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/pluto-54-88'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bai-meng-meng-79-43'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ai-mei-de-mi-xue'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/li-yan-lin-22-95'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ning-meng-jing-tang-dou-dou'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wo-you-yizhi-xiao-mao-mi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhengrongjia'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/li-li-36-48-1'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jin-xiao-mei-88-37'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/liang-yi-sheng-99-9'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xia-xiao-xiao-20-12'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wai-hao-da-biao-jie'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-zhou-yu'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhi-zhi-fei-ji'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/pan-er-bo-shi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/cheng-du-long-bi-bo-shi-zhang-han'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/well-1-95'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ying-ying-37-60'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-xiao-51-29-23'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/li-rui-64-16'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-zhi-dao-ge'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/liu-yi-he-tian-ling'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/13397228518'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhang-zhi-bing-13'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-gang-2017'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-lian-mei-9'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zi-fei-yu-79-53'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zengyy-5'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-jiang-yi-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wu-di-30-63-36'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-zhi-shuo-ming-shu'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/shan-shan-lai-zao-liao-35'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-zi-xun-shi-abbie-71'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/tou-fa-ke'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/0329'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-hao-90-58-31'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-jing-20-41-10'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-xiao-xiao-62-85'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jian-si-bu-neng-jiu'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mi-qi-xian-sheng-72'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jiangjy58'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-jiao-zi-mo-mo'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/haozemaliang'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xie-wen-hui-21-66'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-shua-shua-98'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/amazingbird'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/desperate'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-pei-jun-15'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/le-mei-xiao-qin'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/meilikepujun'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/juan-mao-er-1-90'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bian-mei-jun-shi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hai-da-li-28'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xieyangchun'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/song-song-1-3-27'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yu-sheng-60-21-71'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/CQ_Hazel'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/li-you-zi-13'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ban-ban-61-99'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bei-jing-ba-da-chu-jin-xiao-lei'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-bai-zhen-hua-54'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-rong-yi-sheng-gao-si'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/du-rui-mi-66'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ke-le-niu-nai-38'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ai-xiao-de-chun-93'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-chuan-kai-67'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qiang-jiao-de-xiao-tiao-wa'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/le-le-34-73-57'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/lao-lun-si-de-bai-ri-meng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hong-li-34-89'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhao-xiu-li-67'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-er-gou-zi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-zhi-zheng-yi-11'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/pi-fu-mei-rong-zhang-wei-hua'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/cai-xiao-bai-60-17'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ban-xia-7-29'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qi-sh-57'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qing-jiao-wo-yi-mei-shuo-ming-shu'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chen-mo-fang-jiang-zhi-fang'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yi-mei-zai-xian-50'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-jiu-sheng-25-10'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/miao-jiang-3'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mu-lan-wu-chang-xiong-41'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hk17320188886'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xi-fu-wo-xiang-zhi-dao'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zuo-zheng-xing-de-ren-han-xiao'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/gan-cheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-xiao-yi-ma'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/aloedeng-ni'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhoushaolong'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chaigang9h'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-jian-93-10'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-hu-hu-69-77'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhihu88'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zou-yu-75-32'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/liu-mo-mo-49-86'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mei-gu-shi-de-j-xiao-jie-22'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-li-feng-yong'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/sunzhicheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/long-bi-yi-sheng-li-chang-fu'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chen-da-da-57'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-bai-58-40'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/dong-ming-xian-48'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ai-mei-luo-bu-gan'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zaoxaomei'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/da-xiao-bei-13'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xu-rong-yang-71'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhang-da-yan-yi-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/a-wei-68-2'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-kong-ju-le-bu'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/winni-2-13'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-jiu-jiu-80-61'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/leon-5-35-12'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/dr-yan-93'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/tong-yan-jun-25'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jennykissedme'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/li-chuang-26-64'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yi-mei-lang-zi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-miao-64-41'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-guan-86-73'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-hou-you-10'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ce-ge-mo'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhong-ri-zheng-xing-huang-wen-gang'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-di-xing-kong-70'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/drchenbing'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-liu-53-15-36'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/gao-san-gou-67-76'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhang-xiao-tian-mian-diao-zhuan-jia'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhou-ye-song-57'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mei-du-zheng-xing-zhang-yi-sheng-19'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ouruila'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-liang-zong-jian'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-83-93-34'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ren-dong-ivy'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/13718001372'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xi-jing-xiao-mei'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ai-zheng-cai-hui-ying'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.kdocs.cn/view/l/coV2TD4LExp2'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-meng-ming-xing-38'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/alice-95-96-90'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/liu-xing-hu-die-jian-14/answers'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/huang-yi-xiang-28-29'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/beauty-44-9'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wei-bo-ying-xiao-xiao-dou'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/my-world-70-8'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ishuangyanpi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ban-tang-ka-pei-60'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/miao-miao-miao-5-43'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hejiongchuan'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xin-han-50-61'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiang-yan-zhi-kao-jin-de-you-zi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-ai-dou-xiao-mi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ni-gu-man-man'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/DrZhaoyanyong'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhi-fang-wei-diao-qiu-li-dong'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chen-gui-zong'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiecan1'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-lu-shu-rong-6'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jiang-tian-xie-1'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-yi-sheng-yu-bing'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/lu-jian-jian-12'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-li-qiu-yue'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-zhang-shu-gong'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jiu-ai-he-nai-cha-62'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jie-wo-yiba-qiang-49'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/13522127270'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-ji-guang-zhong-xin-feng-yong-qiang'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bu-zhi-dao-jiao-sha-79-74'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-yi-sheng-zhou-xu-99'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-liu-tun-yi-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xian-xian-76-39/answers'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jing-jing-70-85-21'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bing-he-lao-dao'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-wang-tai-ling'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mo-xiao-xin-28-35'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-ma-zhu'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/lovelife-96-78'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/sorry-7'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qi-yong-le-38'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-rong-yi-sheng-wang-shu-jie'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-mi-jie-33-58'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/shan-he-yi-qiu-94'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zong-xian-lei-yi-sheng-49'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiong-mao-jiang-zi-18'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-liao-56-14-3'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qing-qing-ping-ping-36'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yang-zhe-2-83'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/tao-hua-huan-jiu-qian-64'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-wang-yue'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bei-jing-ba-da-chu-ma-xiao-yang-yi-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-yu-hao'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhang-hai-ming-97-8'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-xiao-xiao-shi-49'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/san-qi-h-79'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/tong-xiao-yi-sheng-85'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-yi-sheng-yin-hong-yu'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ai-mei-meng-xi-52'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zai-zai-76-64'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/a-di-35-64'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/shu-yan-ni-ni/answers'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-wang-li-peng-67'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/Swahili1230'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/gorgeous-1-48'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/luna-10-2'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qia-wa-yi-fu-50/answers'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/lin-xiao-pang-30-84'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/you-zi-shu-27-18'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mc-goldenli'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-dai-zheng-zhou'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-dong-qi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ding-xiang-17-21'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/long-xiao-lan-56'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wo-shi-ann'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-fu-jun-li'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/fan-rong-jie-43'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/liu-liu-zheng-xing'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhou-xiao-xiao-43-80'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/tea-5-1-57/answers'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-man-3-54'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jing-guo-shen-she-94'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhou-miao-miao-50-58'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/masakijiang-83'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-han-xing'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yu-zhou-xiao-ke-ai-79-36'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zxys-zhanglong'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xin-li-xiao-tian-shi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hart-48-54'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ou-bu-zhi-dao-52'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/kiki77-21'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chun-bu-zheng-xing-yi-sheng-li-hui'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xia-mo-87-11'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/tiffanybaby-11'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mu-qing-6-40'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jiang-sijun-70-90-26'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wangling-31-13'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-xi-82-72-10'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-ai-hong-mei'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yang-zhong-zheng-xing-yi-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/gao-xiao-shi-pin-lu-ren-jia'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-wang-yan'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hong-kao-shi-ba-v'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-wai-ke-wang-xiao-yang-yi-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xie-he-zheng-xing-yi-sheng-wang-yang'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bei-yi-san-yuan-zheng-xing-yi-sheng-li-bi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/nan-jing-ruan-tang-xiao-jie'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-meng-chun-16-30'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/column/meidaila'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/column/cosmotology'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/miss-miao-jun-81'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/guan-bo-lan-14'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yi-xue-pi-fu-guan-li-gao-jin-ling'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ji-guang-yu-yi-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jia-qu-jing-31'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/fu-chen-yi-mei'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-liu-xu-li'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-zhu-dong-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/christy-20-58/answers'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yang-shen-shen-21-52'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-chen-bo-yi-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bo-luo-15-5-93'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yang-zhou-ifaceai-fei-si'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-wai-ke-yi-sheng-nie-yun-fei'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/lanzhiyong'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/gong-da-yi-mei-dsh3225'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-mei-jie-niu-er'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/binghan'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mei-fu-yi-sheng-yi-yang-liang'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mei-li-da-ren-40'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mei-xing-shuo-33'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yi-mei-xiao-tian-xin-ne'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/shi-jun-li-yi-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/han-ao-xi-65'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/tang-jian-bing-yi-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/nan-feng-18-6-16'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chen-xiang-xin-17'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/18611026629'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhang-shuai-47-80'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mu-mu-97-35-35'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/miss-fei-fei-68'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yi-mei-xiao-jiang-jun'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/le-xiao-zhang-89'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/joy-71-96-54'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/cat-49-49'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yi-mei-xiao-ni-zi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yimei-38-82'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/CrossoverMarketing'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/pi-fu-yi-sheng-zhousir'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-yuan-wai-07'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xi-you-tuan-zi-2'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hua-mei-yuan-chang-fu-guo-you-bo-shi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-hai-feng-38-32'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/lian-qia-fo-ju-jie'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/cong-cong-64-65'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/shang-hai-zheng-xing-chuang-shi-ren'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-deng-gang'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ning-meng-mei-wo-meng-29-98'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-huang-rui'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yu-qi-gang-44'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/cheng-zong-yu-12'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bai-li-tian-7-72'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-sun-zhong-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/likemei-shi-yan-shi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/deng-zheng-jun-zhu-ren'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hua-shao-30-97'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/man-lun-hua-kai'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/fschaomei'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wen-bi-yun-96'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qing-hua-wu-jia-jun-bo-shi-zheng-xing'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/15104568657'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/dr-fang-81'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qiong-an-gu-niang'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/gong-fu-qiang-65-23'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yi-xue-mei-rong-wang-xi-you-yi-shi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-xiao-shuo-yi-mei'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bian-mei-zhao-ha-ge/answers'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-xiu-fu-yi-sheng-wang-shao-guo'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-666'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-yi-sheng-lin-lin'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zang-meng-qing-yi-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-xiao-lian-mao'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-da-dong'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/LZGdoctor'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-lin-wei'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-zhou-xiao-dong'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/15680033702'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-yin-hong-yu-45'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/huang-jia-cheng-15-88'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/long-bi-yi-sheng-liu-guo-quan'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-qin-zi-kan'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ya-tou-20-6-24'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-kong-ke-pu-67'",
]
for plan in plans:
print('start plan ', plan, ' at ', datetime.now())
os.system(plan)
print('end plan ', plan, ' at ', datetime.now())
time.sleep(10)
import os
import time
from datetime import datetime
if __name__ == '__main__':
plans = [
#"python tasks/zhihu/spider_self.py 0 169 0 'https://www.zhihu.com/people/bai-fu-mei-yan-jiu-zhong-xin' 1",
"python tasks/zhihu/spider_self.py 0 169 66 'https://www.zhihu.com/people/geng-mei-suo-chang' 1",
]
for plan in plans:
print('start plan ', plan, ' at ', datetime.now())
os.system(plan)
print('end plan ', plan, ' at ', datetime.now())
time.sleep(10)
# import rsa
# import os, sys
# base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# sys.path.append("/Users/xuwei")
# sys.path.append("/Users/xuwei/crawler")
# sys.path.append("/Users/xuwei/crawler/crawler_sys")
import pymysql
import hashlib
import requests
import execjs
import os
import re
import sys
import time
from datetime import datetime
import kdl
import random
HOST = '172.18.51.14'
PORT = 3306
USER = 'spider'
PASSWD = 'Gengmei123'
DB = 'spider'
JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
SPIDER_URL = 'https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1'
ANSWER_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created'
ARTICLE_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20&sort_by=created'
THOUGHT_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data%5B*%5D.upvoted_followees%2Cadmin_closed_comment'
ANSWER_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/answers/{0}/root_comments?order=normal&limit=20&offset={1}&status=open'
ARTICLE_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/articles/{0}/root_comments?order=normal&limit=20&offset={1}&status=open'
CHILD_COMMENT_START_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments'
CHILD_COMMENT_OFFSET_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments?limit=20&offset={1}'
THOUGHT_COMMENT_URL = 'https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open'
class Spider(object):
def __init__(self, url, is_online=False):
'''
初始化数据库,调整js规则
'''
self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
passwd=PASSWD,
db=DB, charset='utf8')
self.cur = self.conn.cursor()
os.environ["EXECJS_RUNTIME"] = 'Node'
try:
with open('tasks/zhihu/zhihu.js', 'r', encoding='utf-8') as f:
js = f.read()
except:
with open(JS_FILE_PATH, 'r', encoding='utf-8') as f:
js = f.read()
self.exec_js = execjs.compile(js, cwd='/Library/Java/JavaVirtualMachines/graalvm-ce-java11-20.2.0/Contents/Home/languages/js/npm/node_modules')
self.is_online = True if is_online else False
self.url = url
api_url = url.replace('', '')
self.ANSWER_URL = 'https://www.zhihu.com/api/v4/members/' + url[29:] + '/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created'
self.page_count = 1000
self.ips = []
def get_proxy(self):
if self.ips:
return self.ips.pop()
auth = kdl.Auth("981538205840388", 'p7wrheahef20800z0zh4bedu4005ofsb')
client = kdl.Client(auth)
ips = client.get_dps(10, sign_type='hmacsha1', format='json')
print("dps proxy: ", ips, client.get_proxy_authorization())
for item in ips:
for i in range(10):
self.ips.append({ "http": "http://{}".format(ips[0]), "https": "https://{}".format(ips[0])})
random.shuffle(self.ips)
return self.ips.pop()
def retry_get_url(self, url, retrys=3, proxies=None, timeout=10, **kwargs):
retry_c = 0
while retry_c < retrys:
time.sleep(5)
try:
print(3131313131312133)
proxies = self.get_proxy()
print("70704204902402")
get_resp = requests.get(url, timeout=timeout, **kwargs)
print(get_resp.status_code,7897897897)
return get_resp
except Exception as e:
retry_c += 1
time.sleep(5)
print(e)
print('Failed to get page %s after %d retries, %s'
% (url, retrys, datetime.now()))
return None
def get_serach_page_cookies(self):
'''
cookies更新
'''
url = self.url + "/answers?page=1"
print(url,1341231)
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
# "cookie": 'JOID=VVwUBE7x27HACN_aDfF6aHmauAQZ0f2U4Sn__CjQ-5flKf76K7hP16AJ2dkJafVd-b2rXUc10ti1sPBtJsmPy_E=;SESSIONID=TFlPxrvKlHSu5GFwVuKQZPvLF7j0irSDVxK85ABhAIf;osd=W1gTA0L_37bHBNHeCvZ2Zn2dvwgX1fqT7Sf7-y_c9ZPiLvL0L79I264N3t4FZ_Fa_rGlWUAy3taxt_dhKM2IzP0=;JOID=W1EVBEpxIfBPFyGPKn-JJvuASFgxUgvUajwCpQ5aAtprMgqsAMm5kyEfKIEr9xxWh9I93T_p0YdYpDeSxXodzUE=;SESSIONID=F5iD34RRjiMytqEyVwGp5RonpKiXLRTc5NooUAp7dfZ;osd=V1ESAUl9IfdKFC2PLXqKKvuHTVs9UgzRaTACogtZDtpsNwmgAM68kC0fL4Qo-xxRgtEx3Tjs0otYozKRyXoayEI=;KLBRSID=0a401b23e8a71b70de2f4b37f5b4e379|1617074238|1617072802;Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1617073998;Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1616991920;_xsrf=028b732d-dc59-4fcc-9502-ef1b69081cc6;_zap=a6203f6c-f1b9-4399-9bb4-84d4a7c05db9;d_c0="ALAdzVEi3xKPThaHzBS2goRCrnZQtERjjek=|1616991918"',
"cookie": '_SESSIONID=rvJz2mbRjCqJFwvD79ADGb8gpdpRkWXAsdVDqOVALgh; JOID=UVkSBEtHLEUgV_KGUUMd3fULiLJHZglmBHfTo3JnDWADc9KndNqP8XtW9oBWoLo5jsAnAcPG0-JwbPp8rD1JsFI=; osd=W1gdB05NLUojUviHXkAY1_QEi7dNZwZlAX3SrHFiB2EMcNetddWM9HFX-YNTqrs2jcUtAMzF1uhxY_l5pjxGs1c=; _zap=cc1330a6-d6e9-4b25-8232-0b0481e37ea2; _xsrf=4798d610-1db7-489f-8c14-d608c0bbcf08; d_c0="AOCcdS0CNBKPToRkxgVd-8qBKKgTDM9yF4Y=|1605507877"; capsion_ticket="2|1:0|10:1605684439|14:capsion_ticket|44:MzdiN2JlOTZlNWE2NGU5Zjg0MDNiODFlN2ViMjQzNGU=|b84d77e03b5e9447a3740a614d208ca5f3bc1e4fe15fe9b46f1a2a5702da5f99"; SESSIONID=nplCMucg2EXp8xNQAz74jmaLovUS9CG4rVtcFY4jWLT; JOID=V1sUBk8zRBeITtMgLDNzj10QrRI6WzZ_ygKgQU8AB038d6pof-COptdJ1yYpGa7oR9-1bgHql0Hfs4FvUF_YW-A=; osd=Ul4XBkI2QRSIQ9YlLzN-ilgTrR8_XjV_xwelQk8NAkj_d6dteuOOq9JM1CYkHKvrR9KwawLqmkTasIFiVVrbW-0=; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1605608394,1605684435,1605684439,1605684455; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1605684522; KLBRSID=fe0fceb358d671fa6cc33898c8c48b48|1605685245|1605683234',
"referer": url,
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36",
}
requests_res = self.retry_get_url(url, headers=headers)
print(requests_res.cookies.get_dict(),131313)
return requests_res.cookies.get_dict()
def check_data_exist(self, data_dict, mark):
'''
数据插入前检测
'''
def _update_data_is_online(id_):
sql = "update {table} set is_online = 1 where id = {id_}"
sql = sql.format(table='zhihu_answer', id_=id_)
self.cur.execute(sql)
self.conn.commit()
sql = "select id from {table} where answer_id = {id_}"
exist = None
if mark == 0:
select_sql = sql.format(table='zhihu_answer', id_=data_dict["id"])
self.cur.execute(select_sql)
exist = self.cur.fetchone()
if mark == 1:
select_sql = sql.format(table='zhihu_article', id_=data_dict["id"])
self.cur.execute(select_sql)
exist = self.cur.fetchone()
if exist:
if self.is_online:
_update_data_is_online(exist[0])
return True
return False
def parse_sigle_page(self, data_dict, mark):
'''
插入主要内容数据和图片的url,寻找评论
'''
if self.check_data_exist(data_dict, mark):
return
if mark == 0:
into = "insert into zhihu_answer(title, content, answer_id, created_time, comment_count, new_content, is_online) value(%s, %s, %s, %s, %s, %s, %s)"
values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"],
data_dict["comment_count"], data_dict["content"], self.is_online)
elif mark == 1:
into = "insert into zhihu_article(title, content, article_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (
data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"],
data_dict["content"])
self.cur.execute(into, values)
self.conn.commit()
offset = 0
return
def search_page(self, mark, page_max, start_page=0):
'''
函数主入口
params:
mark 0 answer, 1 article, 2 thought
'''
offset = start_page * 20
for i in range(page_max - 1):
print(i, self.page_count)
if i+start_page >= self.page_count:
break
if mark == 0:
self.search_answer_article_page(offset, 0)
elif mark == 1:
self.search_answer_article_page(offset, 1)
elif mark == 2:
self.search_thought_page(offset)
offset = offset + 20
self.conn.close()
return
def update_page_count(self, answer_count):
count = int(answer_count / 20)
temp = int(answer_count % 20)
if temp > 0:
count += 1
self.page_count = count
def get_page_data(self, url, headers_search, cookies_dict, offset):
get_page = self.retry_get_url(url, headers=headers_search, cookies=cookies_dict)
try:
if get_page.status_code != 200:
# retry once
get_page = self.retry_get_url(url)
if get_page.status_code != 200:
print("article_error, url : ", url, " status_code: ", get_page.status_code)
page_dict = get_page.json()
print('get page json data success! offset: ', offset, ' url: ', url)
return page_dict
except:
print('retry get page data : {}'.format(offset))
return self.get_page_data(url, headers_search, cookies_dict, offset)
def search_answer_article_page(self, offset, mark, proxies_num=0):
'''
实现文章和回答的数据包请求
'''
offset = str(offset)
if mark == 0:
url = self.ANSWER_URL.format(offset)
elif mark == 1:
url = ARTICLE_URL.format(offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = self.retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
if get_page.status_code != 200:
print("article_error")
page_dict = self.get_page_data(url, headers_search, cookies_dict, offset)
if page_dict.get("data"):
if self.page_count == 1000:
self.update_page_count(page_dict["paging"].get("totals", 0))
for one_line in page_dict['data']:
try:
if one_line["content"] != None:
print(one_line)
self.parse_sigle_page(one_line, mark)
print("finshed_article" + offset)
except KeyError:
continue
else:
time.sleep(60*5)
print("article_data_error")
return
def headers_handle(self, url):
'''
url请求中的头部伪装
'''
res_cookies_dict = self.get_serach_page_cookies()
headers_search = {
"accept": "*/*",
"accept-encoding": "gzip, deflate",
"accept-language": "zh-CN,zh;q=0.9",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36",
"x-ab-param": "li_yxzl_new_style_a=1;se_major=0;se_adsrank=4;se_hi_trunc=0;se_v053=1;li_panswer_topic=0;zr_test_aa1=1;pf_noti_entry_num=2;zr_search_sim2=2;zr_slotpaidexp=2;se_zp_boost=0;tp_club_entrance=1;pf_profile2_tab=0;ug_newtag=1;li_answer_card=0;ls_recommend_test=5;qap_labeltype=1;zr_rec_answer_cp=open;se_sug_term=0;tp_topic_tab=0;ge_ge01=5;se_wil_act=0;se_videobox=0;tsp_ad_cardredesign=0;qap_question_visitor= 0;zr_slot_training=2;tp_clubhyb=0;li_ebook_gen_search=2;se_v_v005=0;zw_sameq_sorce=999;ge_ge02=6;se_mobilecard=0;se_auth_src=0;tp_header_style=1;tp_flow_ctr=0;pf_creator_card=1;li_viptab_name=0;zr_intervene=0;se_bert128=1;se_ffzx_jushen1=0;top_v_album=1;se_preset=0;tp_discover=1;ls_fmp4=0;tp_club_top=0;top_universalebook=1;li_svip_cardshow=1;li_paid_answer_exp=0;tp_topic_style=0;zr_art_rec=base;se_colorfultab=1;se_auth_src2=0;tp_club_qa_entrance=1;tp_club__entrance2=1;tsp_hotlist_ui=3;li_svip_tab_search=1;se_entity22=1;tp_meta_card=0;tp_topic_tab_new=0-0-0;tp_zrec=0;top_ebook=0;pf_adjust=1;qap_question_author=0;zr_topic_rpc=0;se_topicfeed=0;tp_club_feed=0;tsp_ioscard2=0;zr_rel_search=base;se_recommend=1;se_usercard=0;tp_club_fdv4=0;tp_m_intro_re_topic=1;pf_foltopic_usernum=0;li_vip_verti_search=0;zr_training_boost=false;se_v054=0;tp_contents=1;soc_feed_intelligent=3;tsp_ios_cardredesign=0;pf_fuceng=1;pf_newguide_vertical=0;ug_follow_topic_1=2;ls_video_commercial=0;li_car_meta=1;se_sug_dnn=0;tp_fenqu_wei=0;li_catalog_card=1;top_quality=0;se_click_v_v=1;se_aa_base=1;se_club_ui=0;se_return_1=0;soc_notification=1;zr_ans_rec=gbrank;zr_search_paid=1;zr_expslotpaid=3;zr_rerank=0;se_college=default;se_whitelist=1;top_root=0;li_yxxq_aut=A1;tsp_adcard2=0;ls_videoad=2;se_col_boost=1;li_edu_page=old;zr_training_first=false;se_t2sug=1;se_vbert3=0;se_merge=0;li_video_section=1;zr_km_answer=open_cvr;zr_sim3=0;se_v_v006=0;tp_dingyue_video=0;li_topics_search=0;se_searchwiki=0;se_guess=0;se_major_v2=0;tp_club_bt=0;tp_sft=a;top_test_4_liguangyi=1",
"x-api-version": "3.0.91",
"x-app-za": "OS=Web",
"x-requested-with": "fetch",
"x-zse-83": "3_2.0",
"x-zse-86": None,
"referer": "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?page=1",
}
cookies_dict = {
"d_c0": '"AOCcdS0CNBKPToRkxgVd-8qBKKgTDM9yF4Y=|1605507877"',
"KLBRSID": None
}
cookies_dict.update(res_cookies_dict)
f = "+".join(
["3_2.0", url.replace("https://www.zhihu.com", ""), headers_search["referer"], cookies_dict["d_c0"]])
fmd5 = hashlib.new('md5', f.encode()).hexdigest()
headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b", fmd5)
return headers_search, cookies_dict
if __name__ == '__main__':
mark = int(sys.argv[1])
max_page = int(sys.argv[2])
start_page = int(sys.argv[3])
spider_url = sys.argv[4]
try:
is_online = int(sys.argv[5])
except:
is_online = False
print(datetime.now())
spider = Spider(spider_url, is_online)
if mark == 0:
spider.search_page(mark, max_page, start_page)
elif mark == 1:
spider.search_page(mark, max_page, start_page)
elif mark == 2:
spider.search_page(mark, max_page, start_page)
print(datetime.now())
# import rsa
# import os, sys
# base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# sys.path.append("/Users/xuwei")
# sys.path.append("/Users/xuwei/crawler")
# sys.path.append("/Users/xuwei/crawler/crawler_sys")
import pymysql
import hashlib
import requests
import execjs
import os
import re
import sys
import time
from datetime import datetime
import kdl
import random
HOST = '172.18.51.14'
PORT = 3306
USER = 'spider'
PASSWD = 'Gengmei123'
DB = 'spider'
JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
SPIDER_URL = 'https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1'
ANSWER_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created'
ARTICLE_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20&sort_by=created'
THOUGHT_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data%5B*%5D.upvoted_followees%2Cadmin_closed_comment'
ANSWER_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/answers/{0}/root_comments?order=normal&limit=20&offset={1}&status=open'
ARTICLE_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/articles/{0}/root_comments?order=normal&limit=20&offset={1}&status=open'
CHILD_COMMENT_START_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments'
CHILD_COMMENT_OFFSET_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments?limit=20&offset={1}'
THOUGHT_COMMENT_URL = 'https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open'
class Spider(object):
def __init__(self, url, is_online=False):
'''
初始化数据库,调整js规则
'''
self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
passwd=PASSWD,
db=DB, charset='utf8')
self.cur = self.conn.cursor()
os.environ["EXECJS_RUNTIME"] = 'Node'
try:
with open('tasks/zhihu/zhihu.js', 'r', encoding='utf-8') as f:
js = f.read()
except:
with open(JS_FILE_PATH, 'r', encoding='utf-8') as f:
js = f.read()
self.exec_js = execjs.compile(js,
cwd='/Library/Java/JavaVirtualMachines/graalvm-ce-java11-20.2.0/Contents/Home/languages/js/npm/node_modules')
self.is_online = True if is_online else False
self.url = url
api_url = url.replace('', '')
self.ANSWER_URL = 'https://www.zhihu.com/api/v4/members/' + url[
29:] + '/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created'
self.page_count = 1000
self.ips = []
def get_proxy(self):
if self.ips:
return self.ips.pop()
auth = kdl.Auth("981538205840388", 'p7wrheahef20800z0zh4bedu4005ofsb')
client = kdl.Client(auth)
ips = client.get_dps(10, sign_type='hmacsha1', format='json')
print("dps proxy: ", ips, client.get_proxy_authorization())
for item in ips:
for i in range(10):
self.ips.append({"http": "http://{}".format(ips[0]), "https": "https://{}".format(ips[0])})
random.shuffle(self.ips)
return self.ips.pop()
def retry_get_url(self, url, retrys=3, proxies=None, timeout=10, **kwargs):
retry_c = 0
while retry_c < retrys:
time.sleep(5)
try:
print(3131313131312133)
proxies = self.get_proxy()
print("70704204902402")
get_resp = requests.get(url, timeout=timeout, **kwargs)
print(get_resp.status_code, 7897897897)
return get_resp
except Exception as e:
retry_c += 1
time.sleep(5)
print(e)
print('Failed to get page %s after %d retries, %s'
% (url, retrys, datetime.now()))
return None
def get_serach_page_cookies(self):
'''
cookies更新
'''
url = self.url + "/answers?page=1"
print(url, 1341231)
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
# "cookie": 'JOID=VVwUBE7x27HACN_aDfF6aHmauAQZ0f2U4Sn__CjQ-5flKf76K7hP16AJ2dkJafVd-b2rXUc10ti1sPBtJsmPy_E=;SESSIONID=TFlPxrvKlHSu5GFwVuKQZPvLF7j0irSDVxK85ABhAIf;osd=W1gTA0L_37bHBNHeCvZ2Zn2dvwgX1fqT7Sf7-y_c9ZPiLvL0L79I264N3t4FZ_Fa_rGlWUAy3taxt_dhKM2IzP0=;JOID=W1EVBEpxIfBPFyGPKn-JJvuASFgxUgvUajwCpQ5aAtprMgqsAMm5kyEfKIEr9xxWh9I93T_p0YdYpDeSxXodzUE=;SESSIONID=F5iD34RRjiMytqEyVwGp5RonpKiXLRTc5NooUAp7dfZ;osd=V1ESAUl9IfdKFC2PLXqKKvuHTVs9UgzRaTACogtZDtpsNwmgAM68kC0fL4Qo-xxRgtEx3Tjs0otYozKRyXoayEI=;KLBRSID=0a401b23e8a71b70de2f4b37f5b4e379|1617074238|1617072802;Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1617073998;Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1616991920;_xsrf=028b732d-dc59-4fcc-9502-ef1b69081cc6;_zap=a6203f6c-f1b9-4399-9bb4-84d4a7c05db9;d_c0="ALAdzVEi3xKPThaHzBS2goRCrnZQtERjjek=|1616991918"',
"cookie": '_SESSIONID=rvJz2mbRjCqJFwvD79ADGb8gpdpRkWXAsdVDqOVALgh; JOID=UVkSBEtHLEUgV_KGUUMd3fULiLJHZglmBHfTo3JnDWADc9KndNqP8XtW9oBWoLo5jsAnAcPG0-JwbPp8rD1JsFI=; osd=W1gdB05NLUojUviHXkAY1_QEi7dNZwZlAX3SrHFiB2EMcNetddWM9HFX-YNTqrs2jcUtAMzF1uhxY_l5pjxGs1c=; _zap=cc1330a6-d6e9-4b25-8232-0b0481e37ea2; _xsrf=4798d610-1db7-489f-8c14-d608c0bbcf08; d_c0="AOCcdS0CNBKPToRkxgVd-8qBKKgTDM9yF4Y=|1605507877"; capsion_ticket="2|1:0|10:1605684439|14:capsion_ticket|44:MzdiN2JlOTZlNWE2NGU5Zjg0MDNiODFlN2ViMjQzNGU=|b84d77e03b5e9447a3740a614d208ca5f3bc1e4fe15fe9b46f1a2a5702da5f99"; SESSIONID=nplCMucg2EXp8xNQAz74jmaLovUS9CG4rVtcFY4jWLT; JOID=V1sUBk8zRBeITtMgLDNzj10QrRI6WzZ_ygKgQU8AB038d6pof-COptdJ1yYpGa7oR9-1bgHql0Hfs4FvUF_YW-A=; osd=Ul4XBkI2QRSIQ9YlLzN-ilgTrR8_XjV_xwelQk8NAkj_d6dteuOOq9JM1CYkHKvrR9KwawLqmkTasIFiVVrbW-0=; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1605608394,1605684435,1605684439,1605684455; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1605684522; KLBRSID=fe0fceb358d671fa6cc33898c8c48b48|1605685245|1605683234',
"referer": url,
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36",
}
requests_res = self.retry_get_url(url, headers=headers)
print(requests_res.cookies.get_dict(), 131313)
return requests_res.cookies.get_dict()
def check_data_exist(self, data_dict, mark):
'''
数据插入前检测
'''
def _update_data_is_online(id_):
sql = "update {table} set is_online = 1 where id = {id_}"
sql = sql.format(table='zhihu_answer', id_=id_)
self.cur.execute(sql)
self.conn.commit()
sql = "select id from {table} where answer_id = {id_}"
exist = None
if mark == 0:
select_sql = sql.format(table='zhihu_answer', id_=data_dict["id"])
self.cur.execute(select_sql)
exist = self.cur.fetchone()
if mark == 1:
select_sql = sql.format(table='zhihu_article', id_=data_dict["id"])
self.cur.execute(select_sql)
exist = self.cur.fetchone()
if exist:
if self.is_online:
_update_data_is_online(exist[0])
return True
return False
def parse_sigle_page(self, data_dict, mark):
'''
插入主要内容数据和图片的url,寻找评论
'''
if self.check_data_exist(data_dict, mark):
return
if mark == 0:
into = "insert into zhihu_answer(title, content, answer_id, created_time, comment_count, new_content, is_online) value(%s, %s, %s, %s, %s, %s, %s)"
values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"],
data_dict["comment_count"], data_dict["content"], self.is_online)
elif mark == 1:
into = "insert into zhihu_article(title, content, article_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (
data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"],
data_dict["comment_count"],
data_dict["content"])
self.cur.execute(into, values)
self.conn.commit()
offset = 0
return
def search_page(self, mark, page_max, start_page=0):
'''
函数主入口
params:
mark 0 answer, 1 article, 2 thought
'''
offset = start_page * 20
for i in range(page_max - 1):
print(i, self.page_count)
if i + start_page >= self.page_count:
break
if mark == 0:
self.search_answer_article_page(offset, 0)
elif mark == 1:
self.search_answer_article_page(offset, 1)
elif mark == 2:
self.search_thought_page(offset)
offset = offset + 20
self.conn.close()
return
def update_page_count(self, answer_count):
count = int(answer_count / 20)
temp = int(answer_count % 20)
if temp > 0:
count += 1
self.page_count = count
def get_page_data(self, url, headers_search, cookies_dict, offset):
get_page = self.retry_get_url(url, headers=headers_search, cookies=cookies_dict)
try:
if get_page.status_code != 200:
# retry once
get_page = self.retry_get_url(url)
if get_page.status_code != 200:
print("article_error, url : ", url, " status_code: ", get_page.status_code)
page_dict = get_page.json()
print('get page json data success! offset: ', offset, ' url: ', url)
return page_dict
except:
print('retry get page data : {}'.format(offset))
return self.get_page_data(url, headers_search, cookies_dict, offset)
def search_answer_article_page(self, offset, mark, proxies_num=0):
'''
实现文章和回答的数据包请求
'''
offset = str(offset)
# if mark == 0:
url = "https://www.zhihu.com/api/v4/search_v3?q=%E6%B0%B4%E5%85%89%E9%92%88&t=general&lc_idx=0&correction=1&offset=0&advert_count=0&limit=20&is_real_time=0&show_all_topics=0&search_source=History&raw_query="
# elif mark == 1:
# url = ARTICLE_URL.format(offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = self.retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
if get_page.status_code != 200:
print("article_error")
page_dict = self.get_page_data(url, headers_search, cookies_dict, offset)
if page_dict.get("data"):
if self.page_count == 1000:
self.update_page_count(page_dict["paging"].get("totals", 0))
for one_line in page_dict['data']:
print(one_line,1313131313131)
try:
if one_line["content"] != None:
print(one_line)
self.parse_sigle_page(one_line, mark)
print("finshed_article" + offset)
except KeyError:
continue
else:
time.sleep(60 * 5)
print("article_data_error")
return
def headers_handle(self, url):
'''
url请求中的头部伪装
'''
res_cookies_dict = self.get_serach_page_cookies()
headers_search = {
"accept": "*/*",
"accept-encoding": "gzip, deflate",
"accept-language": "zh-CN,zh;q=0.9",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36",
"x-ab-param": "li_yxzl_new_style_a=1;se_major=0;se_adsrank=4;se_hi_trunc=0;se_v053=1;li_panswer_topic=0;zr_test_aa1=1;pf_noti_entry_num=2;zr_search_sim2=2;zr_slotpaidexp=2;se_zp_boost=0;tp_club_entrance=1;pf_profile2_tab=0;ug_newtag=1;li_answer_card=0;ls_recommend_test=5;qap_labeltype=1;zr_rec_answer_cp=open;se_sug_term=0;tp_topic_tab=0;ge_ge01=5;se_wil_act=0;se_videobox=0;tsp_ad_cardredesign=0;qap_question_visitor= 0;zr_slot_training=2;tp_clubhyb=0;li_ebook_gen_search=2;se_v_v005=0;zw_sameq_sorce=999;ge_ge02=6;se_mobilecard=0;se_auth_src=0;tp_header_style=1;tp_flow_ctr=0;pf_creator_card=1;li_viptab_name=0;zr_intervene=0;se_bert128=1;se_ffzx_jushen1=0;top_v_album=1;se_preset=0;tp_discover=1;ls_fmp4=0;tp_club_top=0;top_universalebook=1;li_svip_cardshow=1;li_paid_answer_exp=0;tp_topic_style=0;zr_art_rec=base;se_colorfultab=1;se_auth_src2=0;tp_club_qa_entrance=1;tp_club__entrance2=1;tsp_hotlist_ui=3;li_svip_tab_search=1;se_entity22=1;tp_meta_card=0;tp_topic_tab_new=0-0-0;tp_zrec=0;top_ebook=0;pf_adjust=1;qap_question_author=0;zr_topic_rpc=0;se_topicfeed=0;tp_club_feed=0;tsp_ioscard2=0;zr_rel_search=base;se_recommend=1;se_usercard=0;tp_club_fdv4=0;tp_m_intro_re_topic=1;pf_foltopic_usernum=0;li_vip_verti_search=0;zr_training_boost=false;se_v054=0;tp_contents=1;soc_feed_intelligent=3;tsp_ios_cardredesign=0;pf_fuceng=1;pf_newguide_vertical=0;ug_follow_topic_1=2;ls_video_commercial=0;li_car_meta=1;se_sug_dnn=0;tp_fenqu_wei=0;li_catalog_card=1;top_quality=0;se_click_v_v=1;se_aa_base=1;se_club_ui=0;se_return_1=0;soc_notification=1;zr_ans_rec=gbrank;zr_search_paid=1;zr_expslotpaid=3;zr_rerank=0;se_college=default;se_whitelist=1;top_root=0;li_yxxq_aut=A1;tsp_adcard2=0;ls_videoad=2;se_col_boost=1;li_edu_page=old;zr_training_first=false;se_t2sug=1;se_vbert3=0;se_merge=0;li_video_section=1;zr_km_answer=open_cvr;zr_sim3=0;se_v_v006=0;tp_dingyue_video=0;li_topics_search=0;se_searchwiki=0;se_guess=0;se_major_v2=0;tp_club_bt=0;tp_sft=a;top_test_4_liguangyi=1",
"x-api-version": "3.0.91",
"x-app-za": "OS=Web",
"x-requested-with": "fetch",
"x-zse-83": "3_2.0",
"x-zse-86": None,
"referer": "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?page=1",
}
cookies_dict = {
"d_c0": '"AOCcdS0CNBKPToRkxgVd-8qBKKgTDM9yF4Y=|1605507877"',
"KLBRSID": None
}
cookies_dict.update(res_cookies_dict)
f = "+".join(
["3_2.0", url.replace("https://www.zhihu.com", ""), headers_search["referer"], cookies_dict["d_c0"]])
fmd5 = hashlib.new('md5', f.encode()).hexdigest()
headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b", fmd5)
return headers_search, cookies_dict
if __name__ == '__main__':
mark = int(sys.argv[1])
max_page = int(sys.argv[2])
start_page = int(sys.argv[3])
spider_url = sys.argv[4]
try:
is_online = int(sys.argv[5])
except:
is_online = False
print(datetime.now())
spider = Spider(spider_url, is_online)
if mark == 0:
spider.search_page(mark, max_page, start_page)
elif mark == 1:
spider.search_page(mark, max_page, start_page)
elif mark == 2:
spider.search_page(mark, max_page, start_page)
print(datetime.now())
import pymysql
import hashlib
import requests
import execjs
import os
import re
import sys
import time
from datetime import datetime
import kdl
import random
HOST = '172.18.51.14'
PORT = 3306
USER = 'spider'
PASSWD = 'Gengmei123'
DB = 'spider'
JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
SPIDER_URL = 'https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1'
ANSWER_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created'
ARTICLE_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20&sort_by=created'
THOUGHT_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data%5B*%5D.upvoted_followees%2Cadmin_closed_comment'
ANSWER_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/answers/{0}/root_comments?order=normal&limit=20&offset={1}&status=open'
ARTICLE_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/articles/{0}/root_comments?order=normal&limit=20&offset={1}&status=open'
CHILD_COMMENT_START_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments'
CHILD_COMMENT_OFFSET_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments?limit=20&offset={1}'
THOUGHT_COMMENT_URL = 'https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open'
class Spider(object):
def __init__(self, url, is_online=False):
'''
初始化数据库,调整js规则
'''
self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
passwd=PASSWD,
db=DB, charset='utf8')
self.cur = self.conn.cursor()
os.environ["EXECJS_RUNTIME"] = 'Node'
try:
with open('./zhihu.js', 'r', encoding='utf-8') as f:
js = f.read()
except:
with open(JS_FILE_PATH, 'r', encoding='utf-8') as f:
js = f.read()
self.exec_js = execjs.compile(js, cwd='/home/gmuser/node_modules')
self.is_online = True if is_online else False
self.url = url
api_url = url.replace('', '')
self.ANSWER_URL = 'https://www.zhihu.com/api/v4/members/' + url[29:] + '/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created'
self.page_count = 1000
self.ips = []
def get_proxy(self):
if self.ips:
return self.ips.pop()
auth = kdl.Auth("981538205840388", 'p7wrheahef20800z0zh4bedu4005ofsb')
client = kdl.Client(auth)
ips = client.get_dps(10, sign_type='hmacsha1', format='json')
print("dps proxy: ", ips, client.get_proxy_authorization())
for item in ips:
for i in range(10):
self.ips.append({ "http": "http://{}".format(ips[0]), "https": "https://{}".format(ips[0])})
random.shuffle(self.ips)
return self.ips.pop()
def retry_get_url(self, url, retrys=3, proxies=None, timeout=10, **kwargs):
retry_c = 0
while retry_c < retrys:
time.sleep(5)
try:
proxies = self.get_proxy()
get_resp = requests.get(url, timeout=timeout, proxies=proxies, **kwargs)
return get_resp
except Exception as e:
retry_c += 1
time.sleep(5)
print(e)
print('Failed to get page %s after %d retries, %s'
% (url, retrys, datetime.now()))
return None
def get_serach_page_cookies(self):
'''
cookies更新
'''
url = self.url + "/answers?page=1"
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"cookie": '_SESSIONID=rvJz2mbRjCqJFwvD79ADGb8gpdpRkWXAsdVDqOVALgh; JOID=UVkSBEtHLEUgV_KGUUMd3fULiLJHZglmBHfTo3JnDWADc9KndNqP8XtW9oBWoLo5jsAnAcPG0-JwbPp8rD1JsFI=; osd=W1gdB05NLUojUviHXkAY1_QEi7dNZwZlAX3SrHFiB2EMcNetddWM9HFX-YNTqrs2jcUtAMzF1uhxY_l5pjxGs1c=; _zap=cc1330a6-d6e9-4b25-8232-0b0481e37ea2; _xsrf=4798d610-1db7-489f-8c14-d608c0bbcf08; d_c0="AOCcdS0CNBKPToRkxgVd-8qBKKgTDM9yF4Y=|1605507877"; capsion_ticket="2|1:0|10:1605684439|14:capsion_ticket|44:MzdiN2JlOTZlNWE2NGU5Zjg0MDNiODFlN2ViMjQzNGU=|b84d77e03b5e9447a3740a614d208ca5f3bc1e4fe15fe9b46f1a2a5702da5f99"; SESSIONID=nplCMucg2EXp8xNQAz74jmaLovUS9CG4rVtcFY4jWLT; JOID=V1sUBk8zRBeITtMgLDNzj10QrRI6WzZ_ygKgQU8AB038d6pof-COptdJ1yYpGa7oR9-1bgHql0Hfs4FvUF_YW-A=; osd=Ul4XBkI2QRSIQ9YlLzN-ilgTrR8_XjV_xwelQk8NAkj_d6dteuOOq9JM1CYkHKvrR9KwawLqmkTasIFiVVrbW-0=; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1605608394,1605684435,1605684439,1605684455; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1605684522; KLBRSID=fe0fceb358d671fa6cc33898c8c48b48|1605685245|1605683234',
"referer": url,
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36",
}
requests_res = self.retry_get_url(url, headers=headers)
return requests_res.cookies.get_dict()
def check_data_exist(self, data_dict, mark):
'''
数据插入前检测
'''
def _update_data_is_online(id_):
sql = "update {table} set is_online = 1 where id = {id_}"
sql = sql.format(table='zhihu_answer', id_=id_)
self.cur.execute(sql)
self.conn.commit()
sql = "select id from {table} where answer_id = {id_}"
exist = None
if mark == 0:
select_sql = sql.format(table='zhihu_answer', id_=data_dict["id"])
self.cur.execute(select_sql)
exist = self.cur.fetchone()
if mark == 1:
select_sql = sql.format(table='zhihu_article', id_=data_dict["id"])
self.cur.execute(select_sql)
exist = self.cur.fetchone()
if exist:
if self.is_online:
_update_data_is_online(exist[0])
return True
return False
def parse_sigle_page(self, data_dict, mark):
'''
插入主要内容数据和图片的url,寻找评论
'''
if self.check_data_exist(data_dict, mark):
return
if mark == 0:
into = "insert into zhihu_answer(title, content, answer_id, created_time, comment_count, new_content, is_online) value(%s, %s, %s, %s, %s, %s, %s)"
values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"],
data_dict["comment_count"], data_dict["content"], self.is_online)
elif mark == 1:
into = "insert into zhihu_article(title, content, article_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (
data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"],
data_dict["content"])
self.cur.execute(into, values)
self.conn.commit()
offset = 0
return
def search_page(self, mark, page_max, start_page=0):
'''
函数主入口
params:
mark 0 answer, 1 article, 2 thought
'''
offset = start_page * 20
for i in range(page_max - 1):
if i >= self.page_count:
break
if mark == 0:
self.search_answer_article_page(offset, 0)
elif mark == 1:
self.search_answer_article_page(offset, 1)
elif mark == 2:
self.search_thought_page(offset)
offset = offset + 20
self.conn.close()
return
def update_page_count(self, answer_count):
count = int(answer_count / 20)
temp = int(answer_count % 20)
if temp > 0:
count += 1
self.page_count = count
def get_page_data(self, url, headers_search, cookies_dict, offset):
get_page = self.retry_get_url(url, headers=headers_search, cookies=cookies_dict)
try:
if get_page.status_code != 200:
# retry once
get_page = self.retry_get_url(url)
if get_page.status_code != 200:
print("article_error, url : ", url, " status_code: ", get_page.status_code)
page_dict = get_page.json()
print('get page json data success! offset: ', offset, ' url: ', url)
return page_dict
except:
print('retry get page data : {}'.format(offset))
return self.get_page_data(url, headers_search, cookies_dict, offset)
def search_answer_article_page(self, offset, mark, proxies_num=0):
'''
实现文章和回答的数据包请求
'''
offset = str(offset)
if mark == 0:
url = self.ANSWER_URL.format(offset)
elif mark == 1:
url = ARTICLE_URL.format(offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = self.retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
if get_page.status_code != 200:
print("article_error")
page_dict = self.get_page_data(url, headers_search, cookies_dict, offset)
if page_dict.get("data"):
if self.page_count == 1000:
self.update_page_count(page_dict["paging"].get("totals", 0))
for one_line in page_dict['data']:
try:
if one_line["content"] != None:
self.parse_sigle_page(one_line, mark)
print("finshed_article" + offset)
except KeyError:
continue
else:
time.sleep(60*5)
print("article_data_error")
return
def headers_handle(self, url):
'''
url请求中的头部伪装
'''
res_cookies_dict = self.get_serach_page_cookies()
headers_search = {
"accept": "*/*",
"accept-encoding": "gzip, deflate",
"accept-language": "zh-CN,zh;q=0.9",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36",
"x-ab-param": "li_yxzl_new_style_a=1;se_major=0;se_adsrank=4;se_hi_trunc=0;se_v053=1;li_panswer_topic=0;zr_test_aa1=1;pf_noti_entry_num=2;zr_search_sim2=2;zr_slotpaidexp=2;se_zp_boost=0;tp_club_entrance=1;pf_profile2_tab=0;ug_newtag=1;li_answer_card=0;ls_recommend_test=5;qap_labeltype=1;zr_rec_answer_cp=open;se_sug_term=0;tp_topic_tab=0;ge_ge01=5;se_wil_act=0;se_videobox=0;tsp_ad_cardredesign=0;qap_question_visitor= 0;zr_slot_training=2;tp_clubhyb=0;li_ebook_gen_search=2;se_v_v005=0;zw_sameq_sorce=999;ge_ge02=6;se_mobilecard=0;se_auth_src=0;tp_header_style=1;tp_flow_ctr=0;pf_creator_card=1;li_viptab_name=0;zr_intervene=0;se_bert128=1;se_ffzx_jushen1=0;top_v_album=1;se_preset=0;tp_discover=1;ls_fmp4=0;tp_club_top=0;top_universalebook=1;li_svip_cardshow=1;li_paid_answer_exp=0;tp_topic_style=0;zr_art_rec=base;se_colorfultab=1;se_auth_src2=0;tp_club_qa_entrance=1;tp_club__entrance2=1;tsp_hotlist_ui=3;li_svip_tab_search=1;se_entity22=1;tp_meta_card=0;tp_topic_tab_new=0-0-0;tp_zrec=0;top_ebook=0;pf_adjust=1;qap_question_author=0;zr_topic_rpc=0;se_topicfeed=0;tp_club_feed=0;tsp_ioscard2=0;zr_rel_search=base;se_recommend=1;se_usercard=0;tp_club_fdv4=0;tp_m_intro_re_topic=1;pf_foltopic_usernum=0;li_vip_verti_search=0;zr_training_boost=false;se_v054=0;tp_contents=1;soc_feed_intelligent=3;tsp_ios_cardredesign=0;pf_fuceng=1;pf_newguide_vertical=0;ug_follow_topic_1=2;ls_video_commercial=0;li_car_meta=1;se_sug_dnn=0;tp_fenqu_wei=0;li_catalog_card=1;top_quality=0;se_click_v_v=1;se_aa_base=1;se_club_ui=0;se_return_1=0;soc_notification=1;zr_ans_rec=gbrank;zr_search_paid=1;zr_expslotpaid=3;zr_rerank=0;se_college=default;se_whitelist=1;top_root=0;li_yxxq_aut=A1;tsp_adcard2=0;ls_videoad=2;se_col_boost=1;li_edu_page=old;zr_training_first=false;se_t2sug=1;se_vbert3=0;se_merge=0;li_video_section=1;zr_km_answer=open_cvr;zr_sim3=0;se_v_v006=0;tp_dingyue_video=0;li_topics_search=0;se_searchwiki=0;se_guess=0;se_major_v2=0;tp_club_bt=0;tp_sft=a;top_test_4_liguangyi=1",
"x-api-version": "3.0.91",
"x-app-za": "OS=Web",
"x-requested-with": "fetch",
"x-zse-83": "3_2.0",
"x-zse-86": None,
"referer": "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?page=1",
}
cookies_dict = {
"d_c0": '"AOCcdS0CNBKPToRkxgVd-8qBKKgTDM9yF4Y=|1605507877"',
"KLBRSID": None
}
cookies_dict.update(res_cookies_dict)
f = "+".join(
["3_2.0", url.replace("https://www.zhihu.com", ""), headers_search["referer"], cookies_dict["d_c0"]])
fmd5 = hashlib.new('md5', f.encode()).hexdigest()
headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b", fmd5)
return headers_search, cookies_dict
if __name__ == '__main__':
mark = int(sys.argv[1])
max_page = int(sys.argv[2])
start_page = int(sys.argv[3])
spider_url = sys.argv[4]
is_online = sys.argv[5]
print(datetime.now())
spider = Spider(spider_url, is_online)
if mark == 0:
spider.search_page(mark, max_page, start_page)
elif mark == 1:
spider.search_page(mark, max_page, start_page)
elif mark == 2:
spider.search_page(mark, max_page, start_page)
print(datetime.now())
import os
import sys
import re
import time
import pymysql
import requests
import hashlib
import cv2
import execjs
from datetime import datetime
from image_qiniu import upload_file, IMG_TYPE
from bs4 import BeautifulSoup
# pip3 install "requests[security]" -i https://pypi.tuna.tsinghua.edu.cn/simple
DATA_OS_PATH = '/data'
PROJECT_PATH = '/srv/apps/crawler'
class UploadImage(object):
def __init__(self, is_online=0):
HOST = '172.18.51.14'
PORT = 3306
USER = 'spider'
PASSWD = 'Gengmei123'
DB = 'spider'
self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
passwd=PASSWD,
db=DB, charset='utf8')
self.cur = self.conn.cursor()
self.ANSWER_PICTURE_PATH = DATA_OS_PATH + '/answer_picture/'
self.ARTICLE_PICTURE_PATH = DATA_OS_PATH + '/article_picture/'
self.THOUGHT_PICTURE_PATH = DATA_OS_PATH + '/thought_picture/'
self.ANSWER_PICTURE_CUT_PATH = DATA_OS_PATH + '/answer_picture_cut/'
self.ARTICLE_PICTURE_CUT_PATH = DATA_OS_PATH + '/article_picture_cut/'
self.THOUGHT_PICTURE_CUT_PATH = DATA_OS_PATH + '/thought_picture_cut/'
self.JS_FILE_PATH = PROJECT_PATH + '/crawler_sys/site_crawler/zhihu.js'
os.environ["EXECJS_RUNTIME"] = 'Node'
try:
with open('./zhihu.js', 'r', encoding='utf-8') as f:
js = f.read()
except:
with open(self.JS_FILE_PATH, 'r', encoding='utf-8') as f:
js = f.read()
self.exec_js = execjs.compile(js, cwd='/home/gmuser/node_modules')
self.is_online = is_online
def get_serach_page_cookies(self):
'''
cookies更新
'''
url = "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1"
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"cookie": '_SESSIONID=rvJz2mbRjCqJFwvD79ADGb8gpdpRkWXAsdVDqOVALgh; JOID=UVkSBEtHLEUgV_KGUUMd3fULiLJHZglmBHfTo3JnDWADc9KndNqP8XtW9oBWoLo5jsAnAcPG0-JwbPp8rD1JsFI=; osd=W1gdB05NLUojUviHXkAY1_QEi7dNZwZlAX3SrHFiB2EMcNetddWM9HFX-YNTqrs2jcUtAMzF1uhxY_l5pjxGs1c=; _zap=cc1330a6-d6e9-4b25-8232-0b0481e37ea2; _xsrf=4798d610-1db7-489f-8c14-d608c0bbcf08; d_c0="AOCcdS0CNBKPToRkxgVd-8qBKKgTDM9yF4Y=|1605507877"; capsion_ticket="2|1:0|10:1605684439|14:capsion_ticket|44:MzdiN2JlOTZlNWE2NGU5Zjg0MDNiODFlN2ViMjQzNGU=|b84d77e03b5e9447a3740a614d208ca5f3bc1e4fe15fe9b46f1a2a5702da5f99"; SESSIONID=nplCMucg2EXp8xNQAz74jmaLovUS9CG4rVtcFY4jWLT; JOID=V1sUBk8zRBeITtMgLDNzj10QrRI6WzZ_ygKgQU8AB038d6pof-COptdJ1yYpGa7oR9-1bgHql0Hfs4FvUF_YW-A=; osd=Ul4XBkI2QRSIQ9YlLzN-ilgTrR8_XjV_xwelQk8NAkj_d6dteuOOq9JM1CYkHKvrR9KwawLqmkTasIFiVVrbW-0=; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1605608394,1605684435,1605684439,1605684455; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1605684522; KLBRSID=fe0fceb358d671fa6cc33898c8c48b48|1605685245|1605683234',
"referer": "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36",
}
requests_res = self.retry_get_url(url, headers=headers)
return requests_res.cookies.get_dict()
def headers_handle(self, url):
'''
url请求中的头部伪装
'''
res_cookies_dict = self.get_serach_page_cookies()
headers_search = {
"accept": "*/*",
"accept-encoding": "gzip, deflate",
"accept-language": "zh-CN,zh;q=0.9",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36",
"x-ab-param": "li_yxzl_new_style_a=1;se_major=0;se_adsrank=4;se_hi_trunc=0;se_v053=1;li_panswer_topic=0;zr_test_aa1=1;pf_noti_entry_num=2;zr_search_sim2=2;zr_slotpaidexp=2;se_zp_boost=0;tp_club_entrance=1;pf_profile2_tab=0;ug_newtag=1;li_answer_card=0;ls_recommend_test=5;qap_labeltype=1;zr_rec_answer_cp=open;se_sug_term=0;tp_topic_tab=0;ge_ge01=5;se_wil_act=0;se_videobox=0;tsp_ad_cardredesign=0;qap_question_visitor= 0;zr_slot_training=2;tp_clubhyb=0;li_ebook_gen_search=2;se_v_v005=0;zw_sameq_sorce=999;ge_ge02=6;se_mobilecard=0;se_auth_src=0;tp_header_style=1;tp_flow_ctr=0;pf_creator_card=1;li_viptab_name=0;zr_intervene=0;se_bert128=1;se_ffzx_jushen1=0;top_v_album=1;se_preset=0;tp_discover=1;ls_fmp4=0;tp_club_top=0;top_universalebook=1;li_svip_cardshow=1;li_paid_answer_exp=0;tp_topic_style=0;zr_art_rec=base;se_colorfultab=1;se_auth_src2=0;tp_club_qa_entrance=1;tp_club__entrance2=1;tsp_hotlist_ui=3;li_svip_tab_search=1;se_entity22=1;tp_meta_card=0;tp_topic_tab_new=0-0-0;tp_zrec=0;top_ebook=0;pf_adjust=1;qap_question_author=0;zr_topic_rpc=0;se_topicfeed=0;tp_club_feed=0;tsp_ioscard2=0;zr_rel_search=base;se_recommend=1;se_usercard=0;tp_club_fdv4=0;tp_m_intro_re_topic=1;pf_foltopic_usernum=0;li_vip_verti_search=0;zr_training_boost=false;se_v054=0;tp_contents=1;soc_feed_intelligent=3;tsp_ios_cardredesign=0;pf_fuceng=1;pf_newguide_vertical=0;ug_follow_topic_1=2;ls_video_commercial=0;li_car_meta=1;se_sug_dnn=0;tp_fenqu_wei=0;li_catalog_card=1;top_quality=0;se_click_v_v=1;se_aa_base=1;se_club_ui=0;se_return_1=0;soc_notification=1;zr_ans_rec=gbrank;zr_search_paid=1;zr_expslotpaid=3;zr_rerank=0;se_college=default;se_whitelist=1;top_root=0;li_yxxq_aut=A1;tsp_adcard2=0;ls_videoad=2;se_col_boost=1;li_edu_page=old;zr_training_first=false;se_t2sug=1;se_vbert3=0;se_merge=0;li_video_section=1;zr_km_answer=open_cvr;zr_sim3=0;se_v_v006=0;tp_dingyue_video=0;li_topics_search=0;se_searchwiki=0;se_guess=0;se_major_v2=0;tp_club_bt=0;tp_sft=a;top_test_4_liguangyi=1",
"x-api-version": "3.0.91",
"x-app-za": "OS=Web",
"x-requested-with": "fetch",
"x-zse-83": "3_2.0",
"x-zse-86": None,
"referer": "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?page=1",
}
cookies_dict = {
"d_c0": '"AOCcdS0CNBKPToRkxgVd-8qBKKgTDM9yF4Y=|1605507877"',
"KLBRSID": None
}
cookies_dict.update(res_cookies_dict)
f = "+".join(
["3_2.0", url.replace("https://www.zhihu.com", ""), headers_search["referer"], cookies_dict["d_c0"]])
fmd5 = hashlib.new('md5', f.encode()).hexdigest()
headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b", fmd5)
return headers_search, cookies_dict
@staticmethod
def retry_get_url(url, retrys=5, proxies=None, timeout=10, **kwargs):
retry_c = 0
while retry_c < retrys:
try:
requests.packages.urllib3.disable_warnings()
get_resp = requests.get(url, verify=False, timeout=timeout, **kwargs)
return get_resp
except Exception as e:
retry_c += 1
time.sleep(2)
print(e)
print('Failed to get page %s after %d retries, %s'
% (url, retrys, datetime.now()))
return None
@staticmethod
def upload_image_with_path(path, img_type=IMG_TYPE.TOPICIMAGE):
'''
图片上传并得到新url
'''
'''非站内图片处理'''
try:
url = upload_file(file_path=path, img_type=img_type)
print('upload ..... ', url)
return url
except:
print('upload ..... error')
return None
def picture_download_and_cut(self, path, new_path, table, key_id, content_id, content):
'''
文章图片剪切和下载
'''
def _deal_image_by_path(res, file_path, old_url, i):
img = cv2.imread(file_path)
if img.size != 0:
high, width = img.shape[:2]
if high < 10 or width < 10:
cropped = img
else:
cropped = img[0:int(high / 10 * 9), 0:width]
pathes = new_path + "num" + str(i) + ".jpg"
cv2.imwrite(pathes, cropped)
new_url = self.upload_image_with_path(pathes)
sql = """UPDATE {} SET new_url = "{}" WHERE url = "{}" """.format(
table, str(new_url), str(res[i][1]))
self.cur.execute(sql)
self.conn.commit()
else:
print('image open error : ', file_path)
#_upload_picture(file_path, old_url)
def _upload_picture(file_path, old_url):
new_url = self.upload_image_with_path(file_path)
sql = """UPDATE {} SET new_url = "{}" WHERE url = "{}" """.format(
table, str(new_url), str(old_url))
self.cur.execute(sql)
self.conn.commit()
def _url_exist(url, table):
sql = "SELECT new_url from {} where new_url is not null and url = '{}'".format(table, url)
self.cur.execute(sql)
res = self.cur.fetchone()
if res:
update = "UPDATE {} SET new_url = '{}' WHERE url = '{}' and new_url is null".format(table, res[0], url)
self.cur.execute(update)
self.conn.commit()
return True
return False
def _update_new_url(table, new_url, url):
sql = """UPDATE {} SET new_url = "{}" WHERE url = "{}" """.format(
table, str(new_url), str(url))
self.cur.execute(sql)
self.conn.commit()
def _download_picture():
sql = """select {}, url from {} where {} = {} and new_url is null""".format(key_id, table, key_id, content_id)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
gif_patt = r'gif'
for i in range(len(res)):
mark = re.search(gif_patt, res[i][1])
url = res[i][1]
if _url_exist(url, table):
continue
[headers_search, cookies_dict] = self.headers_handle(url)
r = self.retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=0)
if not r:
_update_new_url(table, 'https://static.igengmei.com/ship/static/mobile/img/footer/logo-af69f752.png', url)
continue
# try:
if mark:
pathes = path + str('num') + str(i) + '.gif'
with open(pathes, 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
f.write(r.content) # 往f里写入r对象的二进制文件
f.close()
new_url = self.upload_image_with_path(pathes)
_update_new_url(table, new_url, url)
else:
pathes = path + str('num') + str(i) + '.jpg'
with open(pathes, 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
f.write(r.content) # 往f里写入r对象的二进制文件
f.close()
_deal_image_by_path(res, pathes, url, i)
# except Exception as e:
# print(e)
urls = self.find_all_url(content, content_id)
self.insert_picture_urls(table, urls, content_id, key_id)
_download_picture()
_download_picture()
def picture_process(self, path, new_path, table, pic_table, key_id, offset=0, count=10):
content_dict = self.gets_content_dict(table, key_id, offset, count)
for content_id, content in content_dict.items():
self.picture_download_and_cut(path, new_path, pic_table, key_id, content_id, content)
def insert_picture_urls(self, table, urls, content_id, key_id):
def _delete_repeat_url(instance, columns):
print(columns)
sql = """delete from {} where id in ({})""".format(table, ','.join([str(item) for item in columns]))
instance.cur.execute(sql)
instance.conn.commit()
def _url_exist(instance, url_):
sql = """select id from {} where {} = {} and url = '{}'""".format(table, key_id, content_id, url_)
instance.cur.execute(sql)
res = instance.cur.fetchall()
instance.conn.commit()
if res:
res = [item[0] for item in res]
# if len(res) > 1:
# _delete_repeat_url(instance, res[1:])
return False
return True
values = []
for url in urls:
if not _url_exist(self, url):
continue
values.append("({}, '{}')".format(content_id, url))
if values:
into = """insert into {} (answer_id, url) values {}""".format(table, ','.join(values))
print(into)
self.cur.execute(into)
self.conn.commit()
def find_all_url(self, content, content_id):
new_content = self.replace_html_image_to_url(content)
rich_obj = BeautifulSoup(new_content, features="html.parser")
urls = []
for item in rich_obj.find_all("img"):
print('url:', item.get('src'), ' content_id:', content_id)
urls.append(item.get('src'))
return list(set(urls))
@staticmethod
def replace_html_image_to_url(content):
rich_obj = BeautifulSoup(content, features="html.parser")
for item in rich_obj.find_all("figure"):
image_obj = item.find("img")
new_rich_obj = rich_obj.new_tag(name="img")
new_rich_obj["src"] = image_obj.get("src", "")
item.replace_with(new_rich_obj)
return rich_obj.decode()
def gets_content_dict(self, table, key_id, offset=0, count=10):
sql = """select {}, content from {} where is_online = {} and id = 6925 limit {}, {}""".format(key_id, table, self.is_online, offset, count)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
return {item[0]: item[1] for item in res}
if __name__ == '__main__':
''' 执行命令 python file_name mark offset count '''
mark = int(sys.argv[1]) or 0
offset = int(sys.argv[2]) or 0
count = int(sys.argv[3]) or 10
is_online = int(sys.argv[4]) or 0
print(datetime.now())
a = UploadImage(is_online=is_online)
if mark == 0:
a.picture_process(
a.ANSWER_PICTURE_PATH,
a.ANSWER_PICTURE_CUT_PATH,
'zhihu_answer',
'zhihu_answer_picture_url',
'answer_id',
offset,
count
)
if mark == 1:
a.picture_process(
a.ARTICLE_PICTURE_PATH,
a.ARTICLE_PICTURE_CUT_PATH,
'zhihu_article',
'zhihu_article_picture_url',
'article_id',
offset,
count
)
if mark == 2:
a.picture_process(
a.THOUGHT_PICTURE_PATH,
a.THOUGHT_PICTURE_CUT_PATH,
'zhihu_thought',
'zhihu_thought_picture_url',
'thought_id',
offset,
count
)
print(datetime.now())
const jsdom = require("jsdom");
const {JSDOM} = jsdom;
const { window } = new JSDOM('<!doctype html><html><body></body></html>');
global.window = window;
function t(e) {
return (t = "function" == typeof Symbol && "symbol" == typeof Symbol.A ? function(e) {
return typeof e
}
: function(e) {
return e && "function" == typeof Symbol && e.constructor === Symbol && e !== Symbol.prototype ? "symbol" : typeof e
}
)(e)
}
Object.defineProperty(exports, "__esModule", {
value: !0
});
var A = "2.0"
, __g = {};
function s() {}
function i(e) {
this.t = (2048 & e) >> 11,
this.s = (1536 & e) >> 9,
this.i = 511 & e,
this.h = 511 & e
}
function h(e) {
this.s = (3072 & e) >> 10,
this.h = 1023 & e
}
function a(e) {
this.a = (3072 & e) >> 10,
this.c = (768 & e) >> 8,
this.n = (192 & e) >> 6,
this.t = 63 & e
}
function c(e) {
this.s = e >> 10 & 3,
this.i = 1023 & e
}
function n() {}
function e(e) {
this.a = (3072 & e) >> 10,
this.c = (768 & e) >> 8,
this.n = (192 & e) >> 6,
this.t = 63 & e
}
function o(e) {
this.h = (4095 & e) >> 2,
this.t = 3 & e
}
function r(e) {
this.s = e >> 10 & 3,
this.i = e >> 2 & 255,
this.t = 3 & e
}
s.prototype.e = function(e) {
e.o = !1
}
,
i.prototype.e = function(e) {
switch (this.t) {
case 0:
e.r[this.s] = this.i;
break;
case 1:
e.r[this.s] = e.k[this.h]
}
}
,
h.prototype.e = function(e) {
e.k[this.h] = e.r[this.s]
}
,
a.prototype.e = function(e) {
switch (this.t) {
case 0:
e.r[this.a] = e.r[this.c] + e.r[this.n];
break;
case 1:
e.r[this.a] = e.r[this.c] - e.r[this.n];
break;
case 2:
e.r[this.a] = e.r[this.c] * e.r[this.n];
break;
case 3:
e.r[this.a] = e.r[this.c] / e.r[this.n];
break;
case 4:
e.r[this.a] = e.r[this.c] % e.r[this.n];
break;
case 5:
e.r[this.a] = e.r[this.c] == e.r[this.n];
break;
case 6:
e.r[this.a] = e.r[this.c] >= e.r[this.n];
break;
case 7:
e.r[this.a] = e.r[this.c] || e.r[this.n];
break;
case 8:
e.r[this.a] = e.r[this.c] && e.r[this.n];
break;
case 9:
e.r[this.a] = e.r[this.c] !== e.r[this.n];
break;
case 10:
e.r[this.a] = t(e.r[this.c]);
break;
case 11:
e.r[this.a] = e.r[this.c]in e.r[this.n];
break;
case 12:
e.r[this.a] = e.r[this.c] > e.r[this.n];
break;
case 13:
e.r[this.a] = -e.r[this.c];
break;
case 14:
e.r[this.a] = e.r[this.c] < e.r[this.n];
break;
case 15:
e.r[this.a] = e.r[this.c] & e.r[this.n];
break;
case 16:
e.r[this.a] = e.r[this.c] ^ e.r[this.n];
break;
case 17:
e.r[this.a] = e.r[this.c] << e.r[this.n];
break;
case 18:
e.r[this.a] = e.r[this.c] >>> e.r[this.n];
break;
case 19:
e.r[this.a] = e.r[this.c] | e.r[this.n];
break;
case 20:
e.r[this.a] = !e.r[this.c]
}
}
,
c.prototype.e = function(e) {
e.Q.push(e.C),
e.B.push(e.k),
e.C = e.r[this.s],
e.k = [];
for (var t = 0; t < this.i; t++)
e.k.unshift(e.f.pop());
e.g.push(e.f),
e.f = []
}
,
n.prototype.e = function(e) {
e.C = e.Q.pop(),
e.k = e.B.pop(),
e.f = e.g.pop()
}
,
e.prototype.e = function(e) {
switch (this.t) {
case 0:
e.u = e.r[this.a] >= e.r[this.c];
break;
case 1:
e.u = e.r[this.a] <= e.r[this.c];
break;
case 2:
e.u = e.r[this.a] > e.r[this.c];
break;
case 3:
e.u = e.r[this.a] < e.r[this.c];
break;
case 4:
e.u = e.r[this.a] == e.r[this.c];
break;
case 5:
e.u = e.r[this.a] != e.r[this.c];
break;
case 6:
e.u = e.r[this.a];
break;
case 7:
e.u = !e.r[this.a]
}
}
,
o.prototype.e = function(e) {
switch (this.t) {
case 0:
e.C = this.h;
break;
case 1:
e.u && (e.C = this.h);
break;
case 2:
e.u || (e.C = this.h);
break;
case 3:
e.C = this.h,
e.w = null
}
e.u = !1
}
,
r.prototype.e = function(e) {
switch (this.t) {
case 0:
for (var t = [], n = 0; n < this.i; n++)
t.unshift(e.f.pop());
e.r[3] = e.r[this.s](t[0], t[1]);
break;
case 1:
for (var r = e.f.pop(), o = [], i = 0; i < this.i; i++)
o.unshift(e.f.pop());
e.r[3] = e.r[this.s][r](o[0], o[1]);
break;
case 2:
for (var a = [], s = 0; s < this.i; s++)
a.unshift(e.f.pop());
e.r[3] = new e.r[this.s](a[0],a[1])
}
}
;
var k = function(e) {
for (var t = 66, n = [], r = 0; r < e.length; r++) {
var o = 24 ^ e.charCodeAt(r) ^ t;
n.push(String.fromCharCode(o)),
t = o
}
return n.join("")
};
function Q(e) {
this.t = (4095 & e) >> 10,
this.s = (1023 & e) >> 8,
this.i = 1023 & e,
this.h = 63 & e
}
function C(e) {
this.t = (4095 & e) >> 10,
this.a = (1023 & e) >> 8,
this.c = (255 & e) >> 6
}
function B(e) {
this.s = (3072 & e) >> 10,
this.h = 1023 & e
}
function f(e) {
this.h = 4095 & e
}
function g(e) {
this.s = (3072 & e) >> 10
}
function u(e) {
this.h = 4095 & e
}
function w(e) {
this.t = (3840 & e) >> 8,
this.s = (192 & e) >> 6,
this.i = 63 & e
}
function G() {
this.r = [0, 0, 0, 0],
this.C = 0,
this.Q = [],
this.k = [],
this.B = [],
this.f = [],
this.g = [],
this.u = !1,
this.G = [],
this.b = [],
this.o = !1,
this.w = null,
this.U = null,
this.F = [],
this.R = 0,
this.J = {
0: s,
1: i,
2: h,
3: a,
4: c,
5: n,
6: e,
7: o,
8: r,
9: Q,
10: C,
11: B,
12: f,
13: g,
14: u,
15: w
}
}
Q.prototype.e = function(e) {
switch (this.t) {
case 0:
e.f.push(e.r[this.s]);
break;
case 1:
e.f.push(this.i);
break;
case 2:
e.f.push(e.k[this.h]);
break;
case 3:
e.f.push(k(e.b[this.h]))
}
}
,
C.prototype.e = function(A) {
switch (this.t) {
case 0:
var t = A.f.pop();
A.r[this.a] = A.r[this.c][t];
break;
case 1:
var s = A.f.pop()
, i = A.f.pop();
A.r[this.c][s] = i;
break;
case 2:
var h = A.f.pop();
A.r[this.a] = eval(h)
}
}
,
B.prototype.e = function(e) {
e.r[this.s] = k(e.b[this.h])
}
,
f.prototype.e = function(e) {
e.w = this.h
}
,
g.prototype.e = function(e) {
throw e.r[this.s]
}
,
u.prototype.e = function(e) {
var t = this
, n = [0];
e.k.forEach(function(e) {
n.push(e)
});
var r = function(r) {
var o = new G;
return o.k = n,
o.k[0] = r,
o.v(e.G, t.h, e.b, e.F),
o.r[3]
};
r.toString = function() {
return "() { [native code] }"
}
,
e.r[3] = r
}
,
w.prototype.e = function(e) {
switch (this.t) {
case 0:
for (var t = {}, n = 0; n < this.i; n++) {
var r = e.f.pop();
t[e.f.pop()] = r
}
e.r[this.s] = t;
break;
case 1:
for (var o = [], i = 0; i < this.i; i++)
o.unshift(e.f.pop());
e.r[this.s] = o
}
}
,
G.prototype.D = function(e) {
for (var t = new Buffer(e,"base64").toString("binary"), n = t.charCodeAt(0) << 8 | t.charCodeAt(1), r = [], o = 2; o < n + 2; o += 2)
r.push(t.charCodeAt(o) << 8 | t.charCodeAt(o + 1));
this.G = r;
for (var i = [], a = n + 2; a < t.length; ) {
var s = t.charCodeAt(a) << 8 | t.charCodeAt(a + 1)
, c = t.slice(a + 2, a + 2 + s);
i.push(c),
a += s + 2
}
this.b = i
}
,
G.prototype.v = function(e, t, n) {
for (t = t || 0,
n = n || [],
this.C = t,
"string" == typeof e ? this.D(e) : (this.G = e,
this.b = n),
this.o = !0,
this.R = Date.now(); this.o; ) {
var r = this.G[this.C++];
if ("number" != typeof r)
break;
var o = Date.now();
if (500 < o - this.R)
return;
this.R = o;
try {
this.e(r)
} catch (e) {
this.U = e,
this.w && (this.C = this.w)
}
}
}
,
G.prototype.e = function(e) {
var t = (61440 & e) >> 12;
new this.J[t](e).e(this)
}
,
"undefined" != typeof window && (new G).v("AxjgB5MAnACoAJwBpAAAABAAIAKcAqgAMAq0AzRJZAZwUpwCqACQACACGAKcBKAAIAOcBagAIAQYAjAUGgKcBqFAuAc5hTSHZAZwqrAIGgA0QJEAJAAYAzAUGgOcCaFANRQ0R2QGcOKwChoANECRACQAsAuQABgDnAmgAJwMgAGcDYwFEAAzBmAGcSqwDhoANECRACQAGAKcD6AAGgKcEKFANEcYApwRoAAxB2AGcXKwEhoANECRACQAGAKcE6AAGgKcFKFANEdkBnGqsBUaADRAkQAkABgCnBagAGAGcdKwFxoANECRACQAGAKcGKAAYAZx+rAZGgA0QJEAJAAYA5waoABgBnIisBsaADRAkQAkABgCnBygABoCnB2hQDRHZAZyWrAeGgA0QJEAJAAYBJwfoAAwFGAGcoawIBoANECRACQAGAOQALAJkAAYBJwfgAlsBnK+sCEaADRAkQAkABgDkACwGpAAGAScH4AJbAZy9rAiGgA0QJEAJACwI5AAGAScH6AAkACcJKgAnCWgAJwmoACcJ4AFnA2MBRAAMw5gBnNasCgaADRAkQAkABgBEio0R5EAJAGwKSAFGACcKqAAEgM0RCQGGAYSATRFZAZzshgAtCs0QCQAGAYSAjRFZAZz1hgAtCw0QCQAEAAgB7AtIAgYAJwqoAASATRBJAkYCRIANEZkBnYqEAgaBxQBOYAoBxQEOYQ0giQKGAmQABgAnC6ABRgBGgo0UhD/MQ8zECALEAgaBxQBOYAoBxQEOYQ0gpEAJAoYARoKNFIQ/zEPkAAgChgLGgkUATmBkgAaAJwuhAUaCjdQFAg5kTSTJAsQCBoHFAE5gCgHFAQ5hDSCkQAkChgBGgo0UhD/MQ+QACAKGAsaCRQCOYGSABoAnC6EBRoKN1AUEDmRNJMkCxgFGgsUPzmPkgAaCJwvhAU0wCQFGAUaCxQGOZISPzZPkQAaCJwvhAU0wCQFGAUaCxQMOZISPzZPkQAaCJwvhAU0wCQFGAUaCxQSOZISPzZPkQAaCJwvhAU0wCQFGAkSAzRBJAlz/B4FUAAAAwUYIAAIBSITFQkTERwABi0GHxITAAAJLwMSGRsXHxMZAAk0Fw8HFh4NAwUABhU1EBceDwAENBcUEAAGNBkTGRcBAAFKAAkvHg4PKz4aEwIAAUsACDIVHB0QEQ4YAAsuAzs7AAoPKToKDgAHMx8SGQUvMQABSAALORoVGCQgERcCAxoACAU3ABEXAgMaAAsFGDcAERcCAxoUCgABSQAGOA8LGBsPAAYYLwsYGw8AAU4ABD8QHAUAAU8ABSkbCQ4BAAFMAAktCh8eDgMHCw8AAU0ADT4TGjQsGQMaFA0FHhkAFz4TGjQsGQMaFA0FHhk1NBkCHgUbGBEPAAFCABg9GgkjIAEmOgUHDQ8eFSU5DggJAwEcAwUAAUMAAUAAAUEADQEtFw0FBwtdWxQTGSAACBwrAxUPBR4ZAAkqGgUDAwMVEQ0ACC4DJD8eAx8RAAQ5GhUYAAFGAAAABjYRExELBAACWhgAAVoAQAg/PTw0NxcQPCQ5C3JZEBs9fkcnDRcUAXZia0Q4EhQgXHojMBY3MWVCNT0uDhMXcGQ7AUFPHigkQUwQFkhaAkEACjkTEQspNBMZPC0ABjkTEQsrLQ==");
var b = function(e) {
console.log(encodeURIComponent(e));
return __g._encrypt(encodeURIComponent(e));
};
exports.ENCRYPT_VERSION = A,
exports.default = b;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment