Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Chengyang Zhong
crawler
Commits
d1b63291
Commit
d1b63291
authored
Jan 01, 2021
by
haowang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
modify
parent
607c82f7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
935 additions
and
111 deletions
+935
-111
.gitignore
.gitignore
+3
-0
__init__.py
tasks/zhihu/captcha/__init__.py
+2
-0
ocrmodel.py
tasks/zhihu/captcha/ocrmodel.py
+229
-0
utils.py
tasks/zhihu/captcha/utils.py
+175
-0
zhihu_captcha.py
tasks/zhihu/captcha/zhihu_captcha.py
+74
-0
old_spider.py
tasks/zhihu/old_spider.py
+358
-0
spider.py
tasks/zhihu/spider.py
+94
-111
No files found.
.gitignore
0 → 100644
View file @
d1b63291
.idea
__pycache__
env
tasks/zhihu/captcha/__init__.py
0 → 100644
View file @
d1b63291
from
zhihu_captcha
import
zhihu_captcha
\ No newline at end of file
tasks/zhihu/captcha/ocrmodel.py
0 → 100644
View file @
d1b63291
import
tensorflow
as
tf
from
zhihu_captcha
import
utils
from
tensorflow.python.training
import
moving_averages
FLAGS
=
utils
.
FLAGS
num_classes
=
utils
.
num_classes
class
LSTMOCR
(
object
):
def
__init__
(
self
,
mode
):
self
.
mode
=
mode
# 图像输入
self
.
inputs
=
tf
.
placeholder
(
tf
.
float32
,
[
None
,
FLAGS
.
image_height
,
FLAGS
.
image_width
,
FLAGS
.
image_channel
])
# ctc_loss 需要的是稀疏矩阵
self
.
labels
=
tf
.
sparse_placeholder
(
tf
.
int32
)
# 一维数组,大小[batch_size]
self
.
seq_len
=
tf
.
placeholder
(
tf
.
int32
,
[
None
])
# l2
self
.
_extra_train_ops
=
[]
# 存储调整平滑均值,平滑方差的操作
def
build_graph
(
self
):
self
.
_build_model
()
self
.
_build_train_op
()
self
.
merged_summay
=
tf
.
summary
.
merge_all
()
def
_build_model
(
self
):
"""
构建模型,前两个卷积的卷积核size 分别是7,5 是很重要的,换成其他的效果会差很多
"""
filters
=
[
32
,
64
,
128
,
128
,
FLAGS
.
max_stepsize
]
strides
=
[
1
,
2
]
with
tf
.
variable_scope
(
'cnn'
):
with
tf
.
variable_scope
(
'unit-0'
):
x
=
self
.
_conv2d
(
self
.
inputs
,
'cnn-0'
,
7
,
1
,
filters
[
0
],
strides
[
0
])
# 卷积
x
=
self
.
_batch_norm
(
'bn0'
,
x
)
# 批标准化
x
=
self
.
_leaky_relu
(
x
,
0.01
)
# 非线性激活
x
=
self
.
_max_pool
(
x
,
2
,
strides
[
0
])
# 池化
with
tf
.
variable_scope
(
'unit-1'
):
x
=
self
.
_conv2d
(
x
,
'cnn-1'
,
5
,
filters
[
0
],
filters
[
1
],
strides
[
0
])
x
=
self
.
_batch_norm
(
'bn1'
,
x
)
x
=
self
.
_leaky_relu
(
x
,
0.01
)
x
=
self
.
_max_pool
(
x
,
2
,
strides
[
1
])
with
tf
.
variable_scope
(
'unit-2'
):
x
=
self
.
_conv2d
(
x
,
'cnn-2'
,
3
,
filters
[
1
],
filters
[
2
],
strides
[
0
])
x
=
self
.
_batch_norm
(
'bn2'
,
x
)
x
=
self
.
_leaky_relu
(
x
,
0.01
)
x
=
self
.
_max_pool
(
x
,
2
,
strides
[
1
])
with
tf
.
variable_scope
(
'unit-3'
):
x
=
self
.
_conv2d
(
x
,
'cnn-3'
,
3
,
filters
[
2
],
filters
[
3
],
strides
[
0
])
x
=
self
.
_batch_norm
(
'bn3'
,
x
)
x
=
self
.
_leaky_relu
(
x
,
0.01
)
x
=
self
.
_max_pool
(
x
,
2
,
strides
[
1
])
with
tf
.
variable_scope
(
'unit-4'
):
x
=
self
.
_conv2d
(
x
,
'cnn-4'
,
3
,
filters
[
3
],
filters
[
4
],
strides
[
0
])
x
=
self
.
_batch_norm
(
'bn4'
,
x
)
x
=
self
.
_leaky_relu
(
x
,
0.01
)
x
=
self
.
_max_pool
(
x
,
2
,
strides
[
1
])
with
tf
.
variable_scope
(
'lstm'
):
shp
=
x
.
get_shape
()
.
as_list
()
x
=
tf
.
reshape
(
x
,
[
-
1
,
filters
[
4
],
shp
[
1
]
*
shp
[
2
]])
# tf.nn.rnn_cell.RNNCell, tf.nn.rnn_cell.GRUCell
cell
=
tf
.
contrib
.
rnn
.
LSTMCell
(
FLAGS
.
num_hidden
,
state_is_tuple
=
True
)
if
self
.
mode
==
'train'
:
cell
=
tf
.
contrib
.
rnn
.
DropoutWrapper
(
cell
=
cell
,
output_keep_prob
=
0.8
)
cell1
=
tf
.
contrib
.
rnn
.
LSTMCell
(
FLAGS
.
num_hidden
,
state_is_tuple
=
True
)
if
self
.
mode
==
'train'
:
cell1
=
tf
.
contrib
.
rnn
.
DropoutWrapper
(
cell
=
cell1
,
output_keep_prob
=
0.8
)
# 将rnn堆成2层深度
stack
=
tf
.
contrib
.
rnn
.
MultiRNNCell
([
cell
,
cell1
],
state_is_tuple
=
True
)
# outputs是所有step的结果, state 是最后一个step的结果这里不需要
outputs
,
_
=
tf
.
nn
.
dynamic_rnn
(
stack
,
x
,
self
.
seq_len
,
dtype
=
tf
.
float32
)
# reshape 使其满足模型的step长度
outputs
=
tf
.
reshape
(
outputs
,
[
-
1
,
FLAGS
.
num_hidden
])
W
=
tf
.
get_variable
(
name
=
'W'
,
shape
=
[
FLAGS
.
num_hidden
,
num_classes
],
dtype
=
tf
.
float32
,
initializer
=
tf
.
contrib
.
layers
.
xavier_initializer
())
b
=
tf
.
get_variable
(
name
=
'b'
,
shape
=
[
num_classes
],
dtype
=
tf
.
float32
,
initializer
=
tf
.
constant_initializer
())
self
.
logits
=
tf
.
matmul
(
outputs
,
W
)
+
b
# reshape 使得最后一个维度是 num_classes
shape
=
tf
.
shape
(
x
)
self
.
logits
=
tf
.
reshape
(
self
.
logits
,
[
shape
[
0
],
-
1
,
num_classes
])
# Time major
self
.
logits
=
tf
.
transpose
(
self
.
logits
,
(
1
,
0
,
2
))
def
_build_train_op
(
self
):
self
.
global_step
=
tf
.
Variable
(
0
,
trainable
=
False
)
# ctc 损失函数,使用前后向算法和最大似然
self
.
loss
=
tf
.
nn
.
ctc_loss
(
labels
=
self
.
labels
,
inputs
=
self
.
logits
,
sequence_length
=
self
.
seq_len
)
self
.
cost
=
tf
.
reduce_mean
(
self
.
loss
)
tf
.
summary
.
scalar
(
'cost'
,
self
.
cost
)
self
.
lrn_rate
=
tf
.
train
.
exponential_decay
(
FLAGS
.
initial_learning_rate
,
self
.
global_step
,
FLAGS
.
decay_steps
,
FLAGS
.
decay_rate
,
staircase
=
True
)
# self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.lrn_rate,
# momentum=FLAGS.momentum).minimize(self.cost,
# global_step=self.global_step)
# self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.lrn_rate,
# momentum=FLAGS.momentum,
# use_nesterov=True).minimize(self.cost,
# global_step=self.global_step)
self
.
optimizer
=
tf
.
train
.
AdamOptimizer
(
learning_rate
=
FLAGS
.
initial_learning_rate
,
beta1
=
FLAGS
.
beta1
,
beta2
=
FLAGS
.
beta2
)
.
minimize
(
self
.
loss
,
global_step
=
self
.
global_step
)
train_ops
=
[
self
.
optimizer
]
+
self
.
_extra_train_ops
self
.
train_op
=
tf
.
group
(
*
train_ops
)
# Option 2: tf.contrib.ctc.ctc_beam_search_decoder
# (it's slower but you'll get better results)
# decoded, log_prob = tf.nn.ctc_greedy_decoder(logits, seq_len,merge_repeated=False)
self
.
decoded
,
self
.
log_prob
=
tf
.
nn
.
ctc_beam_search_decoder
(
self
.
logits
,
self
.
seq_len
,
merge_repeated
=
False
)
# 寻找最优路径
self
.
dense_decoded
=
tf
.
sparse_tensor_to_dense
(
self
.
decoded
[
0
],
default_value
=-
1
)
# 解码
# 卷积
def
_conv2d
(
self
,
x
,
name
,
filter_size
,
in_channels
,
out_channels
,
strides
):
with
tf
.
variable_scope
(
name
):
kernel
=
tf
.
get_variable
(
name
=
'DW'
,
shape
=
[
filter_size
,
filter_size
,
in_channels
,
out_channels
],
dtype
=
tf
.
float32
,
initializer
=
tf
.
contrib
.
layers
.
xavier_initializer
())
b
=
tf
.
get_variable
(
name
=
'bais'
,
shape
=
[
out_channels
],
dtype
=
tf
.
float32
,
initializer
=
tf
.
constant_initializer
())
con2d_op
=
tf
.
nn
.
conv2d
(
x
,
kernel
,
[
1
,
strides
,
strides
,
1
],
padding
=
'SAME'
)
return
tf
.
nn
.
bias_add
(
con2d_op
,
b
)
#加上偏置,然后返回
# 批标准化
def
_batch_norm
(
self
,
name
,
x
):
"""批标准化."""
with
tf
.
variable_scope
(
name
):
params_shape
=
[
x
.
get_shape
()[
-
1
]]
#获取tensor的最后一个维度,后面的均值,方差都是这个维度
# 标准化数据为均值为0方差为1之后,还有一个x=x*gamma+beta的调整
# 这个会随着训练不断调整
beta
=
tf
.
get_variable
(
'beta'
,
params_shape
,
tf
.
float32
,
initializer
=
tf
.
constant_initializer
(
0.0
,
tf
.
float32
))
gamma
=
tf
.
get_variable
(
'gamma'
,
params_shape
,
tf
.
float32
,
initializer
=
tf
.
constant_initializer
(
1.0
,
tf
.
float32
))
# 训练的时候不断调整平滑均值,平滑方差
# 预测的时候,回复权重使用的是训练过程中调整出来的平滑方差均值去做标准化
if
self
.
mode
==
'train'
:
mean
,
variance
=
tf
.
nn
.
moments
(
x
,
[
0
,
1
,
2
],
name
=
'moments'
)
#获取批均值和方差,size[最后一个维度]
# moving_mean, moving_variance 这两个name一定要让训练和预测的时候都相等,不然就没法恢复训练好的值了
moving_mean
=
tf
.
get_variable
(
'moving_mean'
,
params_shape
,
tf
.
float32
,
initializer
=
tf
.
constant_initializer
(
0.0
,
tf
.
float32
),
trainable
=
False
)
moving_variance
=
tf
.
get_variable
(
'moving_variance'
,
params_shape
,
tf
.
float32
,
initializer
=
tf
.
constant_initializer
(
1.0
,
tf
.
float32
),
trainable
=
False
)
self
.
_extra_train_ops
.
append
(
moving_averages
.
assign_moving_average
(
moving_mean
,
mean
,
0.9
))
self
.
_extra_train_ops
.
append
(
moving_averages
.
assign_moving_average
(
moving_variance
,
variance
,
0.9
))
else
:
# mean的name一定要跟train的时候的一样 moving_mean
mean
=
tf
.
get_variable
(
'moving_mean'
,
params_shape
,
tf
.
float32
,
initializer
=
tf
.
constant_initializer
(
0.0
,
tf
.
float32
),
trainable
=
False
)
# variance的name一定要跟train的时候的一样 moving_variance
variance
=
tf
.
get_variable
(
'moving_variance'
,
params_shape
,
tf
.
float32
,
initializer
=
tf
.
constant_initializer
(
1.0
,
tf
.
float32
),
trainable
=
False
)
# 可视化
tf
.
summary
.
histogram
(
mean
.
op
.
name
,
mean
)
tf
.
summary
.
histogram
(
variance
.
op
.
name
,
variance
)
# 计算,标准化,最后一个值为误差,一般设置很小即可
x_bn
=
tf
.
nn
.
batch_normalization
(
x
,
mean
,
variance
,
beta
,
gamma
,
0.001
)
x_bn
.
set_shape
(
x
.
get_shape
())
return
x_bn
# 变种Relu
# Relu 简单而强大,方便求导
# 非负区间的梯度为常数,一定程度上能够防止梯度消失问题
def
_leaky_relu
(
self
,
x
,
leakiness
=
0.0
):
return
tf
.
where
(
tf
.
less
(
x
,
0.0
),
leakiness
*
x
,
x
,
name
=
'leaky_relu'
)
def
_max_pool
(
self
,
x
,
ksize
,
strides
):
return
tf
.
nn
.
max_pool
(
x
,
ksize
=
[
1
,
ksize
,
ksize
,
1
],
strides
=
[
1
,
strides
,
strides
,
1
],
padding
=
'SAME'
,
name
=
'max_pool'
)
\ No newline at end of file
tasks/zhihu/captcha/utils.py
0 → 100644
View file @
d1b63291
import
os
import
numpy
as
np
import
tensorflow
as
tf
#import cv2
from
PIL
import
Image
# +-* + () + 10 digit + blank + space
num_classes
=
38
#3 + 2 + 10 + 1 + 1
maxPrintLen
=
100
tf
.
app
.
flags
.
DEFINE_boolean
(
'restore'
,
True
,
'whether to restore from the latest checkpoint'
)
tf
.
app
.
flags
.
DEFINE_string
(
'checkpoint_dir'
,
'./checkpoint/'
,
'the checkpoint dir'
)
tf
.
app
.
flags
.
DEFINE_float
(
'initial_learning_rate'
,
1e-3
,
'inital lr'
)
tf
.
app
.
flags
.
DEFINE_integer
(
'image_height'
,
60
,
'image height'
)
tf
.
app
.
flags
.
DEFINE_integer
(
'image_width'
,
150
,
'image width'
)
tf
.
app
.
flags
.
DEFINE_integer
(
'image_channel'
,
1
,
'image channels as input'
)
tf
.
app
.
flags
.
DEFINE_integer
(
'max_stepsize'
,
64
,
'max stepsize in lstm, as well as '
'the output channels of last layer in CNN'
)
tf
.
app
.
flags
.
DEFINE_integer
(
'num_hidden'
,
128
,
'number of hidden units in lstm'
)
tf
.
app
.
flags
.
DEFINE_integer
(
'num_epochs'
,
1000
,
'maximum epochs'
)
tf
.
app
.
flags
.
DEFINE_integer
(
'batch_size'
,
128
,
'the batch_size'
)
tf
.
app
.
flags
.
DEFINE_integer
(
'save_steps'
,
500
,
'the step to save checkpoint'
)
tf
.
app
.
flags
.
DEFINE_integer
(
'validation_steps'
,
500
,
'the step to validation'
)
tf
.
app
.
flags
.
DEFINE_float
(
'decay_rate'
,
0.98
,
'the lr decay rate'
)
tf
.
app
.
flags
.
DEFINE_float
(
'beta1'
,
0.9
,
'parameter of adam optimizer beta1'
)
tf
.
app
.
flags
.
DEFINE_float
(
'beta2'
,
0.999
,
'adam parameter beta2'
)
tf
.
app
.
flags
.
DEFINE_integer
(
'decay_steps'
,
1000
,
'the lr decay_step for optimizer'
)
tf
.
app
.
flags
.
DEFINE_float
(
'momentum'
,
0.9
,
'the momentum'
)
tf
.
app
.
flags
.
DEFINE_string
(
'train_dir'
,
'./imgs/train/'
,
'the train data dir'
)
tf
.
app
.
flags
.
DEFINE_string
(
'val_dir'
,
'./imgs/val/'
,
'the val data dir'
)
tf
.
app
.
flags
.
DEFINE_string
(
'infer_dir'
,
'./imgs/infer/'
,
'the infer data dir'
)
tf
.
app
.
flags
.
DEFINE_string
(
'logs_dir'
,
'./log'
,
'the logging dir'
)
tf
.
app
.
flags
.
DEFINE_string
(
'mode'
,
'train'
,
'train, val or infer'
)
tf
.
app
.
flags
.
DEFINE_integer
(
'num_gpus'
,
1
,
'num of gpus'
)
FLAGS
=
tf
.
app
.
flags
.
FLAGS
# num_batches_per_epoch = int(num_train_samples/FLAGS.batch_size)
import
string
charset
=
string
.
digits
+
string
.
ascii_lowercase
#'0123456789+-*()'
encode_maps
=
{}
decode_maps
=
{}
for
i
,
char
in
enumerate
(
charset
,
1
):
encode_maps
[
char
]
=
i
decode_maps
[
i
]
=
char
SPACE_INDEX
=
0
SPACE_TOKEN
=
''
encode_maps
[
SPACE_TOKEN
]
=
SPACE_INDEX
decode_maps
[
SPACE_INDEX
]
=
SPACE_TOKEN
class
DataIterator
:
def
__init__
(
self
,
data_dir
):
self
.
image
=
[]
# 所有的图片都加载进内存待等待提取,加载进内存当然是为了速度了
self
.
labels
=
[]
for
root
,
sub_folder
,
file_list
in
os
.
walk
(
data_dir
):
for
file_path
in
file_list
:
image_name
=
os
.
path
.
join
(
root
,
file_path
)
# im = np.array(Image.open(image_name)).astype(np.float32)/255.
im
=
np
.
array
(
Image
.
open
(
image_name
)
.
convert
(
"L"
))
.
astype
(
np
.
float32
)
/
255.
# im = np.array(Image.open(image_name).convert("L").point(lambda x: 0 if x < 150 else 1)).astype(np.float32)
# im = cv2.imread(image_name, 0).astype(np.float32)/255.
# resize to same height, different width will consume time on padding
# im = cv2.resize(im, (image_width, image_height))
im
=
np
.
reshape
(
im
,
[
FLAGS
.
image_height
,
FLAGS
.
image_width
,
FLAGS
.
image_channel
])
self
.
image
.
append
(
im
)
# image is named as /.../<folder>/00000_abcd.png
code
=
image_name
.
split
(
os
.
sep
)[
-
1
]
.
split
(
'_'
)[
1
]
.
split
(
'.'
)[
0
]
# code 是验证码
code
=
[
SPACE_INDEX
if
code
==
SPACE_TOKEN
else
encode_maps
[
c
]
for
c
in
list
(
code
)]
# code转成[1,2,3,4] 字码列表
self
.
labels
.
append
(
code
)
# 使size方法变成属性,调用的时候self.size即可,不用调用self.size() #这里体现不出@property的优点
@property
def
size
(
self
):
return
len
(
self
.
labels
)
# 给定index, 抽取labels
def
the_label
(
self
,
indexs
):
labels
=
[]
for
i
in
indexs
:
labels
.
append
(
self
.
labels
[
i
])
return
labels
# 给定index, 得到一个批次的训练数据
def
input_index_generate_batch
(
self
,
index
=
None
):
if
index
:
image_batch
=
[
self
.
image
[
i
]
for
i
in
index
]
label_batch
=
[
self
.
labels
[
i
]
for
i
in
index
]
else
:
image_batch
=
self
.
image
label_batch
=
self
.
labels
def
get_input_lens
(
sequences
):
# 分片的序列长度,因为验证码图片序列长度都是一样的,不像句子有长有短
# 所以这里的长度都是一样的
lengths
=
np
.
asarray
([
FLAGS
.
max_stepsize
for
_
in
sequences
],
dtype
=
np
.
int64
)
return
sequences
,
lengths
batch_inputs
,
batch_seq_len
=
get_input_lens
(
np
.
array
(
image_batch
))
batch_labels
=
sparse_tuple_from_label
(
label_batch
)
# 转成稀疏矩阵
return
batch_inputs
,
batch_seq_len
,
batch_labels
# 对比解码得到的label和真实label,计算正确率
def
accuracy_calculation
(
original_seq
,
decoded_seq
,
ignore_value
=-
1
,
isPrint
=
False
):
if
len
(
original_seq
)
!=
len
(
decoded_seq
):
print
(
'original lengths is different from the decoded_seq, please check again'
)
return
0
count
=
0
for
i
,
origin_label
in
enumerate
(
original_seq
):
decoded_label
=
[
j
for
j
in
decoded_seq
[
i
]
if
j
!=
ignore_value
]
if
isPrint
and
i
<
maxPrintLen
:
# print('seq{0:4d}: origin: {1} decoded:{2}'.format(i, origin_label, decoded_label))
with
open
(
'./test.csv'
,
'w'
)
as
f
:
f
.
write
(
str
(
origin_label
)
+
'
\t
'
+
str
(
decoded_label
))
f
.
write
(
'
\n
'
)
if
origin_label
==
decoded_label
:
count
+=
1
return
count
*
1.0
/
len
(
original_seq
)
def
sparse_tuple_from_label
(
sequences
,
dtype
=
np
.
int32
):
"""
根据[[1,2,3,4],[5,2,6,5],...] 这种形式,生成稀疏矩阵
稀疏矩阵由三个元素的tuple组成,即(indices, values, shape)
indices和values的都是一个列表,列表元素刚好一一对应,
一个代表坐标位置,一个代表这个位置的值,其中indices是一个
[(0,1),(0,2),...(10,3),(10,4),...]这样的形式的列表,指示了
对应的values的值在密集矩阵的坐标,values 是[1,2,3,...,100,...]
这样的形式,最后一个shape描述密集矩阵的shape
示例:
indices = [(0,1),(0,2),(0,3),(1,1),(1,3),(2,2)]
values = [1,2,3,4,5,6]
shape = [4,3]
则对应的密集矩阵就是
0 1 2 3
0 4 0 5
0 0 6 0
参数:
sequences: 一个列表,列表里面是每个验证码的码字列表
返回:
(indices, values, shape)
"""
indices
=
[]
values
=
[]
for
n
,
seq
in
enumerate
(
sequences
):
indices
.
extend
(
zip
([
n
]
*
len
(
seq
),
range
(
len
(
seq
))))
values
.
extend
(
seq
)
indices
=
np
.
asarray
(
indices
,
dtype
=
np
.
int64
)
values
=
np
.
asarray
(
values
,
dtype
=
dtype
)
shape
=
np
.
asarray
([
len
(
sequences
),
np
.
asarray
(
indices
)
.
max
(
0
)[
1
]
+
1
],
dtype
=
np
.
int64
)
return
indices
,
values
,
shape
\ No newline at end of file
tasks/zhihu/captcha/zhihu_captcha.py
0 → 100644
View file @
d1b63291
import
requests
import
time
import
json
import
os
import
sys
from
bs4
import
BeautifulSoup
as
BS
import
urllib.parse
import
webbrowser
from
io
import
BytesIO
from
zhihu_captcha
import
utils
from
zhihu_captcha
import
orcmodel
import
tensorflow
as
tf
from
PIL
import
Image
import
numpy
as
np
try
:
type
(
eval
(
'model'
))
except
:
model
=
orcmodel
.
LSTMOCR
(
'infer'
)
model
.
build_graph
()
config
=
tf
.
ConfigProto
(
allow_soft_placement
=
True
)
checkpoint_dir
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
"checkpoint"
)
class
ZhihuCaptcha
():
def
__init__
(
self
,
username
=
None
,
password
=
None
):
if
sys
.
path
[
0
]:
os
.
chdir
(
sys
.
path
[
0
])
# 设置脚本所在目录为当前工作目录
# 恢复权重
self
.
__sess
=
self
.
__restoreSess
(
checkpoint_dir
)
# 恢复权重
def
__restoreSess
(
self
,
checkpoint
=
checkpoint_dir
):
sess
=
tf
.
Session
(
config
=
config
)
sess
.
run
(
tf
.
global_variables_initializer
())
saver
=
tf
.
train
.
Saver
(
tf
.
global_variables
(),
max_to_keep
=
100
)
ckpt
=
tf
.
train
.
latest_checkpoint
(
checkpoint
)
if
ckpt
:
#回复权限,这里连 global_step 也会被加载进来
saver
.
restore
(
sess
,
ckpt
)
# print('restore from the checkpoint{0}'.format(ckpt))
print
(
'已加载checkpoint{0}'
.
format
(
ckpt
))
else
:
print
(
'警告:未加载任何chechpoint'
)
print
(
'如果这不是你预期中的,请确保以下目录存在可用的checkpoint:
\n
{0}'
.
format
(
checkpoint_dir
))
return
sess
def
recgImg
(
self
,
img
):
"""
可以在线测试验证码识别功能
参数:
img 一个 (60, 150) 的图片
"""
im
=
np
.
array
(
img
.
convert
(
"L"
))
.
astype
(
np
.
float32
)
/
255.
im
=
np
.
reshape
(
im
,
[
60
,
150
,
1
])
inp
=
np
.
array
([
im
])
seq_len_input
=
np
.
array
([
np
.
array
([
64
for
_
in
inp
],
dtype
=
np
.
int64
)])
#seq_len_input = np.asarray(seq_len_input)
seq_len_input
=
np
.
reshape
(
seq_len_input
,
[
-
1
])
imgs_input
=
np
.
asarray
([
im
])
feed
=
{
model
.
inputs
:
imgs_input
,
model
.
seq_len
:
seq_len_input
}
dense_decoded_code
=
self
.
__sess
.
run
(
model
.
dense_decoded
,
feed
)
expression
=
''
for
i
in
dense_decoded_code
[
0
]:
if
i
==
-
1
:
expression
+=
''
else
:
expression
+=
utils
.
decode_maps
[
i
]
return
expression
\ No newline at end of file
tasks/zhihu/old_spider.py
0 → 100644
View file @
d1b63291
import
pymysql
import
hashlib
import
requests
import
execjs
import
os
import
re
import
sys
import
time
from
datetime
import
datetime
import
kdl
HOST
=
'172.18.51.14'
PORT
=
3306
USER
=
'spider'
PASSWD
=
'Gengmei123'
DB
=
'spider'
JS_FILE_PATH
=
'/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
# JS_FILE_PATH = '/Users/haowei/workspace/gm/crawler/crawler_sys/site_crawler/zhihu.js'
APIKEY
=
'quxguz4hwm9cxnx6wpjhkokx04klpr8v'
def
login
():
url
=
'https://www.zhihu.com'
loginUrl
=
'https://www.zhihu.com/login/email'
headers
=
{
# "User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:41.0) Gecko/20100101 Firefox/41.0',
'User-Agent'
:
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/55.0.2883.87 Chrome/55.0.2883.87 Safari/537.36'
,
"Referer"
:
"http://www.zhihu.com/"
,
'Host'
:
'www.zhihu.com'
,
'rememberme'
:
"true"
}
data
=
{
'email'
:
'yousangdandan@yeah.net'
,
'password'
:
'5358569'
}
global
session
session
=
requests
.
session
()
login_req
=
session
.
post
(
loginUrl
,
data
=
data
,
headers
=
headers
)
print
(
'loginReq:{}'
.
format
(
loginReq
.
status_code
))
return
login_req
def
get_proxy
():
auth
=
kdl
.
Auth
(
"990866563045611"
,
APIKEY
)
client
=
kdl
.
Client
(
auth
)
ips
=
client
.
get_dps
(
1
,
sign_type
=
'hmacsha1'
,
format
=
'json'
,
area
=
'北京,上海,广东'
)
print
(
"dps proxy: "
,
ips
,
client
.
get_proxy_authorization
())
# return { "http": "http://{}".format(ips[0]), "https": "https://{}".format(ips[0]), }, client.get_proxy_authorization()
return
{
"http"
:
"http://{}"
.
format
(
'171.35.213.172:9999'
),
"https"
:
"https://{}"
.
format
(
'171.35.213.172:9999'
),
},
client
.
get_proxy_authorization
()
def
retry_get_url
(
url
,
retrys
=
5
,
headers
=
{},
timeout
=
10
,
**
kwargs
):
retry_c
=
0
while
retry_c
<
retrys
:
time
.
sleep
(
3
)
try
:
# proxies, proxy_authorization = get_proxy()
# headers.update(proxy_authorization)
# get_resp = requests.get(url, headers=headers, timeout=timeout, proxies=proxies, **kwargs)
get_resp
=
requests
.
get
(
url
,
headers
=
headers
,
timeout
=
timeout
,
**
kwargs
)
return
get_resp
except
Exception
as
e
:
retry_c
+=
1
print
(
e
)
# proxies, proxy_authorization = get_proxy()
print
(
'Failed to get page
%
s after
%
d retries,
%
s'
%
(
url
,
retrys
,
datetime
.
now
()))
return
None
class
Spider
(
object
):
def
__init__
(
self
,
spider_url
):
'''
初始化数据库,调整js规则
'''
self
.
conn
=
pymysql
.
connect
(
host
=
HOST
,
port
=
PORT
,
user
=
USER
,
passwd
=
PASSWD
,
db
=
DB
,
charset
=
'utf8'
)
self
.
cur
=
self
.
conn
.
cursor
()
self
.
page_count
=
1000
self
.
use_proxy
=
True
self
.
spider_url
=
spider_url
detail_url
=
'/answers?include=data
%5
B*
%5
D.is_normal
%2
Cadmin_closed_comment
%2
Creward_info
%2
Cis_collapsed
%2
Cannotation_action
%2
Cannotation_detail
%2
Ccollapse_reason
%2
Ccollapsed_by
%2
Csuggest_edit
%2
Ccomment_count
%2
Ccan_comment
%2
Ccontent
%2
Ceditable_content
%2
Cattachment
%2
Cvoteup_count
%2
Creshipment_settings
%2
Ccomment_permission
%2
Cmark_infos
%2
Ccreated_time
%2
Cupdated_time
%2
Creview_info
%2
Cexcerpt
%2
Cis_labeled
%2
Clabel_info
%2
Crelationship.is_authorized
%2
Cvoting
%2
Cis_author
%2
Cis_thanked
%2
Cis_nothelp
%2
Cis_recognized
%3
Bdata
%5
B*
%5
D.author.badge
%5
B
%3
F(type
%3
Dbest_answerer)
%5
D.topics
%3
Bdata
%5
B*
%5
D.question.has_publishing_draft
%2
Crelationship&offset={}&limit=20&sort_by=created'
self
.
ANSWER_URL
=
self
.
spider_url
.
replace
(
"https://www.zhihu.com/people"
,
"https://www.zhihu.com/api/v4/members"
)
+
detail_url
os
.
environ
[
"EXECJS_RUNTIME"
]
=
'Node'
try
:
with
open
(
'./zhihu.js'
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
js
=
f
.
read
()
except
:
with
open
(
JS_FILE_PATH
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
js
=
f
.
read
()
# self.exec_js = execjs.compile(js)
self
.
exec_js
=
execjs
.
compile
(
js
,
cwd
=
'/home/gmuser/node_modules'
)
def
headers_handle
(
self
,
url
):
'''
url请求中的头部伪装
'''
'''
{
'Server': 'CLOUD ELB 1.0.0',
'Date': 'Fri, 01 Jan 2021 08:36:59 GMT',
'Content-Type': 'text/html; charset=utf-8',
'Vary': 'Accept-Encoding',
'content-security-policy': "default-src * blob:; img-src * data: blob: resource: t.captcha.qq.com cstaticdun.126.net necaptcha.nosdn.127.net; connect-src * wss: blob: resource:; frame-src 'self' *.zhihu.com mailto: tel: weixin: *.vzuu.com mo.m.taobao.com getpocket.com note.youdao.com safari-extension://com.evernote.safari.clipper-Q79WDW8YH9 zhihujs: captcha.guard.qcloud.com pos.baidu.com dup.baidustatic.com openapi.baidu.com wappass.baidu.com passport.baidu.com *.cme.qcloud.com vs-cdn.tencent-cloud.com t.captcha.qq.com c.dun.163.com; script-src 'self' blob: *.zhihu.com g.alicdn.com qzonestyle.gtimg.cn res.wx.qq.com open.mobile.qq.com 'unsafe-eval' unpkg.zhimg.com unicom.zhimg.com resource: captcha.gtimg.com captcha.guard.qcloud.com pagead2.googlesyndication.com cpro.baidustatic.com pos.baidu.com dup.baidustatic.com i.hao61.net 'nonce-8555a150-24a4-490b-9a1e-d48bdb590dfe' hm.baidu.com zz.bdstatic.com b.bdstatic.com imgcache.qq.com vs-cdn.tencent-cloud.com gw.alipayobjects.com ssl.captcha.qq.com t.captcha.qq.com cstaticdun.126.net c.dun.163.com ac.dun.163.com/ acstatic-dun.126.net; style-src 'self' 'unsafe-inline' *.zhihu.com unicom.zhimg.com resource: captcha.gtimg.com ssl.captcha.qq.com t.captcha.qq.com cstaticdun.126.net c.dun.163.com ac.dun.163.com/ acstatic-dun.126.net", 'x-content-security-policy': "default-src * blob:; img-src * data: blob: resource: t.captcha.qq.com cstaticdun.126.net necaptcha.nosdn.127.net; connect-src * wss: blob: resource:; frame-src 'self' *.zhihu.com mailto: tel: weixin: *.vzuu.com mo.m.taobao.com getpocket.com note.youdao.com safari-extension://com.evernote.safari.clipper-Q79WDW8YH9 zhihujs: captcha.guard.qcloud.com pos.baidu.com dup.baidustatic.com openapi.baidu.com wappass.baidu.com passport.baidu.com *.cme.qcloud.com vs-cdn.tencent-cloud.com t.captcha.qq.com c.dun.163.com; script-src 'self' blob: *.zhihu.com g.alicdn.com qzonestyle.gtimg.cn res.wx.qq.com open.mobile.qq.com 'unsafe-eval' unpkg.zhimg.com unicom.zhimg.com resource: captcha.gtimg.com captcha.guard.qcloud.com pagead2.googlesyndication.com cpro.baidustatic.com pos.baidu.com dup.baidustatic.com i.hao61.net 'nonce-8555a150-24a4-490b-9a1e-d48bdb590dfe' hm.baidu.com zz.bdstatic.com b.bdstatic.com imgcache.qq.com vs-cdn.tencent-cloud.com gw.alipayobjects.com ssl.captcha.qq.com t.captcha.qq.com cstaticdun.126.net c.dun.163.com ac.dun.163.com/ acstatic-dun.126.net; style-src 'self' 'unsafe-inline' *.zhihu.com unicom.zhimg.com resource: captcha.gtimg.com ssl.captcha.qq.com t.captcha.qq.com cstaticdun.126.net c.dun.163.com ac.dun.163.com/ acstatic-dun.126.net", 'x-webkit-csp': "default-src * blob:; img-src * data: blob: resource: t.captcha.qq.com cstaticdun.126.net necaptcha.nosdn.127.net; connect-src * wss: blob: resource:; frame-src 'self' *.zhihu.com mailto: tel: weixin: *.vzuu.com mo.m.taobao.com getpocket.com note.youdao.com safari-extension://com.evernote.safari.clipper-Q79WDW8YH9 zhihujs: captcha.guard.qcloud.com pos.baidu.com dup.baidustatic.com openapi.baidu.com wappass.baidu.com passport.baidu.com *.cme.qcloud.com vs-cdn.tencent-cloud.com t.captcha.qq.com c.dun.163.com; script-src 'self' blob: *.zhihu.com g.alicdn.com qzonestyle.gtimg.cn res.wx.qq.com open.mobile.qq.com 'unsafe-eval' unpkg.zhimg.com unicom.zhimg.com resource: captcha.gtimg.com captcha.guard.qcloud.com pagead2.googlesyndication.com cpro.baidustatic.com pos.baidu.com dup.baidustatic.com i.hao61.net 'nonce-8555a150-24a4-490b-9a1e-d48bdb590dfe' hm.baidu.com zz.bdstatic.com b.bdstatic.com imgcache.qq.com vs-cdn.tencent-cloud.com gw.alipayobjects.com ssl.captcha.qq.com t.captcha.qq.com cstaticdun.126.net c.dun.163.com ac.dun.163.com/ acstatic-dun.126.net; style-src 'self' 'unsafe-inline' *.zhihu.com unicom.zhimg.com resource: captcha.gtimg.com ssl.captcha.qq.com t.captcha.qq.com cstaticdun.126.net c.dun.163.com ac.dun.163.com/ acstatic-dun.126.net",
'x-frame-options': 'SAMEORIGIN',
'strict-transport-security': 'max-age=15552000; includeSubDomains',
'surrogate-control': 'no-store',
'cache-control': 'no-cache, no-store, must-revalidate, private, max-age=0',
'pragma': 'no-cache',
'expires': '0',
'x-content-type-options': 'nosniff',
'x-xss-protection': '1; mode=block',
'X-Backend-Response': '0.022',
'Referrer-Policy': 'no-referrer-when-downgrade',
'X-SecNG-Response': '0.023000001907349',
'x-lb-timing': '0.023',
'x-idc-id': '2',
'Set-Cookie': 'KLBRSID=fb3eda1aa35a9ed9f88f346a7a3ebe83|1609490219|1609490131; Path=/',
'X-Cache-Lookup': 'Cache Miss, Cache Miss',
'Content-Encoding': 'gzip',
'Transfer-Encoding': 'chunked',
'X-NWS-LOG-UUID': '16179100128830453442',
'Connection': 'keep-alive',
'x-edge-timing': '0.061',
'x-cdn-provider': 'tencent'}
'''
'''
html
cache-control: no-cache, no-store, must-revalidate, private, max-age=0
content-encoding: gzip
content-type: application/json
date: Fri, 01 Jan 2021 07:01:52 GMT
etag: W/"6a7ddf80b3ab19ba789d570163ac1eacb4bde53e"
expires: Fri, 02 Jan 2000 00:00:00 GMT
pragma: no-cache
referrer-policy: no-referrer-when-downgrade
server: CLOUD ELB 1.0.0
set-cookie: KLBRSID=5430ad6ccb1a51f38ac194049bce5dfe|1609484511|1609484497; Path=/
vary: Accept-Encoding
x-backend-response: 0.573
x-cache-lookup: Cache Miss
x-cdn-provider: tencent
x-edge-timing: 0.634
x-idc-id: 2
x-lb-timing: 0.599
x-nws-log-uuid: 12448536375904178345
x-secng-response: 0.59800004959106
x-udid: AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=
'''
res_cookies_dict
=
self
.
get_serach_page_cookies
()
referer
=
self
.
spider_url
.
replace
(
"https://www.zhihu.com/people"
,
"https://www.zhihu.com/api/v4/members"
)
headers_search
=
{
"accept"
:
"*/*"
,
"accept-encoding"
:
"gzip, deflate"
,
"accept-language"
:
"zh-CN,zh;q=0.9"
,
"sec-fetch-dest"
:
"empty"
,
"sec-fetch-mode"
:
"cors"
,
"sec-fetch-site"
:
"same-origin"
,
"user-agent"
:
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
,
"x-ab-param"
:
"li_yxzl_new_style_a=1;se_major=0;se_adsrank=4;se_hi_trunc=0;se_v053=1;li_panswer_topic=0;zr_test_aa1=1;pf_noti_entry_num=2;zr_search_sim2=2;zr_slotpaidexp=2;se_zp_boost=0;tp_club_entrance=1;pf_profile2_tab=0;ug_newtag=1;li_answer_card=0;ls_recommend_test=5;qap_labeltype=1;zr_rec_answer_cp=open;se_sug_term=0;tp_topic_tab=0;ge_ge01=5;se_wil_act=0;se_videobox=0;tsp_ad_cardredesign=0;qap_question_visitor= 0;zr_slot_training=2;tp_clubhyb=0;li_ebook_gen_search=2;se_v_v005=0;zw_sameq_sorce=999;ge_ge02=6;se_mobilecard=0;se_auth_src=0;tp_header_style=1;tp_flow_ctr=0;pf_creator_card=1;li_viptab_name=0;zr_intervene=0;se_bert128=1;se_ffzx_jushen1=0;top_v_album=1;se_preset=0;tp_discover=1;ls_fmp4=0;tp_club_top=0;top_universalebook=1;li_svip_cardshow=1;li_paid_answer_exp=0;tp_topic_style=0;zr_art_rec=base;se_colorfultab=1;se_auth_src2=0;tp_club_qa_entrance=1;tp_club__entrance2=1;tsp_hotlist_ui=3;li_svip_tab_search=1;se_entity22=1;tp_meta_card=0;tp_topic_tab_new=0-0-0;tp_zrec=0;top_ebook=0;pf_adjust=1;qap_question_author=0;zr_topic_rpc=0;se_topicfeed=0;tp_club_feed=0;tsp_ioscard2=0;zr_rel_search=base;se_recommend=1;se_usercard=0;tp_club_fdv4=0;tp_m_intro_re_topic=1;pf_foltopic_usernum=0;li_vip_verti_search=0;zr_training_boost=false;se_v054=0;tp_contents=1;soc_feed_intelligent=3;tsp_ios_cardredesign=0;pf_fuceng=1;pf_newguide_vertical=0;ug_follow_topic_1=2;ls_video_commercial=0;li_car_meta=1;se_sug_dnn=0;tp_fenqu_wei=0;li_catalog_card=1;top_quality=0;se_click_v_v=1;se_aa_base=1;se_club_ui=0;se_return_1=0;soc_notification=1;zr_ans_rec=gbrank;zr_search_paid=1;zr_expslotpaid=3;zr_rerank=0;se_college=default;se_whitelist=1;top_root=0;li_yxxq_aut=A1;tsp_adcard2=0;ls_videoad=2;se_col_boost=1;li_edu_page=old;zr_training_first=false;se_t2sug=1;se_vbert3=0;se_merge=0;li_video_section=1;zr_km_answer=open_cvr;zr_sim3=0;se_v_v006=0;tp_dingyue_video=0;li_topics_search=0;se_searchwiki=0;se_guess=0;se_major_v2=0;tp_club_bt=0;tp_sft=a;top_test_4_liguangyi=1"
,
"x-api-version"
:
"3.0.91"
,
"x-app-za"
:
"OS=Web"
,
"x-requested-with"
:
"fetch"
,
"x-zse-83"
:
"3_2.0"
,
"x-zse-86"
:
None
,
"referer"
:
referer
+
"/answers?page=1"
,
}
# cookies_dict = {
# "d_c0": '"AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=|1545899265"',
# "KLBRSID": '5430ad6ccb1a51f38ac194049bce5dfe|1609484506|1609484497',
# }
# cookies_dict.update(res_cookies_dict)
f
=
"+"
.
join
(
[
"3_2.0"
,
url
.
replace
(
"https://www.zhihu.com"
,
""
),
headers_search
[
"referer"
],
cookies_dict
[
"d_c0"
]])
fmd5
=
hashlib
.
new
(
'md5'
,
f
.
encode
())
.
hexdigest
()
headers_search
[
"x-zse-86"
]
=
"1.0_"
+
self
.
exec_js
.
call
(
"b"
,
fmd5
)
return
headers_search
,
cookies_dict
def
get_serach_page_cookies
(
self
):
'''
cookies更新
'''
headers
=
{
"accept"
:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
,
"accept-encoding"
:
"gzip, deflate, br"
,
"accept-language"
:
"zh-CN,zh;q=0.9"
,
"cache-control"
:
"max-age=0"
,
"cookie"
:
'_xsrf=vTWamiEoaOQszAl6fjdlxOtqyhDvOen9; d_c0="AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=|1545899265"; q_c1=3e9e37a46b1d4bfd87f7d1fcb084daad|1545899267000|1545899267000; _ga=GA1.2.929033900.1582626815; capsion_ticket="2|1:0|10:1608602928|14:capsion_ticket|44:MmRhNDdmYWJhZjU3NGQ4ODg3NDAzNGIwNDNiMTdlNDE=|7924fa0d0e36d3ed2a4af65dafa4684c9b36a70d586ec3adb1963c8df5f55e81"; _zap=6fd2d768-daa1-4be1-9a96-43d86c1bbc75; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1609325059,1609337218,1609401296,1609405637; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1609484506; KLBRSID=5430ad6ccb1a51f38ac194049bce5dfe|1609484506|1609484497'
,
"referer"
:
self
.
spider_url
,
"sec-fetch-dest"
:
"document"
,
"sec-fetch-mode"
:
"navigate"
,
"sec-fetch-site"
:
"same-origin"
,
"sec-fetch-user"
:
"?1"
,
"upgrade-insecure-requests"
:
"1"
,
"user-agent"
:
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
,
}
requests_res
=
retry_get_url
(
self
.
spider_url
,
headers
=
headers
)
return
requests_res
.
cookies
.
get_dict
()
def
update_page_count
(
self
,
answer_count
):
count
=
int
(
answer_count
/
20
)
temp
=
int
(
answer_count
%
20
)
if
temp
>
0
:
count
+=
1
self
.
page_count
=
count
def
check_data_exist
(
self
,
data_dict
,
mark
):
'''
数据插入前检测
'''
sql
=
"select id from {table} where answer_id = {id_}"
exist
=
None
if
mark
==
0
:
select_sql
=
sql
.
format
(
table
=
'zhihu_answer'
,
id_
=
data_dict
[
"id"
])
self
.
cur
.
execute
(
select_sql
)
exist
=
self
.
cur
.
fetchone
()
if
mark
==
1
:
select_sql
=
sql
.
format
(
table
=
'zhihu_article'
,
id_
=
data_dict
[
"id"
])
self
.
cur
.
execute
(
select_sql
)
exist
=
self
.
cur
.
fetchone
()
if
exist
:
return
True
return
False
def
parse_sigle_page
(
self
,
data_dict
,
mark
):
'''
插入主要内容数据和图片的url
'''
if
not
self
.
check_data_exist
(
data_dict
,
mark
):
if
mark
==
0
:
into
=
"insert into zhihu_answer(title, content, answer_id, created_time, comment_count, new_content) value(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
values
=
(
data_dict
[
"question"
][
"title"
],
data_dict
[
"content"
],
data_dict
[
"id"
],
data_dict
[
"created_time"
],
data_dict
[
"comment_count"
],
data_dict
[
"content"
])
elif
mark
==
1
:
into
=
"insert into zhihu_article(title, content, article_id, created_time, comment_count, new_content) value(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
values
=
(
data_dict
[
"title"
],
data_dict
[
"content"
],
data_dict
[
"id"
],
data_dict
[
"created"
],
data_dict
[
"comment_count"
],
data_dict
[
"content"
])
print
(
data_dict
[
"question"
][
"title"
])
self
.
cur
.
execute
(
into
,
values
)
self
.
conn
.
commit
()
return
def
search_page
(
self
,
mark
,
page_max
,
start_page
=
0
,
need_commend
=
False
):
'''
函数主入口
params:
mark 0 answer, 1 article, 2 thought
'''
offset
=
start_page
for
i
in
range
(
page_max
):
if
i
>
self
.
page_count
-
1
:
break
if
mark
==
0
:
self
.
search_answer_article_page
(
offset
,
0
,
0
)
elif
mark
==
1
:
self
.
search_answer_article_page
(
offset
,
1
,
0
)
elif
mark
==
2
:
self
.
search_thought_page
(
offset
)
offset
=
offset
+
20
time
.
sleep
(
10
)
self
.
conn
.
close
()
return
def
get_page_data
(
self
,
url
,
headers_search
,
cookies_dict
):
get_page
=
retry_get_url
(
url
,
headers
=
headers_search
,
cookies
=
cookies_dict
)
if
get_page
.
status_code
!=
200
:
# retry once
get_page
=
retry_get_url
(
url
)
if
get_page
.
status_code
!=
200
:
print
(
"article_error, url : "
,
url
,
" status_code: "
,
get_page
.
status_code
)
try
:
page_dict
=
get_page
.
json
()
print
(
'get page json data success! {}'
,
offset
)
except
:
print
(
'retry get page data : {}'
,
offset
)
self
.
get_page_data
(
url
)
def
search_answer_article_page
(
self
,
offset
,
mark
,
proxies_num
=
0
):
'''
实现文章和回答的数据包请求
'''
offset
=
str
(
offset
)
if
mark
==
0
:
url
=
self
.
ANSWER_URL
.
format
(
offset
)
elif
mark
==
1
:
url
=
ARTICLE_URL
.
format
(
offset
)
[
headers_search
,
cookies_dict
]
=
self
.
headers_handle
(
url
)
page_dict
=
self
.
get_page_data
(
url
,
headers_search
,
cookies_dict
)
if
page_dict
.
get
(
"data"
):
print
(
self
.
page_count
)
if
self
.
page_count
==
1000
:
self
.
update_page_count
(
page_dict
[
"paging"
]
.
get
(
"totals"
,
0
))
for
one_line
in
page_dict
[
'data'
]:
try
:
if
one_line
[
"content"
]
!=
None
:
self
.
parse_sigle_page
(
one_line
,
mark
)
print
(
"finshed_crawler "
+
offset
)
except
KeyError
:
# It's totally ok to drop the last return data value.
# The search api just return something seems related to search
print
(
'page data error'
)
continue
else
:
print
(
"article_data_error, offset: "
,
offset
,
" url: "
,
url
)
self
.
use_proxy
=
True
self
.
search_answer_article_page
(
offset
=
offset
,
mark
=
mark
)
return
if
__name__
==
'__main__'
:
'''
python tasks/zhihu/spider.py 0 1 0 'https://www.zhihu.com/people/taoxi-1130'
python script_file_path mark(指定是问题还是其他, 0 是问题, 1是文章, 2是想法) max_page(最大页码) start_page(起始页码 0开始) spider_url(要爬取用户知乎首页地址)
'''
mark
=
int
(
sys
.
argv
[
1
])
max_page
=
int
(
sys
.
argv
[
2
])
start_page
=
int
(
sys
.
argv
[
3
])
spider_url
=
sys
.
argv
[
4
]
# spider_url = 'https://www.zhihu.com/people/geng-mei-suo-chang/answers'
print
(
datetime
.
now
())
spider
=
Spider
(
spider_url
=
spider_url
)
if
mark
==
0
:
spider
.
search_page
(
mark
,
max_page
,
start_page
)
elif
mark
==
1
:
spider
.
search_page
(
mark
,
max_page
,
start_page
)
elif
mark
==
2
:
spider
.
search_page
(
mark
,
max_page
,
start_page
)
print
(
datetime
.
now
())
tasks/zhihu/spider.py
View file @
d1b63291
...
...
@@ -7,43 +7,18 @@ import re
import
sys
import
time
from
datetime
import
datetime
import
kdl
import
base64
from
requests_html
import
HTMLSession
from
PIL
import
Image
from
captcha.zhihu_captcha
import
ZhihuCaptcha
HOST
=
'172.18.51.14'
PORT
=
3306
USER
=
'spider'
PASSWD
=
'Gengmei123'
DB
=
'spider'
# JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
JS_FILE_PATH
=
'/Users/haowei/workspace/gm/crawler/crawler_sys/site_crawler/zhihu.js'
APIKEY
=
'quxguz4hwm9cxnx6wpjhkokx04klpr8v'
def
get_proxy
():
auth
=
kdl
.
Auth
(
"990866563045611"
,
APIKEY
)
client
=
kdl
.
Client
(
auth
)
ips
=
client
.
get_dps
(
1
,
sign_type
=
'hmacsha1'
,
format
=
'json'
,
area
=
'北京,上海,广东'
)
print
(
"dps proxy: "
,
ips
)
return
{
"http"
:
"http://{}"
.
format
(
ips
[
0
]),
"https"
:
"https://{}"
.
format
(
ips
[
0
]),
}
def
retry_get_url
(
url
,
retrys
=
5
,
timeout
=
10
,
proxies
=
None
,
**
kwargs
):
retry_c
=
0
while
retry_c
<
retrys
:
try
:
if
proxies
:
get_resp
=
requests
.
get
(
url
,
timeout
=
timeout
,
proxies
=
proxies
,
**
kwargs
)
else
:
get_resp
=
requests
.
get
(
url
,
timeout
=
timeout
,
**
kwargs
)
return
get_resp
except
Exception
as
e
:
retry_c
+=
1
time
.
sleep
(
3
)
print
(
e
)
proxies
=
get_proxy
()
print
(
'Failed to get page
%
s after
%
d retries,
%
s'
%
(
url
,
retrys
,
datetime
.
now
()))
return
None
JS_FILE_PATH
=
'/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
# JS_FILE_PATH = '/Users/haowei/workspace/gm/crawler/crawler_sys/site_crawler/zhihu.js'
class
Spider
(
object
):
...
...
@@ -52,6 +27,7 @@ class Spider(object):
'''
初始化数据库,调整js规则
'''
self
.
login_req
=
self
.
_login
()
self
.
conn
=
pymysql
.
connect
(
host
=
HOST
,
port
=
PORT
,
user
=
USER
,
passwd
=
PASSWD
,
db
=
DB
,
charset
=
'utf8'
)
...
...
@@ -71,8 +47,76 @@ class Spider(object):
except
:
with
open
(
JS_FILE_PATH
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
js
=
f
.
read
()
self
.
exec_js
=
execjs
.
compile
(
js
)
# self.exec_js = execjs.compile(js, cwd='/home/gmuser/node_modules')
# self.exec_js = execjs.compile(js)
self
.
exec_js
=
execjs
.
compile
(
js
,
cwd
=
'/home/gmuser/node_modules'
)
self
.
session
=
requests
.
session
()
self
.
HTMLSession
=
HTMLSession
()
def
_login
(
self
):
url
=
'https://www.zhihu.com'
loginUrl
=
'https://www.zhihu.com/login/email'
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/55.0.2883.87 Chrome/55.0.2883.87 Safari/537.36'
,
"Referer"
:
"http://www.zhihu.com/"
,
'Host'
:
'www.zhihu.com'
,
'rememberme'
:
"true"
}
self
.
headers
=
headers
data
=
{
'email'
:
'yousangdandan@yeah.net'
,
'password'
:
'5358569'
}
login_req
=
self
.
session
.
post
(
loginUrl
,
data
=
data
,
headers
=
headers
)
print
(
'loginReq:{}'
.
format
(
login_req
.
status_code
))
self
.
set_cookie
(
self
.
session
.
cookies
)
return
login_req
def
set_cookies
(
self
,
cookies
):
self
.
HTMLSession
.
cookies
=
cookies
self
.
session
.
cookies
=
cookies
def
appeal
(
self
,
url
):
self
.
captcha_model
=
ZhihuCaptcha
()
r
=
self
.
HTMLSession
.
get
(
'https://www.zhihu.com/api/v4/anticrawl/captcha_appeal'
)
captchaUrl
=
r
.
json
()[
'img_base64'
]
captchaUrl
=
re
.
sub
(
'
\n
'
,
''
,
captchaUrl
)
with
open
(
'cache/captcha2.png'
,
'wb'
)
as
f
:
img_base64
=
base64
.
b64decode
(
captchaUrl
.
strip
(
'data:image/png;base64,'
)
.
strip
())
print
(
img_base64
)
f
.
write
(
img_base64
)
im
=
Image
.
open
(
'cache/captcha2.png'
)
captcha
=
self
.
captcha_model
.
recgImg
(
im
)
print
(
captcha
)
r
=
self
.
HTMLSession
.
post
(
'https://www.zhihu.com/api/v4/anticrawl/captcha_appeal'
,
data
=
json
.
dumps
({
"captcha"
:
captcha
}),
headers
=
{
"User-Agent"
:
user_agent
,
"referer"
:
'https://www.zhihu.com/account/unhuman?type=unhuman&message=
%
E7
%
B3
%
BB
%
E7
%
BB
%9
F
%
E7
%9
B
%91%
E6
%
B5
%8
B
%
E5
%88%
B0
%
E6
%82%
A8
%
E7
%9
A
%84%
E7
%
BD
%91%
E7
%
BB
%9
C
%
E7
%8
E
%
AF
%
E5
%
A2
%83%
E5
%
AD
%98%
E5
%9
C
%
A8
%
E5
%
BC
%82%
E5
%
B8
%
B8
%
EF
%
BC
%8
C
%
E4
%
B8
%
BA
%
E4
%
BF
%9
D
%
E8
%
AF
%81%
E6
%82%
A8
%
E7
%9
A
%84%
E6
%
AD
%
A3
%
E5
%
B8
%
B8
%
E8
%
AE
%
BF
%
E9
%97%
AE
%
EF
%
BC
%8
C
%
E8
%
AF
%
B7
%
E8
%
BE
%93%
E5
%85%
A5
%
E9
%
AA
%8
C
%
E8
%
AF
%81%
E7
%
A0
%81%
E8
%
BF
%9
B
%
E8
%
A1
%8
C
%
E9
%
AA
%8
C
%
E8
%
AF
%81%
E3
%80%82
&need_login=false'
,
'Content-Type'
:
'application/json'
,
'x-xsrftoken'
:
self
.
HTMLSession
.
cookies
.
_cookies
[
'.zhihu.com'
][
'/'
][
'_xsrf'
]
.
value
})
return
self
.
HTMLSession
.
get
(
url
,
allow_redirects
=
False
)
def
retry_get_url
(
self
,
url
,
retrys
=
5
,
timeout
=
10
,
**
kwargs
):
retry_c
=
0
while
retry_c
<
retrys
:
time
.
sleep
(
5
)
try
:
import
pdb
;
pdb
.
set_trace
()
get_resp
=
self
.
session
.
get
(
url
,
headers
=
self
.
headers
,
timeout
=
timeout
,
**
kwargs
)
if
get_resp
.
status_code
==
403
:
get_resp
=
self
.
appeal
(
url
)
return
get_resp
except
Exception
as
e
:
retry_c
+=
1
print
(
e
)
print
(
'Failed to get page
%
s after
%
d retries,
%
s'
%
(
url
,
retrys
,
datetime
.
now
()))
return
None
def
update_page_count
(
self
,
answer_count
):
count
=
int
(
answer_count
/
20
)
...
...
@@ -81,28 +125,6 @@ class Spider(object):
count
+=
1
self
.
page_count
=
count
def
get_serach_page_cookies
(
self
):
'''
cookies更新
'''
headers
=
{
"accept"
:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
,
"accept-encoding"
:
"gzip, deflate, br"
,
"accept-language"
:
"zh-CN,zh;q=0.9"
,
"cache-control"
:
"max-age=0"
,
"cookie"
:
'SESSIONID=UIZ9mtCMPNttU11zx8a9e5eJcTm92PhBGKiz9oqWgDr; JOID=UlsQAkk86vULooRFHj177i1UY9UKHM_SK4OkYDkdytAsgqVlOzFzH1WjgUcZ-R9yCKLnnTcSKj5UlS_DhJu9iUI=; osd=VFoXB0466_IOpYJEGTh86CxTZtIMHcjXLIWlZzwazNErh6JjOjZ2GFOihkIe_x51DaXhnDAXLThVkirEgpq6jEU=; SESSIONID=rsVkcWbq9ESuP7O4fOw4qdMJdkNGnCFu59zCNAAkoIL; JOID=VV4TCkoAD-uttc-DPQ6Y9IZDJxUtIizHhpDtoBElLciBnuqhHkmjAfyyzow6twj5biJFaHi7j_WoTqKkbWlN0QI=; osd=UFkVCk4FCO2tscqEOw6c8YFFJxEoJSrHgpXqphEhKM-Hnu6kGU-jBfm1yIw-sg__biZAb367i_CvSKKgaG5L0QY=; _xsrf=vTWamiEoaOQszAl6fjdlxOtqyhDvOen9; d_c0="AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=|1545899265"; q_c1=3e9e37a46b1d4bfd87f7d1fcb084daad|1545899267000|1545899267000; _ga=GA1.2.929033900.1582626815; capsion_ticket="2|1:0|10:1608602928|14:capsion_ticket|44:MmRhNDdmYWJhZjU3NGQ4ODg3NDAzNGIwNDNiMTdlNDE=|7924fa0d0e36d3ed2a4af65dafa4684c9b36a70d586ec3adb1963c8df5f55e81"; _zap=6fd2d768-daa1-4be1-9a96-43d86c1bbc75; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1609325059,1609337218,1609401296,1609405637; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1609410807; KLBRSID=0a401b23e8a71b70de2f4b37f5b4e379|1609410806|1609401296'
,
"referer"
:
self
.
spider_url
,
"sec-fetch-dest"
:
"document"
,
"sec-fetch-mode"
:
"navigate"
,
"sec-fetch-site"
:
"same-origin"
,
"sec-fetch-user"
:
"?1"
,
"upgrade-insecure-requests"
:
"1"
,
"user-agent"
:
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
,
}
requests_res
=
retry_get_url
(
self
.
spider_url
,
headers
=
headers
)
return
requests_res
.
cookies
.
get_dict
()
def
check_data_exist
(
self
,
data_dict
,
mark
):
'''
数据插入前检测
...
...
@@ -124,7 +146,7 @@ class Spider(object):
def
parse_sigle_page
(
self
,
data_dict
,
mark
):
'''
插入主要内容数据和图片的url
,寻找评论
插入主要内容数据和图片的url
'''
if
not
self
.
check_data_exist
(
data_dict
,
mark
):
...
...
@@ -142,8 +164,6 @@ class Spider(object):
self
.
cur
.
execute
(
into
,
values
)
self
.
conn
.
commit
()
return
def
search_page
(
self
,
mark
,
page_max
,
start_page
=
0
,
need_commend
=
False
):
'''
函数主入口
...
...
@@ -167,7 +187,20 @@ class Spider(object):
time
.
sleep
(
10
)
self
.
conn
.
close
()
return
def
get_page_data
(
self
,
url
):
get_page
=
self
.
retry_get_url
(
url
)
if
get_page
.
status_code
!=
200
:
# retry once
get_page
=
self
.
retry_get_url
(
url
)
if
get_page
.
status_code
!=
200
:
print
(
"article_error, url : "
,
url
,
" status_code: "
,
get_page
.
status_code
)
try
:
page_dict
=
get_page
.
json
()
print
(
'get page json data success! {}'
,
offset
)
except
:
print
(
'retry get page data : {}'
,
offset
)
self
.
get_page_data
(
url
)
def
search_answer_article_page
(
self
,
offset
,
mark
,
proxies_num
=
0
):
'''
...
...
@@ -178,19 +211,8 @@ class Spider(object):
url
=
self
.
ANSWER_URL
.
format
(
offset
)
elif
mark
==
1
:
url
=
ARTICLE_URL
.
format
(
offset
)
[
headers_search
,
cookies_dict
]
=
self
.
headers_handle
(
url
)
proxies
=
None
if
self
.
use_proxy
:
proxies
=
get_proxy
()
get_page
=
retry_get_url
(
url
,
headers
=
headers_search
,
cookies
=
cookies_dict
,
proxies
=
proxies
)
if
get_page
.
status_code
!=
200
:
# retry once
time
.
sleep
(
3
)
get_page
=
retry_get_url
(
url
,
proxies
=
proxies
)
if
get_page
.
status_code
!=
200
:
print
(
"article_error, url : "
,
url
,
" status_code: "
,
get_page
.
status_code
)
page_dict
=
get_page
.
json
()
page_dict
=
self
.
get_page_data
(
url
)
if
page_dict
.
get
(
"data"
):
print
(
self
.
page_count
)
if
self
.
page_count
==
1000
:
...
...
@@ -201,54 +223,15 @@ class Spider(object):
self
.
parse_sigle_page
(
one_line
,
mark
)
print
(
"finshed_crawler "
+
offset
)
except
KeyError
:
# It's totally ok to drop the last return data value.
# The search api just return something seems related to search
print
(
'page data error'
)
continue
else
:
print
(
"article_data_error, offset: "
,
offset
,
" url: "
,
url
)
self
.
use_proxy
=
True
time
.
sleep
(
3
)
self
.
search_answer_article_page
(
offset
=
offset
,
mark
=
mark
)
return
def
headers_handle
(
self
,
url
):
'''
url请求中的头部伪装
'''
res_cookies_dict
=
self
.
get_serach_page_cookies
()
referer
=
self
.
spider_url
.
replace
(
"https://www.zhihu.com/people"
,
"https://www.zhihu.com/api/v4/members"
)
headers_search
=
{
"accept"
:
"*/*"
,
"accept-encoding"
:
"gzip, deflate"
,
"accept-language"
:
"zh-CN,zh;q=0.9"
,
"sec-fetch-dest"
:
"empty"
,
"sec-fetch-mode"
:
"cors"
,
"sec-fetch-site"
:
"same-origin"
,
"user-agent"
:
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
,
"x-ab-param"
:
"li_yxzl_new_style_a=1;se_major=0;se_adsrank=4;se_hi_trunc=0;se_v053=1;li_panswer_topic=0;zr_test_aa1=1;pf_noti_entry_num=2;zr_search_sim2=2;zr_slotpaidexp=2;se_zp_boost=0;tp_club_entrance=1;pf_profile2_tab=0;ug_newtag=1;li_answer_card=0;ls_recommend_test=5;qap_labeltype=1;zr_rec_answer_cp=open;se_sug_term=0;tp_topic_tab=0;ge_ge01=5;se_wil_act=0;se_videobox=0;tsp_ad_cardredesign=0;qap_question_visitor= 0;zr_slot_training=2;tp_clubhyb=0;li_ebook_gen_search=2;se_v_v005=0;zw_sameq_sorce=999;ge_ge02=6;se_mobilecard=0;se_auth_src=0;tp_header_style=1;tp_flow_ctr=0;pf_creator_card=1;li_viptab_name=0;zr_intervene=0;se_bert128=1;se_ffzx_jushen1=0;top_v_album=1;se_preset=0;tp_discover=1;ls_fmp4=0;tp_club_top=0;top_universalebook=1;li_svip_cardshow=1;li_paid_answer_exp=0;tp_topic_style=0;zr_art_rec=base;se_colorfultab=1;se_auth_src2=0;tp_club_qa_entrance=1;tp_club__entrance2=1;tsp_hotlist_ui=3;li_svip_tab_search=1;se_entity22=1;tp_meta_card=0;tp_topic_tab_new=0-0-0;tp_zrec=0;top_ebook=0;pf_adjust=1;qap_question_author=0;zr_topic_rpc=0;se_topicfeed=0;tp_club_feed=0;tsp_ioscard2=0;zr_rel_search=base;se_recommend=1;se_usercard=0;tp_club_fdv4=0;tp_m_intro_re_topic=1;pf_foltopic_usernum=0;li_vip_verti_search=0;zr_training_boost=false;se_v054=0;tp_contents=1;soc_feed_intelligent=3;tsp_ios_cardredesign=0;pf_fuceng=1;pf_newguide_vertical=0;ug_follow_topic_1=2;ls_video_commercial=0;li_car_meta=1;se_sug_dnn=0;tp_fenqu_wei=0;li_catalog_card=1;top_quality=0;se_click_v_v=1;se_aa_base=1;se_club_ui=0;se_return_1=0;soc_notification=1;zr_ans_rec=gbrank;zr_search_paid=1;zr_expslotpaid=3;zr_rerank=0;se_college=default;se_whitelist=1;top_root=0;li_yxxq_aut=A1;tsp_adcard2=0;ls_videoad=2;se_col_boost=1;li_edu_page=old;zr_training_first=false;se_t2sug=1;se_vbert3=0;se_merge=0;li_video_section=1;zr_km_answer=open_cvr;zr_sim3=0;se_v_v006=0;tp_dingyue_video=0;li_topics_search=0;se_searchwiki=0;se_guess=0;se_major_v2=0;tp_club_bt=0;tp_sft=a;top_test_4_liguangyi=1"
,
"x-api-version"
:
"3.0.91"
,
"x-app-za"
:
"OS=Web"
,
"x-requested-with"
:
"fetch"
,
"x-zse-83"
:
"3_2.0"
,
"x-zse-86"
:
None
,
"referer"
:
referer
+
"/answers?page=1"
,
}
cookies_dict
=
{
"d_c0"
:
'"AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=|1545899265"'
,
"KLBRSID"
:
'0a401b23e8a71b70de2f4b37f5b4e379|1609410806|1609401296'
}
cookies_dict
.
update
(
res_cookies_dict
)
f
=
"+"
.
join
(
[
"3_2.0"
,
url
.
replace
(
"https://www.zhihu.com"
,
""
),
headers_search
[
"referer"
],
cookies_dict
[
"d_c0"
]])
fmd5
=
hashlib
.
new
(
'md5'
,
f
.
encode
())
.
hexdigest
()
headers_search
[
"x-zse-86"
]
=
"1.0_"
+
self
.
exec_js
.
call
(
"b"
,
fmd5
)
return
headers_search
,
cookies_dict
if
__name__
==
'__main__'
:
'''
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment