Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Chengyang Zhong
crawler
Commits
607c82f7
Commit
607c82f7
authored
Dec 31, 2020
by
haowang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix
parent
e3acccb8
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
972 additions
and
260 deletions
+972
-260
plans.py
tasks/zhihu/plans.py
+412
-0
spider.py
tasks/zhihu/spider.py
+49
-260
spider_complex.py
tasks/zhihu/spider_complex.py
+511
-0
No files found.
tasks/zhihu/plans.py
0 → 100644
View file @
607c82f7
import
os
import
time
from
datetime
import
datetime
if
__name__
==
'__main__'
:
plans
=
[
"python tasks/zhihu/spider.py 0 3 0 'https://www.zhihu.com/people/chun-feng-zhu-lang'"
,
"python tasks/zhihu/spider.py 0 10 0 'https://www.zhihu.com/people/drunkxiaojingguai'"
,
"python tasks/zhihu/spider.py 0 1 0 'https://www.zhihu.com/people/kokokou'"
,
"python tasks/zhihu/spider.py 0 5 0 'https://www.zhihu.com/people/yo-he-14-20'"
,
"python tasks/zhihu/spider.py 0 7 0 'https://www.zhihu.com/people/wen-zi-ding-dan'"
,
"python tasks/zhihu/spider.py 0 169 0 'https://www.zhihu.com/people/zhengxingba'"
,
"python tasks/zhihu/spider.py 0 24 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-lu-hui'"
,
"python tasks/zhihu/spider.py 0 5 0 'https://www.zhihu.com/people/xiao-zhu-99-20'"
,
"python tasks/zhihu/spider.py 0 119 0 'https://www.zhihu.com/people/zhaotianqiang'"
,
"python tasks/zhihu/spider.py 0 7 0 'https://www.zhihu.com/people/zuo-zheng-xing-de-liu-zhe-qi'"
,
"python tasks/zhihu/spider.py 0 14 0 'https://www.zhihu.com/people/cao-yang-yan-da-shi'"
,
"python tasks/zhihu/spider.py 0 2 0 'https://www.zhihu.com/people/zhe-mang-guo-mang'"
,
"python tasks/zhihu/spider.py 0 5 0 'https://www.zhihu.com/people/yuxi624'"
,
"python tasks/zhihu/spider.py 0 10 0 'https://www.zhihu.com/people/key-70-48'"
,
"python tasks/zhihu/spider.py 0 4 0 'https://www.zhihu.com/people/dryanling'"
,
"python tasks/zhihu/spider.py 0 2 0 'https://www.zhihu.com/people/shu-er-29-7'"
,
"python tasks/zhihu/spider.py 0 13 0 'https://www.zhihu.com/people/chen-shi-long-69-98'"
,
"python tasks/zhihu/spider.py 0 5 0 'https://www.zhihu.com/people/ka-li-yu-dan-94'"
,
"python tasks/zhihu/spider.py 0 4 0 'https://www.zhihu.com/people/nuo-nuo-jiang-46-40'"
,
"python tasks/zhihu/spider.py 0 10 0 'https://www.zhihu.com/people/lenhzin'"
,
"python tasks/zhihu/spider.py 0 7 0 'https://www.zhihu.com/people/tu-zi-ai-chi-cao-78'"
,
"python tasks/zhihu/spider.py 0 19 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-wang-shi-hu'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yu-ji-jie-jie-12'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/huan-xing-mei-mei'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/suan-jie-shen-ba'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/li-ji-hua-8'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/dong-yan-10-84'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-yu-yan'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yu-mao-wen-46'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/drfranklin-eps'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/sxsxp1'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/judyxiao-jie-jie'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bian-mei-jun'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hu-tao-tao-de-yan-jiu-shi'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wei-fan-ceng'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-zi-xun-shi-yu-er-duo-duo'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ju-zi-guo-jiang-20-66'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/a-qi-shu-shu-15'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chi-bu-ding-de-pang-ding-27-43'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jliyimei'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/reseted1503325608'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/miao-wu-a-wu-666'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/a-ye-xiang-wo'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/liang-yi-31-45'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhao-xue-qi-2'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-gang-2017'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/long-mao-xue-yuan'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mai-tian-li-de-ke-ai-duo'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-dobby'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-zhi-da-xue-tang'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-min-94-15'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xian-nu-mei-rong'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wei-xiao-xiao-72-30'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wen-bao-guo-bao'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-zhu-li-666'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/si-te-li-ke-lan-de-xian-sheng'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bian-mei-she'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/sou-di-leng-hua-zheng-qiao'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yimeisiba'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hua-jie-bu-xing-hua'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bitibo/pins'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/liu-cheng-sheng-zhu-li'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/gao-yi-chao-63'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chen-shi-er-78-24'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-tian-jiao-sweet'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/can-ma-de-can-lan-sheng-huo'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ai-chi-xue-gao-24'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xian-bei-bu-ai-hua-zhuang'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/pluto-54-88'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bai-meng-meng-79-43'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ai-mei-de-mi-xue'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/li-yan-lin-22-95'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ning-meng-jing-tang-dou-dou'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wo-you-yizhi-xiao-mao-mi'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhengrongjia'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/li-li-36-48-1'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jin-xiao-mei-88-37'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/liang-yi-sheng-99-9'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xia-xiao-xiao-20-12'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wai-hao-da-biao-jie'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-zhou-yu'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhi-zhi-fei-ji'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/pan-er-bo-shi'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/cheng-du-long-bi-bo-shi-zhang-han'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/well-1-95'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ying-ying-37-60'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-xiao-51-29-23'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/li-rui-64-16'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-zhi-dao-ge'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/liu-yi-he-tian-ling'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/13397228518'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhang-zhi-bing-13'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-gang-2017'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-lian-mei-9'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zi-fei-yu-79-53'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zengyy-5'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-jiang-yi-sheng'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wu-di-30-63-36'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-zhi-shuo-ming-shu'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/shan-shan-lai-zao-liao-35'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-zi-xun-shi-abbie-71'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/tou-fa-ke'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/0329'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-hao-90-58-31'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-jing-20-41-10'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-xiao-xiao-62-85'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jian-si-bu-neng-jiu'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mi-qi-xian-sheng-72'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jiangjy58'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-jiao-zi-mo-mo'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/haozemaliang'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xie-wen-hui-21-66'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-shua-shua-98'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/amazingbird'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/desperate'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-pei-jun-15'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/le-mei-xiao-qin'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/meilikepujun'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/juan-mao-er-1-90'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bian-mei-jun-shi'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hai-da-li-28'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xieyangchun'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/song-song-1-3-27'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yu-sheng-60-21-71'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/CQ_Hazel'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/li-you-zi-13'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ban-ban-61-99'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bei-jing-ba-da-chu-jin-xiao-lei'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-bai-zhen-hua-54'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-rong-yi-sheng-gao-si'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/du-rui-mi-66'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ke-le-niu-nai-38'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ai-xiao-de-chun-93'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-chuan-kai-67'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qiang-jiao-de-xiao-tiao-wa'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/le-le-34-73-57'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/lao-lun-si-de-bai-ri-meng'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hong-li-34-89'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhao-xiu-li-67'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-er-gou-zi'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-zhi-zheng-yi-11'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/pi-fu-mei-rong-zhang-wei-hua'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/cai-xiao-bai-60-17'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ban-xia-7-29'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qi-sh-57'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qing-jiao-wo-yi-mei-shuo-ming-shu'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chen-mo-fang-jiang-zhi-fang'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yi-mei-zai-xian-50'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-jiu-sheng-25-10'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/miao-jiang-3'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mu-lan-wu-chang-xiong-41'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hk17320188886'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xi-fu-wo-xiang-zhi-dao'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zuo-zheng-xing-de-ren-han-xiao'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/gan-cheng'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-xiao-yi-ma'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/aloedeng-ni'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhoushaolong'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chaigang9h'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-jian-93-10'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-hu-hu-69-77'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhihu88'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zou-yu-75-32'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/liu-mo-mo-49-86'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mei-gu-shi-de-j-xiao-jie-22'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-li-feng-yong'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/sunzhicheng'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/long-bi-yi-sheng-li-chang-fu'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chen-da-da-57'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-bai-58-40'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/dong-ming-xian-48'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ai-mei-luo-bu-gan'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zaoxaomei'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/da-xiao-bei-13'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xu-rong-yang-71'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhang-da-yan-yi-sheng'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/a-wei-68-2'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-kong-ju-le-bu'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/winni-2-13'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-jiu-jiu-80-61'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/leon-5-35-12'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/dr-yan-93'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/tong-yan-jun-25'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jennykissedme'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/li-chuang-26-64'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yi-mei-lang-zi'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-miao-64-41'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-guan-86-73'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-hou-you-10'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ce-ge-mo'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhong-ri-zheng-xing-huang-wen-gang'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-di-xing-kong-70'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/drchenbing'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-liu-53-15-36'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/gao-san-gou-67-76'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhang-xiao-tian-mian-diao-zhuan-jia'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhou-ye-song-57'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mei-du-zheng-xing-zhang-yi-sheng-19'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ouruila'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-liang-zong-jian'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-83-93-34'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ren-dong-ivy'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/13718001372'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xi-jing-xiao-mei'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ai-zheng-cai-hui-ying'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.kdocs.cn/view/l/coV2TD4LExp2'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-meng-ming-xing-38'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/alice-95-96-90'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/liu-xing-hu-die-jian-14/answers'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/huang-yi-xiang-28-29'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/beauty-44-9'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wei-bo-ying-xiao-xiao-dou'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/my-world-70-8'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ishuangyanpi'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ban-tang-ka-pei-60'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/miao-miao-miao-5-43'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hejiongchuan'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xin-han-50-61'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiang-yan-zhi-kao-jin-de-you-zi'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-ai-dou-xiao-mi'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ni-gu-man-man'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/DrZhaoyanyong'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhi-fang-wei-diao-qiu-li-dong'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chen-gui-zong'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiecan1'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-lu-shu-rong-6'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jiang-tian-xie-1'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-yi-sheng-yu-bing'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/lu-jian-jian-12'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-li-qiu-yue'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-zhang-shu-gong'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jiu-ai-he-nai-cha-62'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jie-wo-yiba-qiang-49'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/13522127270'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-ji-guang-zhong-xin-feng-yong-qiang'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bu-zhi-dao-jiao-sha-79-74'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-yi-sheng-zhou-xu-99'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-liu-tun-yi-sheng'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xian-xian-76-39/answers'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jing-jing-70-85-21'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bing-he-lao-dao'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-wang-tai-ling'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mo-xiao-xin-28-35'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-ma-zhu'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/lovelife-96-78'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/sorry-7'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qi-yong-le-38'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-rong-yi-sheng-wang-shu-jie'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-mi-jie-33-58'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/shan-he-yi-qiu-94'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zong-xian-lei-yi-sheng-49'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiong-mao-jiang-zi-18'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-liao-56-14-3'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qing-qing-ping-ping-36'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yang-zhe-2-83'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/tao-hua-huan-jiu-qian-64'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-wang-yue'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bei-jing-ba-da-chu-ma-xiao-yang-yi-sheng'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-yu-hao'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhang-hai-ming-97-8'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-xiao-xiao-shi-49'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/san-qi-h-79'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/tong-xiao-yi-sheng-85'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-yi-sheng-yin-hong-yu'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ai-mei-meng-xi-52'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zai-zai-76-64'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/a-di-35-64'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/shu-yan-ni-ni/answers'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-wang-li-peng-67'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/Swahili1230'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/gorgeous-1-48'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/luna-10-2'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qia-wa-yi-fu-50/answers'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/lin-xiao-pang-30-84'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/you-zi-shu-27-18'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mc-goldenli'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-dai-zheng-zhou'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-dong-qi'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ding-xiang-17-21'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/long-xiao-lan-56'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wo-shi-ann'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-fu-jun-li'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/fan-rong-jie-43'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/liu-liu-zheng-xing'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhou-xiao-xiao-43-80'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/tea-5-1-57/answers'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-man-3-54'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jing-guo-shen-she-94'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhou-miao-miao-50-58'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/masakijiang-83'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-han-xing'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yu-zhou-xiao-ke-ai-79-36'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zxys-zhanglong'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xin-li-xiao-tian-shi'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hart-48-54'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ou-bu-zhi-dao-52'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/kiki77-21'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chun-bu-zheng-xing-yi-sheng-li-hui'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xia-mo-87-11'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/tiffanybaby-11'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mu-qing-6-40'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jiang-sijun-70-90-26'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wangling-31-13'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-xi-82-72-10'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-ai-hong-mei'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yang-zhong-zheng-xing-yi-sheng'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/gao-xiao-shi-pin-lu-ren-jia'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-wang-yan'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hong-kao-shi-ba-v'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-wai-ke-wang-xiao-yang-yi-sheng'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xie-he-zheng-xing-yi-sheng-wang-yang'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bei-yi-san-yuan-zheng-xing-yi-sheng-li-bi'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/nan-jing-ruan-tang-xiao-jie'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-meng-chun-16-30'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/column/meidaila'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/column/cosmotology'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/miss-miao-jun-81'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/guan-bo-lan-14'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yi-xue-pi-fu-guan-li-gao-jin-ling'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ji-guang-yu-yi-sheng'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jia-qu-jing-31'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/fu-chen-yi-mei'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-liu-xu-li'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-zhu-dong-sheng'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/christy-20-58/answers'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yang-shen-shen-21-52'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-chen-bo-yi-sheng'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bo-luo-15-5-93'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yang-zhou-ifaceai-fei-si'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-wai-ke-yi-sheng-nie-yun-fei'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/lanzhiyong'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/gong-da-yi-mei-dsh3225'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-mei-jie-niu-er'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/binghan'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mei-fu-yi-sheng-yi-yang-liang'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mei-li-da-ren-40'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mei-xing-shuo-33'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yi-mei-xiao-tian-xin-ne'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/shi-jun-li-yi-sheng'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/han-ao-xi-65'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/tang-jian-bing-yi-sheng'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/nan-feng-18-6-16'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chen-xiang-xin-17'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/18611026629'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhang-shuai-47-80'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mu-mu-97-35-35'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/miss-fei-fei-68'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yi-mei-xiao-jiang-jun'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/le-xiao-zhang-89'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/joy-71-96-54'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/cat-49-49'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yi-mei-xiao-ni-zi'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yimei-38-82'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/CrossoverMarketing'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/pi-fu-yi-sheng-zhousir'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-yuan-wai-07'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xi-you-tuan-zi-2'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hua-mei-yuan-chang-fu-guo-you-bo-shi'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-hai-feng-38-32'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/lian-qia-fo-ju-jie'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/cong-cong-64-65'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/shang-hai-zheng-xing-chuang-shi-ren'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-deng-gang'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ning-meng-mei-wo-meng-29-98'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-huang-rui'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yu-qi-gang-44'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/cheng-zong-yu-12'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bai-li-tian-7-72'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-sun-zhong-sheng'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/likemei-shi-yan-shi'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/deng-zheng-jun-zhu-ren'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hua-shao-30-97'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/man-lun-hua-kai'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/fschaomei'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wen-bi-yun-96'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qing-hua-wu-jia-jun-bo-shi-zheng-xing'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/15104568657'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/dr-fang-81'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qiong-an-gu-niang'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/gong-fu-qiang-65-23'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yi-xue-mei-rong-wang-xi-you-yi-shi'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-xiao-shuo-yi-mei'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bian-mei-zhao-ha-ge/answers'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-xiu-fu-yi-sheng-wang-shao-guo'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-666'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-yi-sheng-lin-lin'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zang-meng-qing-yi-sheng'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-xiao-lian-mao'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-da-dong'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/LZGdoctor'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-lin-wei'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-zhou-xiao-dong'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/15680033702'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-yin-hong-yu-45'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/huang-jia-cheng-15-88'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/long-bi-yi-sheng-liu-guo-quan'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-qin-zi-kan'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ya-tou-20-6-24'"
,
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-kong-ke-pu-67'"
,
]
for
plan
in
plans
:
print
(
'start plan '
,
plan
,
' at '
,
datetime
.
now
())
os
.
system
(
plan
)
print
(
'end plan '
,
plan
,
' at '
,
datetime
.
now
())
time
.
sleep
(
10
)
tasks/zhihu/spider.py
View file @
607c82f7
# import rsa
import
os
,
sys
base_path
=
os
.
path
.
dirname
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)))
sys
.
path
.
append
(
"/Users/haowei"
)
sys
.
path
.
append
(
"/Users/haowei/workspace/gm/crawler"
)
sys
.
path
.
append
(
"/Users/haowei/workspace/gm/crawler/crawler_sys"
)
import
pymysql
import
pymysql
import
hashlib
import
hashlib
import
requests
import
requests
...
@@ -14,6 +7,7 @@ import re
...
@@ -14,6 +7,7 @@ import re
import
sys
import
sys
import
time
import
time
from
datetime
import
datetime
from
datetime
import
datetime
import
kdl
HOST
=
'172.18.51.14'
HOST
=
'172.18.51.14'
PORT
=
3306
PORT
=
3306
...
@@ -22,34 +16,31 @@ PASSWD = 'Gengmei123'
...
@@ -22,34 +16,31 @@ PASSWD = 'Gengmei123'
DB
=
'spider'
DB
=
'spider'
# JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
# JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
JS_FILE_PATH
=
'/Users/haowei/workspace/gm/crawler/crawler_sys/site_crawler/zhihu.js'
JS_FILE_PATH
=
'/Users/haowei/workspace/gm/crawler/crawler_sys/site_crawler/zhihu.js'
APIKEY
=
'quxguz4hwm9cxnx6wpjhkokx04klpr8v'
def
retry_get_url_no_proxies
(
url
,
retrys
=
3
,
timeout
=
10
,
**
kwargs
):
def
get_proxy
():
retry_c
=
0
auth
=
kdl
.
Auth
(
"990866563045611"
,
APIKEY
)
while
retry_c
<
retrys
:
client
=
kdl
.
Client
(
auth
)
try
:
ips
=
client
.
get_dps
(
1
,
sign_type
=
'hmacsha1'
,
format
=
'json'
,
area
=
'北京,上海,广东'
)
get_resp
=
requests
.
get
(
url
,
timeout
=
timeout
,
**
kwargs
)
print
(
"dps proxy: "
,
ips
)
return
get_resp
return
{
"http"
:
"http://{}"
.
format
(
ips
[
0
]),
"https"
:
"https://{}"
.
format
(
ips
[
0
]),
}
except
Exception
as
e
:
retry_c
+=
1
time
.
sleep
(
1
)
print
(
e
)
print
(
'Failed to get page
%
s after
%
d retries,
%
s'
%
(
url
,
retrys
,
datetime
.
now
()))
return
None
def
retry_get_url
(
url
,
retrys
=
3
,
timeout
=
10
,
**
kwargs
):
def
retry_get_url
(
url
,
retrys
=
5
,
timeout
=
10
,
proxies
=
None
,
**
kwargs
):
retry_c
=
0
retry_c
=
0
while
retry_c
<
retrys
:
while
retry_c
<
retrys
:
try
:
try
:
if
proxies
:
get_resp
=
requests
.
get
(
url
,
timeout
=
timeout
,
proxies
=
proxies
,
**
kwargs
)
else
:
get_resp
=
requests
.
get
(
url
,
timeout
=
timeout
,
**
kwargs
)
get_resp
=
requests
.
get
(
url
,
timeout
=
timeout
,
**
kwargs
)
return
get_resp
return
get_resp
except
Exception
as
e
:
except
Exception
as
e
:
retry_c
+=
1
retry_c
+=
1
time
.
sleep
(
1
)
time
.
sleep
(
3
)
print
(
e
)
print
(
e
)
proxies
=
get_proxy
()
print
(
'Failed to get page
%
s after
%
d retries,
%
s'
print
(
'Failed to get page
%
s after
%
d retries,
%
s'
%
(
url
,
retrys
,
datetime
.
now
()))
%
(
url
,
retrys
,
datetime
.
now
()))
return
None
return
None
...
@@ -66,8 +57,11 @@ class Spider(object):
...
@@ -66,8 +57,11 @@ class Spider(object):
db
=
DB
,
charset
=
'utf8'
)
db
=
DB
,
charset
=
'utf8'
)
self
.
cur
=
self
.
conn
.
cursor
()
self
.
cur
=
self
.
conn
.
cursor
()
self
.
page_count
=
1000
self
.
use_proxy
=
True
self
.
spider_url
=
spider_url
self
.
spider_url
=
spider_url
detail_url
=
'/answers?include=data
%5
B*
%5
D.is_normal
%2
Cadmin_closed_comment
%2
Creward_info
%2
Cis_collapsed
%2
Cannotation_action
%2
Cannotation_detail
%2
Ccollapse_reason
%2
Ccollapsed_by
%2
Csuggest_edit
%2
Ccomment_count
%2
Ccan_comment
%2
Ccontent
%2
Ceditable_content
%2
Cattachment
%2
Cvoteup_count
%2
Creshipment_settings
%2
Ccomment_permission
%2
Cmark_infos
%2
Ccreated_time
%2
Cupdated_time
%2
Creview_info
%2
Cexcerpt
%2
Cis_labeled
%2
Clabel_info
%2
Crelationship.is_authorized
%2
Cvoting
%2
Cis_author
%2
Cis_thanked
%2
Cis_nothelp
%2
Cis_recognized
%3
Bdata
%5
B*
%5
D.author.badge
%5
B
%3
F(type
%3
Dbest_answerer)
%5
D.topics
%3
Bdata
%5
B*
%5
D.question.has_publishing_draft
%2
Crelationship&offset=
20
&limit=20&sort_by=created'
detail_url
=
'/answers?include=data
%5
B*
%5
D.is_normal
%2
Cadmin_closed_comment
%2
Creward_info
%2
Cis_collapsed
%2
Cannotation_action
%2
Cannotation_detail
%2
Ccollapse_reason
%2
Ccollapsed_by
%2
Csuggest_edit
%2
Ccomment_count
%2
Ccan_comment
%2
Ccontent
%2
Ceditable_content
%2
Cattachment
%2
Cvoteup_count
%2
Creshipment_settings
%2
Ccomment_permission
%2
Cmark_infos
%2
Ccreated_time
%2
Cupdated_time
%2
Creview_info
%2
Cexcerpt
%2
Cis_labeled
%2
Clabel_info
%2
Crelationship.is_authorized
%2
Cvoting
%2
Cis_author
%2
Cis_thanked
%2
Cis_nothelp
%2
Cis_recognized
%3
Bdata
%5
B*
%5
D.author.badge
%5
B
%3
F(type
%3
Dbest_answerer)
%5
D.topics
%3
Bdata
%5
B*
%5
D.question.has_publishing_draft
%2
Crelationship&offset=
{}
&limit=20&sort_by=created'
self
.
ANSWER_URL
=
self
.
spider_url
.
replace
(
"https://www.zhihu.com/people"
,
"https://www.zhihu.com/api/v4/members"
)
+
detail_url
self
.
ANSWER_URL
=
self
.
spider_url
.
replace
(
"https://www.zhihu.com/people"
,
"https://www.zhihu.com/api/v4/members"
)
+
detail_url
os
.
environ
[
"EXECJS_RUNTIME"
]
=
'Node'
os
.
environ
[
"EXECJS_RUNTIME"
]
=
'Node'
...
@@ -80,6 +74,13 @@ class Spider(object):
...
@@ -80,6 +74,13 @@ class Spider(object):
self
.
exec_js
=
execjs
.
compile
(
js
)
self
.
exec_js
=
execjs
.
compile
(
js
)
# self.exec_js = execjs.compile(js, cwd='/home/gmuser/node_modules')
# self.exec_js = execjs.compile(js, cwd='/home/gmuser/node_modules')
def
update_page_count
(
self
,
answer_count
):
count
=
int
(
answer_count
/
20
)
temp
=
int
(
answer_count
%
20
)
if
temp
>
0
:
count
+=
1
self
.
page_count
=
count
def
get_serach_page_cookies
(
self
):
def
get_serach_page_cookies
(
self
):
'''
'''
cookies更新
cookies更新
...
@@ -90,14 +91,14 @@ class Spider(object):
...
@@ -90,14 +91,14 @@ class Spider(object):
"accept-encoding"
:
"gzip, deflate, br"
,
"accept-encoding"
:
"gzip, deflate, br"
,
"accept-language"
:
"zh-CN,zh;q=0.9"
,
"accept-language"
:
"zh-CN,zh;q=0.9"
,
"cache-control"
:
"max-age=0"
,
"cache-control"
:
"max-age=0"
,
"cookie"
:
'
_xsrf=vTWamiEoaOQszAl6fjdlxOtqyhDvOen9; d_c0="AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=|1545899265"; q_c1=3e9e37a46b1d4bfd87f7d1fcb084daad|1545899267000|1545899267000; _ga=GA1.2.929033900.1582626815; capsion_ticket="2|1:0|10:1608602928|14:capsion_ticket|44:MmRhNDdmYWJhZjU3NGQ4ODg3NDAzNGIwNDNiMTdlNDE=|7924fa0d0e36d3ed2a4af65dafa4684c9b36a70d586ec3adb1963c8df5f55e81"; _zap=6fd2d768-daa1-4be1-9a96-43d86c1bbc75; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1609321344,1609321689,1609321744,1609322777; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1609322777; KLBRSID=d017ffedd50a8c265f0e648afe355952|1609323283|1609315999
'
,
"cookie"
:
'
SESSIONID=UIZ9mtCMPNttU11zx8a9e5eJcTm92PhBGKiz9oqWgDr; JOID=UlsQAkk86vULooRFHj177i1UY9UKHM_SK4OkYDkdytAsgqVlOzFzH1WjgUcZ-R9yCKLnnTcSKj5UlS_DhJu9iUI=; osd=VFoXB0466_IOpYJEGTh86CxTZtIMHcjXLIWlZzwazNErh6JjOjZ2GFOihkIe_x51DaXhnDAXLThVkirEgpq6jEU=; SESSIONID=rsVkcWbq9ESuP7O4fOw4qdMJdkNGnCFu59zCNAAkoIL; JOID=VV4TCkoAD-uttc-DPQ6Y9IZDJxUtIizHhpDtoBElLciBnuqhHkmjAfyyzow6twj5biJFaHi7j_WoTqKkbWlN0QI=; osd=UFkVCk4FCO2tscqEOw6c8YFFJxEoJSrHgpXqphEhKM-Hnu6kGU-jBfm1yIw-sg__biZAb367i_CvSKKgaG5L0QY=; _xsrf=vTWamiEoaOQszAl6fjdlxOtqyhDvOen9; d_c0="AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=|1545899265"; q_c1=3e9e37a46b1d4bfd87f7d1fcb084daad|1545899267000|1545899267000; _ga=GA1.2.929033900.1582626815; capsion_ticket="2|1:0|10:1608602928|14:capsion_ticket|44:MmRhNDdmYWJhZjU3NGQ4ODg3NDAzNGIwNDNiMTdlNDE=|7924fa0d0e36d3ed2a4af65dafa4684c9b36a70d586ec3adb1963c8df5f55e81"; _zap=6fd2d768-daa1-4be1-9a96-43d86c1bbc75; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1609325059,1609337218,1609401296,1609405637; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1609410807; KLBRSID=0a401b23e8a71b70de2f4b37f5b4e379|1609410806|1609401296
'
,
"referer"
:
self
.
spider_url
,
"referer"
:
self
.
spider_url
,
"sec-fetch-dest"
:
"document"
,
"sec-fetch-dest"
:
"document"
,
"sec-fetch-mode"
:
"navigate"
,
"sec-fetch-mode"
:
"navigate"
,
"sec-fetch-site"
:
"same-origin"
,
"sec-fetch-site"
:
"same-origin"
,
"sec-fetch-user"
:
"?1"
,
"sec-fetch-user"
:
"?1"
,
"upgrade-insecure-requests"
:
"1"
,
"upgrade-insecure-requests"
:
"1"
,
"user-agent"
:
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_
6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193
Safari/537.36"
,
"user-agent"
:
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_
7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88
Safari/537.36"
,
}
}
requests_res
=
retry_get_url
(
self
.
spider_url
,
headers
=
headers
)
requests_res
=
retry_get_url
(
self
.
spider_url
,
headers
=
headers
)
return
requests_res
.
cookies
.
get_dict
()
return
requests_res
.
cookies
.
get_dict
()
...
@@ -121,7 +122,7 @@ class Spider(object):
...
@@ -121,7 +122,7 @@ class Spider(object):
return
True
return
True
return
False
return
False
def
parse_sigle_page
(
self
,
data_dict
,
mark
,
need_comment
=
False
):
def
parse_sigle_page
(
self
,
data_dict
,
mark
):
'''
'''
插入主要内容数据和图片的url,寻找评论
插入主要内容数据和图片的url,寻找评论
'''
'''
...
@@ -141,26 +142,6 @@ class Spider(object):
...
@@ -141,26 +142,6 @@ class Spider(object):
self
.
cur
.
execute
(
into
,
values
)
self
.
cur
.
execute
(
into
,
values
)
self
.
conn
.
commit
()
self
.
conn
.
commit
()
if
need_comment
:
offset
=
0
if
data_dict
[
"comment_count"
]
!=
0
:
next
=
1
while
next
==
1
:
next
=
self
.
search_root_comment
(
data_dict
[
"id"
],
offset
,
mark
)
offset
=
offset
+
20
# patt = r'%s(.+?)%s' % ("<noscript><img src=\"", "\" data-caption")
# pattern = re.compile(patt)
# result = pattern.findall(data_dict["content"])
# for results in result:
# if mark == 0:
# into = "insert into zhihu_answer_picture_url(answer_id, url) value(%s, %s)"
# elif mark == 1:
# into = "insert into zhihu_article_picture_url(article_id, url) value(%s, %s)"
# values = (data_dict["id"], results)
# self.cur.execute(into, values)
# self.conn.commit()
return
return
def
search_page
(
self
,
mark
,
page_max
,
start_page
=
0
,
need_commend
=
False
):
def
search_page
(
self
,
mark
,
page_max
,
start_page
=
0
,
need_commend
=
False
):
...
@@ -173,10 +154,12 @@ class Spider(object):
...
@@ -173,10 +154,12 @@ class Spider(object):
offset
=
start_page
offset
=
start_page
for
i
in
range
(
page_max
):
for
i
in
range
(
page_max
):
if
i
>
self
.
page_count
-
1
:
break
if
mark
==
0
:
if
mark
==
0
:
self
.
search_answer_article_page
(
offset
,
0
,
0
,
need_commend
)
self
.
search_answer_article_page
(
offset
,
0
,
0
)
elif
mark
==
1
:
elif
mark
==
1
:
self
.
search_answer_article_page
(
offset
,
1
,
0
,
need_commend
)
self
.
search_answer_article_page
(
offset
,
1
,
0
)
elif
mark
==
2
:
elif
mark
==
2
:
self
.
search_thought_page
(
offset
)
self
.
search_thought_page
(
offset
)
...
@@ -186,7 +169,7 @@ class Spider(object):
...
@@ -186,7 +169,7 @@ class Spider(object):
self
.
conn
.
close
()
self
.
conn
.
close
()
return
return
def
search_answer_article_page
(
self
,
offset
,
mark
,
proxies_num
=
0
,
need_comment
=
False
):
def
search_answer_article_page
(
self
,
offset
,
mark
,
proxies_num
=
0
):
'''
'''
实现文章和回答的数据包请求
实现文章和回答的数据包请求
'''
'''
...
@@ -197,18 +180,25 @@ class Spider(object):
...
@@ -197,18 +180,25 @@ class Spider(object):
url
=
ARTICLE_URL
.
format
(
offset
)
url
=
ARTICLE_URL
.
format
(
offset
)
[
headers_search
,
cookies_dict
]
=
self
.
headers_handle
(
url
)
[
headers_search
,
cookies_dict
]
=
self
.
headers_handle
(
url
)
get_page
=
retry_get_url
(
url
,
headers
=
headers_search
,
cookies
=
cookies_dict
,
proxies
=
proxies_num
)
proxies
=
None
if
self
.
use_proxy
:
proxies
=
get_proxy
()
get_page
=
retry_get_url
(
url
,
headers
=
headers_search
,
cookies
=
cookies_dict
,
proxies
=
proxies
)
if
get_page
.
status_code
!=
200
:
if
get_page
.
status_code
!=
200
:
# retry once
# retry once
get_page
=
requests
.
get
(
url
)
time
.
sleep
(
3
)
get_page
=
retry_get_url
(
url
,
proxies
=
proxies
)
if
get_page
.
status_code
!=
200
:
if
get_page
.
status_code
!=
200
:
print
(
"article_error, url : "
,
url
,
" status_code: "
,
get_page
.
status_code
)
print
(
"article_error, url : "
,
url
,
" status_code: "
,
get_page
.
status_code
)
page_dict
=
get_page
.
json
()
page_dict
=
get_page
.
json
()
if
page_dict
.
get
(
"data"
):
if
page_dict
.
get
(
"data"
):
print
(
self
.
page_count
)
if
self
.
page_count
==
1000
:
self
.
update_page_count
(
page_dict
[
"paging"
]
.
get
(
"totals"
,
0
))
for
one_line
in
page_dict
[
'data'
]:
for
one_line
in
page_dict
[
'data'
]:
try
:
try
:
if
one_line
[
"content"
]
!=
None
:
if
one_line
[
"content"
]
!=
None
:
self
.
parse_sigle_page
(
one_line
,
mark
,
need_comment
=
need_comment
)
self
.
parse_sigle_page
(
one_line
,
mark
)
print
(
"finshed_crawler "
+
offset
)
print
(
"finshed_crawler "
+
offset
)
except
KeyError
:
except
KeyError
:
# It's totally ok to drop the last return data value.
# It's totally ok to drop the last return data value.
...
@@ -216,113 +206,9 @@ class Spider(object):
...
@@ -216,113 +206,9 @@ class Spider(object):
continue
continue
else
:
else
:
print
(
"article_data_error, offset: "
,
offset
,
" url: "
,
url
)
print
(
"article_data_error, offset: "
,
offset
,
" url: "
,
url
)
self
.
use_proxy
=
True
return
time
.
sleep
(
3
)
self
.
search_answer_article_page
(
offset
=
offset
,
mark
=
mark
)
def
search_root_comment
(
self
,
answerid
,
offset
,
mark
,
proxies_num
=
0
):
'''
实现父评论的数据包请求
'''
offset
=
str
(
offset
)
answerid
=
str
(
answerid
)
if
mark
==
0
:
url
=
ANSWER_ROOT_COMMENT_URL
.
format
(
answerid
,
offset
)
elif
mark
==
1
:
url
=
ARTICLE_ROOT_COMMENT_URL
.
format
(
answerid
,
offset
)
[
headers_search
,
cookies_dict
]
=
self
.
headers_handle
(
url
)
get_page
=
retry_get_url
(
url
,
headers
=
headers_search
,
cookies
=
cookies_dict
,
proxies
=
proxies_num
)
if
get_page
.
status_code
!=
200
:
# retry once
get_page
=
requests
.
get
(
url
)
if
get_page
.
status_code
!=
200
:
print
(
"root_comment_error"
)
page_dict
=
get_page
.
json
()
if
page_dict
.
get
(
"data"
):
for
one_line
in
page_dict
[
'data'
]:
try
:
self
.
root_comment_data
(
one_line
,
answerid
,
mark
)
print
(
"finshed_root"
+
offset
)
except
KeyError
:
continue
else
:
print
(
"root_data_error"
)
next
=
0
if
len
(
page_dict
[
'data'
])
==
20
:
next
=
1
return
next
def
root_comment_data
(
self
,
data_dict
,
answerid
,
mark
):
'''
插入父评论相关信息并关联子评论
'''
if
mark
==
0
:
into
=
"insert into zhihu_answer_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
elif
mark
==
1
:
into
=
"insert into zhihu_article_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
values
=
(
data_dict
[
"id"
],
data_dict
[
"author"
][
"member"
][
"name"
],
data_dict
[
"content"
],
answerid
,
data_dict
[
"child_comment_count"
],
data_dict
[
"featured"
],
data_dict
[
"created_time"
],
data_dict
[
"author"
][
"member"
][
"id"
])
self
.
cur
.
execute
(
into
,
values
)
self
.
conn
.
commit
()
offset
=
0
if
data_dict
[
"child_comment_count"
]
!=
0
:
next
=
1
while
next
==
1
:
next
=
self
.
search_child_comment
(
data_dict
[
"id"
],
offset
,
mark
)
offset
=
offset
+
20
return
def
search_child_comment
(
self
,
root_comment_id
,
offset
,
proxies_num
=
0
):
'''
文章和回答的数据包请求
'''
root_comment_id
=
str
(
root_comment_id
)
offsets
=
offset
offset
=
str
(
offset
)
if
offsets
==
0
:
url
=
CHILD_COMMENT_START_URL
.
format
(
root_comment_id
)
else
:
url
=
CHILD_COMMENT_OFFSET_URL
.
format
(
root_comment_id
,
offset
)
[
headers_search
,
cookies_dict
]
=
self
.
headers_handle
(
url
)
get_page
=
retry_get_url_no_proxies
(
url
,
headers
=
headers_search
,
cookies
=
cookies_dict
,
proxies
=
proxies_num
)
if
get_page
.
status_code
!=
200
:
# retry once
get_page
=
requests
.
get
(
url
)
if
get_page
.
status_code
!=
200
:
print
(
"child_comment_error"
)
page_dict
=
get_page
.
json
()
if
page_dict
.
get
(
"data"
):
for
one_line
in
page_dict
[
'data'
]:
try
:
self
.
child_comment_data
(
one_line
,
root_comment_id
)
except
KeyError
:
continue
else
:
pass
next
=
0
if
len
(
page_dict
[
'data'
])
==
20
:
next
=
1
return
next
def
child_comment_data
(
self
,
data_dict
,
root_comment_id
):
'''
子评论数据插入
'''
into
=
"insert into zhihu_child_comment(root_comment_id, author_name, content, reply_name, child_comment_id, created_time, author_id) value(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
values
=
(
root_comment_id
,
data_dict
[
"author"
][
"member"
][
"name"
],
data_dict
[
"content"
],
data_dict
[
"reply_to_author"
][
"member"
][
"name"
],
data_dict
[
"id"
],
data_dict
[
"created_time"
],
data_dict
[
"author"
][
"member"
][
"name"
])
self
.
cur
.
execute
(
into
,
values
)
self
.
conn
.
commit
()
return
return
...
@@ -340,7 +226,7 @@ class Spider(object):
...
@@ -340,7 +226,7 @@ class Spider(object):
"sec-fetch-dest"
:
"empty"
,
"sec-fetch-dest"
:
"empty"
,
"sec-fetch-mode"
:
"cors"
,
"sec-fetch-mode"
:
"cors"
,
"sec-fetch-site"
:
"same-origin"
,
"sec-fetch-site"
:
"same-origin"
,
"user-agent"
:
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_
6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193
Safari/537.36"
,
"user-agent"
:
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_
7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88
Safari/537.36"
,
"x-ab-param"
:
"li_yxzl_new_style_a=1;se_major=0;se_adsrank=4;se_hi_trunc=0;se_v053=1;li_panswer_topic=0;zr_test_aa1=1;pf_noti_entry_num=2;zr_search_sim2=2;zr_slotpaidexp=2;se_zp_boost=0;tp_club_entrance=1;pf_profile2_tab=0;ug_newtag=1;li_answer_card=0;ls_recommend_test=5;qap_labeltype=1;zr_rec_answer_cp=open;se_sug_term=0;tp_topic_tab=0;ge_ge01=5;se_wil_act=0;se_videobox=0;tsp_ad_cardredesign=0;qap_question_visitor= 0;zr_slot_training=2;tp_clubhyb=0;li_ebook_gen_search=2;se_v_v005=0;zw_sameq_sorce=999;ge_ge02=6;se_mobilecard=0;se_auth_src=0;tp_header_style=1;tp_flow_ctr=0;pf_creator_card=1;li_viptab_name=0;zr_intervene=0;se_bert128=1;se_ffzx_jushen1=0;top_v_album=1;se_preset=0;tp_discover=1;ls_fmp4=0;tp_club_top=0;top_universalebook=1;li_svip_cardshow=1;li_paid_answer_exp=0;tp_topic_style=0;zr_art_rec=base;se_colorfultab=1;se_auth_src2=0;tp_club_qa_entrance=1;tp_club__entrance2=1;tsp_hotlist_ui=3;li_svip_tab_search=1;se_entity22=1;tp_meta_card=0;tp_topic_tab_new=0-0-0;tp_zrec=0;top_ebook=0;pf_adjust=1;qap_question_author=0;zr_topic_rpc=0;se_topicfeed=0;tp_club_feed=0;tsp_ioscard2=0;zr_rel_search=base;se_recommend=1;se_usercard=0;tp_club_fdv4=0;tp_m_intro_re_topic=1;pf_foltopic_usernum=0;li_vip_verti_search=0;zr_training_boost=false;se_v054=0;tp_contents=1;soc_feed_intelligent=3;tsp_ios_cardredesign=0;pf_fuceng=1;pf_newguide_vertical=0;ug_follow_topic_1=2;ls_video_commercial=0;li_car_meta=1;se_sug_dnn=0;tp_fenqu_wei=0;li_catalog_card=1;top_quality=0;se_click_v_v=1;se_aa_base=1;se_club_ui=0;se_return_1=0;soc_notification=1;zr_ans_rec=gbrank;zr_search_paid=1;zr_expslotpaid=3;zr_rerank=0;se_college=default;se_whitelist=1;top_root=0;li_yxxq_aut=A1;tsp_adcard2=0;ls_videoad=2;se_col_boost=1;li_edu_page=old;zr_training_first=false;se_t2sug=1;se_vbert3=0;se_merge=0;li_video_section=1;zr_km_answer=open_cvr;zr_sim3=0;se_v_v006=0;tp_dingyue_video=0;li_topics_search=0;se_searchwiki=0;se_guess=0;se_major_v2=0;tp_club_bt=0;tp_sft=a;top_test_4_liguangyi=1"
,
"x-ab-param"
:
"li_yxzl_new_style_a=1;se_major=0;se_adsrank=4;se_hi_trunc=0;se_v053=1;li_panswer_topic=0;zr_test_aa1=1;pf_noti_entry_num=2;zr_search_sim2=2;zr_slotpaidexp=2;se_zp_boost=0;tp_club_entrance=1;pf_profile2_tab=0;ug_newtag=1;li_answer_card=0;ls_recommend_test=5;qap_labeltype=1;zr_rec_answer_cp=open;se_sug_term=0;tp_topic_tab=0;ge_ge01=5;se_wil_act=0;se_videobox=0;tsp_ad_cardredesign=0;qap_question_visitor= 0;zr_slot_training=2;tp_clubhyb=0;li_ebook_gen_search=2;se_v_v005=0;zw_sameq_sorce=999;ge_ge02=6;se_mobilecard=0;se_auth_src=0;tp_header_style=1;tp_flow_ctr=0;pf_creator_card=1;li_viptab_name=0;zr_intervene=0;se_bert128=1;se_ffzx_jushen1=0;top_v_album=1;se_preset=0;tp_discover=1;ls_fmp4=0;tp_club_top=0;top_universalebook=1;li_svip_cardshow=1;li_paid_answer_exp=0;tp_topic_style=0;zr_art_rec=base;se_colorfultab=1;se_auth_src2=0;tp_club_qa_entrance=1;tp_club__entrance2=1;tsp_hotlist_ui=3;li_svip_tab_search=1;se_entity22=1;tp_meta_card=0;tp_topic_tab_new=0-0-0;tp_zrec=0;top_ebook=0;pf_adjust=1;qap_question_author=0;zr_topic_rpc=0;se_topicfeed=0;tp_club_feed=0;tsp_ioscard2=0;zr_rel_search=base;se_recommend=1;se_usercard=0;tp_club_fdv4=0;tp_m_intro_re_topic=1;pf_foltopic_usernum=0;li_vip_verti_search=0;zr_training_boost=false;se_v054=0;tp_contents=1;soc_feed_intelligent=3;tsp_ios_cardredesign=0;pf_fuceng=1;pf_newguide_vertical=0;ug_follow_topic_1=2;ls_video_commercial=0;li_car_meta=1;se_sug_dnn=0;tp_fenqu_wei=0;li_catalog_card=1;top_quality=0;se_click_v_v=1;se_aa_base=1;se_club_ui=0;se_return_1=0;soc_notification=1;zr_ans_rec=gbrank;zr_search_paid=1;zr_expslotpaid=3;zr_rerank=0;se_college=default;se_whitelist=1;top_root=0;li_yxxq_aut=A1;tsp_adcard2=0;ls_videoad=2;se_col_boost=1;li_edu_page=old;zr_training_first=false;se_t2sug=1;se_vbert3=0;se_merge=0;li_video_section=1;zr_km_answer=open_cvr;zr_sim3=0;se_v_v006=0;tp_dingyue_video=0;li_topics_search=0;se_searchwiki=0;se_guess=0;se_major_v2=0;tp_club_bt=0;tp_sft=a;top_test_4_liguangyi=1"
,
"x-api-version"
:
"3.0.91"
,
"x-api-version"
:
"3.0.91"
,
"x-app-za"
:
"OS=Web"
,
"x-app-za"
:
"OS=Web"
,
...
@@ -352,7 +238,7 @@ class Spider(object):
...
@@ -352,7 +238,7 @@ class Spider(object):
}
}
cookies_dict
=
{
cookies_dict
=
{
"d_c0"
:
'"AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=|1545899265"'
,
"d_c0"
:
'"AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=|1545899265"'
,
"KLBRSID"
:
None
"KLBRSID"
:
'0a401b23e8a71b70de2f4b37f5b4e379|1609410806|1609401296'
}
}
cookies_dict
.
update
(
res_cookies_dict
)
cookies_dict
.
update
(
res_cookies_dict
)
...
@@ -363,110 +249,13 @@ class Spider(object):
...
@@ -363,110 +249,13 @@ class Spider(object):
headers_search
[
"x-zse-86"
]
=
"1.0_"
+
self
.
exec_js
.
call
(
"b"
,
fmd5
)
headers_search
[
"x-zse-86"
]
=
"1.0_"
+
self
.
exec_js
.
call
(
"b"
,
fmd5
)
return
headers_search
,
cookies_dict
return
headers_search
,
cookies_dict
def
search_thought_page
(
self
,
offset
,
proxies_num
=
0
):
'''
想法数据包请求
'''
offset
=
str
(
offset
)
url
=
THOUGHT_URL
.
format
(
offset
)
[
headers_search
,
cookies_dict
]
=
self
.
headers_handle
(
url
)
get_page
=
retry_get_url
(
url
,
headers
=
headers_search
,
cookies
=
cookies_dict
,
proxies
=
proxies_num
)
if
get_page
.
status_code
!=
200
:
# retry once
get_page
=
requests
.
get
(
url
)
if
get_page
.
status_code
!=
200
:
print
(
"article_error"
)
page_dict
=
get_page
.
json
()
if
page_dict
.
get
(
"data"
):
for
one_line
in
page_dict
[
'data'
]:
try
:
self
.
parse_thought_sigle_page
(
one_line
)
print
(
"finshed_article"
+
offset
)
except
KeyError
:
# It's totally ok to drop the last return data value.
# The search api just return something seems related to search
continue
else
:
print
(
"article_data_error"
)
return
def
parse_thought_sigle_page
(
self
,
data_dict
):
'''
想法内容插入
'''
for
one_dict
in
data_dict
[
"content"
]:
if
one_dict
[
"type"
]
==
"text"
:
into
=
"insert into zhihu_thought(content, thought_id, created_time, comment_count) value(
%
s,
%
s,
%
s,
%
s)"
values
=
(
one_dict
[
"content"
],
data_dict
[
"id"
],
data_dict
[
"created"
],
data_dict
[
"comment_count"
])
self
.
cur
.
execute
(
into
,
values
)
self
.
conn
.
commit
()
else
:
into
=
"insert into zhihu_thought_picture_url(thought_id, url) value(
%
s,
%
s)"
values
=
(
data_dict
[
"id"
],
one_dict
[
"url"
])
self
.
cur
.
execute
(
into
,
values
)
self
.
conn
.
commit
()
offset
=
0
if
data_dict
[
"comment_count"
]
!=
0
:
next
=
1
while
next
==
1
:
next
=
self
.
search_thought_comment
(
data_dict
[
"id"
],
offset
)
offset
=
offset
+
20
return
def
search_thought_comment
(
self
,
answerid
,
offset
,
proxies_num
=
0
):
'''
想法评论数据包请求
'''
offset
=
str
(
offset
)
answerid
=
str
(
answerid
)
url
=
THOUGHT_COMMENT_URL
.
format
(
answerid
,
offset
)
[
headers_search
,
cookies_dict
]
=
self
.
headers_handle
(
url
)
get_page
=
retry_get_url
(
url
,
headers
=
headers_search
,
cookies
=
cookies_dict
,
proxies
=
proxies_num
)
if
get_page
.
status_code
!=
200
:
# retry once
get_page
=
requests
.
get
(
url
)
if
get_page
.
status_code
!=
200
:
print
(
"root_comment_error"
)
page_dict
=
get_page
.
json
()
if
page_dict
.
get
(
"data"
):
for
one_line
in
page_dict
[
'data'
]:
try
:
self
.
thought_comment_data
(
one_line
,
answerid
)
print
(
"finshed_root"
+
offset
)
except
KeyError
:
continue
else
:
print
(
"root_data_error"
)
next
=
0
if
len
(
page_dict
[
'data'
])
==
20
:
next
=
1
return
next
def
thought_comment_data
(
self
,
data_dict
,
answerid
):
'''
想法评论数据插入
'''
into
=
"insert into zhihu_thought_comment(thought_comment_id, author_name, content, answerid, created_time, author_id) value(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
values
=
(
data_dict
[
"id"
],
data_dict
[
"author"
][
"member"
][
"name"
],
data_dict
[
"content"
],
answerid
,
data_dict
[
"created_time"
],
data_dict
[
"author"
][
"member"
][
"id"
])
self
.
cur
.
execute
(
into
,
values
)
self
.
conn
.
commit
()
return
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
'''
'''
python tasks/zhihu/spider.py 0 1 0 'https://www.zhihu.com/people/taoxi-1130'
python tasks/zhihu/spider.py 0 1 0 'https://www.zhihu.com/people/taoxi-1130'
python script_file_path mark(指定是问题还是其他, 0 是问题, 1是文章, 2是想法) max_page(最大页码) start_page(起始页码 0开始) spider_url(要爬取用户知乎首页地址)
python script_file_path mark(指定是问题还是其他, 0 是问题, 1是文章, 2是想法) max_page(最大页码) start_page(起始页码 0开始) spider_url(要爬取用户知乎首页地址)
'''
'''
mark
=
int
(
sys
.
argv
[
1
])
mark
=
int
(
sys
.
argv
[
1
])
max_page
=
int
(
sys
.
argv
[
2
])
max_page
=
int
(
sys
.
argv
[
2
])
start_page
=
int
(
sys
.
argv
[
3
])
start_page
=
int
(
sys
.
argv
[
3
])
...
...
tasks/zhihu/spider_complex.py
0 → 100644
View file @
607c82f7
# import rsa
import
os
,
sys
base_path
=
os
.
path
.
dirname
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)))
sys
.
path
.
append
(
"/Users/haowei"
)
sys
.
path
.
append
(
"/Users/haowei/workspace/gm/crawler"
)
sys
.
path
.
append
(
"/Users/haowei/workspace/gm/crawler/crawler_sys"
)
import
pymysql
import
hashlib
import
requests
import
execjs
import
os
import
re
import
sys
import
time
from
datetime
import
datetime
import
kdl
HOST
=
'172.18.51.14'
PORT
=
3306
USER
=
'spider'
PASSWD
=
'Gengmei123'
DB
=
'spider'
# JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
JS_FILE_PATH
=
'/Users/haowei/workspace/gm/crawler/crawler_sys/site_crawler/zhihu.js'
APIKEY
=
'quxguz4hwm9cxnx6wpjhkokx04klpr8v'
def
get_proxy
():
import
pdb
;
pdb
.
set_trace
()
auth
=
kdl
.
Auth
(
"990866563045611"
,
APIKEY
)
client
=
kdl
.
Client
(
auth
)
ips
=
client
.
get_dps
(
1
,
sign_type
=
'hmacsha1'
,
format
=
'json'
,
area
=
'北京,上海,广东'
)
print
(
"dps proxy: "
,
ips
)
# url = 'http://dps.kuaidaili.com/api/getdps/'
# params = {
# 'orderid': '990866563045611',
# 'num': '1',
# 'ut': '1',
# 'format': 'json',
# 'sep': '1',
# }
# res = request.get(url, **params)
def
retry_get_url_no_proxies
(
url
,
retrys
=
5
,
timeout
=
10
,
**
kwargs
):
retry_c
=
0
while
retry_c
<
retrys
:
try
:
proxies
=
{
"http"
:
"http://120.38.67.137:15374"
,
"https"
:
"https://120.38.67.137:15374"
,
}
get_resp
=
requests
.
get
(
url
,
timeout
=
timeout
,
**
kwargs
)
import
pdb
;
pdb
.
set_trace
()
return
get_resp
except
Exception
as
e
:
retry_c
+=
1
time
.
sleep
(
1
)
print
(
e
)
print
(
'Failed to get page
%
s after
%
d retries,
%
s'
%
(
url
,
retrys
,
datetime
.
now
()))
return
None
def
retry_get_url
(
url
,
retrys
=
5
,
timeout
=
10
,
**
kwargs
):
retry_c
=
0
while
retry_c
<
retrys
:
try
:
proxies
=
{
"http"
:
"http://120.38.67.137:15374"
,
"https"
:
"https://120.38.67.137:15374"
,
}
get_resp
=
requests
.
get
(
url
,
timeout
=
timeout
,
proxies
=
proxies
,
**
kwargs
)
import
pdb
;
pdb
.
set_trace
()
return
get_resp
except
Exception
as
e
:
retry_c
+=
1
time
.
sleep
(
1
)
print
(
e
)
print
(
'Failed to get page
%
s after
%
d retries,
%
s'
%
(
url
,
retrys
,
datetime
.
now
()))
return
None
class
Spider
(
object
):
def
__init__
(
self
,
spider_url
):
'''
初始化数据库,调整js规则
'''
self
.
conn
=
pymysql
.
connect
(
host
=
HOST
,
port
=
PORT
,
user
=
USER
,
passwd
=
PASSWD
,
db
=
DB
,
charset
=
'utf8'
)
self
.
cur
=
self
.
conn
.
cursor
()
self
.
spider_url
=
spider_url
detail_url
=
'/answers?include=data
%5
B*
%5
D.is_normal
%2
Cadmin_closed_comment
%2
Creward_info
%2
Cis_collapsed
%2
Cannotation_action
%2
Cannotation_detail
%2
Ccollapse_reason
%2
Ccollapsed_by
%2
Csuggest_edit
%2
Ccomment_count
%2
Ccan_comment
%2
Ccontent
%2
Ceditable_content
%2
Cattachment
%2
Cvoteup_count
%2
Creshipment_settings
%2
Ccomment_permission
%2
Cmark_infos
%2
Ccreated_time
%2
Cupdated_time
%2
Creview_info
%2
Cexcerpt
%2
Cis_labeled
%2
Clabel_info
%2
Crelationship.is_authorized
%2
Cvoting
%2
Cis_author
%2
Cis_thanked
%2
Cis_nothelp
%2
Cis_recognized
%3
Bdata
%5
B*
%5
D.author.badge
%5
B
%3
F(type
%3
Dbest_answerer)
%5
D.topics
%3
Bdata
%5
B*
%5
D.question.has_publishing_draft
%2
Crelationship&offset=20&limit=20&sort_by=created'
self
.
ANSWER_URL
=
self
.
spider_url
.
replace
(
"https://www.zhihu.com/people"
,
"https://www.zhihu.com/api/v4/members"
)
+
detail_url
os
.
environ
[
"EXECJS_RUNTIME"
]
=
'Node'
try
:
with
open
(
'./zhihu.js'
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
js
=
f
.
read
()
except
:
with
open
(
JS_FILE_PATH
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
js
=
f
.
read
()
self
.
exec_js
=
execjs
.
compile
(
js
)
# self.exec_js = execjs.compile(js, cwd='/home/gmuser/node_modules')
def
get_serach_page_cookies
(
self
):
'''
cookies更新
'''
headers
=
{
"accept"
:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
,
"accept-encoding"
:
"gzip, deflate, br"
,
"accept-language"
:
"zh-CN,zh;q=0.9"
,
"cache-control"
:
"max-age=0"
,
"cookie"
:
'_xsrf=vTWamiEoaOQszAl6fjdlxOtqyhDvOen9; d_c0="AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=|1545899265"; q_c1=3e9e37a46b1d4bfd87f7d1fcb084daad|1545899267000|1545899267000; _ga=GA1.2.929033900.1582626815; capsion_ticket="2|1:0|10:1608602928|14:capsion_ticket|44:MmRhNDdmYWJhZjU3NGQ4ODg3NDAzNGIwNDNiMTdlNDE=|7924fa0d0e36d3ed2a4af65dafa4684c9b36a70d586ec3adb1963c8df5f55e81"; _zap=6fd2d768-daa1-4be1-9a96-43d86c1bbc75; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1609321344,1609321689,1609321744,1609322777; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1609322777; KLBRSID=d017ffedd50a8c265f0e648afe355952|1609323283|1609315999'
,
"referer"
:
self
.
spider_url
,
"sec-fetch-dest"
:
"document"
,
"sec-fetch-mode"
:
"navigate"
,
"sec-fetch-site"
:
"same-origin"
,
"sec-fetch-user"
:
"?1"
,
"upgrade-insecure-requests"
:
"1"
,
"user-agent"
:
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36"
,
}
requests_res
=
retry_get_url
(
self
.
spider_url
,
headers
=
headers
)
return
requests_res
.
cookies
.
get_dict
()
def
check_data_exist
(
self
,
data_dict
,
mark
):
'''
数据插入前检测
'''
sql
=
"select id from {table} where answer_id = {id_}"
exist
=
None
if
mark
==
0
:
select_sql
=
sql
.
format
(
table
=
'zhihu_answer'
,
id_
=
data_dict
[
"id"
])
self
.
cur
.
execute
(
select_sql
)
exist
=
self
.
cur
.
fetchone
()
if
mark
==
1
:
select_sql
=
sql
.
format
(
table
=
'zhihu_article'
,
id_
=
data_dict
[
"id"
])
self
.
cur
.
execute
(
select_sql
)
exist
=
self
.
cur
.
fetchone
()
if
exist
:
return
True
return
False
def
parse_sigle_page
(
self
,
data_dict
,
mark
,
need_comment
=
False
):
'''
插入主要内容数据和图片的url,寻找评论
'''
if
not
self
.
check_data_exist
(
data_dict
,
mark
):
if
mark
==
0
:
into
=
"insert into zhihu_answer(title, content, answer_id, created_time, comment_count, new_content) value(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
values
=
(
data_dict
[
"question"
][
"title"
],
data_dict
[
"content"
],
data_dict
[
"id"
],
data_dict
[
"created_time"
],
data_dict
[
"comment_count"
],
data_dict
[
"content"
])
elif
mark
==
1
:
into
=
"insert into zhihu_article(title, content, article_id, created_time, comment_count, new_content) value(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
values
=
(
data_dict
[
"title"
],
data_dict
[
"content"
],
data_dict
[
"id"
],
data_dict
[
"created"
],
data_dict
[
"comment_count"
],
data_dict
[
"content"
])
print
(
data_dict
[
"question"
][
"title"
])
self
.
cur
.
execute
(
into
,
values
)
self
.
conn
.
commit
()
if
need_comment
:
offset
=
0
if
data_dict
[
"comment_count"
]
!=
0
:
next
=
1
while
next
==
1
:
next
=
self
.
search_root_comment
(
data_dict
[
"id"
],
offset
,
mark
)
offset
=
offset
+
20
# patt = r'%s(.+?)%s' % ("<noscript><img src=\"", "\" data-caption")
# pattern = re.compile(patt)
# result = pattern.findall(data_dict["content"])
# for results in result:
# if mark == 0:
# into = "insert into zhihu_answer_picture_url(answer_id, url) value(%s, %s)"
# elif mark == 1:
# into = "insert into zhihu_article_picture_url(article_id, url) value(%s, %s)"
# values = (data_dict["id"], results)
# self.cur.execute(into, values)
# self.conn.commit()
return
def
search_page
(
self
,
mark
,
page_max
,
start_page
=
0
,
need_commend
=
False
):
'''
函数主入口
params:
mark 0 answer, 1 article, 2 thought
'''
offset
=
start_page
for
i
in
range
(
page_max
):
if
mark
==
0
:
self
.
search_answer_article_page
(
offset
,
0
,
0
,
need_commend
)
elif
mark
==
1
:
self
.
search_answer_article_page
(
offset
,
1
,
0
,
need_commend
)
elif
mark
==
2
:
self
.
search_thought_page
(
offset
)
offset
=
offset
+
20
time
.
sleep
(
10
)
self
.
conn
.
close
()
return
def
search_answer_article_page
(
self
,
offset
,
mark
,
proxies_num
=
0
,
need_comment
=
False
):
'''
实现文章和回答的数据包请求
'''
offset
=
str
(
offset
)
if
mark
==
0
:
url
=
self
.
ANSWER_URL
.
format
(
offset
)
elif
mark
==
1
:
url
=
ARTICLE_URL
.
format
(
offset
)
[
headers_search
,
cookies_dict
]
=
self
.
headers_handle
(
url
)
get_page
=
retry_get_url
(
url
,
headers
=
headers_search
,
cookies
=
cookies_dict
,
proxies
=
proxies_num
)
if
get_page
.
status_code
!=
200
:
# retry once
get_page
=
requests
.
get
(
url
)
if
get_page
.
status_code
!=
200
:
print
(
"article_error, url : "
,
url
,
" status_code: "
,
get_page
.
status_code
)
page_dict
=
get_page
.
json
()
if
page_dict
.
get
(
"data"
):
for
one_line
in
page_dict
[
'data'
]:
try
:
if
one_line
[
"content"
]
!=
None
:
self
.
parse_sigle_page
(
one_line
,
mark
,
need_comment
=
need_comment
)
print
(
"finshed_crawler "
+
offset
)
except
KeyError
:
# It's totally ok to drop the last return data value.
# The search api just return something seems related to search
continue
else
:
print
(
"article_data_error, offset: "
,
offset
,
" url: "
,
url
)
return
def
search_root_comment
(
self
,
answerid
,
offset
,
mark
,
proxies_num
=
0
):
'''
实现父评论的数据包请求
'''
offset
=
str
(
offset
)
answerid
=
str
(
answerid
)
if
mark
==
0
:
url
=
ANSWER_ROOT_COMMENT_URL
.
format
(
answerid
,
offset
)
elif
mark
==
1
:
url
=
ARTICLE_ROOT_COMMENT_URL
.
format
(
answerid
,
offset
)
[
headers_search
,
cookies_dict
]
=
self
.
headers_handle
(
url
)
get_page
=
retry_get_url
(
url
,
headers
=
headers_search
,
cookies
=
cookies_dict
,
proxies
=
proxies_num
)
if
get_page
.
status_code
!=
200
:
# retry once
get_page
=
requests
.
get
(
url
)
if
get_page
.
status_code
!=
200
:
print
(
"root_comment_error"
)
page_dict
=
get_page
.
json
()
if
page_dict
.
get
(
"data"
):
for
one_line
in
page_dict
[
'data'
]:
try
:
self
.
root_comment_data
(
one_line
,
answerid
,
mark
)
print
(
"finshed_root"
+
offset
)
except
KeyError
:
continue
else
:
print
(
"root_data_error"
)
next
=
0
if
len
(
page_dict
[
'data'
])
==
20
:
next
=
1
return
next
def
root_comment_data
(
self
,
data_dict
,
answerid
,
mark
):
'''
插入父评论相关信息并关联子评论
'''
if
mark
==
0
:
into
=
"insert into zhihu_answer_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
elif
mark
==
1
:
into
=
"insert into zhihu_article_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
values
=
(
data_dict
[
"id"
],
data_dict
[
"author"
][
"member"
][
"name"
],
data_dict
[
"content"
],
answerid
,
data_dict
[
"child_comment_count"
],
data_dict
[
"featured"
],
data_dict
[
"created_time"
],
data_dict
[
"author"
][
"member"
][
"id"
])
self
.
cur
.
execute
(
into
,
values
)
self
.
conn
.
commit
()
offset
=
0
if
data_dict
[
"child_comment_count"
]
!=
0
:
next
=
1
while
next
==
1
:
next
=
self
.
search_child_comment
(
data_dict
[
"id"
],
offset
,
mark
)
offset
=
offset
+
20
return
def
search_child_comment
(
self
,
root_comment_id
,
offset
,
proxies_num
=
0
):
'''
文章和回答的数据包请求
'''
root_comment_id
=
str
(
root_comment_id
)
offsets
=
offset
offset
=
str
(
offset
)
if
offsets
==
0
:
url
=
CHILD_COMMENT_START_URL
.
format
(
root_comment_id
)
else
:
url
=
CHILD_COMMENT_OFFSET_URL
.
format
(
root_comment_id
,
offset
)
[
headers_search
,
cookies_dict
]
=
self
.
headers_handle
(
url
)
get_page
=
retry_get_url_no_proxies
(
url
,
headers
=
headers_search
,
cookies
=
cookies_dict
,
proxies
=
proxies_num
)
if
get_page
.
status_code
!=
200
:
# retry once
get_page
=
requests
.
get
(
url
)
if
get_page
.
status_code
!=
200
:
print
(
"child_comment_error"
)
page_dict
=
get_page
.
json
()
if
page_dict
.
get
(
"data"
):
for
one_line
in
page_dict
[
'data'
]:
try
:
self
.
child_comment_data
(
one_line
,
root_comment_id
)
except
KeyError
:
continue
else
:
pass
next
=
0
if
len
(
page_dict
[
'data'
])
==
20
:
next
=
1
return
next
def
child_comment_data
(
self
,
data_dict
,
root_comment_id
):
'''
子评论数据插入
'''
into
=
"insert into zhihu_child_comment(root_comment_id, author_name, content, reply_name, child_comment_id, created_time, author_id) value(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
values
=
(
root_comment_id
,
data_dict
[
"author"
][
"member"
][
"name"
],
data_dict
[
"content"
],
data_dict
[
"reply_to_author"
][
"member"
][
"name"
],
data_dict
[
"id"
],
data_dict
[
"created_time"
],
data_dict
[
"author"
][
"member"
][
"name"
])
self
.
cur
.
execute
(
into
,
values
)
self
.
conn
.
commit
()
return
def
headers_handle
(
self
,
url
):
'''
url请求中的头部伪装
'''
res_cookies_dict
=
self
.
get_serach_page_cookies
()
referer
=
self
.
spider_url
.
replace
(
"https://www.zhihu.com/people"
,
"https://www.zhihu.com/api/v4/members"
)
headers_search
=
{
"accept"
:
"*/*"
,
"accept-encoding"
:
"gzip, deflate"
,
"accept-language"
:
"zh-CN,zh;q=0.9"
,
"sec-fetch-dest"
:
"empty"
,
"sec-fetch-mode"
:
"cors"
,
"sec-fetch-site"
:
"same-origin"
,
"user-agent"
:
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36"
,
"x-ab-param"
:
"li_yxzl_new_style_a=1;se_major=0;se_adsrank=4;se_hi_trunc=0;se_v053=1;li_panswer_topic=0;zr_test_aa1=1;pf_noti_entry_num=2;zr_search_sim2=2;zr_slotpaidexp=2;se_zp_boost=0;tp_club_entrance=1;pf_profile2_tab=0;ug_newtag=1;li_answer_card=0;ls_recommend_test=5;qap_labeltype=1;zr_rec_answer_cp=open;se_sug_term=0;tp_topic_tab=0;ge_ge01=5;se_wil_act=0;se_videobox=0;tsp_ad_cardredesign=0;qap_question_visitor= 0;zr_slot_training=2;tp_clubhyb=0;li_ebook_gen_search=2;se_v_v005=0;zw_sameq_sorce=999;ge_ge02=6;se_mobilecard=0;se_auth_src=0;tp_header_style=1;tp_flow_ctr=0;pf_creator_card=1;li_viptab_name=0;zr_intervene=0;se_bert128=1;se_ffzx_jushen1=0;top_v_album=1;se_preset=0;tp_discover=1;ls_fmp4=0;tp_club_top=0;top_universalebook=1;li_svip_cardshow=1;li_paid_answer_exp=0;tp_topic_style=0;zr_art_rec=base;se_colorfultab=1;se_auth_src2=0;tp_club_qa_entrance=1;tp_club__entrance2=1;tsp_hotlist_ui=3;li_svip_tab_search=1;se_entity22=1;tp_meta_card=0;tp_topic_tab_new=0-0-0;tp_zrec=0;top_ebook=0;pf_adjust=1;qap_question_author=0;zr_topic_rpc=0;se_topicfeed=0;tp_club_feed=0;tsp_ioscard2=0;zr_rel_search=base;se_recommend=1;se_usercard=0;tp_club_fdv4=0;tp_m_intro_re_topic=1;pf_foltopic_usernum=0;li_vip_verti_search=0;zr_training_boost=false;se_v054=0;tp_contents=1;soc_feed_intelligent=3;tsp_ios_cardredesign=0;pf_fuceng=1;pf_newguide_vertical=0;ug_follow_topic_1=2;ls_video_commercial=0;li_car_meta=1;se_sug_dnn=0;tp_fenqu_wei=0;li_catalog_card=1;top_quality=0;se_click_v_v=1;se_aa_base=1;se_club_ui=0;se_return_1=0;soc_notification=1;zr_ans_rec=gbrank;zr_search_paid=1;zr_expslotpaid=3;zr_rerank=0;se_college=default;se_whitelist=1;top_root=0;li_yxxq_aut=A1;tsp_adcard2=0;ls_videoad=2;se_col_boost=1;li_edu_page=old;zr_training_first=false;se_t2sug=1;se_vbert3=0;se_merge=0;li_video_section=1;zr_km_answer=open_cvr;zr_sim3=0;se_v_v006=0;tp_dingyue_video=0;li_topics_search=0;se_searchwiki=0;se_guess=0;se_major_v2=0;tp_club_bt=0;tp_sft=a;top_test_4_liguangyi=1"
,
"x-api-version"
:
"3.0.91"
,
"x-app-za"
:
"OS=Web"
,
"x-requested-with"
:
"fetch"
,
"x-zse-83"
:
"3_2.0"
,
"x-zse-86"
:
None
,
"referer"
:
referer
+
"/answers?page=1"
,
}
cookies_dict
=
{
"d_c0"
:
'"AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=|1545899265"'
,
"KLBRSID"
:
None
}
cookies_dict
.
update
(
res_cookies_dict
)
f
=
"+"
.
join
(
[
"3_2.0"
,
url
.
replace
(
"https://www.zhihu.com"
,
""
),
headers_search
[
"referer"
],
cookies_dict
[
"d_c0"
]])
fmd5
=
hashlib
.
new
(
'md5'
,
f
.
encode
())
.
hexdigest
()
headers_search
[
"x-zse-86"
]
=
"1.0_"
+
self
.
exec_js
.
call
(
"b"
,
fmd5
)
return
headers_search
,
cookies_dict
def
search_thought_page
(
self
,
offset
,
proxies_num
=
0
):
'''
想法数据包请求
'''
offset
=
str
(
offset
)
url
=
THOUGHT_URL
.
format
(
offset
)
[
headers_search
,
cookies_dict
]
=
self
.
headers_handle
(
url
)
get_page
=
retry_get_url
(
url
,
headers
=
headers_search
,
cookies
=
cookies_dict
,
proxies
=
proxies_num
)
if
get_page
.
status_code
!=
200
:
# retry once
get_page
=
requests
.
get
(
url
)
if
get_page
.
status_code
!=
200
:
print
(
"article_error"
)
page_dict
=
get_page
.
json
()
if
page_dict
.
get
(
"data"
):
for
one_line
in
page_dict
[
'data'
]:
try
:
self
.
parse_thought_sigle_page
(
one_line
)
print
(
"finshed_article"
+
offset
)
except
KeyError
:
# It's totally ok to drop the last return data value.
# The search api just return something seems related to search
continue
else
:
print
(
"article_data_error"
)
return
def
parse_thought_sigle_page
(
self
,
data_dict
):
'''
想法内容插入
'''
for
one_dict
in
data_dict
[
"content"
]:
if
one_dict
[
"type"
]
==
"text"
:
into
=
"insert into zhihu_thought(content, thought_id, created_time, comment_count) value(
%
s,
%
s,
%
s,
%
s)"
values
=
(
one_dict
[
"content"
],
data_dict
[
"id"
],
data_dict
[
"created"
],
data_dict
[
"comment_count"
])
self
.
cur
.
execute
(
into
,
values
)
self
.
conn
.
commit
()
else
:
into
=
"insert into zhihu_thought_picture_url(thought_id, url) value(
%
s,
%
s)"
values
=
(
data_dict
[
"id"
],
one_dict
[
"url"
])
self
.
cur
.
execute
(
into
,
values
)
self
.
conn
.
commit
()
offset
=
0
if
data_dict
[
"comment_count"
]
!=
0
:
next
=
1
while
next
==
1
:
next
=
self
.
search_thought_comment
(
data_dict
[
"id"
],
offset
)
offset
=
offset
+
20
return
def
search_thought_comment
(
self
,
answerid
,
offset
,
proxies_num
=
0
):
'''
想法评论数据包请求
'''
offset
=
str
(
offset
)
answerid
=
str
(
answerid
)
url
=
THOUGHT_COMMENT_URL
.
format
(
answerid
,
offset
)
[
headers_search
,
cookies_dict
]
=
self
.
headers_handle
(
url
)
get_page
=
retry_get_url
(
url
,
headers
=
headers_search
,
cookies
=
cookies_dict
,
proxies
=
proxies_num
)
if
get_page
.
status_code
!=
200
:
# retry once
get_page
=
requests
.
get
(
url
)
if
get_page
.
status_code
!=
200
:
print
(
"root_comment_error"
)
page_dict
=
get_page
.
json
()
if
page_dict
.
get
(
"data"
):
for
one_line
in
page_dict
[
'data'
]:
try
:
self
.
thought_comment_data
(
one_line
,
answerid
)
print
(
"finshed_root"
+
offset
)
except
KeyError
:
continue
else
:
print
(
"root_data_error"
)
next
=
0
if
len
(
page_dict
[
'data'
])
==
20
:
next
=
1
return
next
def
thought_comment_data
(
self
,
data_dict
,
answerid
):
'''
想法评论数据插入
'''
into
=
"insert into zhihu_thought_comment(thought_comment_id, author_name, content, answerid, created_time, author_id) value(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
values
=
(
data_dict
[
"id"
],
data_dict
[
"author"
][
"member"
][
"name"
],
data_dict
[
"content"
],
answerid
,
data_dict
[
"created_time"
],
data_dict
[
"author"
][
"member"
][
"id"
])
self
.
cur
.
execute
(
into
,
values
)
self
.
conn
.
commit
()
return
if
__name__
==
'__main__'
:
'''
python tasks/zhihu/spider.py 0 1 0 'https://www.zhihu.com/people/taoxi-1130'
python script_file_path mark(指定是问题还是其他, 0 是问题, 1是文章, 2是想法) max_page(最大页码) start_page(起始页码 0开始) spider_url(要爬取用户知乎首页地址)
'''
# get_proxy()
mark
=
int
(
sys
.
argv
[
1
])
max_page
=
int
(
sys
.
argv
[
2
])
start_page
=
int
(
sys
.
argv
[
3
])
spider_url
=
sys
.
argv
[
4
]
# spider_url = 'https://www.zhihu.com/people/geng-mei-suo-chang/answers'
print
(
datetime
.
now
())
spider
=
Spider
(
spider_url
=
spider_url
)
if
mark
==
0
:
spider
.
search_page
(
mark
,
max_page
,
start_page
)
elif
mark
==
1
:
spider
.
search_page
(
mark
,
max_page
,
start_page
)
elif
mark
==
2
:
spider
.
search_page
(
mark
,
max_page
,
start_page
)
print
(
datetime
.
now
())
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment