Commit 607c82f7 authored by haowang's avatar haowang

fix

parent e3acccb8
import os
import time
from datetime import datetime
if __name__ == '__main__':
plans = [
"python tasks/zhihu/spider.py 0 3 0 'https://www.zhihu.com/people/chun-feng-zhu-lang'",
"python tasks/zhihu/spider.py 0 10 0 'https://www.zhihu.com/people/drunkxiaojingguai'",
"python tasks/zhihu/spider.py 0 1 0 'https://www.zhihu.com/people/kokokou'",
"python tasks/zhihu/spider.py 0 5 0 'https://www.zhihu.com/people/yo-he-14-20'",
"python tasks/zhihu/spider.py 0 7 0 'https://www.zhihu.com/people/wen-zi-ding-dan'",
"python tasks/zhihu/spider.py 0 169 0 'https://www.zhihu.com/people/zhengxingba'",
"python tasks/zhihu/spider.py 0 24 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-lu-hui'",
"python tasks/zhihu/spider.py 0 5 0 'https://www.zhihu.com/people/xiao-zhu-99-20'",
"python tasks/zhihu/spider.py 0 119 0 'https://www.zhihu.com/people/zhaotianqiang'",
"python tasks/zhihu/spider.py 0 7 0 'https://www.zhihu.com/people/zuo-zheng-xing-de-liu-zhe-qi'",
"python tasks/zhihu/spider.py 0 14 0 'https://www.zhihu.com/people/cao-yang-yan-da-shi'",
"python tasks/zhihu/spider.py 0 2 0 'https://www.zhihu.com/people/zhe-mang-guo-mang'",
"python tasks/zhihu/spider.py 0 5 0 'https://www.zhihu.com/people/yuxi624'",
"python tasks/zhihu/spider.py 0 10 0 'https://www.zhihu.com/people/key-70-48'",
"python tasks/zhihu/spider.py 0 4 0 'https://www.zhihu.com/people/dryanling'",
"python tasks/zhihu/spider.py 0 2 0 'https://www.zhihu.com/people/shu-er-29-7'",
"python tasks/zhihu/spider.py 0 13 0 'https://www.zhihu.com/people/chen-shi-long-69-98'",
"python tasks/zhihu/spider.py 0 5 0 'https://www.zhihu.com/people/ka-li-yu-dan-94'",
"python tasks/zhihu/spider.py 0 4 0 'https://www.zhihu.com/people/nuo-nuo-jiang-46-40'",
"python tasks/zhihu/spider.py 0 10 0 'https://www.zhihu.com/people/lenhzin'",
"python tasks/zhihu/spider.py 0 7 0 'https://www.zhihu.com/people/tu-zi-ai-chi-cao-78'",
"python tasks/zhihu/spider.py 0 19 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-wang-shi-hu'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yu-ji-jie-jie-12'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/huan-xing-mei-mei'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/suan-jie-shen-ba'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/li-ji-hua-8'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/dong-yan-10-84'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-yu-yan'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yu-mao-wen-46'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/drfranklin-eps'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/sxsxp1'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/judyxiao-jie-jie'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bian-mei-jun'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hu-tao-tao-de-yan-jiu-shi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wei-fan-ceng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-zi-xun-shi-yu-er-duo-duo'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ju-zi-guo-jiang-20-66'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/a-qi-shu-shu-15'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chi-bu-ding-de-pang-ding-27-43'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jliyimei'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/reseted1503325608'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/miao-wu-a-wu-666'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/a-ye-xiang-wo'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/liang-yi-31-45'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhao-xue-qi-2'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-gang-2017'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/long-mao-xue-yuan'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mai-tian-li-de-ke-ai-duo'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-dobby'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-zhi-da-xue-tang'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-min-94-15'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xian-nu-mei-rong'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wei-xiao-xiao-72-30'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wen-bao-guo-bao'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-zhu-li-666'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/si-te-li-ke-lan-de-xian-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bian-mei-she'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/sou-di-leng-hua-zheng-qiao'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yimeisiba'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hua-jie-bu-xing-hua'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bitibo/pins'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/liu-cheng-sheng-zhu-li'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/gao-yi-chao-63'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chen-shi-er-78-24'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-tian-jiao-sweet'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/can-ma-de-can-lan-sheng-huo'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ai-chi-xue-gao-24'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xian-bei-bu-ai-hua-zhuang'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/pluto-54-88'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bai-meng-meng-79-43'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ai-mei-de-mi-xue'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/li-yan-lin-22-95'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ning-meng-jing-tang-dou-dou'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wo-you-yizhi-xiao-mao-mi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhengrongjia'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/li-li-36-48-1'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jin-xiao-mei-88-37'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/liang-yi-sheng-99-9'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xia-xiao-xiao-20-12'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wai-hao-da-biao-jie'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-zhou-yu'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhi-zhi-fei-ji'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/pan-er-bo-shi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/cheng-du-long-bi-bo-shi-zhang-han'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/well-1-95'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ying-ying-37-60'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-xiao-51-29-23'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/li-rui-64-16'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-zhi-dao-ge'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/liu-yi-he-tian-ling'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/13397228518'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhang-zhi-bing-13'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-gang-2017'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-lian-mei-9'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zi-fei-yu-79-53'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zengyy-5'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-jiang-yi-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wu-di-30-63-36'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-zhi-shuo-ming-shu'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/shan-shan-lai-zao-liao-35'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-zi-xun-shi-abbie-71'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/tou-fa-ke'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/0329'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-hao-90-58-31'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-jing-20-41-10'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-xiao-xiao-62-85'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jian-si-bu-neng-jiu'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mi-qi-xian-sheng-72'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jiangjy58'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-jiao-zi-mo-mo'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/haozemaliang'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xie-wen-hui-21-66'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-shua-shua-98'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/amazingbird'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/desperate'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-pei-jun-15'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/le-mei-xiao-qin'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/meilikepujun'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/juan-mao-er-1-90'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bian-mei-jun-shi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hai-da-li-28'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xieyangchun'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/song-song-1-3-27'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yu-sheng-60-21-71'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/CQ_Hazel'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/li-you-zi-13'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ban-ban-61-99'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bei-jing-ba-da-chu-jin-xiao-lei'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-bai-zhen-hua-54'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-rong-yi-sheng-gao-si'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/du-rui-mi-66'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ke-le-niu-nai-38'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ai-xiao-de-chun-93'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-chuan-kai-67'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qiang-jiao-de-xiao-tiao-wa'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/le-le-34-73-57'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/lao-lun-si-de-bai-ri-meng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hong-li-34-89'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhao-xiu-li-67'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-er-gou-zi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-zhi-zheng-yi-11'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/pi-fu-mei-rong-zhang-wei-hua'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/cai-xiao-bai-60-17'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ban-xia-7-29'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qi-sh-57'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qing-jiao-wo-yi-mei-shuo-ming-shu'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chen-mo-fang-jiang-zhi-fang'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yi-mei-zai-xian-50'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-jiu-sheng-25-10'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/miao-jiang-3'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mu-lan-wu-chang-xiong-41'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hk17320188886'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xi-fu-wo-xiang-zhi-dao'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zuo-zheng-xing-de-ren-han-xiao'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/gan-cheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-xiao-yi-ma'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/aloedeng-ni'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhoushaolong'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chaigang9h'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-jian-93-10'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-hu-hu-69-77'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhihu88'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zou-yu-75-32'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/liu-mo-mo-49-86'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mei-gu-shi-de-j-xiao-jie-22'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-li-feng-yong'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/sunzhicheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/long-bi-yi-sheng-li-chang-fu'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chen-da-da-57'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-bai-58-40'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/dong-ming-xian-48'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ai-mei-luo-bu-gan'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zaoxaomei'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/da-xiao-bei-13'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xu-rong-yang-71'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhang-da-yan-yi-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/a-wei-68-2'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-kong-ju-le-bu'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/winni-2-13'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-jiu-jiu-80-61'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/leon-5-35-12'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/dr-yan-93'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/tong-yan-jun-25'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jennykissedme'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/li-chuang-26-64'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yi-mei-lang-zi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-miao-64-41'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-guan-86-73'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-hou-you-10'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ce-ge-mo'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhong-ri-zheng-xing-huang-wen-gang'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-di-xing-kong-70'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/drchenbing'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-liu-53-15-36'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/gao-san-gou-67-76'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhang-xiao-tian-mian-diao-zhuan-jia'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhou-ye-song-57'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mei-du-zheng-xing-zhang-yi-sheng-19'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ouruila'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-liang-zong-jian'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-83-93-34'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ren-dong-ivy'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/13718001372'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xi-jing-xiao-mei'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ai-zheng-cai-hui-ying'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.kdocs.cn/view/l/coV2TD4LExp2'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-meng-ming-xing-38'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/alice-95-96-90'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/liu-xing-hu-die-jian-14/answers'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/huang-yi-xiang-28-29'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/beauty-44-9'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wei-bo-ying-xiao-xiao-dou'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/my-world-70-8'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ishuangyanpi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ban-tang-ka-pei-60'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/miao-miao-miao-5-43'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hejiongchuan'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xin-han-50-61'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiang-yan-zhi-kao-jin-de-you-zi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-ai-dou-xiao-mi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ni-gu-man-man'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/DrZhaoyanyong'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhi-fang-wei-diao-qiu-li-dong'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chen-gui-zong'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiecan1'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-lu-shu-rong-6'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jiang-tian-xie-1'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-yi-sheng-yu-bing'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/lu-jian-jian-12'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-li-qiu-yue'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-zhang-shu-gong'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jiu-ai-he-nai-cha-62'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jie-wo-yiba-qiang-49'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/13522127270'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-ji-guang-zhong-xin-feng-yong-qiang'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bu-zhi-dao-jiao-sha-79-74'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-yi-sheng-zhou-xu-99'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-liu-tun-yi-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xian-xian-76-39/answers'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jing-jing-70-85-21'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bing-he-lao-dao'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-wang-tai-ling'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mo-xiao-xin-28-35'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-ma-zhu'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/lovelife-96-78'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/sorry-7'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qi-yong-le-38'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-rong-yi-sheng-wang-shu-jie'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-mi-jie-33-58'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/shan-he-yi-qiu-94'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zong-xian-lei-yi-sheng-49'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiong-mao-jiang-zi-18'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-liao-56-14-3'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qing-qing-ping-ping-36'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yang-zhe-2-83'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/tao-hua-huan-jiu-qian-64'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-wang-yue'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bei-jing-ba-da-chu-ma-xiao-yang-yi-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-yu-hao'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhang-hai-ming-97-8'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-xiao-xiao-shi-49'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/san-qi-h-79'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/tong-xiao-yi-sheng-85'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-yi-sheng-yin-hong-yu'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ai-mei-meng-xi-52'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zai-zai-76-64'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/a-di-35-64'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/shu-yan-ni-ni/answers'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-wang-li-peng-67'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/Swahili1230'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/gorgeous-1-48'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/luna-10-2'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qia-wa-yi-fu-50/answers'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/lin-xiao-pang-30-84'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/you-zi-shu-27-18'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mc-goldenli'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-dai-zheng-zhou'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-dong-qi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ding-xiang-17-21'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/long-xiao-lan-56'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wo-shi-ann'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-fu-jun-li'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/fan-rong-jie-43'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/liu-liu-zheng-xing'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhou-xiao-xiao-43-80'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/tea-5-1-57/answers'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-man-3-54'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jing-guo-shen-she-94'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhou-miao-miao-50-58'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/masakijiang-83'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-han-xing'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yu-zhou-xiao-ke-ai-79-36'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zxys-zhanglong'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xin-li-xiao-tian-shi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hart-48-54'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ou-bu-zhi-dao-52'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/kiki77-21'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chun-bu-zheng-xing-yi-sheng-li-hui'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xia-mo-87-11'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/tiffanybaby-11'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mu-qing-6-40'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jiang-sijun-70-90-26'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wangling-31-13'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-xi-82-72-10'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-ai-hong-mei'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yang-zhong-zheng-xing-yi-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/gao-xiao-shi-pin-lu-ren-jia'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-wang-yan'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hong-kao-shi-ba-v'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-wai-ke-wang-xiao-yang-yi-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xie-he-zheng-xing-yi-sheng-wang-yang'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bei-yi-san-yuan-zheng-xing-yi-sheng-li-bi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/nan-jing-ruan-tang-xiao-jie'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-meng-chun-16-30'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/column/meidaila'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/column/cosmotology'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/miss-miao-jun-81'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/guan-bo-lan-14'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yi-xue-pi-fu-guan-li-gao-jin-ling'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ji-guang-yu-yi-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/jia-qu-jing-31'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/fu-chen-yi-mei'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-liu-xu-li'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-zhu-dong-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/christy-20-58/answers'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yang-shen-shen-21-52'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-chen-bo-yi-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bo-luo-15-5-93'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yang-zhou-ifaceai-fei-si'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-wai-ke-yi-sheng-nie-yun-fei'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/lanzhiyong'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/gong-da-yi-mei-dsh3225'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-mei-jie-niu-er'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/binghan'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mei-fu-yi-sheng-yi-yang-liang'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mei-li-da-ren-40'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mei-xing-shuo-33'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yi-mei-xiao-tian-xin-ne'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/shi-jun-li-yi-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/han-ao-xi-65'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/tang-jian-bing-yi-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/nan-feng-18-6-16'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/chen-xiang-xin-17'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/18611026629'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zhang-shuai-47-80'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/mu-mu-97-35-35'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/miss-fei-fei-68'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yi-mei-xiao-jiang-jun'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/le-xiao-zhang-89'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/joy-71-96-54'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/cat-49-49'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yi-mei-xiao-ni-zi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yimei-38-82'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/CrossoverMarketing'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/pi-fu-yi-sheng-zhousir'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-yuan-wai-07'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xi-you-tuan-zi-2'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hua-mei-yuan-chang-fu-guo-you-bo-shi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wang-hai-feng-38-32'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/lian-qia-fo-ju-jie'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/cong-cong-64-65'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/shang-hai-zheng-xing-chuang-shi-ren'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-deng-gang'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ning-meng-mei-wo-meng-29-98'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-huang-rui'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yu-qi-gang-44'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/cheng-zong-yu-12'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bai-li-tian-7-72'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-sun-zhong-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/likemei-shi-yan-shi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/deng-zheng-jun-zhu-ren'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/hua-shao-30-97'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/man-lun-hua-kai'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/fschaomei'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/wen-bi-yun-96'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qing-hua-wu-jia-jun-bo-shi-zheng-xing'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/15104568657'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/dr-fang-81'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/qiong-an-gu-niang'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/gong-fu-qiang-65-23'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yi-xue-mei-rong-wang-xi-you-yi-shi'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/xiao-xiao-shuo-yi-mei'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/bian-mei-zhao-ha-ge/answers'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-xiu-fu-yi-sheng-wang-shao-guo'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-666'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zheng-xing-yi-sheng-lin-lin'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ba-da-chu-zang-meng-qing-yi-sheng'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-xiao-lian-mao'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-da-dong'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/LZGdoctor'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-lin-wei'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-zhou-xiao-dong'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/15680033702'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-yin-hong-yu-45'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/huang-jia-cheng-15-88'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/long-bi-yi-sheng-liu-guo-quan'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/zheng-xing-yi-sheng-qin-zi-kan'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/ya-tou-20-6-24'",
"python tasks/zhihu/spider.py 0 1000 0 'https://www.zhihu.com/people/yan-kong-ke-pu-67'",
]
for plan in plans:
print('start plan ', plan, ' at ', datetime.now())
os.system(plan)
print('end plan ', plan, ' at ', datetime.now())
time.sleep(10)
# import rsa
import os, sys
base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append("/Users/haowei")
sys.path.append("/Users/haowei/workspace/gm/crawler")
sys.path.append("/Users/haowei/workspace/gm/crawler/crawler_sys")
import pymysql import pymysql
import hashlib import hashlib
import requests import requests
...@@ -14,6 +7,7 @@ import re ...@@ -14,6 +7,7 @@ import re
import sys import sys
import time import time
from datetime import datetime from datetime import datetime
import kdl
HOST = '172.18.51.14' HOST = '172.18.51.14'
PORT = 3306 PORT = 3306
...@@ -22,34 +16,31 @@ PASSWD = 'Gengmei123' ...@@ -22,34 +16,31 @@ PASSWD = 'Gengmei123'
DB = 'spider' DB = 'spider'
# JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js' # JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
JS_FILE_PATH = '/Users/haowei/workspace/gm/crawler/crawler_sys/site_crawler/zhihu.js' JS_FILE_PATH = '/Users/haowei/workspace/gm/crawler/crawler_sys/site_crawler/zhihu.js'
APIKEY = 'quxguz4hwm9cxnx6wpjhkokx04klpr8v'
def retry_get_url_no_proxies(url, retrys=3, timeout=10, **kwargs): def get_proxy():
retry_c = 0 auth = kdl.Auth("990866563045611", APIKEY)
while retry_c < retrys: client = kdl.Client(auth)
try: ips = client.get_dps(1, sign_type='hmacsha1', format='json', area='北京,上海,广东')
get_resp = requests.get(url, timeout=timeout, **kwargs) print("dps proxy: ", ips)
return get_resp return { "http": "http://{}".format(ips[0]), "https": "https://{}".format(ips[0]), }
except Exception as e:
retry_c += 1
time.sleep(1)
print(e)
print('Failed to get page %s after %d retries, %s'
% (url, retrys, datetime.now()))
return None
def retry_get_url(url, retrys=3, timeout=10, **kwargs): def retry_get_url(url, retrys=5, timeout=10, proxies=None, **kwargs):
retry_c = 0 retry_c = 0
while retry_c < retrys: while retry_c < retrys:
try: try:
get_resp = requests.get(url, timeout=timeout, **kwargs) if proxies:
get_resp = requests.get(url, timeout=timeout, proxies=proxies, **kwargs)
else:
get_resp = requests.get(url, timeout=timeout, **kwargs)
return get_resp return get_resp
except Exception as e: except Exception as e:
retry_c += 1 retry_c += 1
time.sleep(1) time.sleep(3)
print(e) print(e)
proxies = get_proxy()
print('Failed to get page %s after %d retries, %s' print('Failed to get page %s after %d retries, %s'
% (url, retrys, datetime.now())) % (url, retrys, datetime.now()))
return None return None
...@@ -65,9 +56,12 @@ class Spider(object): ...@@ -65,9 +56,12 @@ class Spider(object):
passwd=PASSWD, passwd=PASSWD,
db=DB, charset='utf8') db=DB, charset='utf8')
self.cur = self.conn.cursor() self.cur = self.conn.cursor()
self.page_count = 1000
self.use_proxy = True
self.spider_url = spider_url self.spider_url = spider_url
detail_url = '/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset=20&limit=20&sort_by=created' detail_url = '/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={}&limit=20&sort_by=created'
self.ANSWER_URL = self.spider_url.replace("https://www.zhihu.com/people", "https://www.zhihu.com/api/v4/members") + detail_url self.ANSWER_URL = self.spider_url.replace("https://www.zhihu.com/people", "https://www.zhihu.com/api/v4/members") + detail_url
os.environ["EXECJS_RUNTIME"] = 'Node' os.environ["EXECJS_RUNTIME"] = 'Node'
...@@ -80,6 +74,13 @@ class Spider(object): ...@@ -80,6 +74,13 @@ class Spider(object):
self.exec_js = execjs.compile(js) self.exec_js = execjs.compile(js)
# self.exec_js = execjs.compile(js, cwd='/home/gmuser/node_modules') # self.exec_js = execjs.compile(js, cwd='/home/gmuser/node_modules')
def update_page_count(self, answer_count):
count = int(answer_count / 20)
temp = int(answer_count % 20)
if temp > 0:
count += 1
self.page_count = count
def get_serach_page_cookies(self): def get_serach_page_cookies(self):
''' '''
cookies更新 cookies更新
...@@ -90,14 +91,14 @@ class Spider(object): ...@@ -90,14 +91,14 @@ class Spider(object):
"accept-encoding": "gzip, deflate, br", "accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9", "accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0", "cache-control": "max-age=0",
"cookie": '_xsrf=vTWamiEoaOQszAl6fjdlxOtqyhDvOen9; d_c0="AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=|1545899265"; q_c1=3e9e37a46b1d4bfd87f7d1fcb084daad|1545899267000|1545899267000; _ga=GA1.2.929033900.1582626815; capsion_ticket="2|1:0|10:1608602928|14:capsion_ticket|44:MmRhNDdmYWJhZjU3NGQ4ODg3NDAzNGIwNDNiMTdlNDE=|7924fa0d0e36d3ed2a4af65dafa4684c9b36a70d586ec3adb1963c8df5f55e81"; _zap=6fd2d768-daa1-4be1-9a96-43d86c1bbc75; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1609321344,1609321689,1609321744,1609322777; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1609322777; KLBRSID=d017ffedd50a8c265f0e648afe355952|1609323283|1609315999', "cookie": 'SESSIONID=UIZ9mtCMPNttU11zx8a9e5eJcTm92PhBGKiz9oqWgDr; JOID=UlsQAkk86vULooRFHj177i1UY9UKHM_SK4OkYDkdytAsgqVlOzFzH1WjgUcZ-R9yCKLnnTcSKj5UlS_DhJu9iUI=; osd=VFoXB0466_IOpYJEGTh86CxTZtIMHcjXLIWlZzwazNErh6JjOjZ2GFOihkIe_x51DaXhnDAXLThVkirEgpq6jEU=; SESSIONID=rsVkcWbq9ESuP7O4fOw4qdMJdkNGnCFu59zCNAAkoIL; JOID=VV4TCkoAD-uttc-DPQ6Y9IZDJxUtIizHhpDtoBElLciBnuqhHkmjAfyyzow6twj5biJFaHi7j_WoTqKkbWlN0QI=; osd=UFkVCk4FCO2tscqEOw6c8YFFJxEoJSrHgpXqphEhKM-Hnu6kGU-jBfm1yIw-sg__biZAb367i_CvSKKgaG5L0QY=; _xsrf=vTWamiEoaOQszAl6fjdlxOtqyhDvOen9; d_c0="AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=|1545899265"; q_c1=3e9e37a46b1d4bfd87f7d1fcb084daad|1545899267000|1545899267000; _ga=GA1.2.929033900.1582626815; capsion_ticket="2|1:0|10:1608602928|14:capsion_ticket|44:MmRhNDdmYWJhZjU3NGQ4ODg3NDAzNGIwNDNiMTdlNDE=|7924fa0d0e36d3ed2a4af65dafa4684c9b36a70d586ec3adb1963c8df5f55e81"; _zap=6fd2d768-daa1-4be1-9a96-43d86c1bbc75; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1609325059,1609337218,1609401296,1609405637; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1609410807; KLBRSID=0a401b23e8a71b70de2f4b37f5b4e379|1609410806|1609401296',
"referer": self.spider_url, "referer": self.spider_url,
"sec-fetch-dest": "document", "sec-fetch-dest": "document",
"sec-fetch-mode": "navigate", "sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin", "sec-fetch-site": "same-origin",
"sec-fetch-user": "?1", "sec-fetch-user": "?1",
"upgrade-insecure-requests": "1", "upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
} }
requests_res = retry_get_url(self.spider_url, headers=headers) requests_res = retry_get_url(self.spider_url, headers=headers)
return requests_res.cookies.get_dict() return requests_res.cookies.get_dict()
...@@ -121,7 +122,7 @@ class Spider(object): ...@@ -121,7 +122,7 @@ class Spider(object):
return True return True
return False return False
def parse_sigle_page(self, data_dict, mark, need_comment=False): def parse_sigle_page(self, data_dict, mark):
''' '''
插入主要内容数据和图片的url,寻找评论 插入主要内容数据和图片的url,寻找评论
''' '''
...@@ -141,26 +142,6 @@ class Spider(object): ...@@ -141,26 +142,6 @@ class Spider(object):
self.cur.execute(into, values) self.cur.execute(into, values)
self.conn.commit() self.conn.commit()
if need_comment:
offset = 0
if data_dict["comment_count"] != 0:
next = 1
while next == 1:
next = self.search_root_comment(data_dict["id"], offset, mark)
offset = offset + 20
# patt = r'%s(.+?)%s' % ("<noscript><img src=\"", "\" data-caption")
# pattern = re.compile(patt)
# result = pattern.findall(data_dict["content"])
# for results in result:
# if mark == 0:
# into = "insert into zhihu_answer_picture_url(answer_id, url) value(%s, %s)"
# elif mark == 1:
# into = "insert into zhihu_article_picture_url(article_id, url) value(%s, %s)"
# values = (data_dict["id"], results)
# self.cur.execute(into, values)
# self.conn.commit()
return return
def search_page(self, mark, page_max, start_page=0, need_commend=False): def search_page(self, mark, page_max, start_page=0, need_commend=False):
...@@ -173,10 +154,12 @@ class Spider(object): ...@@ -173,10 +154,12 @@ class Spider(object):
offset = start_page offset = start_page
for i in range(page_max): for i in range(page_max):
if i > self.page_count - 1:
break
if mark == 0: if mark == 0:
self.search_answer_article_page(offset, 0, 0, need_commend) self.search_answer_article_page(offset, 0, 0)
elif mark == 1: elif mark == 1:
self.search_answer_article_page(offset, 1, 0, need_commend) self.search_answer_article_page(offset, 1, 0)
elif mark == 2: elif mark == 2:
self.search_thought_page(offset) self.search_thought_page(offset)
...@@ -186,7 +169,7 @@ class Spider(object): ...@@ -186,7 +169,7 @@ class Spider(object):
self.conn.close() self.conn.close()
return return
def search_answer_article_page(self, offset, mark, proxies_num=0, need_comment=False): def search_answer_article_page(self, offset, mark, proxies_num=0):
''' '''
实现文章和回答的数据包请求 实现文章和回答的数据包请求
''' '''
...@@ -197,18 +180,25 @@ class Spider(object): ...@@ -197,18 +180,25 @@ class Spider(object):
url = ARTICLE_URL.format(offset) url = ARTICLE_URL.format(offset)
[headers_search, cookies_dict] = self.headers_handle(url) [headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num) proxies = None
if self.use_proxy:
proxies = get_proxy()
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=proxies)
if get_page.status_code != 200: if get_page.status_code != 200:
# retry once # retry once
get_page = requests.get(url) time.sleep(3)
get_page = retry_get_url(url, proxies=proxies)
if get_page.status_code != 200: if get_page.status_code != 200:
print("article_error, url : ", url, " status_code: ", get_page.status_code) print("article_error, url : ", url, " status_code: ", get_page.status_code)
page_dict = get_page.json() page_dict = get_page.json()
if page_dict.get("data"): if page_dict.get("data"):
print(self.page_count)
if self.page_count == 1000:
self.update_page_count(page_dict["paging"].get("totals", 0))
for one_line in page_dict['data']: for one_line in page_dict['data']:
try: try:
if one_line["content"] != None: if one_line["content"] != None:
self.parse_sigle_page(one_line, mark, need_comment=need_comment) self.parse_sigle_page(one_line, mark)
print("finshed_crawler " + offset) print("finshed_crawler " + offset)
except KeyError: except KeyError:
# It's totally ok to drop the last return data value. # It's totally ok to drop the last return data value.
...@@ -216,113 +206,9 @@ class Spider(object): ...@@ -216,113 +206,9 @@ class Spider(object):
continue continue
else: else:
print("article_data_error, offset: ", offset, " url: ", url) print("article_data_error, offset: ", offset, " url: ", url)
self.use_proxy = True
return time.sleep(3)
self.search_answer_article_page(offset=offset, mark=mark)
def search_root_comment(self, answerid, offset, mark, proxies_num=0):
'''
实现父评论的数据包请求
'''
offset = str(offset)
answerid = str(answerid)
if mark == 0:
url = ANSWER_ROOT_COMMENT_URL.format(answerid, offset)
elif mark == 1:
url = ARTICLE_ROOT_COMMENT_URL.format(answerid, offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
if get_page.status_code != 200:
print("root_comment_error")
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
try:
self.root_comment_data(one_line, answerid, mark)
print("finshed_root" + offset)
except KeyError:
continue
else:
print("root_data_error")
next = 0
if len(page_dict['data']) == 20:
next = 1
return next
def root_comment_data(self, data_dict, answerid, mark):
'''
插入父评论相关信息并关联子评论
'''
if mark == 0:
into = "insert into zhihu_answer_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s, %s)"
elif mark == 1:
into = "insert into zhihu_article_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s, %s)"
values = (data_dict["id"], data_dict["author"]["member"]["name"], data_dict["content"], answerid,
data_dict["child_comment_count"], data_dict["featured"], data_dict["created_time"],
data_dict["author"]["member"]["id"])
self.cur.execute(into, values)
self.conn.commit()
offset = 0
if data_dict["child_comment_count"] != 0:
next = 1
while next == 1:
next = self.search_child_comment(data_dict["id"], offset, mark)
offset = offset + 20
return
def search_child_comment(self, root_comment_id, offset, proxies_num=0):
'''
文章和回答的数据包请求
'''
root_comment_id = str(root_comment_id)
offsets = offset
offset = str(offset)
if offsets == 0:
url = CHILD_COMMENT_START_URL.format(root_comment_id)
else:
url = CHILD_COMMENT_OFFSET_URL.format(root_comment_id, offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url_no_proxies(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
if get_page.status_code != 200:
print("child_comment_error")
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
try:
self.child_comment_data(one_line, root_comment_id)
except KeyError:
continue
else:
pass
next = 0
if len(page_dict['data']) == 20:
next = 1
return next
def child_comment_data(self, data_dict, root_comment_id):
'''
子评论数据插入
'''
into = "insert into zhihu_child_comment(root_comment_id, author_name, content, reply_name, child_comment_id, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s)"
values = (root_comment_id, data_dict["author"]["member"]["name"], data_dict["content"],
data_dict["reply_to_author"]["member"]["name"], data_dict["id"], data_dict["created_time"],
data_dict["author"]["member"]["name"])
self.cur.execute(into, values)
self.conn.commit()
return return
...@@ -340,7 +226,7 @@ class Spider(object): ...@@ -340,7 +226,7 @@ class Spider(object):
"sec-fetch-dest": "empty", "sec-fetch-dest": "empty",
"sec-fetch-mode": "cors", "sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin", "sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
"x-ab-param": "li_yxzl_new_style_a=1;se_major=0;se_adsrank=4;se_hi_trunc=0;se_v053=1;li_panswer_topic=0;zr_test_aa1=1;pf_noti_entry_num=2;zr_search_sim2=2;zr_slotpaidexp=2;se_zp_boost=0;tp_club_entrance=1;pf_profile2_tab=0;ug_newtag=1;li_answer_card=0;ls_recommend_test=5;qap_labeltype=1;zr_rec_answer_cp=open;se_sug_term=0;tp_topic_tab=0;ge_ge01=5;se_wil_act=0;se_videobox=0;tsp_ad_cardredesign=0;qap_question_visitor= 0;zr_slot_training=2;tp_clubhyb=0;li_ebook_gen_search=2;se_v_v005=0;zw_sameq_sorce=999;ge_ge02=6;se_mobilecard=0;se_auth_src=0;tp_header_style=1;tp_flow_ctr=0;pf_creator_card=1;li_viptab_name=0;zr_intervene=0;se_bert128=1;se_ffzx_jushen1=0;top_v_album=1;se_preset=0;tp_discover=1;ls_fmp4=0;tp_club_top=0;top_universalebook=1;li_svip_cardshow=1;li_paid_answer_exp=0;tp_topic_style=0;zr_art_rec=base;se_colorfultab=1;se_auth_src2=0;tp_club_qa_entrance=1;tp_club__entrance2=1;tsp_hotlist_ui=3;li_svip_tab_search=1;se_entity22=1;tp_meta_card=0;tp_topic_tab_new=0-0-0;tp_zrec=0;top_ebook=0;pf_adjust=1;qap_question_author=0;zr_topic_rpc=0;se_topicfeed=0;tp_club_feed=0;tsp_ioscard2=0;zr_rel_search=base;se_recommend=1;se_usercard=0;tp_club_fdv4=0;tp_m_intro_re_topic=1;pf_foltopic_usernum=0;li_vip_verti_search=0;zr_training_boost=false;se_v054=0;tp_contents=1;soc_feed_intelligent=3;tsp_ios_cardredesign=0;pf_fuceng=1;pf_newguide_vertical=0;ug_follow_topic_1=2;ls_video_commercial=0;li_car_meta=1;se_sug_dnn=0;tp_fenqu_wei=0;li_catalog_card=1;top_quality=0;se_click_v_v=1;se_aa_base=1;se_club_ui=0;se_return_1=0;soc_notification=1;zr_ans_rec=gbrank;zr_search_paid=1;zr_expslotpaid=3;zr_rerank=0;se_college=default;se_whitelist=1;top_root=0;li_yxxq_aut=A1;tsp_adcard2=0;ls_videoad=2;se_col_boost=1;li_edu_page=old;zr_training_first=false;se_t2sug=1;se_vbert3=0;se_merge=0;li_video_section=1;zr_km_answer=open_cvr;zr_sim3=0;se_v_v006=0;tp_dingyue_video=0;li_topics_search=0;se_searchwiki=0;se_guess=0;se_major_v2=0;tp_club_bt=0;tp_sft=a;top_test_4_liguangyi=1", "x-ab-param": "li_yxzl_new_style_a=1;se_major=0;se_adsrank=4;se_hi_trunc=0;se_v053=1;li_panswer_topic=0;zr_test_aa1=1;pf_noti_entry_num=2;zr_search_sim2=2;zr_slotpaidexp=2;se_zp_boost=0;tp_club_entrance=1;pf_profile2_tab=0;ug_newtag=1;li_answer_card=0;ls_recommend_test=5;qap_labeltype=1;zr_rec_answer_cp=open;se_sug_term=0;tp_topic_tab=0;ge_ge01=5;se_wil_act=0;se_videobox=0;tsp_ad_cardredesign=0;qap_question_visitor= 0;zr_slot_training=2;tp_clubhyb=0;li_ebook_gen_search=2;se_v_v005=0;zw_sameq_sorce=999;ge_ge02=6;se_mobilecard=0;se_auth_src=0;tp_header_style=1;tp_flow_ctr=0;pf_creator_card=1;li_viptab_name=0;zr_intervene=0;se_bert128=1;se_ffzx_jushen1=0;top_v_album=1;se_preset=0;tp_discover=1;ls_fmp4=0;tp_club_top=0;top_universalebook=1;li_svip_cardshow=1;li_paid_answer_exp=0;tp_topic_style=0;zr_art_rec=base;se_colorfultab=1;se_auth_src2=0;tp_club_qa_entrance=1;tp_club__entrance2=1;tsp_hotlist_ui=3;li_svip_tab_search=1;se_entity22=1;tp_meta_card=0;tp_topic_tab_new=0-0-0;tp_zrec=0;top_ebook=0;pf_adjust=1;qap_question_author=0;zr_topic_rpc=0;se_topicfeed=0;tp_club_feed=0;tsp_ioscard2=0;zr_rel_search=base;se_recommend=1;se_usercard=0;tp_club_fdv4=0;tp_m_intro_re_topic=1;pf_foltopic_usernum=0;li_vip_verti_search=0;zr_training_boost=false;se_v054=0;tp_contents=1;soc_feed_intelligent=3;tsp_ios_cardredesign=0;pf_fuceng=1;pf_newguide_vertical=0;ug_follow_topic_1=2;ls_video_commercial=0;li_car_meta=1;se_sug_dnn=0;tp_fenqu_wei=0;li_catalog_card=1;top_quality=0;se_click_v_v=1;se_aa_base=1;se_club_ui=0;se_return_1=0;soc_notification=1;zr_ans_rec=gbrank;zr_search_paid=1;zr_expslotpaid=3;zr_rerank=0;se_college=default;se_whitelist=1;top_root=0;li_yxxq_aut=A1;tsp_adcard2=0;ls_videoad=2;se_col_boost=1;li_edu_page=old;zr_training_first=false;se_t2sug=1;se_vbert3=0;se_merge=0;li_video_section=1;zr_km_answer=open_cvr;zr_sim3=0;se_v_v006=0;tp_dingyue_video=0;li_topics_search=0;se_searchwiki=0;se_guess=0;se_major_v2=0;tp_club_bt=0;tp_sft=a;top_test_4_liguangyi=1",
"x-api-version": "3.0.91", "x-api-version": "3.0.91",
"x-app-za": "OS=Web", "x-app-za": "OS=Web",
...@@ -352,7 +238,7 @@ class Spider(object): ...@@ -352,7 +238,7 @@ class Spider(object):
} }
cookies_dict = { cookies_dict = {
"d_c0": '"AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=|1545899265"', "d_c0": '"AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=|1545899265"',
"KLBRSID": None "KLBRSID": '0a401b23e8a71b70de2f4b37f5b4e379|1609410806|1609401296'
} }
cookies_dict.update(res_cookies_dict) cookies_dict.update(res_cookies_dict)
...@@ -363,110 +249,13 @@ class Spider(object): ...@@ -363,110 +249,13 @@ class Spider(object):
headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b", fmd5) headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b", fmd5)
return headers_search, cookies_dict return headers_search, cookies_dict
def search_thought_page(self, offset, proxies_num=0):
'''
想法数据包请求
'''
offset = str(offset)
url = THOUGHT_URL.format(offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
if get_page.status_code != 200:
print("article_error")
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
try:
self.parse_thought_sigle_page(one_line)
print("finshed_article" + offset)
except KeyError:
# It's totally ok to drop the last return data value.
# The search api just return something seems related to search
continue
else:
print("article_data_error")
return
def parse_thought_sigle_page(self, data_dict):
'''
想法内容插入
'''
for one_dict in data_dict["content"]:
if one_dict["type"] == "text":
into = "insert into zhihu_thought(content, thought_id, created_time, comment_count) value(%s, %s, %s, %s)"
values = (one_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"])
self.cur.execute(into, values)
self.conn.commit()
else:
into = "insert into zhihu_thought_picture_url(thought_id, url) value(%s, %s)"
values = (data_dict["id"], one_dict["url"])
self.cur.execute(into, values)
self.conn.commit()
offset = 0
if data_dict["comment_count"] != 0:
next = 1
while next == 1:
next = self.search_thought_comment(data_dict["id"], offset)
offset = offset + 20
return
def search_thought_comment(self, answerid, offset, proxies_num=0):
'''
想法评论数据包请求
'''
offset = str(offset)
answerid = str(answerid)
url = THOUGHT_COMMENT_URL.format(answerid, offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
if get_page.status_code != 200:
print("root_comment_error")
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
try:
self.thought_comment_data(one_line, answerid)
print("finshed_root" + offset)
except KeyError:
continue
else:
print("root_data_error")
next = 0
if len(page_dict['data']) == 20:
next = 1
return next
def thought_comment_data(self, data_dict, answerid):
'''
想法评论数据插入
'''
into = "insert into zhihu_thought_comment(thought_comment_id, author_name, content, answerid, created_time, author_id) value(%s, %s, %s, %s, %s, %s)"
values = (data_dict["id"], data_dict["author"]["member"]["name"], data_dict["content"], answerid,
data_dict["created_time"], data_dict["author"]["member"]["id"])
self.cur.execute(into, values)
self.conn.commit()
return
if __name__ == '__main__': if __name__ == '__main__':
''' '''
python tasks/zhihu/spider.py 0 1 0 'https://www.zhihu.com/people/taoxi-1130' python tasks/zhihu/spider.py 0 1 0 'https://www.zhihu.com/people/taoxi-1130'
python script_file_path mark(指定是问题还是其他, 0 是问题, 1是文章, 2是想法) max_page(最大页码) start_page(起始页码 0开始) spider_url(要爬取用户知乎首页地址) python script_file_path mark(指定是问题还是其他, 0 是问题, 1是文章, 2是想法) max_page(最大页码) start_page(起始页码 0开始) spider_url(要爬取用户知乎首页地址)
''' '''
mark = int(sys.argv[1]) mark = int(sys.argv[1])
max_page = int(sys.argv[2]) max_page = int(sys.argv[2])
start_page = int(sys.argv[3]) start_page = int(sys.argv[3])
......
# import rsa
import os, sys
base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append("/Users/haowei")
sys.path.append("/Users/haowei/workspace/gm/crawler")
sys.path.append("/Users/haowei/workspace/gm/crawler/crawler_sys")
import pymysql
import hashlib
import requests
import execjs
import os
import re
import sys
import time
from datetime import datetime
import kdl
HOST = '172.18.51.14'
PORT = 3306
USER = 'spider'
PASSWD = 'Gengmei123'
DB = 'spider'
# JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
JS_FILE_PATH = '/Users/haowei/workspace/gm/crawler/crawler_sys/site_crawler/zhihu.js'
APIKEY = 'quxguz4hwm9cxnx6wpjhkokx04klpr8v'
def get_proxy():
import pdb; pdb.set_trace()
auth = kdl.Auth("990866563045611", APIKEY)
client = kdl.Client(auth)
ips = client.get_dps(1, sign_type='hmacsha1', format='json', area='北京,上海,广东')
print("dps proxy: ", ips)
# url = 'http://dps.kuaidaili.com/api/getdps/'
# params = {
# 'orderid': '990866563045611',
# 'num': '1',
# 'ut': '1',
# 'format': 'json',
# 'sep': '1',
# }
# res = request.get(url, **params)
def retry_get_url_no_proxies(url, retrys=5, timeout=10, **kwargs):
retry_c = 0
while retry_c < retrys:
try:
proxies = { "http": "http://120.38.67.137:15374", "https": "https://120.38.67.137:15374", }
get_resp = requests.get(url, timeout=timeout, **kwargs)
import pdb; pdb.set_trace()
return get_resp
except Exception as e:
retry_c += 1
time.sleep(1)
print(e)
print('Failed to get page %s after %d retries, %s'
% (url, retrys, datetime.now()))
return None
def retry_get_url(url, retrys=5, timeout=10, **kwargs):
retry_c = 0
while retry_c < retrys:
try:
proxies = { "http": "http://120.38.67.137:15374", "https": "https://120.38.67.137:15374", }
get_resp = requests.get(url, timeout=timeout, proxies=proxies, **kwargs)
import pdb; pdb.set_trace()
return get_resp
except Exception as e:
retry_c += 1
time.sleep(1)
print(e)
print('Failed to get page %s after %d retries, %s'
% (url, retrys, datetime.now()))
return None
class Spider(object):
def __init__(self, spider_url):
'''
初始化数据库,调整js规则
'''
self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
passwd=PASSWD,
db=DB, charset='utf8')
self.cur = self.conn.cursor()
self.spider_url = spider_url
detail_url = '/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset=20&limit=20&sort_by=created'
self.ANSWER_URL = self.spider_url.replace("https://www.zhihu.com/people", "https://www.zhihu.com/api/v4/members") + detail_url
os.environ["EXECJS_RUNTIME"] = 'Node'
try:
with open('./zhihu.js', 'r', encoding='utf-8') as f:
js = f.read()
except:
with open(JS_FILE_PATH, 'r', encoding='utf-8') as f:
js = f.read()
self.exec_js = execjs.compile(js)
# self.exec_js = execjs.compile(js, cwd='/home/gmuser/node_modules')
def get_serach_page_cookies(self):
'''
cookies更新
'''
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"cookie": '_xsrf=vTWamiEoaOQszAl6fjdlxOtqyhDvOen9; d_c0="AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=|1545899265"; q_c1=3e9e37a46b1d4bfd87f7d1fcb084daad|1545899267000|1545899267000; _ga=GA1.2.929033900.1582626815; capsion_ticket="2|1:0|10:1608602928|14:capsion_ticket|44:MmRhNDdmYWJhZjU3NGQ4ODg3NDAzNGIwNDNiMTdlNDE=|7924fa0d0e36d3ed2a4af65dafa4684c9b36a70d586ec3adb1963c8df5f55e81"; _zap=6fd2d768-daa1-4be1-9a96-43d86c1bbc75; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1609321344,1609321689,1609321744,1609322777; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1609322777; KLBRSID=d017ffedd50a8c265f0e648afe355952|1609323283|1609315999',
"referer": self.spider_url,
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36",
}
requests_res = retry_get_url(self.spider_url, headers=headers)
return requests_res.cookies.get_dict()
def check_data_exist(self, data_dict, mark):
'''
数据插入前检测
'''
sql = "select id from {table} where answer_id = {id_}"
exist = None
if mark == 0:
select_sql = sql.format(table='zhihu_answer', id_=data_dict["id"])
self.cur.execute(select_sql)
exist = self.cur.fetchone()
if mark == 1:
select_sql = sql.format(table='zhihu_article', id_=data_dict["id"])
self.cur.execute(select_sql)
exist = self.cur.fetchone()
if exist:
return True
return False
def parse_sigle_page(self, data_dict, mark, need_comment=False):
'''
插入主要内容数据和图片的url,寻找评论
'''
if not self.check_data_exist(data_dict, mark):
if mark == 0:
into = "insert into zhihu_answer(title, content, answer_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"],
data_dict["comment_count"], data_dict["content"])
elif mark == 1:
into = "insert into zhihu_article(title, content, article_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (
data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"],
data_dict["content"])
print(data_dict["question"]["title"])
self.cur.execute(into, values)
self.conn.commit()
if need_comment:
offset = 0
if data_dict["comment_count"] != 0:
next = 1
while next == 1:
next = self.search_root_comment(data_dict["id"], offset, mark)
offset = offset + 20
# patt = r'%s(.+?)%s' % ("<noscript><img src=\"", "\" data-caption")
# pattern = re.compile(patt)
# result = pattern.findall(data_dict["content"])
# for results in result:
# if mark == 0:
# into = "insert into zhihu_answer_picture_url(answer_id, url) value(%s, %s)"
# elif mark == 1:
# into = "insert into zhihu_article_picture_url(article_id, url) value(%s, %s)"
# values = (data_dict["id"], results)
# self.cur.execute(into, values)
# self.conn.commit()
return
def search_page(self, mark, page_max, start_page=0, need_commend=False):
'''
函数主入口
params:
mark 0 answer, 1 article, 2 thought
'''
offset = start_page
for i in range(page_max):
if mark == 0:
self.search_answer_article_page(offset, 0, 0, need_commend)
elif mark == 1:
self.search_answer_article_page(offset, 1, 0, need_commend)
elif mark == 2:
self.search_thought_page(offset)
offset = offset + 20
time.sleep(10)
self.conn.close()
return
def search_answer_article_page(self, offset, mark, proxies_num=0, need_comment=False):
'''
实现文章和回答的数据包请求
'''
offset = str(offset)
if mark == 0:
url = self.ANSWER_URL.format(offset)
elif mark == 1:
url = ARTICLE_URL.format(offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
if get_page.status_code != 200:
print("article_error, url : ", url, " status_code: ", get_page.status_code)
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
try:
if one_line["content"] != None:
self.parse_sigle_page(one_line, mark, need_comment=need_comment)
print("finshed_crawler " + offset)
except KeyError:
# It's totally ok to drop the last return data value.
# The search api just return something seems related to search
continue
else:
print("article_data_error, offset: ", offset, " url: ", url)
return
def search_root_comment(self, answerid, offset, mark, proxies_num=0):
'''
实现父评论的数据包请求
'''
offset = str(offset)
answerid = str(answerid)
if mark == 0:
url = ANSWER_ROOT_COMMENT_URL.format(answerid, offset)
elif mark == 1:
url = ARTICLE_ROOT_COMMENT_URL.format(answerid, offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
if get_page.status_code != 200:
print("root_comment_error")
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
try:
self.root_comment_data(one_line, answerid, mark)
print("finshed_root" + offset)
except KeyError:
continue
else:
print("root_data_error")
next = 0
if len(page_dict['data']) == 20:
next = 1
return next
def root_comment_data(self, data_dict, answerid, mark):
'''
插入父评论相关信息并关联子评论
'''
if mark == 0:
into = "insert into zhihu_answer_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s, %s)"
elif mark == 1:
into = "insert into zhihu_article_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s, %s)"
values = (data_dict["id"], data_dict["author"]["member"]["name"], data_dict["content"], answerid,
data_dict["child_comment_count"], data_dict["featured"], data_dict["created_time"],
data_dict["author"]["member"]["id"])
self.cur.execute(into, values)
self.conn.commit()
offset = 0
if data_dict["child_comment_count"] != 0:
next = 1
while next == 1:
next = self.search_child_comment(data_dict["id"], offset, mark)
offset = offset + 20
return
def search_child_comment(self, root_comment_id, offset, proxies_num=0):
'''
文章和回答的数据包请求
'''
root_comment_id = str(root_comment_id)
offsets = offset
offset = str(offset)
if offsets == 0:
url = CHILD_COMMENT_START_URL.format(root_comment_id)
else:
url = CHILD_COMMENT_OFFSET_URL.format(root_comment_id, offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url_no_proxies(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
if get_page.status_code != 200:
print("child_comment_error")
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
try:
self.child_comment_data(one_line, root_comment_id)
except KeyError:
continue
else:
pass
next = 0
if len(page_dict['data']) == 20:
next = 1
return next
def child_comment_data(self, data_dict, root_comment_id):
'''
子评论数据插入
'''
into = "insert into zhihu_child_comment(root_comment_id, author_name, content, reply_name, child_comment_id, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s)"
values = (root_comment_id, data_dict["author"]["member"]["name"], data_dict["content"],
data_dict["reply_to_author"]["member"]["name"], data_dict["id"], data_dict["created_time"],
data_dict["author"]["member"]["name"])
self.cur.execute(into, values)
self.conn.commit()
return
def headers_handle(self, url):
'''
url请求中的头部伪装
'''
res_cookies_dict = self.get_serach_page_cookies()
referer = self.spider_url.replace("https://www.zhihu.com/people", "https://www.zhihu.com/api/v4/members")
headers_search = {
"accept": "*/*",
"accept-encoding": "gzip, deflate",
"accept-language": "zh-CN,zh;q=0.9",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36",
"x-ab-param": "li_yxzl_new_style_a=1;se_major=0;se_adsrank=4;se_hi_trunc=0;se_v053=1;li_panswer_topic=0;zr_test_aa1=1;pf_noti_entry_num=2;zr_search_sim2=2;zr_slotpaidexp=2;se_zp_boost=0;tp_club_entrance=1;pf_profile2_tab=0;ug_newtag=1;li_answer_card=0;ls_recommend_test=5;qap_labeltype=1;zr_rec_answer_cp=open;se_sug_term=0;tp_topic_tab=0;ge_ge01=5;se_wil_act=0;se_videobox=0;tsp_ad_cardredesign=0;qap_question_visitor= 0;zr_slot_training=2;tp_clubhyb=0;li_ebook_gen_search=2;se_v_v005=0;zw_sameq_sorce=999;ge_ge02=6;se_mobilecard=0;se_auth_src=0;tp_header_style=1;tp_flow_ctr=0;pf_creator_card=1;li_viptab_name=0;zr_intervene=0;se_bert128=1;se_ffzx_jushen1=0;top_v_album=1;se_preset=0;tp_discover=1;ls_fmp4=0;tp_club_top=0;top_universalebook=1;li_svip_cardshow=1;li_paid_answer_exp=0;tp_topic_style=0;zr_art_rec=base;se_colorfultab=1;se_auth_src2=0;tp_club_qa_entrance=1;tp_club__entrance2=1;tsp_hotlist_ui=3;li_svip_tab_search=1;se_entity22=1;tp_meta_card=0;tp_topic_tab_new=0-0-0;tp_zrec=0;top_ebook=0;pf_adjust=1;qap_question_author=0;zr_topic_rpc=0;se_topicfeed=0;tp_club_feed=0;tsp_ioscard2=0;zr_rel_search=base;se_recommend=1;se_usercard=0;tp_club_fdv4=0;tp_m_intro_re_topic=1;pf_foltopic_usernum=0;li_vip_verti_search=0;zr_training_boost=false;se_v054=0;tp_contents=1;soc_feed_intelligent=3;tsp_ios_cardredesign=0;pf_fuceng=1;pf_newguide_vertical=0;ug_follow_topic_1=2;ls_video_commercial=0;li_car_meta=1;se_sug_dnn=0;tp_fenqu_wei=0;li_catalog_card=1;top_quality=0;se_click_v_v=1;se_aa_base=1;se_club_ui=0;se_return_1=0;soc_notification=1;zr_ans_rec=gbrank;zr_search_paid=1;zr_expslotpaid=3;zr_rerank=0;se_college=default;se_whitelist=1;top_root=0;li_yxxq_aut=A1;tsp_adcard2=0;ls_videoad=2;se_col_boost=1;li_edu_page=old;zr_training_first=false;se_t2sug=1;se_vbert3=0;se_merge=0;li_video_section=1;zr_km_answer=open_cvr;zr_sim3=0;se_v_v006=0;tp_dingyue_video=0;li_topics_search=0;se_searchwiki=0;se_guess=0;se_major_v2=0;tp_club_bt=0;tp_sft=a;top_test_4_liguangyi=1",
"x-api-version": "3.0.91",
"x-app-za": "OS=Web",
"x-requested-with": "fetch",
"x-zse-83": "3_2.0",
"x-zse-86": None,
"referer": referer + "/answers?page=1",
}
cookies_dict = {
"d_c0": '"AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=|1545899265"',
"KLBRSID": None
}
cookies_dict.update(res_cookies_dict)
f = "+".join(
["3_2.0", url.replace("https://www.zhihu.com", ""), headers_search["referer"], cookies_dict["d_c0"]])
fmd5 = hashlib.new('md5', f.encode()).hexdigest()
headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b", fmd5)
return headers_search, cookies_dict
def search_thought_page(self, offset, proxies_num=0):
'''
想法数据包请求
'''
offset = str(offset)
url = THOUGHT_URL.format(offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
if get_page.status_code != 200:
print("article_error")
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
try:
self.parse_thought_sigle_page(one_line)
print("finshed_article" + offset)
except KeyError:
# It's totally ok to drop the last return data value.
# The search api just return something seems related to search
continue
else:
print("article_data_error")
return
def parse_thought_sigle_page(self, data_dict):
'''
想法内容插入
'''
for one_dict in data_dict["content"]:
if one_dict["type"] == "text":
into = "insert into zhihu_thought(content, thought_id, created_time, comment_count) value(%s, %s, %s, %s)"
values = (one_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"])
self.cur.execute(into, values)
self.conn.commit()
else:
into = "insert into zhihu_thought_picture_url(thought_id, url) value(%s, %s)"
values = (data_dict["id"], one_dict["url"])
self.cur.execute(into, values)
self.conn.commit()
offset = 0
if data_dict["comment_count"] != 0:
next = 1
while next == 1:
next = self.search_thought_comment(data_dict["id"], offset)
offset = offset + 20
return
def search_thought_comment(self, answerid, offset, proxies_num=0):
'''
想法评论数据包请求
'''
offset = str(offset)
answerid = str(answerid)
url = THOUGHT_COMMENT_URL.format(answerid, offset)
[headers_search, cookies_dict] = self.headers_handle(url)
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
if get_page.status_code != 200:
print("root_comment_error")
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
try:
self.thought_comment_data(one_line, answerid)
print("finshed_root" + offset)
except KeyError:
continue
else:
print("root_data_error")
next = 0
if len(page_dict['data']) == 20:
next = 1
return next
def thought_comment_data(self, data_dict, answerid):
'''
想法评论数据插入
'''
into = "insert into zhihu_thought_comment(thought_comment_id, author_name, content, answerid, created_time, author_id) value(%s, %s, %s, %s, %s, %s)"
values = (data_dict["id"], data_dict["author"]["member"]["name"], data_dict["content"], answerid,
data_dict["created_time"], data_dict["author"]["member"]["id"])
self.cur.execute(into, values)
self.conn.commit()
return
if __name__ == '__main__':
'''
python tasks/zhihu/spider.py 0 1 0 'https://www.zhihu.com/people/taoxi-1130'
python script_file_path mark(指定是问题还是其他, 0 是问题, 1是文章, 2是想法) max_page(最大页码) start_page(起始页码 0开始) spider_url(要爬取用户知乎首页地址)
'''
# get_proxy()
mark = int(sys.argv[1])
max_page = int(sys.argv[2])
start_page = int(sys.argv[3])
spider_url = sys.argv[4]
# spider_url = 'https://www.zhihu.com/people/geng-mei-suo-chang/answers'
print(datetime.now())
spider = Spider(spider_url=spider_url)
if mark == 0:
spider.search_page(mark, max_page, start_page)
elif mark == 1:
spider.search_page(mark, max_page, start_page)
elif mark == 2:
spider.search_page(mark, max_page, start_page)
print(datetime.now())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment