Commit 2f3b73ca authored by litaolemo's avatar litaolemo

update

parent 288613fc
......@@ -12,10 +12,12 @@ Due to the data-type used in redis is set so I changed the word from list to set
platform_redis_set_reg = {
'toutiao': 'toutiao_url_set',
'腾讯视频': 'v_qq_url_set',
'youku': 'youku_url_set',
'iqiyi': 'iqiyi_url_set',
'toutiao': 'toutiao_url_hash',
'腾讯视频': 'v_qq_url_hash',
'youku': 'youku_url_hash',
'iqiyi': 'iqiyi_url_hash',
'weibo': 'weibo_url_hash',
'douban': 'douban_url_hash',
}
......
......@@ -4,22 +4,12 @@ Created on Wed Jun 6 18:18:09 2018
@author: hanye
"""
import redis
import redis, json
from crawler_sys.framework.platform_redis_register import get_redis_list_name
from crawler_sys.framework.es_crawler import scan_crawler_url_register
rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=0)
rds = redis.StrictRedis(host='154.8.190.251', port=6379, db=19)
def url_reformer(platform, url):
"""
to reform url according to platform, in the future.
Say, a url of http://www.toutiao.com/group/1234567890123456789
as a string is different from http://www.365yg.com/u/1234567890123456789,
but they point to the same resource. They should be reformed
to one unique url before pushing into redis for futher crawling.
"""
reformed_url = url
return reformed_url
def feed_url_into_redis(dict_Lst, platform,
release_time_lower_bdr=None,
......@@ -32,35 +22,15 @@ def feed_url_into_redis(dict_Lst, platform,
is not given when call this function, all urls will be
pushed into redis.
"""
redis_list_name = get_redis_list_name(platform, batch_str)
if redis_list_name is None:
print('Failed to get correct redis list name '
'in platform_redis_register for platform: '
% platform)
return (None, None)
else:
print('Feeding url into redis list %s ...' % redis_list_name)
url_counter = 0
for data_dict in dict_Lst:
try:
url = data_dict['url']
url_reformed = url_reformer(platform, url)
if release_time_lower_bdr is None:
sadd_c = rds.sadd(redis_list_name, url_reformed)
url_counter += sadd_c
else:
url_release_time = data_dict['release_time']
if url_release_time >= release_time_lower_bdr:
sadd_c = rds.sadd(redis_list_name, url_reformed)
url_counter += sadd_c
except:
print('Failed to push url into redis, '
'might because of lack of url field '
'or lack of release_time field, or '
'has wrong typed release_time value. '
'The failed data dict is: \n %s' % data_dict)
print('Pushed %d urls into redis' % url_counter)
return (redis_list_name, url_counter)
for data_dict in dict_Lst:
try:
doc_id = data_dict['doc_id']
sadd_c = rds.lpush(doc_id, json.dumps(data_dict))
rds.expire(doc_id,259200)
except:
print('Failed to push data into redis')
print('Pushed data into redis')
return True
def pull_url_from_es(platform, release_time_lower_bdr=None):
"""
......
......@@ -16,7 +16,7 @@ Data in es will be update when run this program once.
"""
from crawler.crawler_sys.site_crawler_by_redis import (crawler_toutiao, crawler_v_qq, crawler_tudou, crawler_haokan,
crawler_tencent_news,
crawler_wangyi_news, crawler_kwai, crawler_douyin,toutiao_article,crawler_weibo)
crawler_wangyi_news, crawler_kwai, crawler_douyin,toutiao_article,crawler_weibo,crawler_douban)
import sys
from crawler.crawler_sys.utils.output_results import output_result
import argparse, copy, datetime, time
......@@ -46,14 +46,12 @@ parser.add_argument('-n', '--max_page', default=30, type=int,
'must be an int value, default to 30.'))
parser.add_argument('-f', '--output_file_path', default='', type=str,
help=('Specify output file path, default None.'))
parser.add_argument('-r', '--push_to_redis', default='False', type=str,
parser.add_argument('-r', '--push_to_redis', default=False, type=bool,
help=('Write urls to redis or not, default to True'))
parser.add_argument('-w', '--output_to_es_raw', default='True', type=str,
help=('Write data into es or not, default to True'))
parser.add_argument('-index', '--es_index', default='crawler-data-raw', type=str,
help=('assign a es_index to write into, default to crawler-data-raw'))
parser.add_argument('-doc', '--doc_type', default='doc', type=str,
help=('assign a doc to write into, default to doc'))
parser.add_argument('-g', '--output_to_es_register', default='True', type=str,
help=('Write data into es or not, default to True'))
parser.add_argument('-processes', '--processes_num', default=8, type=int,
......@@ -84,7 +82,8 @@ platform_crawler_reg = {
'抖音': crawler_douyin.Crawler_douyin,
"网易新闻": crawler_wangyi_news.Crawler_wangyi_news,
"kwai": crawler_kwai.Crawler_kwai,
"weibo": crawler_weibo.Crawler_weibo
"weibo": crawler_weibo.Crawler_weibo,
"douban":crawler_douban.Crawler_douban
}
......@@ -212,6 +211,7 @@ def single_thead(processe,name):
if not count_has:
releaser_body["mssage"] = "爬取失败,请检查账号"
rds_1.hset("error",releaser_body["platform"] + "/" +releaser_body["releaserUrl"],json.dumps(releaser_body))
if data_list != []:
output_result(result_Lst=data_list,
platform=platform,
......@@ -219,7 +219,6 @@ def single_thead(processe,name):
filepath=None,
output_to_es_raw=output_to_es_raw,
es_index=es_index,
doc_type=doc_type,
output_to_es_register=output_to_es_register)
print(len(data_list))
data_list.clear()
......@@ -249,6 +248,7 @@ def start_crawler(processe,name):
# # t.setDaemon(False) #
t.start()
if __name__ == "__main__":
executor = ProcessPoolExecutor(max_workers=processes_num)
futures = []
......
......@@ -21,7 +21,7 @@ from crawler.crawler_sys.utils.util_logging import logged
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
from crawler.crawler_sys.utils.html_to_str import dehtml
from write_data_into_es.func_get_releaser_id import *
from write_data_into_es.func_cal_doc_id import cal_doc_id
class Crawler_weibo():
def __init__(self, timeout=None, platform='weibo'):
......@@ -65,6 +65,20 @@ class Crawler_weibo():
def get_releaser_id(self, releaserUrl):
return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl)
@staticmethod
def get_img(data):
img_list = []
if data.get("pics"):
for one in data.get("pics"):
try:
img_list.append(one["large"]["url"])
except Exception as e:
img_list.append(one["url"])
print("add img error %s" % e)
return img_list
def releaser_page(self, releaserUrl,
output_to_file=False, filepath=None,
output_to_es_raw=False,
......@@ -75,7 +89,7 @@ class Crawler_weibo():
doc_type=None, proxies_num=None):
print('Processing releaserUrl %s' % releaserUrl)
result_Lst = []
releaser_id, containerid = self.get_releaser_id(releaserUrl)
releaser_id = self.get_releaser_id(releaserUrl)
# xsrf_token,url_extr = self.get_weibo_info(releaser_id)
headers = {
"accept": "application/json, text/plain, */*",
......@@ -131,6 +145,9 @@ class Crawler_weibo():
mid = mblog.get("mid")
forward_text = ""
forward_user = ""
doc_id = cal_doc_id(platform=res_dic["platform"], url=res_dic["url"], data_dict=res_dic,
doc_id_type="all-time-url")
if one.get("source") == "绿洲":
text_type = "绿洲"
elif mblog.get("retweeted_status"):
......@@ -156,7 +173,9 @@ class Crawler_weibo():
"forward_text": forward_text,
"mid": mid,
"releaserUrl": "https://www.weibo.com/u/%s" % releaser_id,
"releaser_id_str": "weibo_%s" % releaser_id
"releaser_id_str": "weibo_%s" % releaser_id,
"img_list":self.get_img(mblog),
"doc_id":doc_id
}
yield res_dic
except Exception as e:
......@@ -189,7 +208,7 @@ if __name__ == '__main__':
# releaserUrl = 'http://v.qq.com/vplus/cfa34d96d1b6609f1dccdea65b26b83d'
url_list = [
# "https://weibo.com/u/1764615662",
# "https://weibo.com/u/3662247177",
"https://weibo.com/u/3662247177",
# "https://weibo.com/u/2378564111",
# "https://weibo.com/u/2983578965",
# "https://weibo.com/u/3938976579",
......@@ -198,13 +217,14 @@ if __name__ == '__main__':
# "https://weibo.com/p/1005055471534537/home?from=page_100505&mod=TAB&is_hot=1#place",
]
# res = test.releaser_page(url, output_to_es_raw=True,
# es_index='crawler-data-raw',
# releaser_page_num_max=400,proxies_num=0)
# for r in res:
# print(r)
for u in url_list:
test.releaser_page_by_time(1590940800000, 1595468554268, u, output_to_es_raw=False,
es_index='crawler-data-raw',
doc_type='doc', releaser_page_num_max=4000)
for url in url_list:
res = test.releaser_page(url, output_to_es_raw=True,
es_index='crawler-data-raw',
releaser_page_num_max=400,proxies_num=0)
for r in res:
print(r)
# for u in url_list:
# test.releaser_page_by_time(1590940800000, 1595468554268, u, output_to_es_raw=False,
# es_index='crawler-data-raw',
# doc_type='doc', releaser_page_num_max=4000)
# test.get_single_page(4524055937468233)
......@@ -27,8 +27,10 @@ from crawler.crawler_sys.utils.trans_strtime_to_timestamp import weibo_parse_tim
# from crawler.crawler_sys.utils.util_logging import logged
# from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
from crawler.crawler_sys.utils.html_to_str import dehtml
from bs4 import BeautifulSoup
from write_data_into_es.func_get_releaser_id import *
from write_data_into_es.func_cal_doc_id import cal_doc_id
class Crawler_douban():
......@@ -70,17 +72,23 @@ class Crawler_douban():
# content = dehtml(page_json["content"])
if page_json.get('localized_message'):
continue
# content_html = """<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"><title>Title</title></head><body>%s</body></html>""" % page_json["content"]
# bs = BeautifulSoup(content_html, "html.parser")
# content = bs.textarea.get_text()
content = page_json["content"]
repost_count = trans_play_count(page_json["reshares_count"])
comment_count = trans_play_count(page_json["comments_count"])
favorite_count = trans_play_count(page_json["like_count"])
collection_count = trans_play_count(page_json["collections_count"])
img_list = re.findall('img src=".*?"',content)
dic = {
"content":content,
"repost_count":repost_count,
"comment_count":comment_count,
"favorite_count":favorite_count,
"collection_count":collection_count,
"img_list":img_list,
}
return dic
except Exception as e:
......@@ -148,6 +156,8 @@ class Crawler_douban():
for one in page_dic:
releaser_id = one["author"]["id"]
mid = one["id"]
doc_id = cal_doc_id(platform=res_dic["platform"], url=res_dic["url"], data_dict=res_dic,doc_id_type="all-time-url")
try:
res_dic = {
"release_time": trans_strtime_to_timestamp(one["create_time"]),
......@@ -160,10 +170,14 @@ class Crawler_douban():
"releaserUrl": "https://www.douban.com/people/%s" % releaser_id,
"releaser_id_str": "douban_%s" % releaser_id,
'video_img':one["cover_url"],
"mid":mid
"mid":mid,
"platform":"douban",
"doc_id":doc_id
}
res_dic.update(self.get_single_page(mid,proxies_num))
print(res_dic)
yield res_dic
except Exception as e:
print(one)
......
......@@ -192,8 +192,13 @@ class Crawler_weibo():
"forward_text":forward_text,
"mid":mid,
"releaserUrl":"https://www.weibo.com/u/%s" % releaser_id,
"releaser_id_str":"weibo_%s" % releaser_id
"releaser_id_str":"weibo_%s" % releaser_id,
"platform":"weibo"
}
# from write_data_into_es.func_cal_doc_id import cal_doc_id
# id = cal_doc_id(platform=res_dic["platform"], url=res_dic["url"], data_dict=res_dic,
# doc_id_type="all-time-url")
# print(id)
yield res_dic
except Exception as e:
print(mblog)
......
......@@ -68,6 +68,7 @@ def hot_words_output_result(result_Lst,output_index="short-video-hotwords"):
if eror_dic['errors'] is True:
print(eror_dic)
def output_result(result_Lst, platform,
output_to_file=False, filepath=None,
output_to_es_raw=False,
......@@ -91,10 +92,8 @@ def output_result(result_Lst, platform,
# feed url into redis
if push_to_redis:
redis_list_name, url_counter = feed_url_into_redis(
result_Lst, platform,
batch_str=batch_str,
release_time_lower_bdr=release_time_lower_bdr)
feed_url_into_redis(
result_Lst, platform)
# output into file according to passed in parameters
if output_to_file is True and filepath is not None:
......@@ -144,6 +143,7 @@ def get_ill_encoded_str_posi(UnicodeEncodeError_msg):
pass
return posi_nums
def bulk_write_into_es(dict_Lst,
index,
construct_id=False,
......@@ -275,7 +275,6 @@ def scan_redis_to_crawl():
break
def remove_fetched_url_from_redis(remove_interval=10):
time.sleep(remove_interval)
cur = 0
......
......@@ -4,4 +4,5 @@ tqdm==4.46.1
absl-py==0.9.0
kdl==0.2.15
redis==3.5.3
elasticsearch==7.8.0
\ No newline at end of file
elasticsearch==7.8.0
qiniu==7.2.8
\ No newline at end of file
......@@ -275,6 +275,13 @@ def weibo(releaserUrl,**kwargs):
except:
return None
def douban(releaserUrl,**kwargs):
if "people/" in releaserUrl:
releaser_id = re.findall(r"people/(.*)", releaserUrl)[0]
return releaser_id
plantform_func = {
"toutiao": toutiao,
"西瓜":toutiao,
......@@ -289,7 +296,8 @@ plantform_func = {
"抖音":douyin,
"weixin":weixin,
"weibo":weibo,
"pearvideo":pearvideo
"pearvideo":pearvideo,
"douban":douban
}
......
......@@ -273,8 +273,98 @@ if __name__ == "__main__":
{"releaserUrl": "https://weibo.com/p/1005055634795408/home?from=page_100505&mod=TAB#place", "releaser": "圈八戒 ",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/6511173721", "releaser": "圈内课代表", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005055471534537/home?from=page_100505&mod=TAB&is_hot=1#place",
"releaser": "娱闻少女", "platform": "weibo"}
{"releaserUrl": "https://weibo.com/p/1005055471534537/home?from=page_100505&mod=TAB#place", "releaser": "娱闻少女",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/3193443435", "releaser": "圈太妹", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/2022990945", "releaser": "圈内狙击手", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/1809782810?is_all=1", "releaser": "全娱乐爆料", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5157190426?is_all=1", "releaser": "娱乐扒少", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/2125613987?is_all=1", "releaser": "圈内一把手 ", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005051948622644/home?from=page_100505&mod=TAB#place",
"releaser": "影视圈扒姐 ", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/2611791490", "releaser": "娱评八公", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/1652840683", "releaser": "追星", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5086098727?is_hot=1", "releaser": "闻娱教主", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5101787982?is_all=1", "releaser": "扒婆说", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5101844765?is_hot=1", "releaser": "星娱客 ", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005052115034114/home?from=page_100505&mod=TAB#place",
"releaser": "娱乐明星团 ", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/6473952993?is_hot=1", "releaser": "偶像日报", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5106602573?is_hot=1", "releaser": "八哥", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5909342713?", "releaser": "圈内教父", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/3200673035?", "releaser": "扒圈老鬼", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005055965621313/home?from=page_100505&mod=TAB#place", "releaser": "圈内师爷",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/1915749764?is_hot=1", "releaser": "迷妹速报", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1002061836328652/home?from=page_100206&mod=TAB#place", "releaser": "前线娱乐",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5896207859?is_hot=1", "releaser": "娱记者", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5717515328?is_hot=1", "releaser": "娱老汉", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005051795994180/home?from=page_100505&mod=TAB#place",
"releaser": "娱乐News", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5978818414?is_hot=1", "releaser": "娱圈蜀黍", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/2489917511?is_hot=1", "releaser": "芒果捞扒婆 ", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5279487569?is_hot=1", "releaser": "娱姐速报 ", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5106602573?is_hot=1", "releaser": "八哥 ", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5323541229?profile_ftype=1&is_all=1#_0", "releaser": "国内外白富美揭秘 ",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1003062512591982/home?from=page_100306&mod=TAB#place", "releaser": "圈少爷",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/2821843050?profile_ftype=1&is_all=1#_0", "releaser": "圈内老鬼",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/3028215832?profile_ftype=1&is_all=1#_0", "releaser": "娱扒爷",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5336756846?profile_ftype=1&is_all=1#_0", "releaser": "兔兔热议",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005051844235935/home?from=page_100505&mod=TAB#place",
"releaser": "娱乐圈外汉", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005052586409491/home?from=page_100505&mod=TAB#place",
"releaser": "娱乐圈吃瓜指南 ", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5255814135", "releaser": "八组兔区爆料", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/2871033210?is_hot=1", "releaser": "八组兔区热议 ", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005052813285937/home?from=page_100505&mod=TAB#place",
"releaser": "八组兔区娱乐圈", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005052831749482/home?from=page_100505&mod=TAB#place",
"releaser": "八组兔区揭秘", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/2709814831", "releaser": "娱大蜀黍", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5634795408", "releaser": "圈八戒", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5176743404", "releaser": "瓜瓜搬运机", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5039775130", "releaser": "娱乐揭秘蜀黍", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/7123521074", "releaser": "饭圈日报", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/1746658980", "releaser": "饭圈阿姨", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005052453653365/home?from=page_100505&mod=TAB#place", "releaser": "圈内星探",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/6311417880?profile_ftype=1&is_all=1#_0", "releaser": "星扒婆 ",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/1420816495?profile_ftype=1&is_all=1#_0", "releaser": "娱尾纹",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/1974754790", "releaser": "教父娱乐", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/1818950785?refer_flag=1028035010_&is_hot=1", "releaser": "扒圈有鱼",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/1893711543", "releaser": "娱乐有饭", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1002061653255165/home?from=page_100206&mod=TAB#place",
"releaser": "娱乐日爆社", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005052391322817/home?from=page_100505&mod=TAB#place", "releaser": "小娱乐家",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1003061994712500/home?from=page_100306&mod=TAB#place",
"releaser": "星扒客push", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5700087877", "releaser": "毒舌八卦", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/3779202361", "releaser": "西皮娱乐", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/1632619962", "releaser": "瓜组新鲜事", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005052103460752/home?from=page_100505&mod=TAB#place", "releaser": "娱嬷嬷 ",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5874584452", "releaser": "吃瓜鹅每日搬", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005052397961280/home?from=page_100505&mod=TAB#place", "releaser": "娱大白",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005053246379064/home?from=page_100505&mod=TAB#place",
"releaser": "娱乐圈扒姐 ", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/1830483711", "releaser": "娱乐女记", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005053847401640/home?from=page_100505&mod=TAB#place",
"releaser": "吃瓜爆料每日搬 ", "platform": "weibo"},
{"releaserUrl": "https://www.douban.com/people/hot_tag",
"releaser": "hot_tag", "platform": "douban"},
{"releaserUrl": "https://www.douban.com/people/new_tag",
"releaser": "new_tag", "platform": "douban"}
]
extra_dic = {
"department_tags":["策略组"],
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment