Commit 42f394f9 authored by litaolemo's avatar litaolemo

update

parent c6dc1c1a
......@@ -98,7 +98,7 @@ def write_releaserUrl_to_redis(data_dic):
for platform in platforms:
# 2 get releaserUrl list on each platform from target-releasers index
releaserUrl_Lst = get_releaserUrls_from_es(platform=platform, frequency=frequency,target_index="target_releasers",project_tags=args.project_tags)
releaserUrl_Lst = get_releaserUrls_from_es(platform=platform, frequency=frequency,target_index="target_releasers")
if is_article:
platform = platform + "_article"
rds.hset("process_num",platform,processes_num)
......
......@@ -159,7 +159,6 @@ class CrawlerDouban():
for one in page_dic:
releaser_id = one["author"]["id"]
mid = one["id"]
doc_id = cal_doc_id(platform=res_dic["platform"], url=res_dic["url"], data_dict=res_dic,doc_id_type="all-time-url")
try:
res_dic = {
......@@ -175,9 +174,11 @@ class CrawlerDouban():
'video_img':one["cover_url"],
"mid":mid,
"platform":"douban",
"doc_id":doc_id
# "doc_id":doc_id
}
doc_id = cal_doc_id(platform=res_dic["platform"], url=res_dic["url"], data_dict=res_dic,
doc_id_type="all-time-url")
res_dic["doc_id"] = doc_id
res_dic.update(self.get_single_page(mid,proxies_num))
print(res_dic)
......@@ -196,69 +197,23 @@ class CrawlerDouban():
def get_releaser_follower_num(self, releaserUrl):
pass
def releaser_page_by_time(self, start_time, end_time, url,**kwargs):
data_lis = []
def releaser_page_by_time(self, start_time, end_time, url, allow, **kwargs):
count_false = 0
output_to_file = kwargs.get("output_to_file")
filepath = kwargs.get("filepath")
push_to_redis = kwargs.get("push_to_redis")
output_to_es_register = kwargs.get("output_to_es_register")
output_to_es_raw = kwargs.get("output_to_es_raw")
es_index = kwargs.get("es_index")
for res in self.releaser_page(url,proxies_num=kwargs.get("proxies_num")):
for res in self.releaser_page(url, proxies_num=kwargs.get("proxies_num")):
video_time = res["release_time"]
# print(res)
if video_time:
if start_time <= video_time:
if start_time < video_time:
if video_time < end_time:
try:
# res["fetch_time"] = datetime.datetime.fromtimestamp(res.get("fetch_time") / 1000).strftime('%Y-%m-%d %H:%M:%S')
res["release_time"] = datetime.datetime.fromtimestamp(res.get("release_time") / 1000).strftime('%Y-%m-%d %H:%M:%S')
except:
pass
data_lis.append(res)
if len(data_lis) >= 100:
output_result(result_Lst=data_lis,
platform=self.platform,
output_to_file=output_to_file,
filepath=filepath,
push_to_redis=push_to_redis,
output_to_es_register=output_to_es_register,
output_to_es_raw=output_to_es_raw,
es_index=es_index,
)
data_lis.clear()
yield res
else:
count_false += 1
if count_false > 10:
if count_false > allow:
break
else:
continue
# if data_lis != []:
# output_result(result_Lst=data_lis,
# platform=self.platform,
# output_to_file=output_to_file,
# filepath=filepath,
# push_to_redis=push_to_redis,
# output_to_es_register=output_to_es_register,
# output_to_es_raw=output_to_es_raw,
# es_index=es_index,
# )
import pandas as pd
data = pd.DataFrame(data_lis)
s = datetime.datetime.now()
ss = str(s)[0:19].replace(' ', '-').replace(':', '-')
res = data.to_csv('%s%sall_s1.csv' % ("all_", ss), encoding='gb18030',
# columns=columns
)
data_lis.clear()
yield res
if __name__ == '__main__':
test = Crawler_Douban()
test = CrawlerDouban()
url = 'https://weibo.com/p/1644114654/home?from=page_100306&mod=TAB#place'
# releaserUrl = 'http://v.qq.com/vplus/cfa34d96d1b6609f1dccdea65b26b83d'
url_list = [
......
......@@ -23,6 +23,7 @@ from crawler.crawler_sys.utils.html_to_str import dehtml
from write_data_into_es.func_get_releaser_id import *
from write_data_into_es.func_cal_doc_id import cal_doc_id
class Crawler_weibo():
def __init__(self, timeout=None, platform='weibo'):
if timeout == None:
......@@ -145,8 +146,6 @@ class Crawler_weibo():
mid = mblog.get("mid")
forward_text = ""
forward_user = ""
doc_id = cal_doc_id(platform=res_dic["platform"], url=res_dic["url"], data_dict=res_dic,
doc_id_type="all-time-url")
if one.get("source") == "绿洲":
text_type = "绿洲"
......@@ -175,8 +174,10 @@ class Crawler_weibo():
"releaserUrl": "https://www.weibo.com/u/%s" % releaser_id,
"releaser_id_str": "weibo_%s" % releaser_id,
"img_list":self.get_img(mblog),
"doc_id":doc_id
# "doc_id":doc_id
}
res_dic["doc_id"] = cal_doc_id(platform="weibo", url=one["scheme"], data_dict=res_dic,
doc_id_type="all-time-url")
yield res_dic
except Exception as e:
print(mblog)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment