Commit f74bbb7f authored by litaolemo's avatar litaolemo

update

parent 42f394f9
...@@ -62,7 +62,7 @@ class CrawlerDouban(): ...@@ -62,7 +62,7 @@ class CrawlerDouban():
def get_single_page(self,mid,proxies): def get_single_page(self,mid,proxies):
count_true = 0 count_true = 0
while count_true <= 3: while count_true <= 5:
try: try:
count_true += 1 count_true += 1
url = "https://frodo.douban.com/api/v2/group/topic/{0}?event_source=search&os_rom=android&apikey=0dad551ec0f84ed02907ff5c42e8ec70&channel=Baidu_Market&_sig={2}&udid=dc{1}e9f33c54b4bb579c49100b6f2cc0dc5cc".format(mid,random.randint(10000,99999),random.choice(self.sig_list)) url = "https://frodo.douban.com/api/v2/group/topic/{0}?event_source=search&os_rom=android&apikey=0dad551ec0f84ed02907ff5c42e8ec70&channel=Baidu_Market&_sig={2}&udid=dc{1}e9f33c54b4bb579c49100b6f2cc0dc5cc".format(mid,random.randint(10000,99999),random.choice(self.sig_list))
...@@ -93,6 +93,8 @@ class CrawlerDouban(): ...@@ -93,6 +93,8 @@ class CrawlerDouban():
except Exception as e: except Exception as e:
print("single page error %s"% e) print("single page error %s"% e)
continue continue
print("single page error")
return None
def get_releaser_id(self, releaserUrl): def get_releaser_id(self, releaserUrl):
return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl) return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl)
...@@ -159,8 +161,8 @@ class CrawlerDouban(): ...@@ -159,8 +161,8 @@ class CrawlerDouban():
for one in page_dic: for one in page_dic:
releaser_id = one["author"]["id"] releaser_id = one["author"]["id"]
mid = one["id"] mid = one["id"]
if True:
try: # try:
res_dic = { res_dic = {
"release_time": trans_strtime_to_timestamp(one["create_time"]), "release_time": trans_strtime_to_timestamp(one["create_time"]),
"url": one["url"], "url": one["url"],
...@@ -180,13 +182,13 @@ class CrawlerDouban(): ...@@ -180,13 +182,13 @@ class CrawlerDouban():
doc_id_type="all-time-url") doc_id_type="all-time-url")
res_dic["doc_id"] = doc_id res_dic["doc_id"] = doc_id
res_dic.update(self.get_single_page(mid,proxies_num)) res_dic.update(self.get_single_page(mid,proxies_num))
print(res_dic) # print(res_dic)
yield res_dic yield res_dic
except Exception as e: # except Exception as e:
print(one) # print(one)
print("row formate error %s" % e) # print("row formate error %s" % e)
continue # continue
# @logged # @logged
def releaser_page(self, releaserUrl, def releaser_page(self, releaserUrl,
...@@ -244,7 +246,9 @@ if __name__ == '__main__': ...@@ -244,7 +246,9 @@ if __name__ == '__main__':
# for r in res: # for r in res:
# print(r) # print(r)
for u in url_list: for u in url_list:
test.releaser_page_by_time(1590940800000, 1595468554268, u, output_to_es_register=True, ttt = test.releaser_page_by_time(1590940800000, 1595468554268, u, output_to_es_register=True,
es_index='crawler-data-raw', es_index='crawler-data-raw',
doc_type='doc', releaser_page_num_max=4000) doc_type='doc', releaser_page_num_max=4000,allow=20)
for t in ttt:
print(t)
# test.get_single_page(4524055937468233) # test.get_single_page(4524055937468233)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment