Commit 8c5bb9e8 authored by litaolemo's avatar litaolemo

update

parent ea7d0c06
# -*- coding:UTF-8 -*-
# @Time : 2020/7/24 16:07
# @File : crawler_douban.py
# @email : litao@igengmei.com
# @author : litao
import os
import copy
import requests
import re
import datetime ,time
import json
import urllib
import random
# from bs4 import BeautifulSoup
# from multiprocessing import Pool
# from multiprocessing import Process
from crawler.crawler_sys.framework.video_fields_std import Std_fields_video
from crawler.crawler_sys.utils.output_results import retry_get_url
from crawler.crawler_sys.utils.output_results import output_result
# from crawler.crawler_sys.utils import output_log
from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
from crawler.crawler_sys.utils.trans_strtime_to_timestamp import weibo_parse_time, trans_strtime_to_timestamp
# from crawler.crawler_sys.utils import connect_with_redis
# from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration
# from crawler.crawler_sys.utils.util_logging import logged
# from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
from crawler.crawler_sys.utils.html_to_str import dehtml
from bs4 import BeautifulSoup
from write_data_into_es.func_get_releaser_id import *
from write_data_into_es.func_cal_doc_id import cal_doc_id
class Crawler_douban():
def __init__(self, timeout=None, platform='weibo'):
if timeout == None:
self.timeout = 10
else:
self.timeout = timeout
self.platform = platform
std_fields = Std_fields_video()
self.video_data = std_fields.video_data
self.video_data['platform'] = self.platform
# remove fields that crawled data don't have
pop_key_Lst = ['describe', 'repost_count', 'isOriginal',
'video_id']
for popk in pop_key_Lst:
self.video_data.pop(popk)
self.sig_list = [
"aOI2VYvkFvPfUngaeoz%2BNYQ7MQM%3D",
"Glc52sbPO46I%2FR%2FOCjl%2BGwKo94I%3D",
"l9oVu%2FYau2UwMyhc5m8ldALp5eU%3D",
"tL36trbi73v7Y057K10%2FQ9fdCiA%3D"
]
self.headers = {
"User-Agent": "api-client/1 com.douban.frodo/10.39.0(189) Android/23 product/cancro vendor/Netease model/Miui rom/android network/wifi platform/AndroidPad",
"Host": "frodo.douban.com",
"Connection": "Keep-Alive",
"Accept-Encoding": "gzip",
}
def get_single_page(self,mid,proxies):
count_true = 0
while count_true <= 3:
try:
count_true += 1
url = "https://frodo.douban.com/api/v2/group/topic/{0}?event_source=search&os_rom=android&apikey=0dad551ec0f84ed02907ff5c42e8ec70&channel=Baidu_Market&_sig={2}&udid=dc{1}e9f33c54b4bb579c49100b6f2cc0dc5cc".format(mid,random.randint(10000,99999),random.choice(self.sig_list))
page_res = retry_get_url(url,headers=self.headers,proxies=proxies)
page_json = page_res.json()
# content = dehtml(page_json["content"])
if page_json.get('localized_message'):
continue
# content_html = """<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"><title>Title</title></head><body>%s</body></html>""" % page_json["content"]
# bs = BeautifulSoup(content_html, "html.parser")
# content = bs.textarea.get_text()
content = page_json["content"]
repost_count = trans_play_count(page_json["reshares_count"])
comment_count = trans_play_count(page_json["comments_count"])
favorite_count = trans_play_count(page_json["like_count"])
collection_count = trans_play_count(page_json["collections_count"])
img_list = re.findall('img src=".*?"',content)
dic = {
"content":content,
"repost_count":repost_count,
"comment_count":comment_count,
"favorite_count":favorite_count,
"collection_count":collection_count,
"img_list":img_list,
}
return dic
except Exception as e:
print("single page error %s"% e)
continue
def get_releaser_id(self, releaserUrl):
return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl)
def gooseneck(self,releaserUrl,output_to_file=False, filepath=None,
output_to_es_raw=False,
output_to_es_register=False,
push_to_redis=False,
releaser_page_num_max=10000,
es_index=None,proxies_num=None):
page = 0
has_more = True
url_dic = {
"start": None,
"count": "20",
"sortby": "new",
"apple": "389276ed556d40cada2e208482b51cd7",
"icecream": "ffd8f7d71419a98e48819cbac587ebbd",
"mooncake": "0f607264fc6318a92b9e13c65db7cd3c",
"webview_ua": "Mozilla%2F5.0%20%28Linux%3B%20Android%2010.0.1%3B%20Miui%20Build%2FV417IR%3B%20wv%29%20AppleWebKit%2F537.36%20%28KHTML%2C%20like%20Gecko%29%20Version%2F4.0%20Chrome%2F52.0.2743.100%20Mobile%20Safari%2F537.36",
"screen_width": "810",
"screen_height": "1440",
"sugar": "0",
"longitude": "0",
"latitude": "0",
"os_rom": "android",
"apikey": "0dad551ec0f84ed02907ff5c42e8ec70",
"channel": "Baidu_Market",
"udid": "dc{0}e9f33c54b4bb579c49100b6f2cc0dc5cc".format(random.randint(10000,99999)),
"_sig": random.choice(self.sig_list),
"_ts": None,
}
while page <= releaser_page_num_max and has_more:
url_dic["_ts"] = int(datetime.datetime.now().timestamp())
url_dic["start"] = str(page * 20)
if "hot_tag" in releaserUrl:
url_dic["sortby"] = "hot"
elif "new_tag" in releaserUrl:
url_dic["sortby"] = "new"
url = "https://frodo.douban.com/api/v2/group/248952/topics?%s" % urllib.parse.urlencode(url_dic)
try:
if proxies_num:
get_page = retry_get_url(url, headers=self.headers, timeout=self.timeout, proxies=proxies_num)
else:
get_page = retry_get_url(url, headers=self.headers, timeout=self.timeout)
except:
get_page = None
has_more = False
if get_page and get_page.status_code == 200:
try:
page_json = get_page.json()
total = page_json["total"]
page += 1
if page > total:
break
page_dic = page_json["topics"]
except Exception as e:
print("load data error %s" % e)
continue
if page_dic:
for one in page_dic:
releaser_id = one["author"]["id"]
mid = one["id"]
doc_id = cal_doc_id(platform=res_dic["platform"], url=res_dic["url"], data_dict=res_dic,doc_id_type="all-time-url")
try:
res_dic = {
"release_time": trans_strtime_to_timestamp(one["create_time"]),
"url": one["url"],
"releaser": one["author"]["name"],
"repost_count": None,
"comment_count": trans_play_count(one["comments_count"]),
"favorite_count": None,
"title": one["title"],
"releaserUrl": "https://www.douban.com/people/%s" % releaser_id,
"releaser_id_str": "douban_%s" % releaser_id,
'video_img':one["cover_url"],
"mid":mid,
"platform":"douban",
"doc_id":doc_id
}
res_dic.update(self.get_single_page(mid,proxies_num))
print(res_dic)
yield res_dic
except Exception as e:
print(one)
print("row formate error %s" % e)
continue
# @logged
def releaser_page(self, releaserUrl,
**kwargs):
return self.gooseneck(releaserUrl,**kwargs)
def get_releaser_follower_num(self, releaserUrl):
pass
def releaser_page_by_time(self, start_time, end_time, url,**kwargs):
data_lis = []
count_false = 0
output_to_file = kwargs.get("output_to_file")
filepath = kwargs.get("filepath")
push_to_redis = kwargs.get("push_to_redis")
output_to_es_register = kwargs.get("output_to_es_register")
output_to_es_raw = kwargs.get("output_to_es_raw")
es_index = kwargs.get("es_index")
for res in self.releaser_page(url,proxies_num=kwargs.get("proxies_num")):
video_time = res["release_time"]
# print(res)
if video_time:
if start_time <= video_time:
if video_time < end_time:
try:
# res["fetch_time"] = datetime.datetime.fromtimestamp(res.get("fetch_time") / 1000).strftime('%Y-%m-%d %H:%M:%S')
res["release_time"] = datetime.datetime.fromtimestamp(res.get("release_time") / 1000).strftime('%Y-%m-%d %H:%M:%S')
except:
pass
data_lis.append(res)
if len(data_lis) >= 100:
output_result(result_Lst=data_lis,
platform=self.platform,
output_to_file=output_to_file,
filepath=filepath,
push_to_redis=push_to_redis,
output_to_es_register=output_to_es_register,
output_to_es_raw=output_to_es_raw,
es_index=es_index,
)
data_lis.clear()
else:
count_false += 1
if count_false > 10:
break
else:
continue
# if data_lis != []:
# output_result(result_Lst=data_lis,
# platform=self.platform,
# output_to_file=output_to_file,
# filepath=filepath,
# push_to_redis=push_to_redis,
# output_to_es_register=output_to_es_register,
# output_to_es_raw=output_to_es_raw,
# es_index=es_index,
# )
import pandas as pd
data = pd.DataFrame(data_lis)
s = datetime.datetime.now()
ss = str(s)[0:19].replace(' ', '-').replace(':', '-')
res = data.to_csv('%s%sall_s1.csv' % ("all_", ss), encoding='gb18030',
# columns=columns
)
data_lis.clear()
if __name__ == '__main__':
test = Crawler_douban()
url = 'https://weibo.com/p/1644114654/home?from=page_100306&mod=TAB#place'
# releaserUrl = 'http://v.qq.com/vplus/cfa34d96d1b6609f1dccdea65b26b83d'
url_list = [
# "https://weibo.com/u/1764615662",
# "https://weibo.com/u/3662247177",
# "https://weibo.com/u/2378564111",
# "https://weibo.com/u/2983578965",
# "https://weibo.com/u/3938976579",
# "https://weibo.com/u/6511177474",
# "https://weibo.com/u/6343916471",
# "https://weibo.com/u/6511177474",
# "https://weibo.com/u/2921603920",
# "https://weibo.com/u/6470919752",
# "https://weibo.com/u/2653906910?refer_flag=1001030103_&is_hot=1",
# "https://weibo.com/u/3115996363?is_hot=1",
# "https://weibo.com/p/1005053212093237/home?from=page_100505&mod=TAB#place",
# "https://weibo.com/u/3926129482",
# "https://weibo.com/u/5509337969?is_hot=1",
# "https://weibo.com/u/5477320351",
# "https://weibo.com/p/1005055634795408/home?from=page_100505&mod=TAB#place",
"https://weibo.com/u/6511173721",
# "https://weibo.com/p/1005055471534537/home?from=page_100505&mod=TAB&is_hot=1#place",
]
# res = test.releaser_page(url, output_to_es_raw=True,
# es_index='crawler-data-raw',
# releaser_page_num_max=400,proxies_num=0)
# for r in res:
# print(r)
for u in url_list:
test.releaser_page_by_time(1590940800000, 1595468554268, u, output_to_es_register=True,
es_index='crawler-data-raw',
doc_type='doc', releaser_page_num_max=4000)
# test.get_single_page(4524055937468233)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment