Commit 3ef6a963 authored by litaolemo's avatar litaolemo

update

parent dd6dac06
......@@ -11,7 +11,7 @@
import redis,random
import kdl,requests
from redis.sentinel import Sentinel
# from redis.sentinel import Sentinel
# sentinel = Sentinel([('192.168.17.65', 26379),
# ('192.168.17.66', 26379),
......@@ -23,7 +23,8 @@ from redis.sentinel import Sentinel
# slave = sentinel.discover_slaves('ida_redis_master')
# # 连接数据库
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True)
# rds = redis.StrictRedis(host='192.168.17.60', port=6378, db=7, decode_responses=True)
rds = redis.StrictRedis(host='154.8.190.251', port=6378, db=18, decode_responses=True)
def get_proxy_from_redis():
try:
one_proxy = rds.randomkey()
......
......@@ -10,8 +10,8 @@ import requests
import datetime
import json, random, urllib
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
# from selenium import webdriver
# from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from crawler.crawler_sys.framework.video_fields_std import Std_fields_video
from crawler.crawler_sys.utils.output_results import output_result
from crawler.crawler_sys.utils.util_logging import logged
......@@ -380,6 +380,6 @@ if __name__ == '__main__':
ttt = test.releaser_page(releaserUrl=u, output_to_es_raw=True,
es_index='crawler-data-raw',
doc_type='doc',
releaser_page_num_max=3, proxies_num=10)
releaser_page_num_max=3, proxies_num=0)
# test.get_releaser_page(u)
# break
......@@ -1792,7 +1792,7 @@ class Crawler_toutiao():
# pass
data_lis.append(res)
if len(data_lis) >= 100:
if len(data_lis) >= 10:
output_result(result_Lst=data_lis,
platform=self.platform,
output_to_file=output_to_file,
......@@ -1882,7 +1882,7 @@ if __name__ == '__main__':
for url in data_lis:
test.releaser_page_by_time(1595088000000, 1595319362610, url, output_to_es_raw=True,
es_index='crawler-data-raw', releaser_page_num_max=2,
proxies_num=0
proxies_num=2
)
# test.get_releaser_follower_num(url)
# test.get_releaser_image(releaserUrl=data_lis[0])
......@@ -1103,8 +1103,7 @@ class Crawler_v_qq():
push_to_redis=push_to_redis,
output_to_es_register=output_to_es_register,
output_to_es_raw=output_to_es_raw,
es_index=es_index,
doc_type=doc_type)
es_index=es_index)
if __name__ == '__main__':
......@@ -1122,7 +1121,7 @@ if __name__ == '__main__':
continue
# releaserUrl=url,)
#test.get_releaser_follower_num("http://v.qq.com/vplus/cfa34d96d1b6609f1dccdea65b26b83d")
# test.releaser_page_by_time(1558972800000, 1562603029917, releaserUrl, output_to_es_raw=True,
# es_index='crawler-data-raw',
# doc_type='doc', releaser_page_num_max=4000)
test.releaser_page_by_time(1558972800000, 1562603029917, releaserUrl, output_to_es_raw=True,
es_index='crawler-data-raw',
doc_type='doc', releaser_page_num_max=4000)
#test.get_releaser_image(releaserUrl)
\ No newline at end of file
......@@ -10,7 +10,7 @@ import os
import copy
import requests
import re
import datetime
import datetime ,time
import json
# import aiohttp
import random
......@@ -135,10 +135,11 @@ class Crawler_weibo():
if releaser_id != None:
while pagenum <= releaser_page_num_max and has_more:
pagenum += 1
time.sleep(0.5)
"?uid=1669879400&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4&type=uid&value=1669879400&containerid=1076031669879400&since_id=451822205602429"
url = "https://m.weibo.cn/api/container/getIndex?uid={0}&t=0&type=uid&value={1}&containerid=107603{2}&since_id={3}".format(releaser_id,releaser_id,releaser_id,since_id)
headers["referer"] = "https://m.weibo.cn/u/uid={0}&t=0".format(releaser_id)
print('Page number": "%d' % pagenum)
print('Page number: %d' % pagenum)
try:
if proxies_num:
get_page = retry_get_url(url,headers=headers, timeout=self.timeout,proxies=proxies_num)
......@@ -148,13 +149,17 @@ class Crawler_weibo():
get_page = None
has_more = False
if get_page != None and get_page.status_code == 200:
try:
page_json = get_page.json()
total = page_json["data"]["cardlistInfo"]["total"]
if pagenum > total:
break
since_id = page_json["data"]["cardlistInfo"]["since_id"]
page_dic = page_json["data"].get("cards")
except Exception as e:
print("load data error %s" % e)
continue
if page_dic:
for one in page_dic:
try:
......@@ -199,19 +204,100 @@ class Crawler_weibo():
def get_releaser_follower_num(self, releaserUrl):
pass
def releaser_page_by_time(self, start_time, end_time, url,**kwargs):
data_lis = []
count_false = 0
output_to_file = kwargs.get("output_to_file")
filepath = kwargs.get("filepath")
push_to_redis = kwargs.get("push_to_redis")
output_to_es_register = kwargs.get("output_to_es_register")
output_to_es_raw = kwargs.get("output_to_es_raw")
es_index = kwargs.get("es_index")
for res in self.releaser_page(url,proxies_num=kwargs.get("proxies_num")):
video_time = res["release_time"]
# print(res)
if video_time:
if start_time <= video_time:
if video_time < end_time:
try:
# res["fetch_time"] = datetime.datetime.fromtimestamp(res.get("fetch_time") / 1000).strftime('%Y-%m-%d %H:%M:%S')
res["release_time"] = datetime.datetime.fromtimestamp(res.get("release_time") / 1000).strftime('%Y-%m-%d %H:%M:%S')
except:
pass
data_lis.append(res)
# if len(data_lis) >= 100:
# output_result(result_Lst=data_lis,
# platform=self.platform,
# output_to_file=output_to_file,
# filepath=filepath,
# push_to_redis=push_to_redis,
# output_to_es_register=output_to_es_register,
# output_to_es_raw=output_to_es_raw,
# es_index=es_index,
# )
# data_lis.clear()
else:
count_false += 1
if count_false > 10:
break
else:
continue
# if data_lis != []:
# output_result(result_Lst=data_lis,
# platform=self.platform,
# output_to_file=output_to_file,
# filepath=filepath,
# push_to_redis=push_to_redis,
# output_to_es_register=output_to_es_register,
# output_to_es_raw=output_to_es_raw,
# es_index=es_index,
# )
import pandas as pd
data = pd.DataFrame(data_lis)
s = datetime.datetime.now()
ss = str(s)[0:19].replace(' ', '-').replace(':', '-')
res = data.to_csv('%s%sall_s1.csv' % ("all_", ss), encoding='gb18030',
# columns=columns
)
data_lis.clear()
if __name__ == '__main__':
test = Crawler_weibo()
url = 'https://weibo.com/p/1644114654/home?from=page_100306&mod=TAB#place'
# releaserUrl = 'http://v.qq.com/vplus/cfa34d96d1b6609f1dccdea65b26b83d'
# nnn = test.video_page(url)
# kw = '任正非 BBC'
# #sr = test.search_page(kw, search_pages_max=2)
res = test.releaser_page(url, output_to_es_raw=True,
url_list = [
# "https://weibo.com/u/1764615662",
# "https://weibo.com/u/3662247177",
# "https://weibo.com/u/2378564111",
# "https://weibo.com/u/2983578965",
# "https://weibo.com/u/3938976579",
# "https://weibo.com/u/6511177474",
# "https://weibo.com/u/6343916471",
# "https://weibo.com/u/6511177474",
# "https://weibo.com/u/2921603920",
# "https://weibo.com/u/6470919752",
# "https://weibo.com/u/2653906910?refer_flag=1001030103_&is_hot=1",
# "https://weibo.com/u/3115996363?is_hot=1",
# "https://weibo.com/p/1005053212093237/home?from=page_100505&mod=TAB#place",
# "https://weibo.com/u/3926129482",
# "https://weibo.com/u/5509337969?is_hot=1",
# "https://weibo.com/u/5477320351",
# "https://weibo.com/p/1005055634795408/home?from=page_100505&mod=TAB#place",
"https://weibo.com/u/6511173721",
# "https://weibo.com/p/1005055471534537/home?from=page_100505&mod=TAB&is_hot=1#place",
]
# res = test.releaser_page(url, output_to_es_raw=True,
# es_index='crawler-data-raw',
# releaser_page_num_max=400,proxies_num=0)
# for r in res:
# print(r)
for u in url_list:
test.releaser_page_by_time(1590940800000, 1595468554268, u, output_to_es_raw=False,
es_index='crawler-data-raw',
releaser_page_num_max=400,proxies_num=0)
for r in res:
print(r)
doc_type='doc', releaser_page_num_max=4000)
# test.get_single_page(4524055937468233)
\ No newline at end of file
......@@ -75,7 +75,7 @@ def output_result(result_Lst, platform,
push_to_redis=False,
batch_str=None,
release_time_lower_bdr=None,
es_index=index_site_crawler):
es_index=index_site_crawler,**kwargs):
# write data into es crawler-raw index
if output_to_es_raw:
bulk_write_into_es(result_Lst, es_index)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment