Commit 4dfed1b9 authored by litaolemo's avatar litaolemo

update

parent d83f79a7
......@@ -8,7 +8,7 @@ Created on Mon Feb 26 17:57:38 2018
class Std_fields_video:
def __init__(self, data_provider=None):
if data_provider==None:
data_provider='BDD'
data_provider='gengmei'
self.video_data={
'platform': None,
'channel': None,
......
......@@ -13,16 +13,16 @@ import kdl,requests
from redis.sentinel import Sentinel
sentinel = Sentinel([('192.168.17.65', 26379),
('192.168.17.66', 26379),
('192.168.17.67', 26379)
], socket_timeout=0.5)
# 查看master节点
master = sentinel.discover_master('ida_redis_master')
# 查看slave 节点
slave = sentinel.discover_slaves('ida_redis_master')
# 连接数据库
rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True)
# sentinel = Sentinel([('192.168.17.65', 26379),
# ('192.168.17.66', 26379),
# ('192.168.17.67', 26379)
# ], socket_timeout=0.5)
# # 查看master节点
# master = sentinel.discover_master('ida_redis_master')
# # 查看slave 节点
# slave = sentinel.discover_slaves('ida_redis_master')
# # 连接数据库
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True)
# rds = redis.StrictRedis(host='192.168.17.60', port=6378, db=7, decode_responses=True)
def get_proxy_from_redis():
try:
......
......@@ -22,7 +22,7 @@ import urllib
try:
from crawler_sys.framework.func_get_releaser_id import *
except:
from func_get_releaser_id import *
from write_data_into_es.func_get_releaser_id import *
import requests
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
from crawler.crawler_sys.framework.get_redirect_resp import get_redirected_resp
......@@ -32,8 +32,9 @@ from crawler.crawler_sys.site_crawler.toutiao_get_signature import getHoney
from crawler.crawler_sys.utils.output_results import output_result
from crawler.crawler_sys.utils.output_results import retry_get_url
from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration
# from crawler.crawler_sys.utils import output_log
from crawler.crawler_sys.utils import output_log
from crawler.crawler_sys.utils.util_logging import logged
from write_data_into_es.func_cal_doc_id import cal_doc_id
class Crawler_toutiao():
......@@ -48,7 +49,7 @@ class Crawler_toutiao():
self.video_data = std_fields.video_data
self.video_data['platform'] = self.platform
# remove fields that crawled data don't have
pop_key_Lst = ['channel', 'describe', 'isOriginal', "repost_count", "video_id"]
pop_key_Lst = ['channel', 'describe', 'isOriginal', "video_id"]
for popk in pop_key_Lst:
self.video_data.pop(popk)
self.releaser_url_pattern = 'http://www.365yg.com/c/user/[RELEASER_ID]/'
......@@ -124,6 +125,7 @@ class Crawler_toutiao():
"x-requested-with": "XMLHttpRequest",
}
# log_path = '/home/hanye/crawlersNew/crawler/crawler_log'
# current_day = str(datetime.datetime.now())[:10]
# info_log_file = log_path + '/all_' + current_day + '.log'
......@@ -765,6 +767,156 @@ class Crawler_toutiao():
return video_image_url
def get_web_article_info(self,article_id):
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh,zh-CN;q=0.9",
"Connection": "keep-alive",
# "Cookie": "tt_webid=6851461299689686542; SLARDAR_WEB_ID=568d391e-7f96-491b-9557-b045a55e9dd8",
"Host": "m.toutiao.com",
"Referer": "https://m.toutiao.com/i6851146167279944199/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
}
headers["Referer"] = "https://m.toutiao.com/i%s" % article_id
url = "https://m.toutiao.com/i{0}/info/?i={1}".format(article_id,article_id)
requests_res = retry_get_url(url,headers=headers,proxies=0)
res_json = requests_res.json()
res_dic = {
"title":res_json["data"].get("title"),
'high_quality_flag':int(res_json["data"].get('high_quality_flag')),
"play_count": int(res_json["data"].get('impression_count')),
"comment_count": res_json["data"].get("comment_count"),
"repost_count": res_json["data"].get("repost_count"),
"favorite_count": res_json["data"].get("digg_count"),
'releaser_followers_count': res_json["data"].get("follower_count"),
'release_time': int(res_json["data"].get('publish_time')*1e3),
"content":res_json["data"].get("content"),
}
return res_dic
def web_releaser_page_article(self, releaserUrl,
releaser_page_num_max=50000,
proxies_num=None,**kwargs):
result_list = []
has_more = True
count = 1
releaser_id = self.find_releaser_id(releaserUrl)
count_false = 0
offset = "0"
headers = {"accept": "text/javascript, text/html, application/xml, text/xml, */*",
"accept-encoding": "gzip, deflate", "accept-language": "zh,zh-CN;q=0.9",
"content-type": "application/x-www-form-urlencoded",
# "cookie": "gftoken=NDAxNzc3NjcyM3wxNTk1MjI3MTU0ODh8fDAGBgYGBgY; SLARDAR_WEB_ID=0ddc45df-54ce-42c5-8dfd-27403ea3319e; s_v_web_id=verify_kcu52781_yF9Mw8Pu_VGOQ_4R2p_8AeG_NwGKWAkt7YLl; ttcid=df5933a4926945c68dde9bf5e5542f9730; tt_scid=KlhjcsMcR9m7a1GIqnzjDfr.XZ0-jnU4X-ZPLZFZ51vyyv6FmjCdmDwYVWtjq2JO18fd",
# "referer": "https://profile.zjurl.cn/rogue/ugc/profile/?user_id=%s&request_source=1" % releaser_id,
"sec-fetch-dest": "empty", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
"x-requested-with": "XMLHttpRequest"}
# vid = "AB5483CA-FCDC-42F1-AFB1-077A1%sDA" % random.randint(100000, 999999)
# ccid = "F153594D-1310-4984-A4C3-A679D4D%s" % random.randint(10000, 99999)
# openudid = "5d44f2ea1b74e3731b27e5ed8039ac29f%s" % random.randint(1000000, 9999999)
# idfa = "E3FC9054-384B-485F-9B4C-936F33D7D%s" % random.randint(100, 999)
# iid = str(random.randint(104525900000, 104526000000))
while has_more and count <= releaser_page_num_max:
# url = "https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&{2}".format(
# random.choice(self.api_list), random.randint(5, 10), urllib.parse.urlencode(url_dic))
# url = """https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&visited_uid={2}&client_extra_params=&count=20&offset={3}&stream_api_version=88&category=profile_video&version_code=7.6.0&app_name=news_article&channel=App%20Store&resolution=1536*2048&aid=13&ab_feature=794528&ab_version=765192,857803,660830,1444046,1397712,1434498,662176,801968,1419045,668775,1462526,1190525,1489306,1493796,1439625,1469498,668779,1417599,662099,1477261,1484884,668774,1496422,1427395&ab_group=794528&pos=5pe9vb/x8v788cLx/On47unC7fLuv72nveaXvb29vb/ 8vLv fTz/On4y/zx6Pjuv72nveaXvb29vb29v/Hy8/r06ej5 L 9p72tsZe9vb29vb2/8fzp9Ono fi/vae9rZe9vb294Je9veCX4A==&update_version_code=76014&ac=WIFI&os_version=13.3.1&ssmix=a&device_platform=ipad&ab_client=a1,f2,f7,e1&device_type=iPad6,11""".format(random.choice(self.api_list), random.randint(1, 1),str(releaser_id),str(offset))
# url = "https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&visited_uid={2}&stream_api_version=47&count=20&offset={3}&ac=wifi&channel=wap_test_lite_1&aid=35&app_name=news_article_lite&version_code=715&version_name=7.1.5&device_platform=android&ab_version=668903,668905,668907,808414,772541,1378617,668908,668904,668906,1401332,1496418,928942&ab_client=a1,c2,e1,f2,g2,f7&ab_feature=z1&abflag=3&ssmix=a&device_type=OPPO R11&device_brand=OPPO&language=zh&os_api=22&os_version=5.1.1&manifest_version_code=715&resolution=900*1600&dpi=320&update_version_code=71504&sa_enable=0&fp=a_fake_fp&tma_jssdk_version=1.25.4.2&rom_version=coloros__r11-user 5.1.1 nmf26x 500200210 release-keys&plugin_state=30631999".format(
# random.choice(self.api_list), random.randint(5, 10), str(releaser_id), str(offset))
url = "https://profile.zjurl.cn/api/feed/profile/v2/?category=profile_article&visited_uid={0}&stream_api_version=82&request_source=1&offset={1}&user_id={2}".format(
str(releaser_id), str(offset), str(releaser_id))
try:
proxies = get_proxy(proxies_num)
if proxies:
# proxies = {
# "http": "http://127.0.0.1:80",
# "https": "http://127.0.0.1:443"
# }
get_page = requests.get(url, headers=self.headers, proxies=proxies, timeout=10)
else:
get_page = requests.get(url, headers=self.headers, timeout=10)
except:
continue
print("get_page %s on page %s" % (releaser_id, count))
page_dic = {}
try:
page_dic = get_page.json()
if page_dic.get("message") != "success":
count_false += 1
if count_false < 3:
continue
else:
print("unknow error")
break
data_list = page_dic.get('data')
has_more = page_dic.get('has_more')
offset = str(page_dic.get("offset"))
except:
if not page_dic:
count_false += 1
if count_false >= 3:
break
else:
continue
if data_list:
data_list = page_dic.get('data')
has_more = page_dic.get('has_more')
else:
data_list = []
has_more = False
# offset = page_dic.get('offset')
if has_more is None:
has_more = False
if not data_list:
print("toutiao no data in releaser %s page %s" % (releaser_id, count))
# print(page_dic)
# print(url)
count_false += 1
proxies = get_proxy(1)
if count_false >= 5:
has_more = False
break
continue
else:
count_false = 0
count += 1
for one_video in data_list:
# print(one_video)
# info_str = one_video.get('content')
info_dic = json.loads(one_video["content"])
video_dic = copy.deepcopy(self.video_data)
video_dic['title'] = info_dic.get('title')
video_dic['abstract'] = info_dic.get('abstract')
video_dic['url'] = info_dic.get('share_url')
video_dic['releaser'] = info_dic.get('source')
video_dic['releaserUrl'] = releaserUrl
release_time = info_dic.get('publish_time')
video_dic['release_time'] = int(release_time * 1e3)
video_dic['duration'] = info_dic.get('video_duration')
video_dic['play_count'] = info_dic.get('read_count')
video_dic['repost_count'] = info_dic.get('forward_info').get('forward_count')
video_dic['comment_count'] = info_dic.get('comment_count')
video_dic['favorite_count'] = info_dic.get('digg_count')
video_dic['article_id'] = info_dic.get('tag_id')
video_dic['fetch_time'] = int(time.time() * 1e3)
video_dic['releaser_id_str'] = "toutiao_%s" % releaser_id
video_dic['video_img'] = self.get_video_image(info_dic)
video_dic['id'] = cal_doc_id(video_dic["platform"], url=video_dic["url"], doc_id_type='all-time-url', data_dict=video_dic)
try:
article_info = self.get_web_article_info(info_dic.get('tag_id'))
video_dic.update(article_info)
except Exception as e:
print("method get_web_article_info error %s" %e)
yield video_dic
def App_releaser_page_video(self, releaserUrl,
output_to_file=False,
filepath=None,
......@@ -864,7 +1016,7 @@ class Crawler_toutiao():
# }
# url = "https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&{2}".format(
# random.choice(self.api_list), random.randint(5, 10), urllib.parse.urlencode(url_dic))
#url = """https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&visited_uid={2}&client_extra_params=&count=20&offset={3}&stream_api_version=88&category=profile_video&version_code=7.6.0&app_name=news_article&channel=App%20Store&resolution=1536*2048&aid=13&ab_feature=794528&ab_version=765192,857803,660830,1444046,1397712,1434498,662176,801968,1419045,668775,1462526,1190525,1489306,1493796,1439625,1469498,668779,1417599,662099,1477261,1484884,668774,1496422,1427395&ab_group=794528&pos=5pe9vb/x8v788cLx/On47unC7fLuv72nveaXvb29vb/ 8vLv fTz/On4y/zx6Pjuv72nveaXvb29vb29v/Hy8/r06ej5 L 9p72tsZe9vb29vb2/8fzp9Ono fi/vae9rZe9vb294Je9veCX4A==&update_version_code=76014&ac=WIFI&os_version=13.3.1&ssmix=a&device_platform=ipad&ab_client=a1,f2,f7,e1&device_type=iPad6,11""".format(random.choice(self.api_list), random.randint(1, 1),str(releaser_id),str(offset))
# url = """https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&visited_uid={2}&client_extra_params=&count=20&offset={3}&stream_api_version=88&category=profile_video&version_code=7.6.0&app_name=news_article&channel=App%20Store&resolution=1536*2048&aid=13&ab_feature=794528&ab_version=765192,857803,660830,1444046,1397712,1434498,662176,801968,1419045,668775,1462526,1190525,1489306,1493796,1439625,1469498,668779,1417599,662099,1477261,1484884,668774,1496422,1427395&ab_group=794528&pos=5pe9vb/x8v788cLx/On47unC7fLuv72nveaXvb29vb/ 8vLv fTz/On4y/zx6Pjuv72nveaXvb29vb29v/Hy8/r06ej5 L 9p72tsZe9vb29vb2/8fzp9Ono fi/vae9rZe9vb294Je9veCX4A==&update_version_code=76014&ac=WIFI&os_version=13.3.1&ssmix=a&device_platform=ipad&ab_client=a1,f2,f7,e1&device_type=iPad6,11""".format(random.choice(self.api_list), random.randint(1, 1),str(releaser_id),str(offset))
# url = "https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&visited_uid={2}&stream_api_version=47&count=20&offset={3}&ac=wifi&channel=wap_test_lite_1&aid=35&app_name=news_article_lite&version_code=715&version_name=7.1.5&device_platform=android&ab_version=668903,668905,668907,808414,772541,1378617,668908,668904,668906,1401332,1496418,928942&ab_client=a1,c2,e1,f2,g2,f7&ab_feature=z1&abflag=3&ssmix=a&device_type=OPPO R11&device_brand=OPPO&language=zh&os_api=22&os_version=5.1.1&manifest_version_code=715&resolution=900*1600&dpi=320&update_version_code=71504&sa_enable=0&fp=a_fake_fp&tma_jssdk_version=1.25.4.2&rom_version=coloros__r11-user 5.1.1 nmf26x 500200210 release-keys&plugin_state=30631999".format(
# random.choice(self.api_list), random.randint(5, 10), str(releaser_id), str(offset))
url = "https://profile.zjurl.cn/api/feed/profile/v1/?category=profile_video&visited_uid={0}&stream_api_version=82&request_source=1&offset={1}&user_id={2}".format(
......@@ -968,7 +1120,8 @@ class Crawler_toutiao():
count_false = 0
count_no_data = 0
offset = "0"
self.headers["referer"] = "https://profile.zjurl.cn/rogue/ugc/profile/?user_id=%s&request_source=1" % releaser_id
self.headers[
"referer"] = "https://profile.zjurl.cn/rogue/ugc/profile/?user_id=%s&request_source=1" % releaser_id
# vid = "AB5483CA-FCDC-42F1-AFB1-077A1%sDA" % random.randint(100000, 999999)
# ccid = "F153594D-1310-4984-A4C3-A679D4D%s" % random.randint(10000, 99999)
# openudid = "5d44f2ea1b74e3731b27e5ed8039ac29f%s" % random.randint(1000000, 9999999)
......@@ -1545,7 +1698,7 @@ class Crawler_toutiao():
es_index=None,
doc_type=None,
proxies_num=None):
for res in self.App_releaser_page_video(releaserUrl, output_to_file=output_to_file, filepath=filepath,
for res in self.web_releaser_page_article(releaserUrl, output_to_file=output_to_file, filepath=filepath,
releaser_page_num_max=releaser_page_num_max,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
......@@ -1676,8 +1829,9 @@ class Crawler_toutiao():
if __name__ == '__main__':
data_lis = ["https://www.toutiao.com/c/user/5839829632/#mid=5839829632",
'http://m.365yg.com/video/app/user/home/?to_user_id=52299115946&format=html',
data_lis = [
# "https://www.toutiao.com/c/user/5839829632/#mid=5839829632",
'http://m.365yg.com/video/app/user/home/?to_user_id=58914711545&format=html',
'http://m.365yg.com/video/app/user/home/?to_user_id=50002654647&format=html',
'http://m.365yg.com/video/app/user/home/?to_user_id=72306985675&format=html',
......@@ -1740,7 +1894,7 @@ if __name__ == '__main__':
# res = test.video_page("https://www.ixigua.com/i6701478014242259463/")
# print(res)
for url in data_lis:
test.releaser_page_by_time(1582272540000, 1582964230998 , url, output_to_es_raw=True,
test.releaser_page_by_time(1582272540000, 1595302556249, url, output_to_es_raw=True,
es_index='crawler-data-raw',
doc_type='doc', releaser_page_num_max=2,
proxies_num=0
......
......@@ -15,7 +15,7 @@ from elasticsearch.helpers import scan
from func_find_week_num import find_week_belongs_to
from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
from crawler.crawler_sys.utils import trans_format
from func_cal_doc_id import cal_doc_id
from write_data_into_es.func_cal_doc_id import cal_doc_id
hosts = '192.168.17.11'
port = 80
......
......@@ -6,26 +6,17 @@ Created on Wed Jun 20 09:19:12 2018
"""
import hashlib
try:
from write_data_into_es.func_calculate_toutiao_video_id import calculate_toutiao_video_id
from write_data_into_es.func_calculate_newTudou_video_id import calculate_newTudou_video_id
from write_data_into_es.func_calculate_v_qq_video_id import calculate_v_qq_video_id
#from func_calculate_kwai_video_id_by_data import calculate_kwai_video_id_by_data
from write_data_into_es.func_calculate_kwai_video_id_by_url import calculate_kwai_video_id_by_data_by_url
from write_data_into_es.func_calculate_txxw_video_id import calculate_txxw_video_id
from write_data_into_es.func_calculate_wangyi_news_id import calculate_wangyi_news_id
from write_data_into_es.func_calculate_douyin_id import calculate_douyin_id
from write_data_into_es.func_calculate_haokan_video_id import calculate_haokan_id
except:
from write_data_into_es_new.func_calculate_toutiao_video_id import calculate_toutiao_video_id
from write_data_into_es_new.func_calculate_newTudou_video_id import calculate_newTudou_video_id
from write_data_into_es_new.func_calculate_v_qq_video_id import calculate_v_qq_video_id
# from func_calculate_kwai_video_id_by_data import calculate_kwai_video_id_by_data
from write_data_into_es_new.func_calculate_kwai_video_id_by_url import calculate_kwai_video_id_by_data_by_url
from write_data_into_es_new.func_calculate_txxw_video_id import calculate_txxw_video_id
from write_data_into_es_new.func_calculate_wangyi_news_id import calculate_wangyi_news_id
from write_data_into_es_new.func_calculate_douyin_id import calculate_douyin_id
from write_data_into_es_new.func_calculate_haokan_video_id import calculate_haokan_id
from write_data_into_es.func_calculate_toutiao_video_id import calculate_toutiao_video_id
from write_data_into_es.func_calculate_newTudou_video_id import calculate_newTudou_video_id
from write_data_into_es.func_calculate_v_qq_video_id import calculate_v_qq_video_id
#from func_calculate_kwai_video_id_by_data import calculate_kwai_video_id_by_data
from write_data_into_es.func_calculate_kwai_video_id_by_url import calculate_kwai_video_id_by_data_by_url
from write_data_into_es.func_calculate_txxw_video_id import calculate_txxw_video_id
from write_data_into_es.func_calculate_wangyi_news_id import calculate_wangyi_news_id
from write_data_into_es.func_calculate_douyin_id import calculate_douyin_id
from write_data_into_es.func_calculate_haokan_video_id import calculate_haokan_id
def vid_cal_func(platform):
vid_cal_func_dict = {
......
from func_get_releaser_id import get_releaser_id
from write_data_into_es.func_get_releaser_id import get_releaser_id
def calculate_txxw_video_id(data_dict):
try:
......
......@@ -7,11 +7,11 @@ from elasticsearch import Elasticsearch
import json, copy
from write_data_into_es.func_get_releaser_id import get_releaser_id
from write_data_into_es.func_cal_doc_id import cal_doc_id
from func_transfer_from_ftp import transfer_from_ftp
from write_data_into_es.func_transfer_from_ftp import transfer_from_ftp
import logging
from urllib.parse import parse_qs,urlparse
from elasticsearch.helpers import scan
from ReleaserMeta import ReleaseMeta
from write_data_into_es.ReleaserMeta import ReleaseMeta
hosts = '192.168.17.11'
port = 80
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment