Commit 4dfed1b9 authored by litaolemo's avatar litaolemo

update

parent d83f79a7
...@@ -8,7 +8,7 @@ Created on Mon Feb 26 17:57:38 2018 ...@@ -8,7 +8,7 @@ Created on Mon Feb 26 17:57:38 2018
class Std_fields_video: class Std_fields_video:
def __init__(self, data_provider=None): def __init__(self, data_provider=None):
if data_provider==None: if data_provider==None:
data_provider='BDD' data_provider='gengmei'
self.video_data={ self.video_data={
'platform': None, 'platform': None,
'channel': None, 'channel': None,
......
...@@ -13,16 +13,16 @@ import kdl,requests ...@@ -13,16 +13,16 @@ import kdl,requests
from redis.sentinel import Sentinel from redis.sentinel import Sentinel
sentinel = Sentinel([('192.168.17.65', 26379), # sentinel = Sentinel([('192.168.17.65', 26379),
('192.168.17.66', 26379), # ('192.168.17.66', 26379),
('192.168.17.67', 26379) # ('192.168.17.67', 26379)
], socket_timeout=0.5) # ], socket_timeout=0.5)
# 查看master节点 # # 查看master节点
master = sentinel.discover_master('ida_redis_master') # master = sentinel.discover_master('ida_redis_master')
# 查看slave 节点 # # 查看slave 节点
slave = sentinel.discover_slaves('ida_redis_master') # slave = sentinel.discover_slaves('ida_redis_master')
# 连接数据库 # # 连接数据库
rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True) # rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True)
# rds = redis.StrictRedis(host='192.168.17.60', port=6378, db=7, decode_responses=True) # rds = redis.StrictRedis(host='192.168.17.60', port=6378, db=7, decode_responses=True)
def get_proxy_from_redis(): def get_proxy_from_redis():
try: try:
......
...@@ -22,7 +22,7 @@ import urllib ...@@ -22,7 +22,7 @@ import urllib
try: try:
from crawler_sys.framework.func_get_releaser_id import * from crawler_sys.framework.func_get_releaser_id import *
except: except:
from func_get_releaser_id import * from write_data_into_es.func_get_releaser_id import *
import requests import requests
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
from crawler.crawler_sys.framework.get_redirect_resp import get_redirected_resp from crawler.crawler_sys.framework.get_redirect_resp import get_redirected_resp
...@@ -32,8 +32,9 @@ from crawler.crawler_sys.site_crawler.toutiao_get_signature import getHoney ...@@ -32,8 +32,9 @@ from crawler.crawler_sys.site_crawler.toutiao_get_signature import getHoney
from crawler.crawler_sys.utils.output_results import output_result from crawler.crawler_sys.utils.output_results import output_result
from crawler.crawler_sys.utils.output_results import retry_get_url from crawler.crawler_sys.utils.output_results import retry_get_url
from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration
# from crawler.crawler_sys.utils import output_log from crawler.crawler_sys.utils import output_log
from crawler.crawler_sys.utils.util_logging import logged from crawler.crawler_sys.utils.util_logging import logged
from write_data_into_es.func_cal_doc_id import cal_doc_id
class Crawler_toutiao(): class Crawler_toutiao():
...@@ -48,32 +49,32 @@ class Crawler_toutiao(): ...@@ -48,32 +49,32 @@ class Crawler_toutiao():
self.video_data = std_fields.video_data self.video_data = std_fields.video_data
self.video_data['platform'] = self.platform self.video_data['platform'] = self.platform
# remove fields that crawled data don't have # remove fields that crawled data don't have
pop_key_Lst = ['channel', 'describe', 'isOriginal', "repost_count", "video_id"] pop_key_Lst = ['channel', 'describe', 'isOriginal', "video_id"]
for popk in pop_key_Lst: for popk in pop_key_Lst:
self.video_data.pop(popk) self.video_data.pop(popk)
self.releaser_url_pattern = 'http://www.365yg.com/c/user/[RELEASER_ID]/' self.releaser_url_pattern = 'http://www.365yg.com/c/user/[RELEASER_ID]/'
self.list_page_url_dict = {'all_channel': ( self.list_page_url_dict = {'all_channel': (
'https://www.365yg.com/api/pc/feed/?max_behot_time=0' 'https://www.365yg.com/api/pc/feed/?max_behot_time=0'
'&category=video_new&utm_source=toutiao')} '&category=video_new&utm_source=toutiao')}
self.legal_list_page_urls = [] self.legal_list_page_urls = []
self.legal_channels = [] self.legal_channels = []
self.api_list = [ self.api_list = [
"ic", "ic",
"is", "is",
"api3-normal-c-hl", "api3-normal-c-hl",
"ib", "ib",
"api3-normal-c-lf", "api3-normal-c-lf",
"id", "id",
"ie", "ie",
"api3-normal-c-lq", "api3-normal-c-lq",
"ii", "ii",
"io", "io",
"it", "it",
"iu", "iu",
"lf", "lf",
"lg", "lg",
"lh", "lh",
] ]
for ch in self.list_page_url_dict: for ch in self.list_page_url_dict:
list_page_url = self.list_page_url_dict[ch] list_page_url = self.list_page_url_dict[ch]
...@@ -112,18 +113,19 @@ class Crawler_toutiao(): ...@@ -112,18 +113,19 @@ class Crawler_toutiao():
# } # }
self.headers = { self.headers = {
"accept": "text/javascript, text/html, application/xml, text/xml, */*", "accept": "text/javascript, text/html, application/xml, text/xml, */*",
"accept-encoding": "gzip, deflate", "accept-encoding": "gzip, deflate",
"accept-language": "zh,zh-CN;q=0.9", "accept-language": "zh,zh-CN;q=0.9",
"content-type": "application/x-www-form-urlencoded", "content-type": "application/x-www-form-urlencoded",
# "cookie": "gftoken=MjA4NTcyMDkyMXwxNTgyOTYxNjM3NjZ8fDAGBgYGBgY; SLARDAR_WEB_ID=9706fc8c-b8a6-4265-8a2e-e3f0739daaf2; UM_distinctid=1708fddb4c0466-04c756d28410e1-752c6c3c-51abc-1708fddb4c1790; CNZZDATA1274386066=608234173-1582960977-https%253A%252F%252Fwww.toutiao.com%252F%7C1582960977", # "cookie": "gftoken=MjA4NTcyMDkyMXwxNTgyOTYxNjM3NjZ8fDAGBgYGBgY; SLARDAR_WEB_ID=9706fc8c-b8a6-4265-8a2e-e3f0739daaf2; UM_distinctid=1708fddb4c0466-04c756d28410e1-752c6c3c-51abc-1708fddb4c1790; CNZZDATA1274386066=608234173-1582960977-https%253A%252F%252Fwww.toutiao.com%252F%7C1582960977",
# "referer": "https://profile.zjurl.cn/rogue/ugc/profile/?user_id=50502346296&media_id=50502346296&request_source=1", # "referer": "https://profile.zjurl.cn/rogue/ugc/profile/?user_id=50502346296&media_id=50502346296&request_source=1",
"sec-fetch-mode": "cors", "sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin", "sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
"x-requested-with": "XMLHttpRequest", "x-requested-with": "XMLHttpRequest",
} }
# log_path = '/home/hanye/crawlersNew/crawler/crawler_log' # log_path = '/home/hanye/crawlersNew/crawler/crawler_log'
# current_day = str(datetime.datetime.now())[:10] # current_day = str(datetime.datetime.now())[:10]
# info_log_file = log_path + '/all_' + current_day + '.log' # info_log_file = log_path + '/all_' + current_day + '.log'
...@@ -214,62 +216,62 @@ class Crawler_toutiao(): ...@@ -214,62 +216,62 @@ class Crawler_toutiao():
print(url) print(url)
return None return None
headers = { headers = {
"Accept-Encoding": "gzip", "Accept-Encoding": "gzip",
"X-SS-REQ-TICKET": str(int(datetime.datetime.now().timestamp()) * 1e3), "X-SS-REQ-TICKET": str(int(datetime.datetime.now().timestamp()) * 1e3),
"sdk-version": "1", "sdk-version": "1",
# "Cookie": "qh[360]=1; install_id=85200129335; ttreq=1$e8e97b875965bf4af4b5dbaaba4d4a5ec3441e47; history=JM89SDpxGAfw5%2F%2Bo%2F7tEz15%2FZ0tbUEN7Q8FhEYQQIdJ2oNFBgpagCA7BIbUFNUT0NjSkRIvl2AveOdr2XEVUuDS0FFnQEETEo%2BOH5%2Fvj9%2F0WyqF4xphMZNLJeD6aSBmk15Tt4nTWSGUaEHR0e%2BG9aqGfPFOgOXrZ%2BtQBJVI6QXPA89R9dzs2QCqC6eil7H3eQhcFiJOXE4NLgDL9q7FscXLM78Qv62rk0GuiRN511vlNRZioEEArGesNaKhQXxBmHd1q7ic19JNcb90Cu1ELfdQz11KkY4Ob%2BWZYex%2BRPCfFK6uaO12GkJ%2FEN%2BtofMgAVEg8s0qbw2ehgkKiwToovMVNdJP4ai%2Fqvw4vjlLXFi%2BqefWmhTKpUvum%2FoR3VBIvYDrgeYT5YtpNksxJe6WeA3SReODW1diayV1cq%2FzDhf2%2FoqFMognaHwAAAP%2F%2F; odin_tt=8cd4f07f6dc385b01edd52312dd29fbe7fdbfa059194493779de3fe408b8836bb9265292bb9335bc976037dd93e5d131de7acf894a805930417b4d3be7f308e0", # "Cookie": "qh[360]=1; install_id=85200129335; ttreq=1$e8e97b875965bf4af4b5dbaaba4d4a5ec3441e47; history=JM89SDpxGAfw5%2F%2Bo%2F7tEz15%2FZ0tbUEN7Q8FhEYQQIdJ2oNFBgpagCA7BIbUFNUT0NjSkRIvl2AveOdr2XEVUuDS0FFnQEETEo%2BOH5%2Fvj9%2F0WyqF4xphMZNLJeD6aSBmk15Tt4nTWSGUaEHR0e%2BG9aqGfPFOgOXrZ%2BtQBJVI6QXPA89R9dzs2QCqC6eil7H3eQhcFiJOXE4NLgDL9q7FscXLM78Qv62rk0GuiRN511vlNRZioEEArGesNaKhQXxBmHd1q7ic19JNcb90Cu1ELfdQz11KkY4Ob%2BWZYex%2BRPCfFK6uaO12GkJ%2FEN%2BtofMgAVEg8s0qbw2ehgkKiwToovMVNdJP4ai%2Fqvw4vjlLXFi%2BqefWmhTKpUvum%2FoR3VBIvYDrgeYT5YtpNksxJe6WeA3SReODW1diayV1cq%2FzDhf2%2FoqFMognaHwAAAP%2F%2F; odin_tt=8cd4f07f6dc385b01edd52312dd29fbe7fdbfa059194493779de3fe408b8836bb9265292bb9335bc976037dd93e5d131de7acf894a805930417b4d3be7f308e0",
# "X-Gorgon": "0300ddd08400675de6e75ad03849011c863306ddae2b0eb3cec4", # "X-Gorgon": "0300ddd08400675de6e75ad03849011c863306ddae2b0eb3cec4",
# "X-Khronos": str(int(datetime.datetime.now().timestamp())), # "X-Khronos": str(int(datetime.datetime.now().timestamp())),
# "Host": "xgapi.snssdk.com", # "Host": "xgapi.snssdk.com",
"Connection": "Keep-Alive", "Connection": "Keep-Alive",
"Authorization": "HMAC-SHA1:2.0:1573091168911407306:bab42eac5b9e4a8eb25a91fc371ad533:WTfDrhnIsymHfmHCgG9YvRSu2YY=", "Authorization": "HMAC-SHA1:2.0:1573091168911407306:bab42eac5b9e4a8eb25a91fc371ad533:WTfDrhnIsymHfmHCgG9YvRSu2YY=",
"User-Agent": "okhttp/3.10.0.1", "User-Agent": "okhttp/3.10.0.1",
"X-Pods": "", "X-Pods": "",
} }
print(vid) print(vid)
url_dic = { url_dic = {
"group_id": vid, "group_id": vid,
"item_id": vid, "item_id": vid,
"aggr_type": 0, "aggr_type": 0,
"context": 1, "context": 1,
"flags": 64, "flags": 64,
# "iid": "77627602260", # "iid": "77627602260",
# "device_id": random.randint(50000000000,59999999999), # "device_id": random.randint(50000000000,59999999999),
"ac": "wifi", "ac": "wifi",
"channel": "update", "channel": "update",
"aid": "13", "aid": "13",
"app_name": "news_article", "app_name": "news_article",
"version_code": "732", "version_code": "732",
"version_name": "7.3.2", "version_name": "7.3.2",
"device_platform": "android", "device_platform": "android",
"ab_version": "830855,947965,942635,662176,665176,674051,643894,919834,649427,677130,710077,801968,707372,661900,668775,990369,739390,662099,668774,765190,976875,857803,952277,757281,679101,660830,759657,661781,648315", "ab_version": "830855,947965,942635,662176,665176,674051,643894,919834,649427,677130,710077,801968,707372,661900,668775,990369,739390,662099,668774,765190,976875,857803,952277,757281,679101,660830,759657,661781,648315",
"ab_group": "100168", "ab_group": "100168",
"ab_feature": "94563,102749", "ab_feature": "94563,102749",
"ssmix": "a", "ssmix": "a",
# "device_type": "oppo R11s Plus", # "device_type": "oppo R11s Plus",
# "device_brand": "OPPO", # "device_brand": "OPPO",
"language": "zh", "language": "zh",
"os_api": "23", "os_api": "23",
"os_version": "9.0.1", "os_version": "9.0.1",
# "uuid": "250129616283002", # "uuid": "250129616283002",
# "openudid": "7313ae71df9e5367", # "openudid": "7313ae71df9e5367",
"manifest_version_code": "731", "manifest_version_code": "731",
"resolution": "810*1440", "resolution": "810*1440",
"dpi": "270", "dpi": "270",
"update_version_code": "75410", "update_version_code": "75410",
"_rticket": int(datetime.datetime.now().timestamp() * 1e3), "_rticket": int(datetime.datetime.now().timestamp() * 1e3),
# "rom_version": "coloros__v417ir release-keys", # "rom_version": "coloros__v417ir release-keys",
# "fp": "w2TZFzTqczmWFlwOLSU1J2xecSKO", # "fp": "w2TZFzTqczmWFlwOLSU1J2xecSKO",
"tma_jssdk_version": "1.24.0.1", "tma_jssdk_version": "1.24.0.1",
# "pos": "5r_x8vP69Ono-fi_p6ysq7Opra2kr6ixv_H86fTp6Pn4v6eupLOkra6vpajg", # "pos": "5r_x8vP69Ono-fi_p6ysq7Opra2kr6ixv_H86fTp6Pn4v6eupLOkra6vpajg",
# "plugin": "0", # "plugin": "0",
# "ts":int(datetime.datetime.now().timestamp()), # "ts":int(datetime.datetime.now().timestamp()),
# "as":"ab7f9fce505d1d7dbe7f9f", # "as":"ab7f9fce505d1d7dbe7f9f",
# "mas":"011993339399f959a359d379b98587814259a359d3997919d319b3" # "mas":"011993339399f959a359d379b98587814259a359d3997919d319b3"
} }
url = 'http://xgapi.snssdk.com/video/app/article/information/v25/?%s' % ( url = 'http://xgapi.snssdk.com/video/app/article/information/v25/?%s' % (
urllib.parse.urlencode(url_dic)) urllib.parse.urlencode(url_dic))
# get_page = get_redirected_resp(url) # get_page = get_redirected_resp(url)
res = retry_get_url(url, headers=headers, timeout=5, proxies=1) res = retry_get_url(url, headers=headers, timeout=5, proxies=1)
try: try:
...@@ -489,7 +491,7 @@ class Crawler_toutiao(): ...@@ -489,7 +491,7 @@ class Crawler_toutiao():
# self.loggererror.error('Got KeyError exception: %s at page %s' # self.loggererror.error('Got KeyError exception: %s at page %s'
# % (except_msg, releaserUrl)) # % (except_msg, releaserUrl))
print('Got KeyError exception: %s at page %s' % ( print('Got KeyError exception: %s at page %s' % (
except_msg, releaserUrl)) except_msg, releaserUrl))
try: try:
print(duration_str) print(duration_str)
except: except:
...@@ -698,14 +700,14 @@ class Crawler_toutiao(): ...@@ -698,14 +700,14 @@ class Crawler_toutiao():
def get_data_mediaid(self, releaserUrl, releaser_id): def get_data_mediaid(self, releaserUrl, releaser_id):
headers = { headers = {
"Host": "m.toutiao.com", "Host": "m.toutiao.com",
"Connection": "keep-alive", "Connection": "keep-alive",
"Cache-Control": "max-age=0", "Cache-Control": "max-age=0",
"Upgrade-Insecure-Requests": "1", "Upgrade-Insecure-Requests": "1",
"User-Agent": self.random_useragent(), "User-Agent": self.random_useragent(),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding": "gzip, deflate", "Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9" "Accept-Language": "zh-CN,zh;q=0.9"
} }
releaserUrl = "http://m.toutiao.com/profile/%s/#mid=%s" % (releaser_id, releaser_id) releaserUrl = "http://m.toutiao.com/profile/%s/#mid=%s" % (releaser_id, releaser_id)
time.sleep(1) time.sleep(1)
...@@ -765,6 +767,156 @@ class Crawler_toutiao(): ...@@ -765,6 +767,156 @@ class Crawler_toutiao():
return video_image_url return video_image_url
def get_web_article_info(self,article_id):
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh,zh-CN;q=0.9",
"Connection": "keep-alive",
# "Cookie": "tt_webid=6851461299689686542; SLARDAR_WEB_ID=568d391e-7f96-491b-9557-b045a55e9dd8",
"Host": "m.toutiao.com",
"Referer": "https://m.toutiao.com/i6851146167279944199/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
}
headers["Referer"] = "https://m.toutiao.com/i%s" % article_id
url = "https://m.toutiao.com/i{0}/info/?i={1}".format(article_id,article_id)
requests_res = retry_get_url(url,headers=headers,proxies=0)
res_json = requests_res.json()
res_dic = {
"title":res_json["data"].get("title"),
'high_quality_flag':int(res_json["data"].get('high_quality_flag')),
"play_count": int(res_json["data"].get('impression_count')),
"comment_count": res_json["data"].get("comment_count"),
"repost_count": res_json["data"].get("repost_count"),
"favorite_count": res_json["data"].get("digg_count"),
'releaser_followers_count': res_json["data"].get("follower_count"),
'release_time': int(res_json["data"].get('publish_time')*1e3),
"content":res_json["data"].get("content"),
}
return res_dic
def web_releaser_page_article(self, releaserUrl,
releaser_page_num_max=50000,
proxies_num=None,**kwargs):
result_list = []
has_more = True
count = 1
releaser_id = self.find_releaser_id(releaserUrl)
count_false = 0
offset = "0"
headers = {"accept": "text/javascript, text/html, application/xml, text/xml, */*",
"accept-encoding": "gzip, deflate", "accept-language": "zh,zh-CN;q=0.9",
"content-type": "application/x-www-form-urlencoded",
# "cookie": "gftoken=NDAxNzc3NjcyM3wxNTk1MjI3MTU0ODh8fDAGBgYGBgY; SLARDAR_WEB_ID=0ddc45df-54ce-42c5-8dfd-27403ea3319e; s_v_web_id=verify_kcu52781_yF9Mw8Pu_VGOQ_4R2p_8AeG_NwGKWAkt7YLl; ttcid=df5933a4926945c68dde9bf5e5542f9730; tt_scid=KlhjcsMcR9m7a1GIqnzjDfr.XZ0-jnU4X-ZPLZFZ51vyyv6FmjCdmDwYVWtjq2JO18fd",
# "referer": "https://profile.zjurl.cn/rogue/ugc/profile/?user_id=%s&request_source=1" % releaser_id,
"sec-fetch-dest": "empty", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
"x-requested-with": "XMLHttpRequest"}
# vid = "AB5483CA-FCDC-42F1-AFB1-077A1%sDA" % random.randint(100000, 999999)
# ccid = "F153594D-1310-4984-A4C3-A679D4D%s" % random.randint(10000, 99999)
# openudid = "5d44f2ea1b74e3731b27e5ed8039ac29f%s" % random.randint(1000000, 9999999)
# idfa = "E3FC9054-384B-485F-9B4C-936F33D7D%s" % random.randint(100, 999)
# iid = str(random.randint(104525900000, 104526000000))
while has_more and count <= releaser_page_num_max:
# url = "https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&{2}".format(
# random.choice(self.api_list), random.randint(5, 10), urllib.parse.urlencode(url_dic))
# url = """https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&visited_uid={2}&client_extra_params=&count=20&offset={3}&stream_api_version=88&category=profile_video&version_code=7.6.0&app_name=news_article&channel=App%20Store&resolution=1536*2048&aid=13&ab_feature=794528&ab_version=765192,857803,660830,1444046,1397712,1434498,662176,801968,1419045,668775,1462526,1190525,1489306,1493796,1439625,1469498,668779,1417599,662099,1477261,1484884,668774,1496422,1427395&ab_group=794528&pos=5pe9vb/x8v788cLx/On47unC7fLuv72nveaXvb29vb/ 8vLv fTz/On4y/zx6Pjuv72nveaXvb29vb29v/Hy8/r06ej5 L 9p72tsZe9vb29vb2/8fzp9Ono fi/vae9rZe9vb294Je9veCX4A==&update_version_code=76014&ac=WIFI&os_version=13.3.1&ssmix=a&device_platform=ipad&ab_client=a1,f2,f7,e1&device_type=iPad6,11""".format(random.choice(self.api_list), random.randint(1, 1),str(releaser_id),str(offset))
# url = "https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&visited_uid={2}&stream_api_version=47&count=20&offset={3}&ac=wifi&channel=wap_test_lite_1&aid=35&app_name=news_article_lite&version_code=715&version_name=7.1.5&device_platform=android&ab_version=668903,668905,668907,808414,772541,1378617,668908,668904,668906,1401332,1496418,928942&ab_client=a1,c2,e1,f2,g2,f7&ab_feature=z1&abflag=3&ssmix=a&device_type=OPPO R11&device_brand=OPPO&language=zh&os_api=22&os_version=5.1.1&manifest_version_code=715&resolution=900*1600&dpi=320&update_version_code=71504&sa_enable=0&fp=a_fake_fp&tma_jssdk_version=1.25.4.2&rom_version=coloros__r11-user 5.1.1 nmf26x 500200210 release-keys&plugin_state=30631999".format(
# random.choice(self.api_list), random.randint(5, 10), str(releaser_id), str(offset))
url = "https://profile.zjurl.cn/api/feed/profile/v2/?category=profile_article&visited_uid={0}&stream_api_version=82&request_source=1&offset={1}&user_id={2}".format(
str(releaser_id), str(offset), str(releaser_id))
try:
proxies = get_proxy(proxies_num)
if proxies:
# proxies = {
# "http": "http://127.0.0.1:80",
# "https": "http://127.0.0.1:443"
# }
get_page = requests.get(url, headers=self.headers, proxies=proxies, timeout=10)
else:
get_page = requests.get(url, headers=self.headers, timeout=10)
except:
continue
print("get_page %s on page %s" % (releaser_id, count))
page_dic = {}
try:
page_dic = get_page.json()
if page_dic.get("message") != "success":
count_false += 1
if count_false < 3:
continue
else:
print("unknow error")
break
data_list = page_dic.get('data')
has_more = page_dic.get('has_more')
offset = str(page_dic.get("offset"))
except:
if not page_dic:
count_false += 1
if count_false >= 3:
break
else:
continue
if data_list:
data_list = page_dic.get('data')
has_more = page_dic.get('has_more')
else:
data_list = []
has_more = False
# offset = page_dic.get('offset')
if has_more is None:
has_more = False
if not data_list:
print("toutiao no data in releaser %s page %s" % (releaser_id, count))
# print(page_dic)
# print(url)
count_false += 1
proxies = get_proxy(1)
if count_false >= 5:
has_more = False
break
continue
else:
count_false = 0
count += 1
for one_video in data_list:
# print(one_video)
# info_str = one_video.get('content')
info_dic = json.loads(one_video["content"])
video_dic = copy.deepcopy(self.video_data)
video_dic['title'] = info_dic.get('title')
video_dic['abstract'] = info_dic.get('abstract')
video_dic['url'] = info_dic.get('share_url')
video_dic['releaser'] = info_dic.get('source')
video_dic['releaserUrl'] = releaserUrl
release_time = info_dic.get('publish_time')
video_dic['release_time'] = int(release_time * 1e3)
video_dic['duration'] = info_dic.get('video_duration')
video_dic['play_count'] = info_dic.get('read_count')
video_dic['repost_count'] = info_dic.get('forward_info').get('forward_count')
video_dic['comment_count'] = info_dic.get('comment_count')
video_dic['favorite_count'] = info_dic.get('digg_count')
video_dic['article_id'] = info_dic.get('tag_id')
video_dic['fetch_time'] = int(time.time() * 1e3)
video_dic['releaser_id_str'] = "toutiao_%s" % releaser_id
video_dic['video_img'] = self.get_video_image(info_dic)
video_dic['id'] = cal_doc_id(video_dic["platform"], url=video_dic["url"], doc_id_type='all-time-url', data_dict=video_dic)
try:
article_info = self.get_web_article_info(info_dic.get('tag_id'))
video_dic.update(article_info)
except Exception as e:
print("method get_web_article_info error %s" %e)
yield video_dic
def App_releaser_page_video(self, releaserUrl, def App_releaser_page_video(self, releaserUrl,
output_to_file=False, output_to_file=False,
filepath=None, filepath=None,
...@@ -864,7 +1016,7 @@ class Crawler_toutiao(): ...@@ -864,7 +1016,7 @@ class Crawler_toutiao():
# } # }
# url = "https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&{2}".format( # url = "https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&{2}".format(
# random.choice(self.api_list), random.randint(5, 10), urllib.parse.urlencode(url_dic)) # random.choice(self.api_list), random.randint(5, 10), urllib.parse.urlencode(url_dic))
#url = """https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&visited_uid={2}&client_extra_params=&count=20&offset={3}&stream_api_version=88&category=profile_video&version_code=7.6.0&app_name=news_article&channel=App%20Store&resolution=1536*2048&aid=13&ab_feature=794528&ab_version=765192,857803,660830,1444046,1397712,1434498,662176,801968,1419045,668775,1462526,1190525,1489306,1493796,1439625,1469498,668779,1417599,662099,1477261,1484884,668774,1496422,1427395&ab_group=794528&pos=5pe9vb/x8v788cLx/On47unC7fLuv72nveaXvb29vb/ 8vLv fTz/On4y/zx6Pjuv72nveaXvb29vb29v/Hy8/r06ej5 L 9p72tsZe9vb29vb2/8fzp9Ono fi/vae9rZe9vb294Je9veCX4A==&update_version_code=76014&ac=WIFI&os_version=13.3.1&ssmix=a&device_platform=ipad&ab_client=a1,f2,f7,e1&device_type=iPad6,11""".format(random.choice(self.api_list), random.randint(1, 1),str(releaser_id),str(offset)) # url = """https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&visited_uid={2}&client_extra_params=&count=20&offset={3}&stream_api_version=88&category=profile_video&version_code=7.6.0&app_name=news_article&channel=App%20Store&resolution=1536*2048&aid=13&ab_feature=794528&ab_version=765192,857803,660830,1444046,1397712,1434498,662176,801968,1419045,668775,1462526,1190525,1489306,1493796,1439625,1469498,668779,1417599,662099,1477261,1484884,668774,1496422,1427395&ab_group=794528&pos=5pe9vb/x8v788cLx/On47unC7fLuv72nveaXvb29vb/ 8vLv fTz/On4y/zx6Pjuv72nveaXvb29vb29v/Hy8/r06ej5 L 9p72tsZe9vb29vb2/8fzp9Ono fi/vae9rZe9vb294Je9veCX4A==&update_version_code=76014&ac=WIFI&os_version=13.3.1&ssmix=a&device_platform=ipad&ab_client=a1,f2,f7,e1&device_type=iPad6,11""".format(random.choice(self.api_list), random.randint(1, 1),str(releaser_id),str(offset))
# url = "https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&visited_uid={2}&stream_api_version=47&count=20&offset={3}&ac=wifi&channel=wap_test_lite_1&aid=35&app_name=news_article_lite&version_code=715&version_name=7.1.5&device_platform=android&ab_version=668903,668905,668907,808414,772541,1378617,668908,668904,668906,1401332,1496418,928942&ab_client=a1,c2,e1,f2,g2,f7&ab_feature=z1&abflag=3&ssmix=a&device_type=OPPO R11&device_brand=OPPO&language=zh&os_api=22&os_version=5.1.1&manifest_version_code=715&resolution=900*1600&dpi=320&update_version_code=71504&sa_enable=0&fp=a_fake_fp&tma_jssdk_version=1.25.4.2&rom_version=coloros__r11-user 5.1.1 nmf26x 500200210 release-keys&plugin_state=30631999".format( # url = "https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&visited_uid={2}&stream_api_version=47&count=20&offset={3}&ac=wifi&channel=wap_test_lite_1&aid=35&app_name=news_article_lite&version_code=715&version_name=7.1.5&device_platform=android&ab_version=668903,668905,668907,808414,772541,1378617,668908,668904,668906,1401332,1496418,928942&ab_client=a1,c2,e1,f2,g2,f7&ab_feature=z1&abflag=3&ssmix=a&device_type=OPPO R11&device_brand=OPPO&language=zh&os_api=22&os_version=5.1.1&manifest_version_code=715&resolution=900*1600&dpi=320&update_version_code=71504&sa_enable=0&fp=a_fake_fp&tma_jssdk_version=1.25.4.2&rom_version=coloros__r11-user 5.1.1 nmf26x 500200210 release-keys&plugin_state=30631999".format(
# random.choice(self.api_list), random.randint(5, 10), str(releaser_id), str(offset)) # random.choice(self.api_list), random.randint(5, 10), str(releaser_id), str(offset))
url = "https://profile.zjurl.cn/api/feed/profile/v1/?category=profile_video&visited_uid={0}&stream_api_version=82&request_source=1&offset={1}&user_id={2}".format( url = "https://profile.zjurl.cn/api/feed/profile/v1/?category=profile_video&visited_uid={0}&stream_api_version=82&request_source=1&offset={1}&user_id={2}".format(
...@@ -968,7 +1120,8 @@ class Crawler_toutiao(): ...@@ -968,7 +1120,8 @@ class Crawler_toutiao():
count_false = 0 count_false = 0
count_no_data = 0 count_no_data = 0
offset = "0" offset = "0"
self.headers["referer"] = "https://profile.zjurl.cn/rogue/ugc/profile/?user_id=%s&request_source=1" % releaser_id self.headers[
"referer"] = "https://profile.zjurl.cn/rogue/ugc/profile/?user_id=%s&request_source=1" % releaser_id
# vid = "AB5483CA-FCDC-42F1-AFB1-077A1%sDA" % random.randint(100000, 999999) # vid = "AB5483CA-FCDC-42F1-AFB1-077A1%sDA" % random.randint(100000, 999999)
# ccid = "F153594D-1310-4984-A4C3-A679D4D%s" % random.randint(10000, 99999) # ccid = "F153594D-1310-4984-A4C3-A679D4D%s" % random.randint(10000, 99999)
# openudid = "5d44f2ea1b74e3731b27e5ed8039ac29f%s" % random.randint(1000000, 9999999) # openudid = "5d44f2ea1b74e3731b27e5ed8039ac29f%s" % random.randint(1000000, 9999999)
...@@ -1156,18 +1309,18 @@ class Crawler_toutiao(): ...@@ -1156,18 +1309,18 @@ class Crawler_toutiao():
video_url = info_dic.get("raw_data").get("origin_group").get('article_url') video_url = info_dic.get("raw_data").get("origin_group").get('article_url')
elif info_dic.get("raw_data").get("comment_base"): elif info_dic.get("raw_data").get("comment_base"):
video_url = info_dic.get("raw_data").get("comment_base").get('share').get( video_url = info_dic.get("raw_data").get("comment_base").get('share').get(
'share_url') 'share_url')
elif info_dic.get("raw_data").get("action"): elif info_dic.get("raw_data").get("action"):
video_url = "https://m.toutiaoimg.cn/group/%s/" % info_dic.get("raw_data").get( video_url = "https://m.toutiaoimg.cn/group/%s/" % info_dic.get("raw_data").get(
'group_id') 'group_id')
video_dic['video_id'] = info_dic.get("raw_data").get('group_id') video_dic['video_id'] = info_dic.get("raw_data").get('group_id')
video_dic['play_count'] = info_dic.get("raw_data").get("action").get("play_count") video_dic['play_count'] = info_dic.get("raw_data").get("action").get("play_count")
video_dic['repost_count'] = info_dic.get("raw_data").get("action").get( video_dic['repost_count'] = info_dic.get("raw_data").get("action").get(
"share_count") "share_count")
video_dic['comment_count'] = info_dic.get("raw_data").get("action").get( video_dic['comment_count'] = info_dic.get("raw_data").get("action").get(
'comment_count') 'comment_count')
video_dic['favorite_count'] = info_dic.get("raw_data").get("action").get( video_dic['favorite_count'] = info_dic.get("raw_data").get("action").get(
'digg_count') 'digg_count')
video_dic['duration'] = info_dic.get('raw_data').get('video').get("duration") video_dic['duration'] = info_dic.get('raw_data').get('video').get("duration")
video_dic['title'] = info_dic.get('raw_data').get("title") video_dic['title'] = info_dic.get('raw_data').get("title")
video_dic['releaser'] = info_dic.get('raw_data').get("user").get("info").get("name") video_dic['releaser'] = info_dic.get('raw_data').get("user").get("info").get("name")
...@@ -1176,7 +1329,7 @@ class Crawler_toutiao(): ...@@ -1176,7 +1329,7 @@ class Crawler_toutiao():
video_dic['releaser_id_str'] = "toutiao_%s" % releaser_id video_dic['releaser_id_str'] = "toutiao_%s" % releaser_id
video_dic['fetch_time'] = int(datetime.datetime.now().timestamp() * 1e3) video_dic['fetch_time'] = int(datetime.datetime.now().timestamp() * 1e3)
video_dic['video_img'] = "http://p1-tt.bytecdn.cn/large/" + info_dic.get( video_dic['video_img'] = "http://p1-tt.bytecdn.cn/large/" + info_dic.get(
'raw_data').get('video').get("origin_cover").get("uri") 'raw_data').get('video').get("origin_cover").get("uri")
video_dic['release_time'] = int(info_dic.get("raw_data").get("create_time") * 1e3) video_dic['release_time'] = int(info_dic.get("raw_data").get("create_time") * 1e3)
video_url = None video_url = None
if video_url: if video_url:
...@@ -1221,16 +1374,16 @@ class Crawler_toutiao(): ...@@ -1221,16 +1374,16 @@ class Crawler_toutiao():
data_count = 0 data_count = 0
# print(as_cp_sign) # print(as_cp_sign)
headers = { headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"accept-encoding": "gzip, deflate, br", "accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9", "accept-language": "zh-CN,zh;q=0.9",
"content-type": "application/x-www-form-urlencoded", "content-type": "application/x-www-form-urlencoded",
"x-requested-with": "XMLHttpRequest", "x-requested-with": "XMLHttpRequest",
"Referer": "https://www.toutiao.com/c/user/%s/" % releaser_id, "Referer": "https://www.toutiao.com/c/user/%s/" % releaser_id,
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
# "cookie":'cookie: tt_webid=6673330506500982276; WEATHER_CITY=%E5%8C%97%E4%BA%AC; UM_distinctid=169c3156bb86b3-00d2e2a0ad50b2-7a1437-161398-169c3156bb9746; tt_webid=6673330506500982276; csrftoken=301d4862d95090ad520f8a54ae360b93; uuid="w:79cdae1ec41c48c9b9cd21255077f629"; CNZZDATA1259612802=281397494-1553752275-https%253A%252F%252Fwww.baidu.com%252F%7C1555306390', # "cookie":'cookie: tt_webid=6673330506500982276; WEATHER_CITY=%E5%8C%97%E4%BA%AC; UM_distinctid=169c3156bb86b3-00d2e2a0ad50b2-7a1437-161398-169c3156bb9746; tt_webid=6673330506500982276; csrftoken=301d4862d95090ad520f8a54ae360b93; uuid="w:79cdae1ec41c48c9b9cd21255077f629"; CNZZDATA1259612802=281397494-1553752275-https%253A%252F%252Fwww.baidu.com%252F%7C1555306390',
"cache-control": "max-age=0", "cache-control": "max-age=0",
"upgrade-insecure-requests": "1" "upgrade-insecure-requests": "1"
} }
user_page_url = "https://www.toutiao.com/c/user/%s/" % releaser_id user_page_url = "https://www.toutiao.com/c/user/%s/" % releaser_id
user_page = requests.get(user_page_url, headers=headers) user_page = requests.get(user_page_url, headers=headers)
...@@ -1239,7 +1392,7 @@ class Crawler_toutiao(): ...@@ -1239,7 +1392,7 @@ class Crawler_toutiao():
# print(str(releaser_id)+str(max_behot_time)) # print(str(releaser_id)+str(max_behot_time))
# js_head = json.loads(get_js(str(releaser_id)+str(max_behot_time))) # js_head = json.loads(get_js(str(releaser_id)+str(max_behot_time)))
get_as_cp_sign = requests.get( get_as_cp_sign = requests.get(
"http://127.0.0.1:3000/?id=%s&max_behot_time=%s" % (releaser_id, max_behot_time)) "http://127.0.0.1:3000/?id=%s&max_behot_time=%s" % (releaser_id, max_behot_time))
as_cp_sign = get_as_cp_sign.json() as_cp_sign = get_as_cp_sign.json()
url_dic = {"page_type": "0", url_dic = {"page_type": "0",
"user_id": releaser_id, "user_id": releaser_id,
...@@ -1434,12 +1587,12 @@ class Crawler_toutiao(): ...@@ -1434,12 +1587,12 @@ class Crawler_toutiao():
media_id = releaser_id media_id = releaser_id
headers = { headers = {
"accept-encoding": "gzip, deflate, br", "accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9", "accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0", "cache-control": "max-age=0",
"upgrade-insecure-requests": "1", "upgrade-insecure-requests": "1",
"Referer": "http://m.toutiao.com/profile/%s/" % releaser_id, "Referer": "http://m.toutiao.com/profile/%s/" % releaser_id,
"User-Agent": self.random_useragent(), "User-Agent": self.random_useragent(),
} }
while has_more == 1 and count <= releaser_page_num_max: while has_more == 1 and count <= releaser_page_num_max:
...@@ -1545,7 +1698,7 @@ class Crawler_toutiao(): ...@@ -1545,7 +1698,7 @@ class Crawler_toutiao():
es_index=None, es_index=None,
doc_type=None, doc_type=None,
proxies_num=None): proxies_num=None):
for res in self.App_releaser_page_video(releaserUrl, output_to_file=output_to_file, filepath=filepath, for res in self.web_releaser_page_article(releaserUrl, output_to_file=output_to_file, filepath=filepath,
releaser_page_num_max=releaser_page_num_max, releaser_page_num_max=releaser_page_num_max,
output_to_es_raw=output_to_es_raw, output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register, output_to_es_register=output_to_es_register,
...@@ -1574,28 +1727,28 @@ class Crawler_toutiao(): ...@@ -1574,28 +1727,28 @@ class Crawler_toutiao():
@staticmethod @staticmethod
def random_useragent(): def random_useragent():
agent_lis = [ agent_lis = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50", "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36"
] ]
return agent_lis[random.randrange(0, len(agent_lis))] return agent_lis[random.randrange(0, len(agent_lis))]
...@@ -1676,8 +1829,9 @@ class Crawler_toutiao(): ...@@ -1676,8 +1829,9 @@ class Crawler_toutiao():
if __name__ == '__main__': if __name__ == '__main__':
data_lis = ["https://www.toutiao.com/c/user/5839829632/#mid=5839829632", data_lis = [
'http://m.365yg.com/video/app/user/home/?to_user_id=52299115946&format=html', # "https://www.toutiao.com/c/user/5839829632/#mid=5839829632",
'http://m.365yg.com/video/app/user/home/?to_user_id=58914711545&format=html', 'http://m.365yg.com/video/app/user/home/?to_user_id=58914711545&format=html',
'http://m.365yg.com/video/app/user/home/?to_user_id=50002654647&format=html', 'http://m.365yg.com/video/app/user/home/?to_user_id=50002654647&format=html',
'http://m.365yg.com/video/app/user/home/?to_user_id=72306985675&format=html', 'http://m.365yg.com/video/app/user/home/?to_user_id=72306985675&format=html',
...@@ -1740,7 +1894,7 @@ if __name__ == '__main__': ...@@ -1740,7 +1894,7 @@ if __name__ == '__main__':
# res = test.video_page("https://www.ixigua.com/i6701478014242259463/") # res = test.video_page("https://www.ixigua.com/i6701478014242259463/")
# print(res) # print(res)
for url in data_lis: for url in data_lis:
test.releaser_page_by_time(1582272540000, 1582964230998 , url, output_to_es_raw=True, test.releaser_page_by_time(1582272540000, 1595302556249, url, output_to_es_raw=True,
es_index='crawler-data-raw', es_index='crawler-data-raw',
doc_type='doc', releaser_page_num_max=2, doc_type='doc', releaser_page_num_max=2,
proxies_num=0 proxies_num=0
......
...@@ -15,7 +15,7 @@ from elasticsearch.helpers import scan ...@@ -15,7 +15,7 @@ from elasticsearch.helpers import scan
from func_find_week_num import find_week_belongs_to from func_find_week_num import find_week_belongs_to
from crawler.crawler_sys.framework.platform_crawler_register import get_crawler from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
from crawler.crawler_sys.utils import trans_format from crawler.crawler_sys.utils import trans_format
from func_cal_doc_id import cal_doc_id from write_data_into_es.func_cal_doc_id import cal_doc_id
hosts = '192.168.17.11' hosts = '192.168.17.11'
port = 80 port = 80
......
...@@ -6,26 +6,17 @@ Created on Wed Jun 20 09:19:12 2018 ...@@ -6,26 +6,17 @@ Created on Wed Jun 20 09:19:12 2018
""" """
import hashlib import hashlib
try:
from write_data_into_es.func_calculate_toutiao_video_id import calculate_toutiao_video_id from write_data_into_es.func_calculate_toutiao_video_id import calculate_toutiao_video_id
from write_data_into_es.func_calculate_newTudou_video_id import calculate_newTudou_video_id from write_data_into_es.func_calculate_newTudou_video_id import calculate_newTudou_video_id
from write_data_into_es.func_calculate_v_qq_video_id import calculate_v_qq_video_id from write_data_into_es.func_calculate_v_qq_video_id import calculate_v_qq_video_id
#from func_calculate_kwai_video_id_by_data import calculate_kwai_video_id_by_data #from func_calculate_kwai_video_id_by_data import calculate_kwai_video_id_by_data
from write_data_into_es.func_calculate_kwai_video_id_by_url import calculate_kwai_video_id_by_data_by_url from write_data_into_es.func_calculate_kwai_video_id_by_url import calculate_kwai_video_id_by_data_by_url
from write_data_into_es.func_calculate_txxw_video_id import calculate_txxw_video_id from write_data_into_es.func_calculate_txxw_video_id import calculate_txxw_video_id
from write_data_into_es.func_calculate_wangyi_news_id import calculate_wangyi_news_id from write_data_into_es.func_calculate_wangyi_news_id import calculate_wangyi_news_id
from write_data_into_es.func_calculate_douyin_id import calculate_douyin_id from write_data_into_es.func_calculate_douyin_id import calculate_douyin_id
from write_data_into_es.func_calculate_haokan_video_id import calculate_haokan_id from write_data_into_es.func_calculate_haokan_video_id import calculate_haokan_id
except:
from write_data_into_es_new.func_calculate_toutiao_video_id import calculate_toutiao_video_id
from write_data_into_es_new.func_calculate_newTudou_video_id import calculate_newTudou_video_id
from write_data_into_es_new.func_calculate_v_qq_video_id import calculate_v_qq_video_id
# from func_calculate_kwai_video_id_by_data import calculate_kwai_video_id_by_data
from write_data_into_es_new.func_calculate_kwai_video_id_by_url import calculate_kwai_video_id_by_data_by_url
from write_data_into_es_new.func_calculate_txxw_video_id import calculate_txxw_video_id
from write_data_into_es_new.func_calculate_wangyi_news_id import calculate_wangyi_news_id
from write_data_into_es_new.func_calculate_douyin_id import calculate_douyin_id
from write_data_into_es_new.func_calculate_haokan_video_id import calculate_haokan_id
def vid_cal_func(platform): def vid_cal_func(platform):
vid_cal_func_dict = { vid_cal_func_dict = {
......
from func_get_releaser_id import get_releaser_id from write_data_into_es.func_get_releaser_id import get_releaser_id
def calculate_txxw_video_id(data_dict): def calculate_txxw_video_id(data_dict):
try: try:
......
...@@ -7,11 +7,11 @@ from elasticsearch import Elasticsearch ...@@ -7,11 +7,11 @@ from elasticsearch import Elasticsearch
import json, copy import json, copy
from write_data_into_es.func_get_releaser_id import get_releaser_id from write_data_into_es.func_get_releaser_id import get_releaser_id
from write_data_into_es.func_cal_doc_id import cal_doc_id from write_data_into_es.func_cal_doc_id import cal_doc_id
from func_transfer_from_ftp import transfer_from_ftp from write_data_into_es.func_transfer_from_ftp import transfer_from_ftp
import logging import logging
from urllib.parse import parse_qs,urlparse from urllib.parse import parse_qs,urlparse
from elasticsearch.helpers import scan from elasticsearch.helpers import scan
from ReleaserMeta import ReleaseMeta from write_data_into_es.ReleaserMeta import ReleaseMeta
hosts = '192.168.17.11' hosts = '192.168.17.11'
port = 80 port = 80
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment