1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 28 10:28:45 2018
@author: fangyucheng
"""
import urllib
import requests
import json
from bs4 import BeautifulSoup
from crawler_sys.utils.output_results import retry_get_url
from crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
from crawler.crawler_sys.utils.util_logging import logged
try:
from .func_get_releaser_id import *
except:
from func_get_releaser_id import *
class Crawler_miaopai():
def get_releaser_follower_num(self, releaserUrl):
if "www.yixia.com" in releaserUrl:
get_page = retry_get_url(releaserUrl)
get_page.encoding = 'utf-8'
page = get_page.text
soup = BeautifulSoup(page, 'html.parser')
try:
midstep_1 = soup.find('ul', {'class': 'bottomInfor'})
midstep_2 = midstep_1.find_all('li')
for line in midstep_2:
line_text = line.text
if '粉丝' in line_text:
follower_str = line_text.replace('粉丝', '')
follower_num = trans_play_count(follower_str)
print('%s follower number is %s' % (releaserUrl, follower_num))
return follower_num
except:
print("can't can followers")
elif "n.miaopai.com" in releaserUrl:
try:
split_url = releaserUrl.split("personal/")
suid = split_url[-1].replace('.htm', '').replace('.html', '').replace('htm', '')
url = "https://n.miaopai.com/api/aj_user/space.json?suid=%s" % suid
get_page = urllib.request.urlopen(url)
page_bytes = get_page.read()
page_str = page_bytes.decode("utf-8")
page_dic = json.loads(page_str)
follower_num = page_dic['data']['followers_count']
return follower_num
except:
print("can't can followers")
def get_releaser_id(self, releaserUrl):
return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl)
#encoding method
@logged
def releaser_page(self, releaserUrl, releaser_page_num_max=30):
headers = {'Host': 'n.miaopai.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Cookie': 'aliyungf_tc=AQAAAIVvfVl0CgQAysVBfBViNUJYGG5C; Hm_lvt_e8fa5926bca558076246d7fb7ca12071=1545124849; Hm_lpvt_e8fa5926bca558076246d7fb7ca12071=1545124849',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0'}
releaser_id = self.get_releaser_id(releaserUrl)
page_num = 1
while page_num <= releaser_page_num_max:
url = ('https://n.miaopai.com/api/aj_user/medias.json?suid=%s&page=%s'
% (releaser_id, page_num))
get_page = requests.get(url, headers=headers)
get_page.encoding = 'utf-8'
page = get_page.text
page_dic = get_page.json()
if __name__ == "__main__":
releaserUrl = 'http://n.miaopai.com/personal/h~NjA~vSfoYLz1pchtm'
test = Crawler_miaopai()
p = test.get_releaser_follower_num(releaserUrl)