Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
772fd8b7
Commit
772fd8b7
authored
Nov 12, 2020
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
dc18535b
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
213 additions
and
0 deletions
+213
-0
crawler_xinyang_ask_tag.py
dev/xinyang_ask_tag/crawler_xinyang_ask_tag.py
+213
-0
No files found.
dev/xinyang_ask_tag/crawler_xinyang_ask_tag.py
0 → 100644
View file @
772fd8b7
# -*- coding:utf-8 -*-
# @Time : 2019/12/27 15:49
# @Author : litao
"""
新氧https://www.soyoung.com/itemk// 页下各标签的问答
"""
import
numpy
as
np
import
random
import
argparse
import
json
,
redis
,
re
,
requests
from
selenium.webdriver
import
ActionChains
import
time
,
datetime
,
copy
from
selenium
import
webdriver
# from PIL import Image
import
os
from
selenium.webdriver.support.ui
import
WebDriverWait
# import cv2
from
fontTools.ttLib
import
*
from
crawler.crawler_sys.utils.trans_str_play_count_to_int
import
trans_play_count
from
crawler.crawler_sys.utils.trans_duration_str_to_second
import
trans_duration
from
concurrent.futures
import
ProcessPoolExecutor
from
lxml
import
etree
from
crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili
import
get_proxy
from
bs4
import
BeautifulSoup
# rds_list = redis.StrictRedis(host='127.0.0.1', port=6379, db=1, decode_responses=True)
# rds_single = redis.StrictRedis(host='127.0.0.1', port=6379, db=0, decode_responses=True)
# rds_get = redis.StrictRedis(host='127.0.0.1', port=6379, db=15, decode_responses=True)
# rds_copy = redis.StrictRedis(host='127.0.0.1', port=6379, db=1, decode_responses=True)
rds_list
=
redis
.
StrictRedis
(
host
=
'192.168.17.60'
,
port
=
6379
,
db
=
1
,
decode_responses
=
True
)
rds_single
=
redis
.
StrictRedis
(
host
=
'192.168.17.60'
,
port
=
6379
,
db
=
0
,
decode_responses
=
True
)
rds_get
=
redis
.
StrictRedis
(
host
=
'192.168.17.60'
,
port
=
6379
,
db
=
15
,
decode_responses
=
True
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Specify a platform name.'
)
parser
.
add_argument
(
'-p'
,
'--max_page'
,
default
=
0
,
type
=
int
,
help
=
(
'The max page numbers'
))
parser
.
add_argument
(
'-t'
,
'--style_tag'
,
default
=
""
,
type
=
str
,
help
=
(
'style_tag'
))
parser
.
add_argument
(
'-c'
,
'--countries'
,
default
=
""
,
type
=
str
,
help
=
(
'style_tag'
))
args
=
parser
.
parse_args
()
def
revise_data
():
scan_re
=
rds_list
.
scan_iter
()
for
one_scan
in
scan_re
:
# print(one_scan)
data
=
rds_list
.
hgetall
(
one_scan
)
# data["title"] = data["title"].replace("\r", "").replace("\n", "")
# data["describe"] = data["describe"].replace("\r", "").replace("\n", "")
if
not
data
.
get
(
"directors"
):
rds_get
.
hmset
(
one_scan
,
data
)
# rds_list.hmset(one_scan,data)
class
Crawler_main
(
object
):
def
__init__
(
self
):
# self.chrome_options = webdriver.ChromeOptions()
# # self.chrome_options.add_argument('--headless')
# self.chrome_options.add_argument('--disable-gpu')
# # self.chrome_options.add_argument("--start-maximized")
# self.chrome_options.add_argument("--no-sandbox")
# self.chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
# self.timestamp = str(datetime.datetime.now().timestamp() * 1e3)
# prefs = {"profile.managed_default_content_settings.images": 2}
# self.chrome_options.add_experimental_option("prefs", prefs)
# self.driver = webdriver.Chrome(options=self.chrome_options)
self
.
headers
=
{
"accept"
:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
,
"accept-encoding"
:
"gzip, deflate, br"
,
"accept-language"
:
"zh-CN,zh;q=0.9"
,
"cookie"
:
"_ga=GA1.2.193275787.1596185563; cookie_id=1605149277057691084; xysource=15154; PHPSESSID=e0ae5890a52041aa000765f7ddd6488b; __usersign__=1605149277224950668; __postion__=a
%3
A4
%3
A
%7
Bs
%3
A6
%3
A
%22
cityId
%22%3
Bi
%3
A0
%3
Bs
%3
A8
%3
A
%22
cityName
%22%3
Bs
%3
A0
%3
A
%22%22%3
Bs
%3
A8
%3
A
%22
cityCode
%22%3
Bi
%3
A0
%3
Bs
%3
A3
%3
A
%22
jwd
%22%3
Bi
%3
A0
%3
B
%7
D; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1605149278; _gid=GA1.2.1287643971.1605149278; _gat=1; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1605150670"
,
"referer"
:
"https://www.soyoung.com/itemk//"
,
"sec-fetch-dest"
:
"document"
,
"sec-fetch-mode"
:
"navigate"
,
"sec-fetch-site"
:
"same-origin"
,
"sec-fetch-user"
:
"?1"
,
"upgrade-insecure-requests"
:
"1"
,
"user-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
,
}
self
.
one_video_dic
=
{
"platform"
:
"douban"
,
"title"
:
""
,
"url"
:
""
,
"describe"
:
""
,
}
def
__exit__
(
self
):
# self.driver.close()
pass
def
list_page
(
self
,
releaserUrl
=
"https://www.soyoung.com/itemk//"
,
tag_list_xpath
=
None
,
):
offset
=
0
count_false
=
0
proxies
=
get_proxy
(
0
)
requests_res
=
requests
.
get
(
releaserUrl
,
headers
=
self
.
headers
,
allow_redirects
=
False
,
timeout
=
5
)
page_obj
=
etree
.
HTML
(
requests_res
.
text
)
obj_list
=
page_obj
.
xpath
(
"/html[1]/body[1]/div"
)
for
first_title_obj
in
obj_list
:
try
:
tag_id
=
first_title_obj
.
xpath
(
"./@id"
)[
0
]
print
(
tag_id
)
first_title
=
first_title_obj
.
xpath
(
"./div[1]/div[1]/text()"
)[
0
]
.
strip
()
print
(
"first_title"
,
first_title
)
except
:
continue
second_title_str_obj_list
=
first_title_obj
.
xpath
(
"./div[1]/div[2]/div[1]/div[1]/a"
)
if
'product100'
in
tag_id
:
second_title_obj_list
=
first_title_obj
.
xpath
(
"./div[2]/div"
)
for
count_tag
,
one_second_title_obj
in
enumerate
(
second_title_obj_list
):
second_title
=
second_title_str_obj_list
[
count_tag
]
.
xpath
(
"./text()"
)[
0
]
.
strip
()
second_id
=
second_title_str_obj_list
[
count_tag
]
.
xpath
(
"./@data-id"
)[
0
]
.
strip
()
# second_obj_list = one_second_title_obj.xpath("./div[2]/div")
print
(
"second_title"
,
second_title
)
for
third_title_obj_product
in
self
.
get_third_tag_list
(
second_id
):
# third_title_obj_list = one_third_title_obj.xpath("./div[2]/div")
# third_name = third_title_obj_product.xpath("./div[1]/text()")[0].strip()
# third_name_info = third_title_obj_product.xpath("./div[1]/span[1]/text()")[0].strip()
# third_name_des = third_title_obj_product.xpath("./p[1]/text()")[0].strip()
# third_name_url = "https:" + third_title_obj_product.xpath("./@data-url")[0].strip()
# print(third_title_obj_product)
third_name
=
third_title_obj_product
.
get
(
"name"
)
third_name_info
=
third_title_obj_product
.
get
(
"one_feature"
)
third_name_des
=
third_title_obj_product
.
get
(
"summary"
)
try
:
third_name_url
=
"https://www.soyoung.com/itemk/
%
s/"
%
third_title_obj_product
.
get
(
"seo"
)
.
get
(
"pinyin"
)
except
:
third_name_url
=
""
print
(
first_title
,
second_title
,
third_name
)
for
qa_title
,
qa_answer
in
self
.
parse_single_data
(
third_name_url
):
data_dict
=
{
"first_title"
:
first_title
,
"second_title"
:
second_title
,
"third_name"
:
third_name
,
"third_name_info"
:
third_name_info
,
"third_name_des"
:
third_name_des
,
"third_name_url"
:
third_name_url
,
"qa_title"
:
qa_title
,
"qa_answer"
:
qa_answer
,
}
yield
data_dict
# break
def
parse_single_data
(
self
,
data_url
):
try
:
requests_res
=
requests
.
get
(
data_url
,
headers
=
self
.
headers
,
allow_redirects
=
False
,
timeout
=
5
)
page_obj
=
etree
.
HTML
(
requests_res
.
text
)
obj_list
=
page_obj
.
xpath
(
"//section[@id='qa']/div"
)
for
qa_obj
in
obj_list
:
qa_title
=
qa_obj
.
xpath
(
"./div[1]/p[1]/text()"
)[
0
]
.
strip
()
qa_answer
=
qa_obj
.
xpath
(
"./div[2]/p[1]/span[1]/text()"
)[
0
]
.
strip
()
# print(qa_title,qa_answer)
yield
qa_title
,
qa_answer
except
:
yield
""
,
""
def
get_third_tag_list
(
self
,
menu_id
):
headers
=
{
"accept"
:
"application/json, text/javascript, */*; q=0.01"
,
"accept-encoding"
:
"gzip, deflate"
,
"accept-language"
:
"zh-CN,zh;q=0.9"
,
# "cookie": "_ga=GA1.2.193275787.1596185563; cookie_id=1605149277057691084; xysource=15154; PHPSESSID=e0ae5890a52041aa000765f7ddd6488b; __usersign__=1605149277224950668; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1605149278; _gid=GA1.2.1287643971.1605149278; _gat=1; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1605165197",
"referer"
:
"https://www.soyoung.com/itemk//"
,
"sec-fetch-dest"
:
"empty"
,
"sec-fetch-mode"
:
"cors"
,
"sec-fetch-site"
:
"same-origin"
,
"user-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
,
"x-requested-with"
:
"XMLHttpRequest"
,
}
url
=
"https://www.soyoung.com/items/itemList?_json=1&menu_id=
%
s"
%
menu_id
requests_res
=
requests
.
get
(
url
,
headers
=
headers
,
allow_redirects
=
False
,
timeout
=
5
)
res_json
=
requests_res
.
json
()
return
res_json
if
__name__
==
"__main__"
:
# if args.style_tag or args.countries:
# Crawler_douban = Crawler_main()
# Crawler_douban.list_page(style_tags=args.style_tag,countries=args.countries)
# else:
# executor = ProcessPoolExecutor(max_workers=5)
# futures = []
# for one_scan in range(5):
# Crawler_douban = Crawler_main()
# future = executor.submit(Crawler_douban.detail_page, task=one_scan)
# futures.append(future)
# executor.shutdown(True)
import
pandas
as
pd
data_list
=
[]
Crawler_xinyang
=
Crawler_main
()
try
:
for
data
in
Crawler_xinyang
.
list_page
():
data_list
.
append
(
data
)
except
:
res
=
pd
.
DataFrame
(
data_list
)
res
.
to_csv
(
"wrong.csv"
,
encoding
=
"gb18030"
)
finally
:
res
=
pd
.
DataFrame
(
data_list
)
res
.
to_csv
(
"result.csv"
,
encoding
=
"gb18030"
)
# revise_data()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment