Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
772fd8b7
Commit
772fd8b7
authored
4 years ago
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
dc18535b
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
213 additions
and
0 deletions
+213
-0
crawler_xinyang_ask_tag.py
dev/xinyang_ask_tag/crawler_xinyang_ask_tag.py
+213
-0
No files found.
dev/xinyang_ask_tag/crawler_xinyang_ask_tag.py
0 → 100644
View file @
772fd8b7
# -*- coding:utf-8 -*-
# @Time : 2019/12/27 15:49
# @Author : litao
"""
新氧https://www.soyoung.com/itemk// 页下各标签的问答
"""
import
numpy
as
np
import
random
import
argparse
import
json
,
redis
,
re
,
requests
from
selenium.webdriver
import
ActionChains
import
time
,
datetime
,
copy
from
selenium
import
webdriver
# from PIL import Image
import
os
from
selenium.webdriver.support.ui
import
WebDriverWait
# import cv2
from
fontTools.ttLib
import
*
from
crawler.crawler_sys.utils.trans_str_play_count_to_int
import
trans_play_count
from
crawler.crawler_sys.utils.trans_duration_str_to_second
import
trans_duration
from
concurrent.futures
import
ProcessPoolExecutor
from
lxml
import
etree
from
crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili
import
get_proxy
from
bs4
import
BeautifulSoup
# rds_list = redis.StrictRedis(host='127.0.0.1', port=6379, db=1, decode_responses=True)
# rds_single = redis.StrictRedis(host='127.0.0.1', port=6379, db=0, decode_responses=True)
# rds_get = redis.StrictRedis(host='127.0.0.1', port=6379, db=15, decode_responses=True)
# rds_copy = redis.StrictRedis(host='127.0.0.1', port=6379, db=1, decode_responses=True)
rds_list
=
redis
.
StrictRedis
(
host
=
'192.168.17.60'
,
port
=
6379
,
db
=
1
,
decode_responses
=
True
)
rds_single
=
redis
.
StrictRedis
(
host
=
'192.168.17.60'
,
port
=
6379
,
db
=
0
,
decode_responses
=
True
)
rds_get
=
redis
.
StrictRedis
(
host
=
'192.168.17.60'
,
port
=
6379
,
db
=
15
,
decode_responses
=
True
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Specify a platform name.'
)
parser
.
add_argument
(
'-p'
,
'--max_page'
,
default
=
0
,
type
=
int
,
help
=
(
'The max page numbers'
))
parser
.
add_argument
(
'-t'
,
'--style_tag'
,
default
=
""
,
type
=
str
,
help
=
(
'style_tag'
))
parser
.
add_argument
(
'-c'
,
'--countries'
,
default
=
""
,
type
=
str
,
help
=
(
'style_tag'
))
args
=
parser
.
parse_args
()
def
revise_data
():
scan_re
=
rds_list
.
scan_iter
()
for
one_scan
in
scan_re
:
# print(one_scan)
data
=
rds_list
.
hgetall
(
one_scan
)
# data["title"] = data["title"].replace("\r", "").replace("\n", "")
# data["describe"] = data["describe"].replace("\r", "").replace("\n", "")
if
not
data
.
get
(
"directors"
):
rds_get
.
hmset
(
one_scan
,
data
)
# rds_list.hmset(one_scan,data)
class
Crawler_main
(
object
):
def
__init__
(
self
):
# self.chrome_options = webdriver.ChromeOptions()
# # self.chrome_options.add_argument('--headless')
# self.chrome_options.add_argument('--disable-gpu')
# # self.chrome_options.add_argument("--start-maximized")
# self.chrome_options.add_argument("--no-sandbox")
# self.chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
# self.timestamp = str(datetime.datetime.now().timestamp() * 1e3)
# prefs = {"profile.managed_default_content_settings.images": 2}
# self.chrome_options.add_experimental_option("prefs", prefs)
# self.driver = webdriver.Chrome(options=self.chrome_options)
self
.
headers
=
{
"accept"
:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
,
"accept-encoding"
:
"gzip, deflate, br"
,
"accept-language"
:
"zh-CN,zh;q=0.9"
,
"cookie"
:
"_ga=GA1.2.193275787.1596185563; cookie_id=1605149277057691084; xysource=15154; PHPSESSID=e0ae5890a52041aa000765f7ddd6488b; __usersign__=1605149277224950668; __postion__=a
%3
A4
%3
A
%7
Bs
%3
A6
%3
A
%22
cityId
%22%3
Bi
%3
A0
%3
Bs
%3
A8
%3
A
%22
cityName
%22%3
Bs
%3
A0
%3
A
%22%22%3
Bs
%3
A8
%3
A
%22
cityCode
%22%3
Bi
%3
A0
%3
Bs
%3
A3
%3
A
%22
jwd
%22%3
Bi
%3
A0
%3
B
%7
D; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1605149278; _gid=GA1.2.1287643971.1605149278; _gat=1; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1605150670"
,
"referer"
:
"https://www.soyoung.com/itemk//"
,
"sec-fetch-dest"
:
"document"
,
"sec-fetch-mode"
:
"navigate"
,
"sec-fetch-site"
:
"same-origin"
,
"sec-fetch-user"
:
"?1"
,
"upgrade-insecure-requests"
:
"1"
,
"user-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
,
}
self
.
one_video_dic
=
{
"platform"
:
"douban"
,
"title"
:
""
,
"url"
:
""
,
"describe"
:
""
,
}
def
__exit__
(
self
):
# self.driver.close()
pass
def
list_page
(
self
,
releaserUrl
=
"https://www.soyoung.com/itemk//"
,
tag_list_xpath
=
None
,
):
offset
=
0
count_false
=
0
proxies
=
get_proxy
(
0
)
requests_res
=
requests
.
get
(
releaserUrl
,
headers
=
self
.
headers
,
allow_redirects
=
False
,
timeout
=
5
)
page_obj
=
etree
.
HTML
(
requests_res
.
text
)
obj_list
=
page_obj
.
xpath
(
"/html[1]/body[1]/div"
)
for
first_title_obj
in
obj_list
:
try
:
tag_id
=
first_title_obj
.
xpath
(
"./@id"
)[
0
]
print
(
tag_id
)
first_title
=
first_title_obj
.
xpath
(
"./div[1]/div[1]/text()"
)[
0
]
.
strip
()
print
(
"first_title"
,
first_title
)
except
:
continue
second_title_str_obj_list
=
first_title_obj
.
xpath
(
"./div[1]/div[2]/div[1]/div[1]/a"
)
if
'product100'
in
tag_id
:
second_title_obj_list
=
first_title_obj
.
xpath
(
"./div[2]/div"
)
for
count_tag
,
one_second_title_obj
in
enumerate
(
second_title_obj_list
):
second_title
=
second_title_str_obj_list
[
count_tag
]
.
xpath
(
"./text()"
)[
0
]
.
strip
()
second_id
=
second_title_str_obj_list
[
count_tag
]
.
xpath
(
"./@data-id"
)[
0
]
.
strip
()
# second_obj_list = one_second_title_obj.xpath("./div[2]/div")
print
(
"second_title"
,
second_title
)
for
third_title_obj_product
in
self
.
get_third_tag_list
(
second_id
):
# third_title_obj_list = one_third_title_obj.xpath("./div[2]/div")
# third_name = third_title_obj_product.xpath("./div[1]/text()")[0].strip()
# third_name_info = third_title_obj_product.xpath("./div[1]/span[1]/text()")[0].strip()
# third_name_des = third_title_obj_product.xpath("./p[1]/text()")[0].strip()
# third_name_url = "https:" + third_title_obj_product.xpath("./@data-url")[0].strip()
# print(third_title_obj_product)
third_name
=
third_title_obj_product
.
get
(
"name"
)
third_name_info
=
third_title_obj_product
.
get
(
"one_feature"
)
third_name_des
=
third_title_obj_product
.
get
(
"summary"
)
try
:
third_name_url
=
"https://www.soyoung.com/itemk/
%
s/"
%
third_title_obj_product
.
get
(
"seo"
)
.
get
(
"pinyin"
)
except
:
third_name_url
=
""
print
(
first_title
,
second_title
,
third_name
)
for
qa_title
,
qa_answer
in
self
.
parse_single_data
(
third_name_url
):
data_dict
=
{
"first_title"
:
first_title
,
"second_title"
:
second_title
,
"third_name"
:
third_name
,
"third_name_info"
:
third_name_info
,
"third_name_des"
:
third_name_des
,
"third_name_url"
:
third_name_url
,
"qa_title"
:
qa_title
,
"qa_answer"
:
qa_answer
,
}
yield
data_dict
# break
def
parse_single_data
(
self
,
data_url
):
try
:
requests_res
=
requests
.
get
(
data_url
,
headers
=
self
.
headers
,
allow_redirects
=
False
,
timeout
=
5
)
page_obj
=
etree
.
HTML
(
requests_res
.
text
)
obj_list
=
page_obj
.
xpath
(
"//section[@id='qa']/div"
)
for
qa_obj
in
obj_list
:
qa_title
=
qa_obj
.
xpath
(
"./div[1]/p[1]/text()"
)[
0
]
.
strip
()
qa_answer
=
qa_obj
.
xpath
(
"./div[2]/p[1]/span[1]/text()"
)[
0
]
.
strip
()
# print(qa_title,qa_answer)
yield
qa_title
,
qa_answer
except
:
yield
""
,
""
def
get_third_tag_list
(
self
,
menu_id
):
headers
=
{
"accept"
:
"application/json, text/javascript, */*; q=0.01"
,
"accept-encoding"
:
"gzip, deflate"
,
"accept-language"
:
"zh-CN,zh;q=0.9"
,
# "cookie": "_ga=GA1.2.193275787.1596185563; cookie_id=1605149277057691084; xysource=15154; PHPSESSID=e0ae5890a52041aa000765f7ddd6488b; __usersign__=1605149277224950668; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1605149278; _gid=GA1.2.1287643971.1605149278; _gat=1; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1605165197",
"referer"
:
"https://www.soyoung.com/itemk//"
,
"sec-fetch-dest"
:
"empty"
,
"sec-fetch-mode"
:
"cors"
,
"sec-fetch-site"
:
"same-origin"
,
"user-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
,
"x-requested-with"
:
"XMLHttpRequest"
,
}
url
=
"https://www.soyoung.com/items/itemList?_json=1&menu_id=
%
s"
%
menu_id
requests_res
=
requests
.
get
(
url
,
headers
=
headers
,
allow_redirects
=
False
,
timeout
=
5
)
res_json
=
requests_res
.
json
()
return
res_json
if
__name__
==
"__main__"
:
# if args.style_tag or args.countries:
# Crawler_douban = Crawler_main()
# Crawler_douban.list_page(style_tags=args.style_tag,countries=args.countries)
# else:
# executor = ProcessPoolExecutor(max_workers=5)
# futures = []
# for one_scan in range(5):
# Crawler_douban = Crawler_main()
# future = executor.submit(Crawler_douban.detail_page, task=one_scan)
# futures.append(future)
# executor.shutdown(True)
import
pandas
as
pd
data_list
=
[]
Crawler_xinyang
=
Crawler_main
()
try
:
for
data
in
Crawler_xinyang
.
list_page
():
data_list
.
append
(
data
)
except
:
res
=
pd
.
DataFrame
(
data_list
)
res
.
to_csv
(
"wrong.csv"
,
encoding
=
"gb18030"
)
finally
:
res
=
pd
.
DataFrame
(
data_list
)
res
.
to_csv
(
"result.csv"
,
encoding
=
"gb18030"
)
# revise_data()
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment