Commit 772fd8b7 authored by litaolemo's avatar litaolemo

update

parent dc18535b
# -*- coding:utf-8 -*-
# @Time : 2019/12/27 15:49
# @Author : litao
"""
新氧https://www.soyoung.com/itemk// 页下各标签的问答
"""
import numpy as np
import random
import argparse
import json, redis, re, requests
from selenium.webdriver import ActionChains
import time, datetime, copy
from selenium import webdriver
# from PIL import Image
import os
from selenium.webdriver.support.ui import WebDriverWait
# import cv2
from fontTools.ttLib import *
from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration
from concurrent.futures import ProcessPoolExecutor
from lxml import etree
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
from bs4 import BeautifulSoup
# rds_list = redis.StrictRedis(host='127.0.0.1', port=6379, db=1, decode_responses=True)
# rds_single = redis.StrictRedis(host='127.0.0.1', port=6379, db=0, decode_responses=True)
# rds_get = redis.StrictRedis(host='127.0.0.1', port=6379, db=15, decode_responses=True)
# rds_copy = redis.StrictRedis(host='127.0.0.1', port=6379, db=1, decode_responses=True)
rds_list = redis.StrictRedis(host='192.168.17.60', port=6379, db=1, decode_responses=True)
rds_single = redis.StrictRedis(host='192.168.17.60', port=6379, db=0, decode_responses=True)
rds_get = redis.StrictRedis(host='192.168.17.60', port=6379, db=15, decode_responses=True)
parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-p', '--max_page', default=0, type=int,
help=('The max page numbers'))
parser.add_argument('-t', '--style_tag', default="", type=str,
help=('style_tag'))
parser.add_argument('-c', '--countries', default="", type=str,
help=('style_tag'))
args = parser.parse_args()
def revise_data():
scan_re = rds_list.scan_iter()
for one_scan in scan_re:
# print(one_scan)
data = rds_list.hgetall(one_scan)
# data["title"] = data["title"].replace("\r", "").replace("\n", "")
# data["describe"] = data["describe"].replace("\r", "").replace("\n", "")
if not data.get("directors"):
rds_get.hmset(one_scan, data)
# rds_list.hmset(one_scan,data)
class Crawler_main(object):
def __init__(self):
# self.chrome_options = webdriver.ChromeOptions()
# # self.chrome_options.add_argument('--headless')
# self.chrome_options.add_argument('--disable-gpu')
# # self.chrome_options.add_argument("--start-maximized")
# self.chrome_options.add_argument("--no-sandbox")
# self.chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
# self.timestamp = str(datetime.datetime.now().timestamp() * 1e3)
# prefs = {"profile.managed_default_content_settings.images": 2}
# self.chrome_options.add_experimental_option("prefs", prefs)
# self.driver = webdriver.Chrome(options=self.chrome_options)
self.headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cookie": "_ga=GA1.2.193275787.1596185563; cookie_id=1605149277057691084; xysource=15154; PHPSESSID=e0ae5890a52041aa000765f7ddd6488b; __usersign__=1605149277224950668; __postion__=a%3A4%3A%7Bs%3A6%3A%22cityId%22%3Bi%3A0%3Bs%3A8%3A%22cityName%22%3Bs%3A0%3A%22%22%3Bs%3A8%3A%22cityCode%22%3Bi%3A0%3Bs%3A3%3A%22jwd%22%3Bi%3A0%3B%7D; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1605149278; _gid=GA1.2.1287643971.1605149278; _gat=1; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1605150670",
"referer": "https://www.soyoung.com/itemk//",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
}
self.one_video_dic = {
"platform": "douban",
"title": "",
"url": "",
"describe": "",
}
def __exit__(self):
# self.driver.close()
pass
def list_page(self, releaserUrl="https://www.soyoung.com/itemk//",
tag_list_xpath=None,
):
offset = 0
count_false = 0
proxies = get_proxy(0)
requests_res = requests.get(releaserUrl, headers=self.headers, allow_redirects=False, timeout=5)
page_obj = etree.HTML(requests_res.text)
obj_list = page_obj.xpath("/html[1]/body[1]/div")
for first_title_obj in obj_list:
try:
tag_id = first_title_obj.xpath("./@id")[0]
print(tag_id)
first_title = first_title_obj.xpath("./div[1]/div[1]/text()")[0].strip()
print("first_title", first_title)
except:
continue
second_title_str_obj_list = first_title_obj.xpath("./div[1]/div[2]/div[1]/div[1]/a")
if 'product100' in tag_id:
second_title_obj_list = first_title_obj.xpath("./div[2]/div")
for count_tag, one_second_title_obj in enumerate(second_title_obj_list):
second_title = second_title_str_obj_list[count_tag].xpath("./text()")[0].strip()
second_id = second_title_str_obj_list[count_tag].xpath("./@data-id")[0].strip()
# second_obj_list = one_second_title_obj.xpath("./div[2]/div")
print("second_title", second_title)
for third_title_obj_product in self.get_third_tag_list(second_id):
# third_title_obj_list = one_third_title_obj.xpath("./div[2]/div")
# third_name = third_title_obj_product.xpath("./div[1]/text()")[0].strip()
# third_name_info = third_title_obj_product.xpath("./div[1]/span[1]/text()")[0].strip()
# third_name_des = third_title_obj_product.xpath("./p[1]/text()")[0].strip()
# third_name_url = "https:" + third_title_obj_product.xpath("./@data-url")[0].strip()
# print(third_title_obj_product)
third_name = third_title_obj_product.get("name")
third_name_info = third_title_obj_product.get("one_feature")
third_name_des = third_title_obj_product.get("summary")
try:
third_name_url = "https://www.soyoung.com/itemk/%s/" % third_title_obj_product.get(
"seo").get("pinyin")
except:
third_name_url = ""
print(first_title, second_title, third_name)
for qa_title, qa_answer in self.parse_single_data(third_name_url):
data_dict = {
"first_title": first_title,
"second_title": second_title,
"third_name": third_name,
"third_name_info": third_name_info,
"third_name_des": third_name_des,
"third_name_url": third_name_url,
"qa_title": qa_title,
"qa_answer": qa_answer,
}
yield data_dict
# break
def parse_single_data(self, data_url):
try:
requests_res = requests.get(data_url, headers=self.headers, allow_redirects=False, timeout=5)
page_obj = etree.HTML(requests_res.text)
obj_list = page_obj.xpath("//section[@id='qa']/div")
for qa_obj in obj_list:
qa_title = qa_obj.xpath("./div[1]/p[1]/text()")[0].strip()
qa_answer = qa_obj.xpath("./div[2]/p[1]/span[1]/text()")[0].strip()
# print(qa_title,qa_answer)
yield qa_title, qa_answer
except:
yield "", ""
def get_third_tag_list(self, menu_id):
headers = {
"accept": "application/json, text/javascript, */*; q=0.01",
"accept-encoding": "gzip, deflate",
"accept-language": "zh-CN,zh;q=0.9",
# "cookie": "_ga=GA1.2.193275787.1596185563; cookie_id=1605149277057691084; xysource=15154; PHPSESSID=e0ae5890a52041aa000765f7ddd6488b; __usersign__=1605149277224950668; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1605149278; _gid=GA1.2.1287643971.1605149278; _gat=1; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1605165197",
"referer": "https://www.soyoung.com/itemk//",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
"x-requested-with": "XMLHttpRequest",
}
url = "https://www.soyoung.com/items/itemList?_json=1&menu_id=%s" % menu_id
requests_res = requests.get(url, headers=headers, allow_redirects=False, timeout=5)
res_json = requests_res.json()
return res_json
if __name__ == "__main__":
# if args.style_tag or args.countries:
# Crawler_douban = Crawler_main()
# Crawler_douban.list_page(style_tags=args.style_tag,countries=args.countries)
# else:
# executor = ProcessPoolExecutor(max_workers=5)
# futures = []
# for one_scan in range(5):
# Crawler_douban = Crawler_main()
# future = executor.submit(Crawler_douban.detail_page, task=one_scan)
# futures.append(future)
# executor.shutdown(True)
import pandas as pd
data_list = []
Crawler_xinyang = Crawler_main()
try:
for data in Crawler_xinyang.list_page():
data_list.append(data)
except:
res = pd.DataFrame(data_list)
res.to_csv("wrong.csv", encoding="gb18030")
finally:
res = pd.DataFrame(data_list)
res.to_csv("result.csv", encoding="gb18030")
# revise_data()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment