Commit 4d1d6e83 authored by 李小芳's avatar 李小芳

update

parent 626b43f5
......@@ -2,7 +2,7 @@
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="jdk" jdkName="Python 3.6 (doris_env)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
......
......@@ -3,6 +3,41 @@
<component name="CsvFileAttributes">
<option name="attributeMap">
<map>
<entry key="/crawler_sys/tools/video_num_count/crawler/haokan_青春旅社.csv">
<value>
<Attribute>
<option name="separator" value="," />
</Attribute>
</value>
</entry>
<entry key="/dev/xinyang_ask_tag/soyoung_service.csv">
<value>
<Attribute>
<option name="separator" value="," />
</Attribute>
</value>
</entry>
<entry key="/dev/xinyang_ask_tag/soyoung_service_1.csv">
<value>
<Attribute>
<option name="separator" value="," />
</Attribute>
</value>
</entry>
<entry key="/dev/xinyang_ask_tag/soyoung_service_cika.csv">
<value>
<Attribute>
<option name="separator" value="," />
</Attribute>
</value>
</entry>
<entry key="/dev/xinyang_ask_tag/soyoung_service_write_cika.csv">
<value>
<Attribute>
<option name="separator" value="," />
</Attribute>
</value>
</entry>
<entry key="/tasks/yangjingshu.csv">
<value>
<Attribute>
......
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (doris_env)" project-jdk-type="Python SDK" />
<component name="PyCharmProfessionalAdvertiser">
<option name="shown" value="true" />
</component>
</project>
\ No newline at end of file
......@@ -60,7 +60,8 @@ def get_url():
if cat <= 5 and source < 14:
continue
yield cat,source,year,sort
yield cat, source, year, sort
def revise_data():
scan_re = rds_list.scan_iter()
......@@ -566,7 +567,7 @@ class Crawler_main(object):
time.sleep(0.1)
print("page ", offset)
url = "http://api.maoyan.com/mmdb/search/movie/tag/list.json?cityId=1&limit=100&offset={0}&catId={1}&sourceId={2}&yearId={3}&sortId={4}&token=7SJTJRCOW4fNMlp_xZDfgeI8qL0AAAAAsAkAADq-Y4OtjaaVeiysSdZtMsWTuGb0liEIqBPrkrC5QNJ0xOlFWRhf__Rj4D5cDS9L9g&utm_campaign=AmovieBmovieCD-1&movieBundleVersion=8012031&utm_source=meituan&utm_medium=android&utm_term=8.12.3&utm_content=440000000189785&ci=1&net=1&dModel=oneplus%20a5010&uuid=0000000000000A10631E76CD844099D6694316F7616BBA157797426456628307&channelId=1&lat=0.0&lng=0.0&refer=c_boybi6x4&version_name=8.12.3&machine_type=0".format(
str(offset),cat,source,year,sort)
str(offset), cat, source, year, sort)
proxies = get_proxy(4)
requests_res = requests.get(url, headers=headers, proxies=proxies, allow_redirects=False)
dev_list = requests_res.json()
......@@ -622,21 +623,21 @@ class Crawler_main(object):
pass
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh,zh-CN;q=0.9",
"Connection": "keep-alive",
# "Cookie": "_lxsdk_cuid=16f45cef868c8-03522a76dfe0d8-5f4e2917-161012-16f45cef868c8; _lxsdk=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1577424189,1577683110,1577942292; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; __utma=17099173.1331545914.1577942309.1577942309.1577942309.1; __utmc=17099173; __utmz=17099173.1577942309.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __mta=150368905.1577424190198.1578028660590.1578044222257.17; _lxsdk_s=16f6ac3e790-0de-cab-b27%7C265018624%7C6; uuid_n_v=v1; iuuid=9C5AAEF02E0E11EAB981AB68C7AB1D51622E552FC52545AE9F3D31A0EE1F6A4F; webp=true; selectci=; ci=1%2C%E5%8C%97%E4%BA%AC; theme=maoyan; _last_page=undefined; latlng=39.908589%2C116.397316%2C1578045092790; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1578045104",
"Host": "m.maoyan.com",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh,zh-CN;q=0.9",
"Connection": "keep-alive",
# "Cookie": "_lxsdk_cuid=16f45cef868c8-03522a76dfe0d8-5f4e2917-161012-16f45cef868c8; _lxsdk=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1577424189,1577683110,1577942292; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; __utma=17099173.1331545914.1577942309.1577942309.1577942309.1; __utmc=17099173; __utmz=17099173.1577942309.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __mta=150368905.1577424190198.1578028660590.1578044222257.17; _lxsdk_s=16f6ac3e790-0de-cab-b27%7C265018624%7C6; uuid_n_v=v1; iuuid=9C5AAEF02E0E11EAB981AB68C7AB1D51622E552FC52545AE9F3D31A0EE1F6A4F; webp=true; selectci=; ci=1%2C%E5%8C%97%E4%BA%AC; theme=maoyan; _last_page=undefined; latlng=39.908589%2C116.397316%2C1578045092790; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1578045104",
"Host": "m.maoyan.com",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
}
# keys = 1249366
proxies = get_proxy(4)
url = "http://m.maoyan.com/movie/{0}/box?_v_=yes&utm_campaign=AmovieBmovieD100&f=android&userid={1}".format(
keys,random.randint(265011000,265031000))
page_source = requests.get(url, headers=headers, proxies=proxies, timeout=5,allow_redirects=False)
keys, random.randint(265011000, 265031000))
page_source = requests.get(url, headers=headers, proxies=proxies, timeout=5, allow_redirects=False)
# print(page_source.text)
try:
page_json = re.findall('AppData = (.*?);</script>', page_source.text)[0]
......@@ -649,7 +650,7 @@ class Crawler_main(object):
box_office = res_json.get("summary").get("mbox").get("sumBox")
dic = {
"box_office": box_office,
"url":"https://maoyan.com/films/%s" % keys
"url": "https://maoyan.com/films/%s" % keys
}
print(dic)
self.parse_data(dic, keys)
......@@ -662,7 +663,7 @@ class Crawler_main(object):
if __name__ == "__main__":
if args.style_tag or args.countries:
Crawler_douban = Crawler_main()
Crawler_douban.list_page(style_tags=args.style_tag,countries=args.countries)
Crawler_douban.list_page(style_tags=args.style_tag, countries=args.countries)
else:
executor = ProcessPoolExecutor(max_workers=8)
futures = []
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment