1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
# -*- coding:UTF-8 -*-
# @Time : 2020/7/28 8:53
# @File : cal_ni_and_put_to_backend.py
# @email : litao@igengmei.com
# @author : litao
import redis
import json
import datetime
import time
import sys
import six
from maintenance.func_send_email_with_file import send_file_email
from typing import Dict, List
from crawler.gm_upload.gm_upload import upload, upload_file
import requests
import os
import copy
import re
# import HTMLParser
import pymysql
from crawler.crawler_sys.utils.output_results import retry_get_url
from lxml import html
from lxml.html.clean import Cleaner
import random
# from mistune import Renderer, InlineGrammar, InlineLexer, Markdown, escape
rds = redis.StrictRedis(host='172.16.40.164', port=6379, db=19, password='ReDis!GmTx*0aN12')
# rds = redis.StrictRedis(host='172.16.40.164', port=6379, db=19, password='ReDis!GmTx*0aN12')
# conn = pymysql.connect(host='bj-cdb-6slgqwlc.sql.tencentcdb.com', port=62120, user='work', passwd='Gengmei1',
# db='mimas_test', charset='utf8')
# rds.scan()
class push_rule(object):
def __init__(self, repost_count_ni=None, comment_count_ni=None, favorite_count_ni=None, time_range=5, level=3):
"""
传入增量计算规则
如 5分钟点赞量增长200
faverite_count_ni = 200
time_range = 5
:param repost_count_ni: Int 转发增长值
:param comment_count_ni: Int 评论增长值
:param favorite_count_ni: Int 点赞增长值
:param time_range: Int 间隔分钟
"""
self._repost_count_ni = repost_count_ni
self._comment_count_ni = comment_count_ni
self._favorite_count_ni = favorite_count_ni
self._time_range = time_range
self.level = level
try:
self.repost_per_min = self._repost_count_ni / time_range
except:
self.repost_per_min = -100
try:
self.comment_per_min = self._comment_count_ni / time_range
except:
self.comment_per_min = -100
try:
self.favorite_per_min = self._favorite_count_ni / time_range
except:
self.favorite_per_min = -100
def parse_data(self, fetch_time_last=None, repost_count_last=None, comment_count_last=None,
favorite_count_last=None, fetch_time=None, repost_count=None, comment_count=None,
favorite_count=None, parse_mode="and") -> bool:
"""
:param fetch_time_last:
:param repost_count_last:
:param comment_count_last:
:param favorite_count_last:
:param fetch_time:
:param repost_count:
:param comment_count:
:param favoratie_count:
:param parse_mode: str "and" or "or" 用于判断条件是同时满足还是满足任一条件
:return:
"""
if fetch_time_last and fetch_time:
time_diff = (fetch_time_last - fetch_time) / 60 / 1e3
else:
raise KeyError("time input error")
if isinstance(comment_count_last, int) and isinstance(comment_count, int):
comment_diff = comment_count_last - comment_count
else:
comment_diff = -100
if isinstance(favorite_count_last, int) and isinstance(favorite_count, int):
favoratie_diff = favorite_count_last - favorite_count
else:
favoratie_diff = -100
if isinstance(repost_count_last, int) and isinstance(repost_count, int):
repost_diff = repost_count_last - repost_count
else:
repost_diff = -100
if parse_mode == "and":
if comment_diff / time_diff >= self.comment_per_min and favoratie_diff / time_diff >= self.favorite_per_min and repost_diff / time_diff >= self.repost_per_min:
return True
else:
return False
elif parse_mode == "or":
if comment_diff / time_diff >= self.comment_per_min or favoratie_diff / time_diff >= self.favorite_per_min or repost_diff / time_diff >= self.repost_per_min:
return True
else:
return False
else:
return False
def scan_from_redis(push_rule_class_list) -> Dict:
# len_id_list = rds.llen("doc_id")
set_name = "exists_doc_id_set_%s" % datetime.datetime.now().strftime("%Y-%m-%d")
rds.sadd(set_name, "test")
rds.expire(set_name, 259200)
out_ts = datetime.datetime.now().timestamp() * 1e3 - 86400000
while True:
doc_id = rds.lpop("doc_id")
if doc_id:
res = rds.llen(doc_id)
if res < 2:
continue
res_list = rds.lrange(doc_id, 0, 1)
fetch_time = 0
repost_count = 0
comment_count = 0
favorite_count = 0
for count, re in enumerate(res_list):
one_data = json.loads(re)
if count == 0:
fetch_time = one_data.get("fetch_time")
repost_count = one_data.get("repost_count")
comment_count = one_data.get("comment_count")
favorite_count = one_data.get("favorite_count")
continue
if one_data.get("article_type") != "article":
continue
for push_bool in push_rule_class_list:
bool_res = push_bool.parse_data(fetch_time_last=fetch_time, repost_count_last=repost_count,
comment_count_last=comment_count,
favorite_count_last=favorite_count,
comment_count=one_data.get("comment_count"),
favorite_count=one_data.get("favorite_count"),
repost_count=one_data.get("repost_count"), parse_mode="and",
fetch_time=one_data.get("fetch_time"))
# print(bool_res)
if bool_res:
one_data["level"] = push_bool.level
if one_data["release_time"] < out_ts:
continue
set_name = "exists_doc_id_set_%s" % datetime.datetime.fromtimestamp(
one_data["release_time"] / 1e3).strftime("%Y-%m-%d")
if rds.sismember(set_name, one_data["doc_id"]):
continue
else:
rds.sadd(set_name, one_data["doc_id"])
yield one_data
continue
# print(res_list)
# else:
# time.sleep(1)
WHITE_TAGS = {
"basic": ["div", "p", "span", "img", "br", "video", 'a'], # 暂定小程序及爬取数据使用
"all": [
"div", "p", "span", "img", "br", "video", "audio", "a", "b", "strong", "i", "ul", "ol", "li", "em", "h1",
"h2", "h3", "h4", "h5", "h6", "iframe",
] # 可以展示的所有白标签
}
# def _get_rich_text(rich_text):
# """
# 富文本标签转成标签
# :param rich_text:
# :return:
# """
# try:
# h = HTMLParser.HTMLParser()
# rich_text = h.unescape(rich_text.decode("utf-8").replace("&", "&").replace("\n", "<br>")) # 富文本标签转成标签对象
# return rich_text
# except:
# return rich_text
def gm_convert_html_tags(rich_text, all_tags=False, remove_tags=None):
"""
富文本内容重新清洗,剔除不需要的样式
:param rich_text: 富文本
:param all_tags: 是否需要匹配所有白名单中的标签
:param remove_tags: 需要剔除的,白名单标签 []
:return:
"""
if not rich_text:
return ""
# rich_text = _get_rich_text(rich_text)
# 标签清洗 + 补齐 参数
tags = WHITE_TAGS["all"] if all_tags else WHITE_TAGS["basic"]
if remove_tags:
tags = [tag for tag in tags if tag not in remove_tags]
kw = {
"remove_unknown_tags": False,
"allow_tags": tags,
"safe_attrs": ["src", ],
}
if "a" in tags:
kw["safe_attrs"].append("href")
elif all_tags:
kw["safe_attrs"].extend(["class", "style"])
if "iframe" in kw["allow_tags"]:
kw["embedded"] = False
clear = Cleaner(**kw)
rich_text = clear.clean_html(rich_text)
# 增加样式
element_obj = html.fromstring(rich_text)
for element in element_obj.xpath(u"//img|//video"):
if not all_tags: # 小程序,普通用户,爬取数据
element.attrib["width"] = "100%" # 图片、视频增加宽度 100%
if element.tag == "video" and all_tags:
element.attrib["class"] = "js_richtext_video"
# 移除a标签中跳转链不是gengmei开头的链接
for item in element_obj.xpath("//a[not(starts-with(@href, 'gengmei://'))]"):
item.getparent().remove(item)
# a 标签追加样式
for item in element_obj.xpath("//a"):
item.attrib["style"] = 'color:#3FB5AF' # a标签颜色
rich_text = html.tostring(element_obj, encoding="unicode")
return rich_text
def push_data_to_user(res_data: Dict) -> Dict:
"""
处理数据为可以入库的格式
:param res_data:
:return:
"""
qiniu_img_list = []
if res_data["img_list"]:
for img_url in res_data["img_list"]:
try:
img_wb = retry_get_url(img_url).content
res = upload(img_wb)
print(res)
img_info = retry_get_url(res + "-imageinfo")
img_info_json = img_info.json()
qiniu_img_list.append((res + "-w", img_info_json))
except Exception as e:
print("down load img error %s" % e)
return {}
# 替换图片
if res_data["platform"] == "weibo":
res_data["qiniu_img_list"] = qiniu_img_list
if "http://t.cn/" in res_data["title"]:
res_data["title"] = res_data["title"].split("http://t.cn/")[0]
res_data["content"] = res_data["title"]
elif res_data["platform"] == "douban":
content = res_data.get("content")
if content:
for count, img_url in enumerate(res_data["img_list"]):
# print(qiniu_img_list[count][0])
content = content.replace(img_url, qiniu_img_list[count][0])
res_data["qiniu_img_list"] = qiniu_img_list
res_data["content"] = content
if res_data["platform"] == "weibo":
res_data["content"] = gm_convert_html_tags(res_data["title"], all_tags=True)
res_data["title"] = ""
elif res_data["platform"] == "douban":
res_data["content"] = gm_convert_html_tags(res_data["content"], all_tags=True)
return res_data
user_id_list = [33524704, 33524711, 33524716, 33524731, 33524740, 33524697, 33524707, 33524712, 33524717, 33524724,
33524755, 33524762, 33524779, 33524766, 33524782]
img_type = {
"OTHER": 1,
# '其他图片'
"GIF": 2,
# "GIF动图")
"JPG": 3,
# "JPG图片")
"JPEG": 4,
# "JPEG图片")
"PNG": 5,
# "PNG图片")
"BMP": 6,
# "BMP位图")
"WEBP": 7,
# "WEBP图片类型")
"TIFF": 8,
# "TIFF图片类型")
}
def write_data_into_mysql(res_data):
conn = pymysql.connect(host='172.16.30.138', port=3306, user='mimas', passwd='GJL3UJe1Ck9ggL6aKnZCq4cRvM',
db='mimas_prod', charset='utf8mb4')
cur = conn.cursor()
now_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# 清洗数据为可以入库的格式
data = push_data_to_user(res_data)
if not data.get("content"):
return None
if not data.get("qiniu_img_list"):
return None
tractate_id = 0
if data["platform"] == "weibo":
platform_value = 11
elif data["platform"] == "douban":
platform_value = 12
try:
sql_query = """insert into api_tractate
(user_id,content,is_online,status,platform,content_level,is_excellent,create_time,last_modified,user_del,low_quality,low_quality_deal,platform_id,pgc_type,title)
values ({user_id},'{content}',{is_online},{status},{platform},{content_level},{is_excellent},'{create_time}','{last_modified}',{user_del},{low_quality},{low_quality_deal},'{platform_id}',{pgc_type},'{title}');""".format(
user_id=random.choice(user_id_list), content=data["content"], is_online=0, status=2, platform=platform_value,
content_level=data["level"],
is_excellent=0, create_time=now_str,
last_modified=now_str, user_del=0,
low_quality=0, low_quality_deal=0, platform_id=data["doc_id"], pgc_type=0, title=data["title"])
res = cur.execute(sql_query)
tractate_id = int(conn.insert_id())
if res:
conn.commit()
except Exception as e:
print("commit error %s" % e)
print(data)
conn.rollback()
if data.get("qiniu_img_list"):
for img_info in data.get("qiniu_img_list"):
if img_info[0] in data.get("content"):
image_url_source = 2
else:
image_url_source = 3
try:
image_type = img_type.get(img_info[1]["format"].upper())
except:
image_type = 1
try:
width = img_info[1]["width"]
height = img_info[1]["height"]
except:
width = 0
height = 0
try:
if img_type == 7:
sql_query = """
insert into api_tractate_images (tractate_id,image_url,width,image_webp,height,image_url_source,image_type,image_webp,create_time,update_time)
values ({tractate_id},'{image_url}',{width},{height},{image_webp},{image_url_source},{image_type},{image_webp},'{create_time}','{update_time}')
""".format(tractate_id=tractate_id, image_url=img_info[0], width=width,
height=height, image_url_source=image_url_source,
image_type=image_type, image_webp=img_info[0],
create_time=now_str, update_time=now_str)
else:
sql_query = """
insert into api_tractate_images (tractate_id,image_url,width,height,image_url_source,image_type,create_time,update_time)
values ({tractate_id},'{image_url}',{width},{height},{image_url_source},{image_type},'{create_time}','{update_time}')
""".format(tractate_id=tractate_id, image_url=img_info[0], width=width,
height=height, image_url_source=image_url_source, image_type=image_type,
create_time=now_str, update_time=now_str)
res = cur.execute(sql_query)
if res:
conn.commit()
except Exception as e:
print("commit error %s" % e)
conn.rollback()
cur.close()
conn.close()
if tractate_id:
return tractate_id
else:
return None
class EnumMeta(type):
"""Metaclass for Enum"""
@staticmethod
def _find_allow_types_(cls, bases):
all_types = set(six.integer_types) | {six.text_type, str}
allow_types = set()
if Enum is None: # Enum base class
assert cls == 'Enum'
return tuple(all_types)
else:
for base in bases:
if not issubclass(base, Enum):
allow_types.add(base)
if allow_types:
return tuple(all_types & allow_types)
else:
return tuple(all_types)
class Enum(six.with_metaclass(EnumMeta, object)):
"""Generic enumeration.
Derive from this class to define new enumerations.
"""
def __repr__(self):
return "<%s.%s: %r>" % (
self.__class__.__name__, self._name_, self._value_)
def __str__(self):
if self._desc_:
return "%s.%s(%s)" % (self.__class__.__name__, self._name_, self._desc_)
else:
return "%s.%s" % (self.__class__.__name__, self._name_)
def __hash__(self):
return hash(self._name_)
class TRACTATE_PLATFORM(Enum):
"""
新帖子发布来源
"""
GM = ("1", u"更美")
HERA = ("2", u"HERA后台")
DOCTOR = ("3", u"医生端")
XIAOHONGSHU = ("4", u"小红书")
WEIBO = ("5", u"微博")
SOYOUNG = ("6", u"新氧")
MARK = ("7", u"站内打卡活动")
VARIETY_SHOW_YOUNG = ("8", "选秀节目(少年之名)打榜活动")
GROUP_DETAIL = ("9", "普通小组")
GROUP_TOPIC_DETAIL = ("10", "普通小组话题")
STRATEGY_WEIBO_HOTSPOT = ("11", "策略微博热点")
STRATEGY_DOUBAN_HOTSPOT = ("12", "策略豆瓣鹅组热点")
STRATEGY_TOUTIAO = ("13", "策略头条文章")
STRATEGY_ZHIHU = ("14", "策略知乎文章")
STRATEGY_XIAOHONGSHU = ("15", "策略小红书文章")
STRATEGY_SOYOUNG = ("16", "策略新氧文章")
STRATEGY_WEIBO = ("17", "策略微博文章")
def task_main():
# 实例化数据判断规则 注意高优先级在前 低优先级在后
push_rule_class1 = push_rule(comment_count_ni=20, time_range=5, level=3)
push_rule_class2 = push_rule(comment_count_ni=5, time_range=5, level=2)
rules_list = [
push_rule_class1,
push_rule_class2
]
# 循环处理抓取数据,返回需要添加至后端的数据
for res_data in scan_from_redis(rules_list):
# 符合规则入库数据
try:
tractate_id = write_data_into_mysql(res_data)
print("line394",tractate_id,res_data)
if res_data["level"] >= 3 and tractate_id:
title_str = res_data["platform"] + "帖子内容审核"
body_str = """
问好:
有一篇新的{level}星内容需要审核,帖子号为{tractate_id}
内容如下:
{content}
""".format(tractate_id=tractate_id, content=res_data["content"],level=res_data["level"])
send_file_email("", "", email_group=["<hongxu@igengmei@igengmei.com>","<yangjiayue@igengmei.com>","<zhangweiwei@igengmei.com>","<liuyiting@igengmei.com>"], cc_group=["<duanyingrong@igengmei.com>","<litao@igengmei.com>"],
email_msg_body_str=body_str, title_str=title_str)
print("send to mysql")
except Exception as e:
print("send email error %s"%e)
# test = {'release_time': 1595952037000, 'fetch_time': 1596012816514, 'url': 'https://www.douban.com/group/topic/186707979/', 'releaser': '🍫', 'repost_count': 40, 'comment_count': 411, 'favorite_count': 144, 'title': '王俊凯终于还是举铁了', 'releaserUrl': 'https://www.douban.com/people/57762442', 'releaser_id_str': 'douban_57762442', 'video_img': 'https://img3.doubanio.com/view/group_topic/sqxs/public/p317684082.webp', 'mid': '186707979', 'platform': 'douban', 'doc_id': 'douban_186707979', 'content': '<div id=\'content\'><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684082.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img9.doubanio.com/view/group_topic/l/public/p317684064.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684093.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img9.doubanio.com/view/group_topic/l/public/p317684095.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684052.webp" width="500"/></div></div><p></p></div>', 'collection_count': 107, 'img_list': ['https://img3.doubanio.com/view/group_topic/l/public/p317684082.webp', 'https://img9.doubanio.com/view/group_topic/l/public/p317684064.webp', 'https://img3.doubanio.com/view/group_topic/l/public/p317684093.webp', 'https://img9.doubanio.com/view/group_topic/l/public/p317684095.webp', 'https://img3.doubanio.com/view/group_topic/l/public/p317684052.webp'], 'level': 5}
# write_data_into_mysql(test)
if __name__ == "__main__":
from concurrent.futures import ProcessPoolExecutor
executor = ProcessPoolExecutor(max_workers=5)
futures = []
for processe in range(4):
future = executor.submit(task_main)
futures.append(future)
print('Processe %s start' % processe)
executor.shutdown(True)
# task_main()