Commit 7a03edf5 authored by 段英荣's avatar 段英荣

增加切词内容

parent 0096ff6e
......@@ -32,6 +32,9 @@ import os
import cv2
import copy
import traceback
import pymysql
import jieba
import jieba.analyse
......@@ -44,6 +47,13 @@ majia_user_list = [
"32269952","32269956","32269962","32269966","32269973","32269978","32269980","32269982","32269987","32269989","32270003","32270004","32270007","32270012","32270015","32270017","32270020","32270024","32270027","32270031","32270041","32270044","32270047","32270050","32270054","32270055","32270057","32270059","32270063","32270066","32269913","32269918","32269920","32269927","32269933","32269939","32269943","32269948","32269957","32269965","32269972","32269979","32269983","32269988","32269995","32270002","32270005","32270011","32270016","32270022","32270029","32270036","32270040","32270051","32270061","32270065","32270071","32270075","32270081","32270085","32270094","32270096","32270110","32270116","32270121","32270141","32270147","32270152","32270156","32270161","32270114","32270119","32270122","32270125","32270129","32270131","32270133","32270134","32270137","32270167","32270068","32270070","32270076","32270078","32270083","32270087","32270093","32270095","32270099","32270105","32269992","32270018","32270023","32270030","32270034","32270043","32270048","32270052","32270056","32270060"
]
ZHENGXING_HOST = "172.16.30.141"
ZHENGXING_USER = "work"
ZHENGXING_PWD = "BJQaT9VzDcuPBqkd"
ZHENGXING_DATABASE = "zhengxing"
class ZhihuAccount(object):
def __init__(self, username: str = None, password: str = None):
......@@ -70,6 +80,15 @@ class ZhihuAccount(object):
}
self.session.cookies = cookiejar.LWPCookieJar(filename='./cookies.txt')
self.zhengxing_conn = pymysql.connect(
host=ZHENGXING_HOST,
user=ZHENGXING_USER,
password=ZHENGXING_PWD,
database=ZHENGXING_DATABASE,
charset="utf8")
self.zhengxing_cursor = self.zhengxing_conn.cursor()
self.tag_words_set = set()
def login(self, captcha_lang: str = 'en', load_cookies: bool = True):
"""
模拟登录知乎
......@@ -222,6 +241,38 @@ class ZhihuAccount(object):
js = execjs.compile(f.read())
return js.call('Q', urlencode(form_data))
def add_jieba_tag_word(self):
try:
tag_sql = """
select tag_type,name from api_tag where is_online=true;
"""
self.zhengxing_cursor.execute(tag_sql)
sql_tag_results = self.zhengxing_cursor.fetchall()
for tag_item in sql_tag_results:
tag_name = tag_item[1]
tag_type = tag_item[0]
jieba.add_word(tag_name)
self.tag_words_set.add(tag_name)
except:
print(traceback.format_exc())
def get_tfidf_words_from_content(self,content):
try:
key_tag_list = list()
keywords_list = jieba.analyse.extract_tags(content, topK=20, withWeight=True)
for key_item in keywords_list:
if key_item[0] in self.tag_words_set:
key_tag_list.append(key_item[0])
return key_tag_list
except:
print(traceback.format_exc())
return []
# 知乎个人文章列表
def test_member_article(self):
......@@ -318,7 +369,8 @@ class ZhihuAccount(object):
"title": title,
"content": content,
"type": data_type,
"question_id": ""
"question_id": "",
"tags": self.get_tfidf_words_from_content(content)
}
zhihu_spider_question_fd.write(json.dumps(question_item_dict) + "\n")
elif platform_id not in question_answer_dict[question_id]:
......@@ -336,7 +388,8 @@ class ZhihuAccount(object):
"title": title,
"content": content,
"type": data_type,
"question_id": question_id
"question_id": question_id,
"tags": self.get_tfidf_words_from_content(content)
}
zhihu_spider_fd.write(json.dumps(item_dict) + "\n")
......@@ -400,7 +453,8 @@ class ZhihuAccount(object):
"title": question_title,
"content": data_content,
"type": data_type,
"question_id": question_id
"question_id": question_id,
"tags": self.get_tfidf_words_from_content(data_content)
}
zhihu_spider_fd.write(json.dumps(item_dict) + "\n")
......@@ -416,6 +470,7 @@ if __name__ == '__main__':
account = ZhihuAccount('', '')
account.login(captcha_lang='en', load_cookies=True)
account.add_jieba_tag_word()
#account.test_member_article()
zhihu_spider_data_file = "./zhihu_spider_data.txt"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment