Commit 06a45e9d authored by 张彦钊's avatar 张彦钊

Merge branch 'zhao' into 'master'

新增把esmm排序结果重排

See merge request !32
parents 25637f3e 3bfe07d5
from itertools import chain, islice, cycle
import datetime
from collections import Counter
from gm_types.gaia import DIARY_ORDER_TYPE
from gm_types.doris import ANSWER_SORT_TYPE
from gm_types.doris import ARTICLE_SORT_TYPE
from gm_types.mimas import CONTENT_CLASS
from gm_types.doris import CARD_TYPE
from gm_types.gaia import CITY_LEVEL
from gm_rpcd.all import bind
import traceback
from search.utils.diary import recall_diary
from search.utils.answer import recall_answers
from search.utils.article import recall_articles
from gm_rpcd.all import context
from libs.algorithms import drop_dup
from libs.cache import redis_client
from libs.error import logging_exception
from extend.models.gaia import City, CityScale
from extend.models.gold import (
QAQueue,
WikiQueue,
IconQueue,
UserTopicQueue,
DoctorTopicQueue,
DiaryQueue,
ArticleQueue,
AnswerQueue,
DeviceQAQueue,
DeviceIconQueue,
DeviceUserTopicQueue,
DeviceDoctorTopicQueue,
DeviceAnswerQueue,
DeviceArticleQueue,
DeviceDiaryQueue,
QuestionQueue,
DeviceQuestionQueue
)
import logging
import redis
import json
from django.conf import settings
import traceback
MAX_LOAD = 200
logger = logging.getLogger(__name__)
@bind("doris/recommend/get_diaries")
def get_diaries(tags, city, offset=0, size=10, city_tag_id=None):
# NOTE: city as city id
sort_params = {}
if city_tag_id:
sort_params["user_city_tag_id"] = city_tag_id
elif city:
try:
x = City.objects.get(id=city)
sort_params["user_city_tag_id"] = x.tag_id
except City.DoesNotExist:
pass
filters = {
"is_sink": False,
"has_before_cover": True,
"has_after_cover": True,
"content_level_is_good": True
}
if tags:
filters["closure_tag_ids"] = tags
tail = offset + size
diaries_ids = []
if tail < MAX_LOAD:
diaries = recall_diary(None, 0, 200, filters, DIARY_ORDER_TYPE.RECOMMEND, sort_params, fields=["id", "user.id"])
diaries_items = [(diary['id'], diary['user']['id']) for diary in diaries]
drop_dup_diaries = drop_dup(diaries_items)
drop_dup_size = len(drop_dup_diaries)
if tail <= drop_dup_size:
diaries_ids = [item[0] for item in drop_dup_diaries[offset:tail]]
if len(diaries_ids) == 0: # 如果头200条去重结束 后面的排序不去重
diaries = recall_diary(None, offset, size, filters, DIARY_ORDER_TYPE.RECOMMEND, sort_params, fields=["id"])
diaries_ids = [diary['id'] for diary in diaries]
return {"diaries_ids": diaries_ids}
@bind("doris/recommend/get_articles")
def get_articles(tags, offset=0, size=10):
filters = {
"content_level": [CONTENT_CLASS.EXCELLENT, CONTENT_CLASS.FINE]
}
if tags:
filters["tag_ids"] = tags
articles = recall_articles(None, offset, size, filters, ARTICLE_SORT_TYPE.RECOMMEND, {})
article_ids = [article['id'] for article in articles]
return {"article_ids": article_ids}
@bind("doris/recommend/get_answers")
def get_answers(tags, offset=0, size=10):
filters = {
"content_level": [CONTENT_CLASS.EXCELLENT, CONTENT_CLASS.FINE]
}
if tags:
filters["tag_ids"] = tags
tail = offset + size
answer_ids = []
if tail < MAX_LOAD:
answers = recall_answers(None, 0, MAX_LOAD, filters, ANSWER_SORT_TYPE.RECOMMEND, {}, fields=["id", "user_id"])
answers = filter(lambda answer: "id" in answer and "user_id" in answer, answers)
answer_items = [(answer["id"], answer["user_id"]) for answer in answers]
drop_dup_answers = drop_dup(answer_items)
if tail <= len(drop_dup_answers):
answer_ids = [item[0] for item in drop_dup_answers[offset:tail]]
if len(answer_ids) == 0:
answers = recall_answers(None, offset, size, filters, ANSWER_SORT_TYPE.RECOMMEND, {})
answer_ids = [answer['id'] for answer in answers]
return {"answer_ids": answer_ids}
@bind('doris/recommend/icon')
def fetch_icon(device_id, size):
try:
card_type = "icon"
try:
que = DeviceIconQueue.objects.get(device_id=device_id)
except DeviceIconQueue.DoesNotExist:
que = IconQueue.objects.last()
if not que:
return {"icon": []}
que = list(filter(None, que.queue.split(',')))
# adjust args.
cursor = 0
cursor = int(cursor) % len(que)
size = min(size, len(que))
data = list(islice(cycle(que), cursor, cursor + size))
return {card_type: list(map(int, data))}
except:
logging_exception()
return {"icon": []}
@bind('doris/recommend/homepage_polymer')
def fetch_polymer_ids(device_id, size):
try:
card_type = "polymer_ids"
try:
que = DeviceIconQueue.objects.get(device_id=device_id)
except DeviceIconQueue.DoesNotExist:
que = IconQueue.objects.last()
if not que:
return {"polymer_ids": []}
que = list(filter(None, que.queue.split(',')))
# adjust args.
cursor = 0
cursor = int(cursor) % len(que)
size = min(size, len(que))
data = list(islice(cycle(que), cursor, cursor + size))
return {card_type: list(map(int, data))}
except:
logging_exception()
return {"polymer_ids": []}
@bind('doris/recommend/feed')
def recommend_feed(device_id, card_type, city_id, size):
try:
return RecommendFeed.dispatch(device_id, card_type,
city_id, size)
except:
logging_exception()
return {card_type: []}
class RecommendFeed:
@classmethod
def dispatch(cls, device_id, card_type, city_id, size):
data = []
if card_type == CARD_TYPE.QA:
data = cls.fetch_qa(device_id, card_type, size)
elif card_type == CARD_TYPE.ANSWER:
data = cls.fetch_answer(device_id, card_type, size)
data = list(map(int, data))
elif card_type == CARD_TYPE.ARTICLE:
data = cls.fetch_article(device_id, card_type, size)
data = list(map(int, data))
elif card_type == CARD_TYPE.QUESTION:
data = cls.fetch_question(device_id, card_type, size)
data = list(map(int, data))
elif card_type == CARD_TYPE.DIARY:
data = cls.fetch_diary(device_id, card_type, city_id, size)
elif card_type == CARD_TYPE.USERTOPIC:
data = cls.fetch_user_topic(device_id,card_type,size)
elif card_type == CARD_TYPE.DOCTORTOPIC:
data = cls.fetch_doctor_topic(device_id,card_type,size)
data = list(map(int, data))
elif card_type == CARD_TYPE.ENCYCLOPEDIA:
data = cls.fetch_wiki(device_id,card_type,size)
return {card_type: data}
@staticmethod
def current_date():
return datetime.datetime.now().strftime('%Y-%m-%d')
@staticmethod
def fetch_question(device_id, card_type, size):
key = '{device_id}-{card_type}-{date}'.format(device_id=device_id,
card_type=card_type, date=RecommendFeed.current_date())
try:
que = DeviceQuestionQueue.objects.get(device_id=device_id)
except DeviceQuestionQueue.DoesNotExist:
que = QuestionQueue.objects.last()
que = list(filter(None, que.queue.split(',')))
# adjust args.
cursor = redis_client.get(key) or 0
cursor = int(cursor) % len(que)
size = min(size, len(que))
redis_client.set(key, cursor + size, ex=24 * 60 * 60)
return list(islice(cycle(que), cursor, cursor + size))
@staticmethod
def fetch_icon(device_id, card_type, size):
key = '{device_id}-{card_type}-{date}'.format(device_id=device_id,
card_type=card_type, date=RecommendFeed.current_date())
try:
que = DeviceIconQueue.objects.get(device_id=device_id)
except DeviceIconQueue.DoesNotExist:
que = IconQueue.objects.last()
que = list(filter(None, que.queue.split(',')))
# adjust args.
cursor = redis_client.get(key) or 0
cursor = int(cursor) % len(que)
size = min(size, len(que))
redis_client.set(key, cursor + size, ex=24 * 60 * 60)
return list(islice(cycle(que), cursor, cursor + size))
@staticmethod
def fetch_wiki(device_id, card_type, size):
try:
key = '{device_id}-{card_type}-{date}'.format(device_id=device_id,
card_type=card_type, date=RecommendFeed.current_date())
que = WikiQueue.objects.last()
if not que:
return []
# que = list(filter(None, que.queue.split(',')))
que = json.loads(que.queue)
# adjust args.
cursor = redis_client.get(key) or 0
cursor = int(cursor) % len(que)
size = min(size, len(que))
redis_client.set(key, cursor + size, ex=24 * 60 * 60)
return list(islice(cycle(que), cursor, cursor + size))
except:
logging_exception()
return []
@staticmethod
def fetch_answer(device_id, card_type, size):
try:
key = '{device_id}-{card_type}-{date}'.format(device_id=device_id,
card_type=card_type, date=RecommendFeed.current_date())
try:
que = DeviceAnswerQueue.objects.get(device_id=device_id)
except DeviceAnswerQueue.DoesNotExist:
que = AnswerQueue.objects.last()
if not que:
return []
que = list(filter(None, que.queue.split(',')))
# adjust args.
cursor = redis_client.get(key) or 0
cursor = int(cursor) % len(que)
size = min(size, len(que))
redis_client.set(key, cursor + size, ex=24 * 60 * 60)
return list(islice(cycle(que), cursor, cursor + size))
except:
logging_exception()
return []
@staticmethod
def fetch_qa(device_id, card_type, size):
try:
def get_after_filter_qa():
try:
return json.loads(gmkv.get(after_filter_key))
except:
return []
def write_after_filter_qa(cid_list):
try:
if gmkv.exists(after_filter_key):
gmkv.set(after_filter_key, json.dumps(cid_list))
else:
gmkv.set(after_filter_key, json.dumps(cid_list),ex = 6*60*60)
except:
logging_exception()
logger.error("catch exception,err_log:%s" % traceback.format_exc())
def filter_qa(device_id,cid_list):
try:
key = str(device_id) + "_dislike_qa"
if gmkv.exists(key):
dislike = gmkv.smembers(key)
cid_list = [i for i in cid_list if str(i) not in dislike]
return cid_list
else:
return cid_list
except:
return cid_list
def read_history(cid_list):
if redis_client.exists(today_qa_key):
redis_client.sadd(today_qa_key, *cid_list)
else:
redis_client.sadd(today_qa_key, *cid_list)
redis_client.expire(today_qa_key, 15 * 24 * 60 * 60)
if redis_client.exists(read_qa_key) and redis_client.exists(old_qa_key):
redis_client.sdiffstore(read_qa_key, read_qa_key, old_qa_key)
redis_client.delete(old_qa_key)
redis_client.expire(read_qa_key, time=13 * 24 * 60 * 60)
redis_client.sadd(read_qa_key, *cid_list)
search_qa_recommend_list = list()
read_qa_key = "TS:recommend_answer_set:device_id:" + str(device_id)
old_qa_key = "TS:recommend_answer_set:device_id:{}:{}"\
.format(device_id,(datetime.date.today() - datetime.timedelta(days=14)).strftime("%Y-%m-%d"))
today_qa_key = "TS:recommend_answer_set:device_id:{}:{}"\
.format(device_id, datetime.date.today().strftime("%Y-%m-%d"))
answer_queue_key = "qa_is_tail:" + str(device_id)
after_filter_key = "device_qa_after_filter:device_id:" + str(device_id)
gmkv = redis.Redis(host="172.16.40.135", port=5379, db=2)
if device_id != '0':
search_qa_recommend_key = "TS:search_recommend_answer_queue:device_id:" + str(device_id)
if redis_client.exists(search_qa_recommend_key):
search_qa_recommend_dict = redis_client.hgetall(search_qa_recommend_key)
queue_list = json.loads(search_qa_recommend_dict[b'answer_queue'])
queue_list = filter_qa(device_id, queue_list)
if len(queue_list) == 0:
redis_client.delete(search_qa_recommend_key)
elif len(queue_list) == 1:
size = size - 1
search_qa_recommend_list = queue_list
redis_client.delete(search_qa_recommend_key)
else:
size = size - 1
search_qa_recommend_list.append(queue_list[0])
redis_client.hset(search_qa_recommend_key,"answer_queue",json.dumps(queue_list[1:]))
if gmkv.exists(answer_queue_key):
if len(search_qa_recommend_list) > 0:
search_qa_recommend_list = list(map(int, search_qa_recommend_list))
read_history(search_qa_recommend_list)
return search_qa_recommend_list
elif gmkv.exists(after_filter_key):
que = get_after_filter_qa()
que = filter_qa(device_id,que)
if len(que) == 0:
gmkv.set(answer_queue_key,"tail",ex = 6*60*60)
if len(search_qa_recommend_list) > 0:
search_qa_recommend_list = list(map(int, search_qa_recommend_list))
read_history(search_qa_recommend_list)
return search_qa_recommend_list
elif len(que) <= size:
search_qa_recommend_list.extend(que)
gmkv.set(answer_queue_key, "tail", ex=6 * 60 * 60)
search_qa_recommend_list = list(map(int, search_qa_recommend_list))
read_history(search_qa_recommend_list)
return search_qa_recommend_list
else:
search_qa_recommend_list.extend(que[:size])
write_after_filter_qa(que[size:])
search_qa_recommend_list = list(map(int, search_qa_recommend_list))
read_history(search_qa_recommend_list)
return search_qa_recommend_list
try:
que = DeviceQAQueue.objects.get(device_id=device_id)
except DeviceQAQueue.DoesNotExist:
que = AnswerQueue.objects.last()
if not que:
if len(search_qa_recommend_list) > 0:
search_qa_recommend_list = list(map(int, search_qa_recommend_list))
read_history(search_qa_recommend_list)
return search_qa_recommend_list
qa = list(filter(None, que.queue.split(',')))
if device_id != "0":
qa = filter_qa(device_id,qa)
if len(qa) == 0:
if device_id != "0":
gmkv.set(answer_queue_key, "tail", ex=6 * 60 * 60)
if len(search_qa_recommend_list) > 0:
search_qa_recommend_list = list(map(int, search_qa_recommend_list))
read_history(search_qa_recommend_list)
return search_qa_recommend_list
elif len(qa) <= size:
search_qa_recommend_list.extend(qa)
search_qa_recommend_list = list(map(int, search_qa_recommend_list))
if device_id != "0":
gmkv.set(answer_queue_key, "tail", ex=6 * 60 * 60)
read_history(search_qa_recommend_list)
return search_qa_recommend_list
else:
search_qa_recommend_list.extend(qa[:size])
search_qa_recommend_list = list(map(int, search_qa_recommend_list))
if device_id != "0":
write_after_filter_qa(qa[size:])
read_history(search_qa_recommend_list)
return search_qa_recommend_list
except:
logging_exception()
return []
@staticmethod
def fetch_article(device_id, card_type, size):
key = '{device_id}-{card_type}-{date}'.format(device_id=device_id,
card_type=card_type, date=RecommendFeed.current_date())
try:
que = DeviceArticleQueue.objects.get(device_id=device_id)
except DeviceArticleQueue.DoesNotExist:
que = ArticleQueue.objects.last()
if not que:
return []
que = list(filter(None, que.queue.split(',')))
# adjust args.
cursor = redis_client.get(key) or 0
cursor = int(cursor) % len(que)
size = min(size, len(que))
redis_client.set(key, cursor + size, ex=24 * 60 * 60)
return list(islice(cycle(que), cursor, cursor + size))
@staticmethod
def fetch_user_topic(device_id, card_type, size):
try:
def filter_topic(cid_list):
try:
if gmkv.exists(dislike_key):
dislike = gmkv.smembers(dislike_key)
cid_list = [i for i in cid_list if str(i) not in dislike]
return cid_list
else:
return cid_list
except:
return cid_list
def write_after_filter_tractate(cid_list):
try:
if gmkv.exists(after_filter_key):
gmkv.set(after_filter_key, json.dumps(cid_list))
else:
gmkv.set(after_filter_key, json.dumps(cid_list), ex=6 * 60 * 60)
except:
logging_exception()
logger.error("catch exception,err_log:%s" % traceback.format_exc())
def get_filter_tractate():
try:
return json.loads(gmkv.get(after_filter_key))
except:
return []
def read_history(cid_list):
if redis_client.exists(today_key):
redis_client.sadd(today_key, *cid_list)
else:
redis_client.sadd(today_key, *cid_list)
redis_client.expire(today_key, 15 * 24 * 60 * 60)
if redis_client.exists(read_key) and redis_client.exists(old_key):
redis_client.sdiffstore(read_key, read_key, old_key)
redis_client.delete(old_key)
redis_client.expire(read_key, time=13 * 24 * 60 * 60)
redis_client.sadd(read_key, *cid_list)
dislike_key = str(device_id) + "_dislike_tractate"
search_topic_recommend_key = "TS:search_recommend_tractate_queue:device_id:" + str(device_id)
after_filter_key = "device_tractate_after_filter:device_id:" + str(device_id)
tractate_key = "tractate_is_tail" + str(device_id)
read_key = "TS:recommend_tractate_set:device_id:" + str(device_id)
old_key = "TS:recommend_tractate_set:device_id:{}:{}"\
.format(device_id,(datetime.date.today() - datetime.timedelta(days=14)).strftime("%Y-%m-%d"))
today_key = "TS:recommend_tractate_set:device_id:{}:{}"\
.format(device_id,datetime.date.today().strftime("%Y-%m-%d"))
search_list = list()
gmkv = redis.Redis(host="172.16.40.135", port=5379, db=2)
if (device_id != '0') and size >= 2:
if redis_client.exists(search_topic_recommend_key):
search_topic_recommend_dict = redis_client.hgetall(search_topic_recommend_key)
search_topic_recommend_list = json.loads(search_topic_recommend_dict[b'tractate_queue'])
search_topic_recommend_list = filter_topic(search_topic_recommend_list)
if len(search_topic_recommend_list) == 0:
redis_client.delete(search_topic_recommend_key)
elif len(search_topic_recommend_list) <= 2:
search_list = search_topic_recommend_list
size = size - len(search_list)
redis_client.delete(search_topic_recommend_key)
else:
search_list = search_topic_recommend_list[:2]
size = size - 2
redis_client.hset(search_topic_recommend_key, 'tractate_queue',
json.dumps(search_topic_recommend_list[2:]))
if gmkv.exists(tractate_key):
if len(search_list) > 0:
search_list = list(map(int, search_list))
read_history(search_list)
return search_list
elif gmkv.exists(after_filter_key):
que = get_filter_tractate()
que = filter_topic(que)
if len(que) == 0:
gmkv.set(tractate_key,"tail",ex = 2*60*60)
if len(search_list) > 0:
search_list = list(map(int, search_list))
read_history(search_list)
return search_list
elif len(que) <= size:
search_list.extend(que)
gmkv.set(tractate_key, "tail",ex = 2*60*60)
search_list = list(map(int, search_list))
read_history(search_list)
return search_list
else:
search_list.extend(que[:size])
write_after_filter_tractate(que[size:])
search_list = list(map(int, search_list))
read_history(search_list)
return search_list
try:
que = DeviceUserTopicQueue.objects.get(device_id=device_id)
except DeviceUserTopicQueue.DoesNotExist:
que = UserTopicQueue.objects.last()
if not que:
if len(search_list) > 0:
search_list = list(map(int, search_list))
read_history(search_list)
return search_list
qa = list(filter(None, que.queue.split(',')))
if device_id != "0":
qa = filter_topic(qa)
if len(qa) == 0:
if device_id != "0":
gmkv.set(tractate_key, "tail", ex=2 * 60 * 60)
if len(search_list) > 0:
search_list = list(map(int, search_list))
read_history(search_list)
return search_list
elif len(qa) <= size:
search_list.extend(qa)
search_list = list(map(int, search_list))
if device_id != "0":
gmkv.set(tractate_key, "tail", ex=2 * 60 * 60)
read_history(search_list)
return search_list
else:
search_list.extend(qa[:size])
search_list = list(map(int, search_list))
if device_id != "0":
write_after_filter_tractate(qa[size:])
read_history(search_list)
return search_list
except:
logging_exception()
return []
@staticmethod
def fetch_doctor_topic(device_id, card_type, size):
try:
key = '{device_id}-{card_type}-{date}'.format(device_id=device_id,
card_type=card_type, date=RecommendFeed.current_date())
try:
que = DeviceDoctorTopicQueue.objects.get(device_id=device_id)
except DeviceDoctorTopicQueue.DoesNotExist:
que = DoctorTopicQueue.objects.last()
if not que:
return []
que = list(filter(None, que.queue.split(',')))
# adjust args.
cursor = redis_client.get(key) or 0
cursor = int(cursor) % len(que)
size = min(size, len(que))
redis_client.set(key, cursor + size, ex=24 * 60 * 60)
return list(islice(cycle(que), cursor, cursor + size))
except:
logging_exception()
return []
@classmethod
def get_gm_kv_ins(cls,redis_ip, redis_port, redis_db, redis_password=""):
try:
if len(redis_password) == 0:
cli_ins = redis.Redis(host=redis_ip, port=redis_port, db=redis_db, socket_timeout=2)
else:
cli_ins = redis.Redis(host=redis_ip, port=redis_port, db=redis_db, password=redis_password,
socket_timeout=2)
cli_ins.ping()
return cli_ins
except:
return None
@classmethod
def fetch_diary_queue_data(cls, city_id, device_id=None):
local = list()
nearby = list()
nation = list()
megacity = list()
use_city_id = city_id
try:
gm_kv_ins = None
for gm_kv_host_item in settings.GM_KV_HOSTS:
gm_kv_ins = cls.get_gm_kv_ins(redis_ip=gm_kv_host_item["host"], redis_port=gm_kv_host_item["port"], redis_db=gm_kv_host_item["db"],redis_password=gm_kv_host_item["password"])
if gm_kv_ins:
break
specify_city_id_key = "diary_queue:city_id:" + use_city_id
world_city_id_key = "diary_queue:city_id:world"
if device_id is not None:
specify_city_id_key = "device_diary_queue:device_id:" + device_id + ":city_id:" + use_city_id
city_val_dict = gm_kv_ins.hgetall(specify_city_id_key)
if len(city_val_dict) == 0:
city_val_dict = gm_kv_ins.hgetall(world_city_id_key)
use_city_id = "world"
if b"native_queue" in city_val_dict and city_val_dict[b"native_queue"]:
local = list(filter(None, city_val_dict[b"native_queue"].split(b",")))
if b"nearby_queue" in city_val_dict and city_val_dict[b"nearby_queue"]:
nearby = list(filter(None, city_val_dict[b"nearby_queue"].split(b",")))
if b"nation_queue" in city_val_dict and city_val_dict[b"nation_queue"]:
nation = list(filter(None, city_val_dict[b"nation_queue"].split(b",")))
if b"megacity_queue" in city_val_dict and city_val_dict[b"megacity_queue"]:
megacity = list(filter(None, city_val_dict[b"megacity_queue"].split(b",")))
return (local, nearby, nation, megacity, use_city_id)
except:
logging_exception()
logger.error("catch exception,err_log:%s" % traceback.format_exc())
qs = DiaryQueue.objects.filter(city_id__in=[city_id, 'world'])
# Assume that world queue must exist.
if len(qs) == 1:
obj = qs[0]
else:
obj = qs[0] if qs[0].city_id == city_id else qs[1]
if obj.native_queue:
local = list(filter(None, obj.native_queue.split(',')))
if obj.nearby_queue:
nearby = list(filter(None, obj.nearby_queue.split(',')))
if obj.nation_queue:
nation = list(filter(None, obj.nation_queue.split(',')))
if obj.megacity_queue:
megacity = list(filter(None, obj.megacity_queue.split(',')))
use_city_id = obj.city_id if obj else use_city_id
return (local, nearby, nation, megacity, use_city_id)
@classmethod
def fetch_device_diary_queue_data(cls, city_id, device_id):
local = list()
nearby = list()
nation = list()
megacity = list()
use_city_id = city_id
try:
gm_kv_ins = None
for gm_kv_host_item in settings.GM_KV_HOSTS:
gm_kv_ins = cls.get_gm_kv_ins(redis_ip=gm_kv_host_item["host"], redis_port=gm_kv_host_item["port"], redis_db=gm_kv_host_item["db"],redis_password=gm_kv_host_item["password"])
if gm_kv_ins:
break
specify_city_id_key = "device_diary_queue:device_id:" + device_id + ":city_id:" + use_city_id
city_val_dict = gm_kv_ins.hgetall(specify_city_id_key)
if b"native_queue" in city_val_dict and city_val_dict[b"native_queue"]:
local = list(filter(None, city_val_dict[b"native_queue"].split(b",")))
if b"nearby_queue" in city_val_dict and city_val_dict[b"nearby_queue"]:
nearby = list(filter(None, city_val_dict[b"nearby_queue"].split(b",")))
if b"nation_queue" in city_val_dict and city_val_dict[b"nation_queue"]:
nation = list(filter(None, city_val_dict[b"nation_queue"].split(b",")))
if b"megacity_queue" in city_val_dict and city_val_dict[b"megacity_queue"]:
megacity = list(filter(None, city_val_dict[b"megacity_queue"].split(b",")))
return (local, nearby, nation, megacity, use_city_id)
except:
logging_exception()
logger.error("catch exception,err_log:%s" % traceback.format_exc())
obj = DeviceDiaryQueue.objects.filter(device_id=device_id, city_id=city_id).first()
if obj and obj.native_queue:
local = list(filter(None, obj.native_queue.split(',')))
if obj and obj.nearby_queue:
nearby = list(filter(None, obj.nearby_queue.split(',')))
if obj and obj.nation_queue:
nation = list(filter(None, obj.nation_queue.split(',')))
if obj and obj.megacity_queue:
megacity = list(filter(None, obj.megacity_queue.split(',')))
use_city_id = obj.city_id if obj else use_city_id
return (local, nearby, nation, megacity, use_city_id)
@classmethod
def fetch_diary(cls, device_id, card_type, city_id, size):
def read_history(cid_list):
if redis_client.exists(today_key):
redis_client.sadd(today_key, *cid_list)
else:
redis_client.sadd(today_key, *cid_list)
redis_client.expire(today_key, 15 * 24 * 60 * 60)
if redis_client.exists(read_key) and redis_client.exists(old_key):
redis_client.sdiffstore(read_key, read_key, old_key)
redis_client.delete(old_key)
redis_client.expire(read_key, time=13 * 24 * 60 * 60)
redis_client.sadd(read_key, *cid_list)
def dislike_cid_filter(device_id, cid_list):
try:
key = str(device_id) + "_dislike_diary"
if gmkv.exists(key):
value = gmkv.smembers(key)
cid_list = [i for i in cid_list if str(i) not in value]
return cid_list
except:
return cid_list
def fetch_after_filter_queue(device_id, city_id):
local = list()
nearby = list()
nation = list()
megacity = list()
use_city_id = city_id
try:
specify_city_id_key = "device_diary_queue_after_filter:device_id:" + device_id + ":city_id:" + use_city_id
if gmkv.exists(specify_city_id_key):
queue = gmkv.get(specify_city_id_key).split(b";")
local = list(filter(None, queue[0].split(b",")))
nearby = list(filter(None, queue[1].split(b",")))
nation = list(filter(None, queue[2].split(b",")))
megacity = list(filter(None, queue[3].split(b",")))
return (local, nearby, nation, megacity)
else:
return local, nearby, nation, megacity
except:
return local, nearby, nation, megacity
def write_after_filter_queue(device_id, city_id, local, nearby, megacity, nation):
try:
specify_city_id_key = "device_diary_queue_after_filter:device_id:" + device_id + ":city_id:" + city_id
queue = local + ";" + nearby + ";" + nation + ";" + megacity
if gmkv.exists(specify_city_id_key):
gmkv.set(specify_city_id_key, queue)
else:
gmkv.set(specify_city_id_key, queue, ex=6 * 60 * 60)
except:
logging_exception()
logger.error("catch exception,err_log:%s" % traceback.format_exc())
def get_data(local, nearby, nation, megacity, cx, cy, cm, cz, x, y, z, m, size):
nx = int(round(x * 1.0 / (x + y + z + m) * size))
ny = int(round(y * 1.0 / (x + y + z + m) * size))
nz = int(round(z * 1.0 / (x + y + z + m) * size))
nm = int(round(m * 1.0 / (x + y + z + m) * size))
nxyz = [nx, ny, nm, nz]
xyz = [x, y, m, z]
counter = Counter([nx, ny, nm, nz])
if counter[0] == 2:
nxyz[nxyz.index(0)] += size - sum(nxyz)
else:
nxyz[xyz.index(max(xyz))] += size - sum(nxyz)
nx, ny, nm, nz = nxyz
local_filter = dislike_cid_filter(device_id, cx)
if len(local_filter) == 0:
local_filter = dislike_cid_filter(device_id, local)
slocal = local_filter[:nx]
have_x = local_filter[nx:]
x_str = ",".join([str(i) for i in have_x])
ny += (nx - len(slocal))
nearby_filter = dislike_cid_filter(device_id, cy)
if len(nearby_filter) == 0:
nearby_filter = dislike_cid_filter(device_id, nearby)
snearby = nearby_filter[:ny]
have_y = nearby_filter[ny:]
y_str = ",".join([str(i) for i in have_y])
nm += (ny - len(snearby))
megacity_filter = dislike_cid_filter(device_id, cm)
if len(megacity_filter) == 0:
megacity_filter = dislike_cid_filter(device_id, megacity)
smegacity = megacity_filter[:nm]
have_m = megacity_filter[nm:]
m_str = ",".join([str(i) for i in have_m])
nz += (nm - len(smegacity))
nation_filter = dislike_cid_filter(device_id, cz)
if len(nation_filter) == 0:
nation_filter = dislike_cid_filter(device_id, nation)
snation = nation_filter[:nz]
have_z = snation[nz:]
z_str = ",".join([str(i) for i in have_z])
return chain(slocal, snearby, smegacity, snation), x_str, y_str, m_str, z_str
if device_id != '0':
portrait_list = list()
click_diary_size = 1
search_diary_size = 4
read_key = "TS:recommend_diary_set:device_id:" + str(device_id)
old_key = "TS:recommend_diary_set:device_id:{}:{}"\
.format(device_id, (datetime.date.today() - datetime.timedelta(days=14)).strftime("%Y-%m-%d"))
today_key = "TS:recommend_diary_set:device_id:{}:{}"\
.format(device_id, datetime.date.today().strftime("%Y-%m-%d"))
user_portrait_diary_key = 'user_portrait_recommend_diary_queue:device_id:%s:%s' % \
(device_id, datetime.datetime.now().strftime('%Y-%m-%d'))
gmkv = redis.Redis(host="172.16.40.135", port=5379, db=2)
if redis_client.exists(user_portrait_diary_key):
user_portrait_diary_dict = redis_client.hgetall(user_portrait_diary_key)
user_portrait_cursor = str(user_portrait_diary_dict[b'cursor'], encoding='utf-8')
if user_portrait_cursor == '0':
if b'len_cursor' in user_portrait_diary_dict.keys():
user_portrait_diary_list = json.loads(user_portrait_diary_dict[b'diary_queue'])
filter_user_portrait_diary_list = dislike_cid_filter(device_id, user_portrait_diary_list)
if len(filter_user_portrait_diary_list) > size:
portrait_list = filter_user_portrait_diary_list[:size]
redis_client.hset(user_portrait_diary_key, 'diary_queue',
json.dumps(filter_user_portrait_diary_list[size:]))
portrait_list = list(map(int, portrait_list))
read_history(portrait_list)
return portrait_list
else:
size = size - len(filter_user_portrait_diary_list)
portrait_list = filter_user_portrait_diary_list
redis_client.delete(user_portrait_diary_key)
search_diary_recommend_key = "TS:search_recommend_diary_queue:device_id:" + str(device_id)
search_list = list()
if redis_client.exists(search_diary_recommend_key) and size > 3:
search_diary_recommend_dict = redis_client.hgetall(search_diary_recommend_key)
search_diary_recommend_list = json.loads(search_diary_recommend_dict[b'diary_queue'])
search_diary_recommend_list = dislike_cid_filter(device_id, search_diary_recommend_list)
if len(search_diary_recommend_list) == 0:
redis_client.delete(search_diary_recommend_key)
elif len(search_diary_recommend_list) <= search_diary_size:
search_list = search_diary_recommend_list
size = size - len(search_diary_recommend_list)
redis_client.delete(search_diary_recommend_key)
else:
search_list = search_diary_recommend_list[:search_diary_size]
size = size - search_diary_size
redis_client.hset(search_diary_recommend_key, 'diary_queue',
json.dumps(search_diary_recommend_list[search_diary_size:]))
if size <= 0:
portrait_list.extend(search_list)
portrait_list = list(map(int, portrait_list))
read_history(portrait_list)
return portrait_list
diary_recommend_key = "TS:recommend_diary_queue:device_id:" + str(device_id)
ts_recommend_list = list()
if redis_client.exists(diary_recommend_key) and size > 0:
diary_recommend_dict = redis_client.hgetall(diary_recommend_key)
diary_recommend_list = json.loads(diary_recommend_dict[b'diary_queue'])
diary_recommend_list = dislike_cid_filter(device_id, diary_recommend_list)
if len(diary_recommend_list) == 0:
redis_client.delete(diary_recommend_key)
elif len(diary_recommend_list) <= click_diary_size:
ts_recommend_list = diary_recommend_list
redis_client.delete(diary_recommend_key)
size = size - len(ts_recommend_list)
else:
size = size - click_diary_size
ts_recommend_list = diary_recommend_list[:click_diary_size]
diary_recommend_list_json = json.dumps(diary_recommend_list[click_diary_size:])
redis_client.hset(diary_recommend_key, 'diary_queue', diary_recommend_list_json)
if size <= 0:
portrait_list.extend(search_list)
portrait_list.extend(ts_recommend_list)
portrait_list = list(map(int, portrait_list))
read_history(portrait_list)
return portrait_list
if size > 0:
try:
(local, nearby, nation, megacity, city_id) = cls.fetch_device_diary_queue_data(city_id,
device_id)
if len(local) == 0 and len(nearby) == 0 and len(nation) == 0 and len(megacity) == 0:
(local, nearby, nation, megacity, city_id) = cls.fetch_diary_queue_data(city_id)
except:
logging_exception()
(local, nearby, nation, megacity, city_id) = cls.fetch_diary_queue_data(city_id)
x, y, m, z = cls.get_city_scale(city_id)
cx, cy, cm, cz = fetch_after_filter_queue(device_id, city_id)
data, x_str, y_str, m_str, z_str = get_data(
local, nearby, nation, megacity,
cx, cy, cm, cz,
x, y, z, m, size)
write_after_filter_queue(device_id, city_id, x_str, y_str, m_str, z_str)
portrait_list.extend(search_list)
portrait_list.extend(ts_recommend_list)
portrait_list.extend(data)
if len(portrait_list) == 0:
(local, nearby, nation, megacity, city_id) = cls.fetch_diary_queue_data(city_id)
portrait_list = cls.get_queue(local, nearby, nation, megacity,
device_id, city_id, size, x, y, z, m)
portrait_list = list(map(int, portrait_list))
if len(portrait_list) != 0:
read_history(portrait_list)
return portrait_list
else:
try:
(local, nearby, nation, megacity, city_id) = cls.fetch_device_diary_queue_data(city_id, device_id)
if len(local) == 0 and len(nearby) == 0 and len(nation) == 0 and len(megacity) == 0:
(local, nearby, nation, megacity, city_id) = cls.fetch_diary_queue_data(city_id)
except:
logging_exception()
(local, nearby, nation, megacity, city_id) = cls.fetch_diary_queue_data(city_id)
x, y, m, z = cls.get_city_scale(city_id)
data = cls.get_queue(local, nearby, nation, megacity, device_id, city_id, size, x, y, z, m)
return data
@classmethod
def get_queue(cls,local, nearby, nation, megacity, device_id,city_id,size,x,y,z,m):
key = '{device_id}-{city_id}-{date}'.format(device_id=device_id,
city_id=city_id, date=RecommendFeed.current_date())
counter_key = key + '-counter_v1'
counter = redis_client.incr(counter_key)
if counter == 1:
redis_client.expire(counter_key, 24 * 60 * 60)
cursor_key = key + '-cursor_v1'
cursor = redis_client.get(cursor_key) or b'0-0-0-0'
cx, cy, cm, cz = map(int, cursor.split(b'-'))
def get_scale(local, nearby, nation, megacity, cx, cy, cm, cz, x, y, z, m, size):
nx = int(round(x * 1.0 / (x + y + z + m) * size))
ny = int(round(y * 1.0 / (x + y + z + m) * size))
nz = int(round(z * 1.0 / (x + y + z + m) * size))
nm = int(round(m * 1.0 / (x + y + z + m) * size))
nxyz = [nx, ny, nm, nz]
xyz = [x, y, m, z]
counter = Counter([nx, ny, nm, nz])
if counter[0] == 2:
nxyz[nxyz.index(0)] += size - sum(nxyz)
else:
nxyz[xyz.index(max(xyz))] += size - sum(nxyz)
nx, ny, nm, nz = nxyz
slocal = local[cx:cx + nx]
cx = min(cx + nx, len(local))
ny += (nx - len(slocal))
snearby = nearby[cy:cy + ny]
cy = min(cy + ny, len(nearby))
nm += (ny - len(snearby))
smegacity = megacity[cm: cm + nm]
cm = min(cm + nm, len(megacity))
nz += (nm - len(smegacity))
snation = nation[cz:cz + nz]
cz = min(cz + nz, len(nation))
return chain(slocal, snearby, smegacity, snation), cx, cy, cm, cz
data, ncx, ncy, ncm, ncz = get_scale(
local, nearby, nation, megacity,
cx, cy, cm, cz,
x, y, z, m, size)
if ncx == cx and ncy == cy: # native queue and nearby queue
logger.info("diary queue reach end,cx:%d,cy:%d,cm:%d,cz:%d", cx, cy, cm, cz)
ncx = ncy = ncm = ncz = 0
val = '-'.join(map(str, [ncx, ncy, ncm, ncz]))
redis_client.set(cursor_key, val, ex=24 * 60 * 60)
return list(map(int, data))
@staticmethod
def get_city_scale(city_id):
try:
c = CityScale.objects.get(city_id=city_id)
x, y, z, m = c.native, c.nearby, c.nation, c.megacity
except CityScale.DoesNotExist:
try:
c = City.objects.get(id=city_id)
if c.level in (CITY_LEVEL.SUPER, CITY_LEVEL.ONE):
x, y, m, z = 4, 3, 0, 3
elif c.level == CITY_LEVEL.TWO:
x, y, m, z = 3, 3, 0, 3
elif c.level == CITY_LEVEL.THREE:
x, y, m, z = 1, 4, 0, 5
else:
x, y, m, z = 0, 0, 0, 10
except City.DoesNotExist:
x, y, m, z = 0, 0, 0, 10
return x, y, m, z
@staticmethod
def get_scale_data(local, nearby, nation, megacity, cx, cy, cm, cz, x, y, z, m, size):
"""
:param local: local diary queue
:param nearby: nearby diary queue
:param nation: nation diary queue
:param megacity: megacity diary queue
:param cx: seen local diary offset
:param cy: seen nearby diary offset
:param cz: seen nation diary offset
:param cm: seen megacity diary offset
:param x: local diary scale factor
:param y: nearby diary scale factor
:param z: nation diary scale factor
:param m: megacity diary scale factor
:param size: nubmer of diary
:return:
"""
# 本地 临近 特大城市 全国 四个层级 都按照的是四舍五入取得方式
# 针对出现的问题,本次相应的优化是:
# 1、如果出现两个层级为零,且有剩余坑位时,则按照本地 临近 全国的优先级,先给优先级高且为零的层级一个坑位。
# 2、如果所有层级都非零,且有剩余坑位时,则优先给权重占比大的层级一个坑位。
# 3、如果只有一个层级为零,且有剩余坑位时,则优先填充权重占比大的层级一个坑位。
nx = int(round(x * 1.0 / (x + y + z + m) * size))
ny = int(round(y * 1.0 / (x + y + z + m) * size))
nz = int(round(z * 1.0 / (x + y + z + m) * size))
nm = int(round(m * 1.0 / (x + y + z + m) * size))
nxyz = [nx, ny, nm, nz]
xyz = [x, y, m, z]
counter = Counter([nx, ny, nm, nz])
if counter[0] == 2:
nxyz[nxyz.index(0)] += size - sum(nxyz)
else:
nxyz[xyz.index(max(xyz))] += size - sum(nxyz)
nx, ny, nm, nz = nxyz
slocal = local[cx:cx + nx]
cx = min(cx + nx, len(local))
ny += (nx - len(slocal))
snearby = nearby[cy:cy + ny]
cy = min(cy + ny, len(nearby))
nm += (ny - len(snearby))
smegacity = megacity[cm: cm + nm]
cm = min(cm + nm, len(megacity))
nz += (nm - len(smegacity))
snation = nation[cz:cz + nz]
cz = min(cz + nz, len(nation))
return chain(slocal, snearby, smegacity, snation), cx, cy, cm, cz
# -*- coding: UTF-8 -*-
import pymysql
import redis
import datetime
import pandas as pd
import json
def get_yesterday_date():
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
yesterday = yesterday.strftime("%Y-%m-%d")
print(yesterday)
return yesterday
def get_black_user():
conn2db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod')
cursor = conn2db.cursor()
sql = "select distinct device_id from blacklist"
cursor.execute(sql)
result = cursor.fetchall()
black_user = pd.DataFrame(list(result))[0].values.tolist()
cursor.close()
conn2db.close()
return black_user
def get_data():
conn2db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle')
cursor = conn2db.cursor()
sql = "select distinct device_id from ffm_diary_queue_temp where device_id regexp '[5|6]$'"
cursor.execute(sql)
result = cursor.fetchall()
device = pd.DataFrame(list(result))[0].values.tolist()
cursor.close()
conn2db.close()
device = tuple(set(device)-set(black))
return device
def ctr_all():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod')
cursor = db.cursor()
sql_active = "select distinct device_id from data_feed_exposure " \
"where cid_type = 'diary'" \
"and device_id regexp'[5|6]$' and stat_date = '{}';".format(date)
cursor.execute(sql_active)
result = cursor.fetchall()
tail56 = pd.DataFrame(list(result))[0].values.tolist()
tail56 = set(tail56)-set(black)
print("当天尾号5或6活跃用户总数:")
print(len(tail56))
cover = len(tail56&set(device_id))
print("当天尾号5或6活跃用户覆盖数:")
print(cover)
cover_percent = format(cover / len(tail56), ".6f")
print("当天尾号5或6活跃用户覆盖率:")
print(cover_percent)
if __name__ == "__main__":
device_id = "D17A3770-1CC7-4AFB-A9EA-6E667EE051FF"
search_qa_recommend_key = "TS:search_recommend_answer_queue:device_id:" + str(device_id)
r = redis.StrictRedis.from_url("redis://redis.paas-test.env:6379/1")
cids = list(range(529405,529408))
cids = [str(i) for i in cids]
return len(tail56),cover,cover_percent
value = json.dumps(cids)
r.hset(search_qa_recommend_key,'answer_queue',value)
def ctr():
sql_click = "select count(cid) from data_feed_click " \
"where (cid_type = 'diary' or cid_type = 'diary_video') " \
"and stat_date = '{}' and device_id in {};".format(date,device_id)
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod')
cursor = db.cursor()
cursor.execute(sql_click)
click = cursor.fetchone()[0]
print("实验用户点击数:"+str(click))
sql_exp = "select count(cid) from data_feed_exposure " \
"where cid_type = 'diary'" \
"and stat_date = '{}' and device_id in {};".format(date,device_id)
cursor.execute(sql_exp)
exp = cursor.fetchone()[0]
print("实验用户曝光数:"+str(exp))
print("实验用户点击率:"+str(click/exp))
return click,exp,format(click/exp,".6f")
print(1)
def rate2file():
output_path = DIRECTORY_PATH + "56ctr.csv"
with open(output_path,'a+') as f:
line = date.replace('-', '')+','+str(temp_data[0])+','+str(temp_data[1])+','+str(temp_data[2])+\
","+str(data[0])+","+str(data[1])+","+str(data[2])+'\n'
f.write(line)
if __name__ == "__main__":
DIRECTORY_PATH = "/data/ffm/"
date = get_yesterday_date()
black = get_black_user()
device_id = get_data()
temp_data = ctr()
data = ctr_all()
rate2file()
......@@ -2,114 +2,220 @@ import pandas as pd
import pymysql
from datetime import datetime
from datetime import timedelta
import pickle
import time
from kafka import KafkaProducer
import json
from pyspark.streaming.kafka import KafkaUtils
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.streaming import StreamingContext
from pyspark import SparkConf
import redis
import sys
import os
import json
import pymysql
import numpy as np
import time
import datetime
import tensorflow as tf
import msgpack
import smtplib
import requests
def get_city():
sql = "select distinct city_id from data_feed_exposure where stat_date >= '2018-10-01' order by city_id"
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod')
cursor = db.cursor()
print("开始获取")
cursor.execute(sql)
print("成功获取")
result = cursor.fetchall()
db.close()
user = pd.DataFrame(list(result))[0].values.tolist()
print(user)
sql = "select distinct name from api_tag where tag_type = 4"
db = pymysql.connect(host='rdsfewzdmf0jfjp9un8xj.mysql.rds.aliyuncs.com', port=3306,
user='work', passwd='BJQaT9VzDcuPBqkd', db='zhengxing')
cursor = db.cursor()
print("开始获取")
from email.mime.text import MIMEText
from email.utils import formataddr
from email.mime.multipart import MIMEMultipart
from email.mime.application import MIMEApplication
# sys.path.append('/srv/apps/ftrl/Bandist_Streaming')
def send_email(app,id,e,extra_information = ''):
# 第三方 SMTP 服务
mail_host = 'smtp.exmail.qq.com' # 设置服务器
mail_user = "huangkai@igengmei.com" # 用户名
mail_pass = "UyhVobmDHa4r4ecV" # 口令
sender = 'huangkai@igengmei.com'
receivers = ['huangkai@igengmei.com'] # 接收邮件,可设置为你的QQ邮箱或者其他邮箱
e = str(e)
msg = MIMEMultipart()
part = MIMEText('app_id:'+id+':fail', 'plain', 'utf-8')
msg.attach(part)
msg['From'] = formataddr(["huangkai", sender])
# 括号里的对应收件人邮箱昵称、收件人邮箱账号
msg['To'] = ";".join(receivers)
# message['Cc'] = ";".join(cc_reciver)
msg['Subject'] = 'spark streaming:app_name:'+app
with open('error.txt','w') as f:
f.write(e)
f.write(extra_information)
f.close()
part = MIMEApplication(open('error.txt', 'r').read())
part.add_header('Content-Disposition', 'attachment', filename="error.txt")
msg.attach(part)
try:
smtpObj = smtplib.SMTP_SSL(mail_host, 465)
smtpObj.login(mail_user, mail_pass)
smtpObj.sendmail(sender, receivers, msg.as_string())
except smtplib.SMTPException:
print('error')
def ts_cal():
return 0
def cal_ctr(data):
a1 = datetime.datetime.now()
device_data = data[1]
device_id = device_data['device']['device_id']
db_eagle = pymysql.connect(host="172.16.40.158", port=4000, user="root", password="3SYz54LS9#^9sBvC",
db="eagle",
cursorclass=pymysql.cursors.DictCursor)
cursor = db_eagle.cursor()
sql = 'select id from online_api_service'
cursor.execute(sql)
print("成功获取")
result = cursor.fetchall()
db.close()
user = pd.DataFrame(list(result))[0].values.tolist()
print(user)
# def get_tail8():
# sql = "select distinct device_id from data_feed_click \
# where stat_date='{}' \
# and cid_type='{}' \
# and device_id regexp '8$';".format(stat_date,cid_type)
# db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
# cursor = db.cursor()
# print("开始获取")
# cursor.execute(sql)
# print("成功获取")
# result = cursor.fetchall()
# db.close()
# user = pd.DataFrame(list(result))[0].values.tolist()
# user = tuple(user)
# print("尾号是8的用户个数")
# print(len(user))
# return user
# def get_ctr(user_tuple):
# sql = "select count(device_id) from data_feed_click \
# where stat_date='{}' \
# and cid_type='{}' \
# and device_id in {}".format(stat_date, cid_type, user_tuple)
# db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
# cursor = db.cursor()
# print("开始获取")
# cursor.execute(sql)
# click = cursor.fetchall()[0][0]
# print(click)
#
# sql = "select count(device_id) from data_feed_exposure \
# where stat_date='{}' \
# and cid_type='{}' \
# and device_id in {}".format(stat_date, cid_type, user_tuple)
# cursor = db.cursor()
# print("开始获取")
# cursor.execute(sql)
# exp = cursor.fetchall()[0][0]
# db.close()
# print(exp)
# print(click / exp)
# def get_tail6():
# df = pd.read_csv(path+"{}predictTail6Unique.csv".format(stat_date))
# pre_list = tuple(eval(df.loc[0,"list"]))
# print(len(pre_list))
# print(pre_list[:2])
# db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
# sql = "select distinct device_id from data_feed_click \
# where stat_date='{}' \
# and cid_type='{}' \
# and device_id in {}".format(stat_date,cid_type,pre_list)
# cursor = db.cursor()
# print("开始获取")
# cursor.execute(sql)
# print("成功获取")
# result = cursor.fetchall()
# db.close()
# print(pd.DataFrame(list(result)).empty)
# user = pd.DataFrame(list(result))[0].values.tolist()
# user = tuple(user)
# print("用户个数")
# print(len(user))
# return user
if __name__ == "__main__":
get_city()
# path = "/data/models/"
# cid_type = "diary"
# now = datetime.now()
# year = now.year
# month = now.month
# day = now.day
# stat_date = datetime(year, month, day)
# stat_date = (stat_date - timedelta(days=1)).strftime("%Y-%m-%d")
# print(stat_date)
# tail6 = get_tail6()
# get_ctr(tail6)
# tail8 = get_tail8()
# get_ctr(tail8)
results = cursor.fetchall()
device_meigou_ctr_key = 'device_meigou_ctr:device_id:'+str(device_id)
device_meigou_params_key = 'device_meigou_params:device_id:'+str(device_id)
redis_client = redis.StrictRedis.from_url('redis://:ReDis!GmTx*0aN6@172.16.40.133:6379')
meigou_index_dict = dict()
meigou_new_params_dict = dict()
index_value = 0
init_params_value = 1
model_param_a = list()
model_param_b = list()
if redis_client.exists(device_meigou_params_key):
meigou_params_dict = redis_client.hgetall(device_meigou_params_key)
for result in results:
if result['id'] in meigou_params_dict.keys():
meigou_index_dict.update({index_value:result['id']})
meigou_new_params_dict.update({result['id']:meigou_index_dict[result['id']]})
model_param_a.append(meigou_params_dict[result['id']]['a'])
model_param_b.append(meigou_params_dict[result['id']]['b'])
index_value += 1
else:
meigou_index_dict.update({index_value: result['id']})
meigou_new_params_dict.update({result['id']:{"a":init_params_value,"b":init_params_value}})
model_param_a.append(init_params_value)
model_param_b.append(init_params_value)
index_value +=1
else:
for result in results:
meigou_new_params_dict.update({result['id']:{"a":init_params_value,"b":init_params_value}})
meigou_index_dict.update({index_value: result['id']})
model_param_a.append(init_params_value)
model_param_b.append(init_params_value)
index_value += 1
a2 = datetime.datetime.now()
num_actions = len(results)
user_feature = np.array([1])
# hparams_nlinear = tf.contrib.training.HParams(num_actions=num_actions,
# context_dim=1,
# init_scale=0.3,
# activation=tf.nn.relu,
# layer_sizes=[1],
# batch_size=1,
# activate_decay=True,
# initial_lr=0.1,
# max_grad_norm=5.0,
# show_training=False,
# freq_summary=1000,
# buffer_s=-1,
# initial_pulls=0,
# reset_lr=True,
# lr_decay_rate=0.5,
# training_freq=1,
# training_freq_network=10000,
# training_epochs=100,
# a0=model_param_a,
# b0=model_param_b,
# lambda_prior=0.25)
# inital model
model = NeuralLinearPosteriorSampling('NeuralLinear',num_actions,model_param_a,model_param_b)
a2 =datetime.datetime.now()
vals = model.action(user_feature)
# model.update(user_feature,0,np.array(1))
max =vals.max()
min = vals.min()
ctr_0_1 = (vals-min)/(max-min)
meigou_ctr_dict = dict()
a3 =datetime.datetime.now()
for i in range(len(ctr_0_1)):
meigou_ctr_dict.update({meigou_index_dict[i]:ctr_0_1[i]})
redis_client.set(device_meigou_ctr_key,json.dumps(meigou_ctr_dict))
a4 = datetime.datetime.now()
send_email(str(a1),str(a2),str(a3),str(a4))
def choose_action():
return 0
def Filter_Data(data):
data_dict = data[1]
if b'content' in data_dict:
return False
elif 'type' in data_dict:
if data_dict['type'] == 'device_opened' and data_dict['device']['device_id'] == '8E699605-DC2A-46B6-8B47-E9E809353055':
return True
def write_to_kafka():
producer = KafkaProducer(bootstrap_servers=["172.16.44.25:9092","172.16.44.31:9092","172.16.44.45:9092"],
key_serializer=lambda k: json.dumps(k).encode('utf-8'),
value_serializer=lambda v: json.dumps(v).encode('utf-8'))
future = producer.send(topic="test_topic", key="hello", value="world")
try:
record_metadata = future.get(timeout=10)
print("send ok")
except kafka_errors as e:
print(str(e))
def Ctr(rdd):
try:
results = rdd
write_to_kafka()
return results
except:
print("fail")
def m_decoder(s):
if s is None:
return None
try:
data = json.loads(s)
return data
except:
data = msgpack.loads(s, encoding='utf-8')
return data
if __name__ == '__main__':
# Spark-Streaming-Kafka
sc = SparkContext(conf=SparkConf().setMaster("spark://nvwa01:7077").setAppName("kafka_test")
.set("spark.io.compression.codec", "lzf"))
ssc = SQLContext(sc)
ssc = StreamingContext(sc, 10)
sc.setLogLevel("WARN")
kafkaParams = {"metadata.broker.list": "172.16.44.25:9092,172.16.44.31:9092,172.16.44.45:9092",
"group.id": "kafka_test",
"socket.timeout.ms": "600000",
"auto.offset.reset": "largest"}
stream = KafkaUtils.createDirectStream(ssc, ["test_topic"], kafkaParams,
keyDecoder=m_decoder, valueDecoder=m_decoder)
transformstream = stream.transform(lambda x: Ctr(x))
transformstream.pprint()
ssc.start()
ssc.awaitTermination()
......
import time
from prepareData import fetch_data
from read_filter import fetch_data
from utils import *
import pandas as pd
from config import *
......
import pymysql
import pandas as pd
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
# 从数据库获取数据,并将数据转化成DataFrame
def get_data(sql):
cursor = db.cursor()
cursor.execute(sql)
data = cursor.fetchall()
data = pd.DataFrame(list(data)).dropna()
return data
# 获取全国点击量TOP2000日记
sql = "select city_id,cid where cid_type = 'diary' order by click_count_choice desc limit 2000"
allCitiesTop2000 = get_data(sql)
allCitiesTop2000 = allCitiesTop2000.rename(columns={0:"city_id",1:"cid"})
allCitiesTop2000.to_csv("\home\zhangyanzhao\diaryTestSet\allCitiesTop2000.csv")
print("成功获取全国日记点击量TOP2000")
# 获取全国城市列表
sql = "select distinct city_id from data_feed_click"
cityList = get_data(sql)
cityList.to_csv("\home\zhangyanzhao\diaryTestSet\cityList.csv")
cityList = cityList[0].values.tolist()
print("成功获取城市列表")
# 获取每个城市点击量TOP2000日记,如果数量小于2000,用全国点击量TOP2000日记补充
for i in cityList:
sql = "select city_id,cid from data_feed_click " \
"where cid_type = 'diary' and city_id = {0} " \
"order by click_count_choice desc limit 2000".format(i)
data = get_data(sql)
data = data.rename(columns={0:"city_id",1:"cid"})
if data.shape[0]<2000:
n = 2000-data.shape[0]
# 全国点击量TOP2000日记中去除该城市的日记
temp = allCitiesTop2000[allCitiesTop2000["city_id"]!=i].loc[:n-1]
data = data.append(temp)
else:
pass
file_name = "\home\zhangyanzhao\diaryTestSet\{0}DiaryTop2000.csv".format(i)
data.to_csv(file_name)
print("end")
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
from pyspark.streaming.kafka import KafkaUtils
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.streaming import StreamingContext
from pyspark import SparkConf
import redis
import sys
import os
import json
import pymysql
import numpy as np
import pandas as pd
import time
import datetime
def Json(x):
try:
data = json.loads(x[1])
if 'type' in data and 'device' in data:
if data['type'] == 'on_click_button' \
and data['params']['page_name'] == 'home' and data['params']['tab_name'] == '精选' \
and data['params']['button_name'] == 'user_feedback_type' \
and data['params']['extra_param'][0]["card_content_type"] in ("diary","qa","user_post") \
and ("1" in data['params']['extra_param'][0]["feedback_type"]
or "2" in data['params']['extra_param'][0]["feedback_type"]):
return True
else:
return False
else:
return False
except Exception as e:
print("filter fail")
print(e)
def model(rdd):
try:
rdd = rdd.filter(lambda x:Json(x)).repartition(10).map(lambda x:get_data(x))\
.map(lambda x:write_redis(x[0],x[1],x[2]))
return rdd
except:
print("fail")
def get_data(x):
try:
data = json.loads(x[1])
device_id = data['device']['device_id']
cid = data['params']['extra_param'][0]["card_id"]
card = data['params']['extra_param'][0]["card_content_type"]
return device_id,cid,card
except Exception as e:
print("get_data fail")
# send_email("get_data", "get_data", e)
def write_redis(device_id,cid,card):
if card == "diary":
diary_write(device_id, cid)
elif card == "qa":
question_write(device_id, cid)
elif card == "user_post":
tractate_write(device_id, cid)
def tractate_write(device_id, cid):
try:
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle')
sql = "select b.id from src_mimas_prod_api_tractate_tag a left join src_zhengxing_api_tag b " \
"on a.tag_id = b.id where b.tag_type = '3' and a.tractate_id = {}".format(cid)
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
if len(result) > 0:
tags = result[0][0]
if tags is not None:
sql = "select a.id from src_mimas_prod_api_tractate a left join src_mimas_prod_api_tractate_tag b " \
"on a.id=b.tractate_id left join src_zhengxing_api_tag c on b.tag_id=c.id " \
"where a.is_online = 1 and c.id = {} and c.tag_type = '3'".format(tags)
cursor.execute(sql)
result = cursor.fetchall()
db.close()
if len(result) > 0:
cids = [str(i[0]) for i in result]
r = redis.Redis(host="172.16.40.135", port=5379, password="",db = 2)
key = str(device_id) + "_dislike_tractate"
if r.exists(key):
value = json.loads(r.get(key).decode('utf-8'))
value.extend(cids)
cids = json.dumps(list(set(value)))
r.set(key, cids)
print("cunza")
else:
r.set(key, json.dumps(cids))
r.expire(key, 7 * 24 * 60 * 60)
except Exception as e:
print("tractate insert redis fail")
print(e)
def question_write(device_id,cid):
try:
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle')
sql = "select b.id from src_mimas_prod_api_questiontag a left join src_zhengxing_api_tag b " \
"on a.tag_id = b.id where b.tag_type = '3' and a.question_id = {}".format(cid)
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
if len(result) > 0:
tags = result[0][0]
if tags is not None:
sql = "select a.id from src_mimas_prod_api_question a left join src_mimas_prod_api_questiontag b " \
"on a.id=b.question_id left join src_zhengxing_api_tag c on b.tag_id=c.id " \
"where a.is_online = 1 and c.tag_type = '3' and c.id = {}".format(tags)
cursor.execute(sql)
result = cursor.fetchall()
db.close()
if len(result) > 0:
cids = [str(i[0]) for i in result]
r = redis.Redis(host="172.16.40.135", port=5379, password="", db=2)
key = str(device_id) + "_dislike_qa"
if r.exists(key):
value = json.loads(r.get(key).decode('utf-8'))
value.extend(cids)
cids = json.dumps(list(set(value)))
r.set(key, cids)
print("cunza")
else:
r.set(key, json.dumps(cids))
r.expire(key, 7 * 24 * 60 * 60)
print("bucunza")
return "question good"
except Exception as e:
print("question insert redis fail")
print(e)
def diary_write(device_id,cid):
try:
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle')
sql = "select b.id from src_mimas_prod_api_diary_tags a left join src_zhengxing_api_tag b " \
"on a.tag_id = b.id where b.tag_type = '3' and a.diary_id = {}".format(cid)
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
if len(result) > 0:
tags = result[0][0]
if tags is not None:
sql = "select a.id from src_mimas_prod_api_diary a left join src_mimas_prod_api_diary_tags b " \
"on a.id=b.diary_id left join src_zhengxing_api_tag c on b.tag_id=c.id " \
"where a.is_online = 1 and a.content_level >= '3' " \
"and c.id = {} and c.tag_type = '3'".format(tags)
cursor.execute(sql)
result = cursor.fetchall()
db.close()
if len(result) > 0:
cids = [str(i[0]) for i in result]
r = redis.Redis(host="172.16.40.135", port=5379, password="", db=2)
key = str(device_id) + "_dislike_diary"
if r.exists(key):
value = json.loads(r.get(key).decode('utf-8'))
value.extend(cids)
cids = json.dumps(list(set(value)))
r.set(key, cids)
else:
r.set(key, json.dumps(cids))
r.expire(key, 7 * 24 * 60 * 60)
except Exception as e:
print("diary insert redis fail")
print(e)
sc = SparkContext(conf=SparkConf().setMaster("spark://nvwa01:7077").setAppName("dislike").set("spark.io.compression.codec", "lzf"))
ssc = StreamingContext(sc,4)
sc.setLogLevel("WARN")
kafkaParams = {"metadata.broker.list": "172.16.44.25:9092,172.16.44.31:9092,172.16.44.45:9092",
"group.id": "dislike",
"socket.timeout.ms": "600000",
"auto.offset.reset": "largest"}
stream = KafkaUtils.createDirectStream(ssc, ["gm-maidian-data"], kafkaParams)
transformstream = stream.transform(lambda x:model(x))
transformstream.pprint()
ssc.start()
ssc.awaitTermination()
# -*- coding: utf-8 -*-
import pymysql
from pyspark.conf import SparkConf
import pytispark.pytispark as pti
from pyspark.sql import SparkSession
import datetime
import pandas as pd
import time
from pyspark import StorageLevel
def app_list_func(x,l):
b = str(x).split(",")
e = []
for i in b:
if i in l.keys():
e.append(l[i])
else:
e.append(0)
return e
def get_list(db,sql,n):
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
v = list(set([i[0] for i in result]))
app_list_value = [str(i).split(",") for i in v]
app_list_unique = []
for i in app_list_value:
app_list_unique.extend(i)
app_list_unique = list(set(app_list_unique))
number = len(app_list_unique)
app_list_map = dict(zip(app_list_unique, list(range(n, number + n))))
db.close()
return number, app_list_map
def get_map():
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select app_list from device_app_list"
a = time.time()
apps_number, app_list_map = get_list(db,sql,16)
print("applist")
print((time.time()-a)/60)
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select level2_ids from diary_feat"
b = time.time()
leve2_number, leve2_map = get_list(db, sql, 16+apps_number)
print("leve2")
print((time.time() - b) / 60)
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select level3_ids from diary_feat"
c = time.time()
leve3_number, leve3_map = get_list(db, sql, 16+leve2_number+apps_number)
print((time.time() - c) / 60)
return apps_number, app_list_map,leve2_number, leve2_map,leve3_number, leve3_map
def get_unique(db,sql):
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
v = list(set([i[0] for i in result]))
db.close()
print(sql)
print(len(v))
return v
def con_sql(db,sql):
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
df = pd.DataFrame(list(result))
db.close()
return df
def get_pre_number():
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select count(*) from esmm_pre_data"
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchone()[0]
print("预测集数量:")
print(result)
db.close()
def feature_engineer():
apps_number, app_list_map, level2_number, leve2_map, level3_number, leve3_map = get_map()
app_list_map["app_list"] = 16
leve3_map["level3_ids"] = 17
leve3_map["search_tag3"] = 18
leve2_map["level2_ids"] = 19
leve2_map["tag1"] = 20
leve2_map["tag2"] = 21
leve2_map["tag3"] = 22
leve2_map["tag4"] = 23
leve2_map["tag5"] = 24
leve2_map["tag6"] = 25
leve2_map["tag7"] = 26
leve2_map["search_tag2"] = 27
unique_values = []
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select distinct stat_date from esmm_train_bias"
unique_values.extend(get_unique(db,sql))
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select distinct ucity_id from esmm_train_bias"
unique_values.extend(get_unique(db, sql))
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select distinct ccity_name from esmm_train_bias"
unique_values.extend(get_unique(db, sql))
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select distinct bias from esmm_train_bias"
unique_values.extend(get_unique(db, sql))
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select distinct device_type from user_feature"
unique_values.extend(get_unique(db, sql))
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select distinct manufacturer from user_feature"
unique_values.extend(get_unique(db, sql))
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select distinct channel from user_feature"
unique_values.extend(get_unique(db, sql))
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select distinct price_min from knowledge"
unique_values.extend(get_unique(db, sql))
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select distinct treatment_method from knowledge"
unique_values.extend(get_unique(db, sql))
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select distinct price_max from knowledge"
unique_values.extend(get_unique(db, sql))
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select distinct treatment_time from knowledge"
unique_values.extend(get_unique(db, sql))
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select distinct maintain_time from knowledge"
unique_values.extend(get_unique(db, sql))
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select distinct recover_time from knowledge"
unique_values.extend(get_unique(db, sql))
# unique_values.append("video")
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select max(stat_date) from esmm_train_bias"
validate_date = con_sql(db, sql)[0].values.tolist()[0]
print("validate_date:" + validate_date)
temp = datetime.datetime.strptime(validate_date, "%Y-%m-%d")
start = (temp - datetime.timedelta(days=180)).strftime("%Y-%m-%d")
print(start)
features = ["ucity_id", "ccity_name", "device_type", "manufacturer",
"channel", "stat_date",
"treatment_method", "price_min", "price_max", "treatment_time", "maintain_time", "recover_time",
"app_list", "level3_ids", "level2_ids","bias","search_tag2", "search_tag3"]
unique_values.extend(features)
print("unique_values length")
print(len(unique_values))
print("特征维度:")
print(apps_number + level2_number + level3_number + len(unique_values))
temp = list(range(29 + apps_number + level2_number + level3_number,
29 + apps_number + level2_number + level3_number + len(unique_values)))
value_map = dict(zip(unique_values, temp))
sql = "select e.y,e.z,e.stat_date,e.ucity_id,e.bias,feat.level2_ids,e.ccity_name,u.device_type,u.manufacturer," \
"u.channel,dl.app_list,feat.level3_ids,doctor.hospital_id," \
"doris.search_tag2,doris.search_tag3," \
"k.treatment_method,k.price_min,k.price_max,k.treatment_time,k.maintain_time,k.recover_time," \
"e.device_id,e.cid_id " \
"from jerry_test.esmm_train_bias e left join jerry_test.user_feature u on e.device_id = u.device_id " \
"left join jerry_test.device_app_list dl on e.device_id = dl.device_id " \
"left join jerry_test.diary_feat feat on e.cid_id = feat.diary_id " \
"left join jerry_test.knowledge k on feat.level2 = k.level2_id " \
"left join eagle.src_zhengxing_api_service service on e.diary_service_id = service.id " \
"left join eagle.src_zhengxing_api_doctor doctor on service.doctor_id = doctor.id " \
"left join jerry_test.search_doris doris on e.device_id = doris.device_id and e.stat_date = doris.get_date " \
"where e.stat_date >= '{}'".format(start)
df = spark.sql(sql)
df = df.drop_duplicates(["ucity_id", "level2_ids", "ccity_name", "device_type", "manufacturer",
"channel", "stat_date", "app_list", "hospital_id",
"level3_ids","bias","search_tag2", "search_tag3"])
df = df.na.fill(dict(zip(features, features)))
rdd = df.select("stat_date", "y", "z", "app_list", "level2_ids", "level3_ids",
"ucity_id", "ccity_name", "device_type", "manufacturer", "channel",
"treatment_method", "price_min", "price_max", "treatment_time",
"maintain_time", "recover_time","bias","search_tag2", "search_tag3","cid_id","device_id")\
.rdd.repartition(200).map(
lambda x: (x[0], float(x[1]), float(x[2]), app_list_func(x[3], app_list_map), app_list_func(x[4], leve2_map),
app_list_func(x[5], leve3_map),
[value_map.get(x[0], 1), value_map.get(x[6], 2), value_map.get(x[7], 3), value_map.get(x[8], 4),
value_map.get(x[9], 5), value_map.get(x[10], 6), value_map.get(x[11], 7), value_map.get(x[12], 8),
value_map.get(x[13], 9), value_map.get(x[14], 10),
value_map.get(x[15], 11), value_map.get(x[16], 12), value_map.get(x[17], 13)],
app_list_func(x[18], leve2_map), app_list_func(x[19], leve3_map),x[6],x[20],x[21]
))
rdd.persist(storageLevel= StorageLevel.MEMORY_ONLY_SER)
# TODO 上线后把下面train fliter 删除,因为最近一天的数据也要作为训练集
train = rdd.map(
lambda x: (x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9],
x[10], x[11]))
f = time.time()
spark.createDataFrame(train).toDF("y", "z", "app_list", "level2_list", "level3_list","ids",
"search_tag2_list","search_tag3_list","city","cid_id","uid") \
.repartition(1).write.format("tfrecords").save(path=path + "tr/", mode="overwrite")
h = time.time()
print("train tfrecord done")
print((h - f) / 60)
print("训练集样本总量:")
print(rdd.count())
# get_pre_number()
test = rdd.filter(lambda x: x[0] == validate_date).map(
lambda x: (x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9],
x[10], x[11]))
spark.createDataFrame(test).toDF("y", "z", "app_list", "level2_list", "level3_list","ids",
"search_tag2_list","search_tag3_list","city","cid_id","uid") \
.repartition(1).write.format("tfrecords").save(path=path + "va/", mode="overwrite")
print("va tfrecord done")
rdd.unpersist()
return validate_date, value_map, app_list_map, leve2_map, leve3_map
def get_predict(date,value_map,app_list_map,leve2_map,leve3_map):
sql = "select e.y,e.z,e.label,e.ucity_id,feat.level2_ids,e.ccity_name," \
"u.device_type,u.manufacturer,u.channel,c.top,e.device_id,e.cid_id,cut.time," \
"dl.app_list,e.hospital_id,feat.level3_ids," \
"wiki.tag as tag1,question.tag as tag2,search.tag as tag3,budan.tag as tag4," \
"ot.tag as tag5,sixin.tag as tag6,cart.tag as tag7,doris.search_tag2,doris.search_tag3," \
"k.treatment_method,k.price_min,k.price_max,k.treatment_time,k.maintain_time,k.recover_time " \
"from jerry_test.esmm_pre_data e " \
"left join jerry_test.user_feature u on e.device_id = u.device_id " \
"left join jerry_test.cid_type_top c on e.device_id = c.device_id " \
"left join jerry_test.cid_time_cut cut on e.cid_id = cut.cid " \
"left join jerry_test.device_app_list dl on e.device_id = dl.device_id " \
"left join jerry_test.diary_feat feat on e.cid_id = feat.diary_id " \
"left join jerry_test.wiki_tag wiki on e.device_id = wiki.device_id " \
"left join jerry_test.question_tag question on e.device_id = question.device_id " \
"left join jerry_test.search_tag search on e.device_id = search.device_id " \
"left join jerry_test.budan_tag budan on e.device_id = budan.device_id " \
"left join jerry_test.order_tag ot on e.device_id = ot.device_id " \
"left join jerry_test.sixin_tag sixin on e.device_id = sixin.device_id " \
"left join jerry_test.cart_tag cart on e.device_id = cart.device_id " \
"left join jerry_test.knowledge k on feat.level2 = k.level2_id " \
"left join jerry_test.search_doris doris on e.device_id = doris.device_id and e.stat_date = doris.get_date " \
"where e.device_id = 'C33E2C8E-86E9-4C91-8458-526FB81E4C78'"
features = ["ucity_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "time", "hospital_id",
"treatment_method", "price_min", "price_max", "treatment_time", "maintain_time", "recover_time",
"app_list", "level3_ids", "level2_ids", "tag1", "tag2", "tag3", "tag4", "tag5", "tag6", "tag7",
"search_tag2", "search_tag3"]
df = spark.sql(sql)
df = df.drop_duplicates(["ucity_id", "device_id", "cid_id"])
df = df.na.fill(dict(zip(features, features)))
f = time.time()
rdd = df.select("label", "y", "z", "ucity_id", "device_id", "cid_id", "app_list", "level2_ids", "level3_ids",
"tag1", "tag2", "tag3", "tag4", "tag5", "tag6", "tag7",
"ucity_id", "ccity_name", "device_type", "manufacturer", "channel", "top", "time",
"hospital_id", "treatment_method", "price_min", "price_max", "treatment_time",
"maintain_time", "recover_time", "search_tag2", "search_tag3") \
.rdd.repartition(200).map(lambda x: (x[0], float(x[1]), float(x[2]), x[3], x[4], x[5],
app_list_func(x[6], app_list_map), app_list_func(x[7], leve2_map),
app_list_func(x[8], leve3_map), app_list_func(x[9], leve2_map),
app_list_func(x[10], leve2_map), app_list_func(x[11], leve2_map),
app_list_func(x[12], leve2_map), app_list_func(x[13], leve2_map),
app_list_func(x[14], leve2_map), app_list_func(x[15], leve2_map),
[value_map.get(date, 1), value_map.get(x[16], 2),
value_map.get(x[17], 3), value_map.get(x[18], 4),
value_map.get(x[19], 5), value_map.get(x[20], 6),
value_map.get(x[21], 7), value_map.get(x[22], 8),
value_map.get(x[23], 9), value_map.get(x[24], 10),
value_map.get(x[25], 11), value_map.get(x[26], 12),
value_map.get(x[27], 13), value_map.get(x[28], 14),
value_map.get(x[29], 15)],
app_list_func(x[30], leve2_map),app_list_func(x[31], leve3_map)))
rdd.persist(storageLevel= StorageLevel.MEMORY_ONLY_SER)
print("预测集样本大小:")
print(rdd.count())
if rdd.filter(lambda x: x[0] == 0).count() > 0:
print("预测集native有数据")
spark.createDataFrame(rdd.filter(lambda x: x[0] == 0)
.map(lambda x: (x[1], x[2], x[6], x[7], x[8], x[9], x[10], x[11],
x[12], x[13], x[14], x[15], x[16], x[17], x[18], x[3], x[4], x[5]))) \
.toDF("y", "z", "app_list", "level2_list", "level3_list", "tag1_list", "tag2_list", "tag3_list",
"tag4_list","tag5_list", "tag6_list", "tag7_list", "ids", "search_tag2_list",
"search_tag3_list", "city", "uid","cid_id") \
.repartition(1).write.format("tfrecords").save(path=path + "native/", mode="overwrite")
print("native tfrecord done")
h = time.time()
print((h - f) / 60)
else:
print("预测集native为空")
if rdd.filter(lambda x: x[0] == 1).count() > 0:
print("预测集nearby有数据")
spark.createDataFrame(rdd.filter(lambda x: x[0] == 1)
.map(lambda x: (x[1], x[2], x[6], x[7], x[8], x[9], x[10], x[11],
x[12], x[13], x[14], x[15], x[16], x[17], x[18], x[3], x[4], x[5]))) \
.toDF("y", "z", "app_list", "level2_list", "level3_list", "tag1_list", "tag2_list", "tag3_list",
"tag4_list","tag5_list", "tag6_list", "tag7_list", "ids", "search_tag2_list",
"search_tag3_list", "city", "uid", "cid_id")\
.repartition(1).write.format("tfrecords").save(path=path + "nearby/", mode="overwrite")
print("nearby tfrecord done")
else:
print("预测集nearby为空")
rdd.unpersist()
if __name__ == '__main__':
sparkConf = SparkConf().set("spark.hive.mapred.supports.subdirectories", "true") \
.set("spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive", "true") \
.set("spark.tispark.plan.allow_index_double_read", "false") \
.set("spark.tispark.plan.allow_index_read", "true") \
.set("spark.sql.extensions", "org.apache.spark.sql.TiExtensions") \
.set("spark.tispark.pd.addresses", "172.16.40.158:2379").set("spark.io.compression.codec", "lzf")\
.set("spark.driver.maxResultSize", "8g").set("spark.sql.avro.compression.codec","snappy")
spark = SparkSession.builder.config(conf=sparkConf).enableHiveSupport().getOrCreate()
ti = pti.TiContext(spark)
ti.tidbMapDatabase("jerry_test")
ti.tidbMapDatabase("eagle")
spark.sparkContext.setLogLevel("WARN")
path = "hdfs:///strategy/esmm/"
local_path = "/home/gmuser/esmm/"
validate_date, value_map, app_list_map, leve2_map, leve3_map = feature_engineer()
# get_predict(validate_date, value_map, app_list_map, leve2_map, leve3_map)
spark.stop()
import pymysql
import datetime
import json
import redis
import pandas as pd
from sqlalchemy import create_engine
def get_mysql_data(host,port,user,passwd,db,sql):
db = pymysql.connect(host=host, port=port, user=user, passwd=passwd,db=db)
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
db.close()
return result
def get_esmm_users():
try:
stat_date = (datetime.date.today() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")
sql = "select distinct device_id,city_id from data_feed_exposure_precise " \
"where stat_date = '{}'".format(stat_date)
result = get_mysql_data('172.16.40.158', 4000, 'root','3SYz54LS9#^9sBvC','jerry_prod',sql)
result = list(result)
return result
except:
return []
def get_user_profile(device_id,top_k = 5):
try:
r = redis.Redis(host="172.16.40.135", port=5379, password="", db=2)
key = "user:portrait_tags:cl_id:" + str(device_id)
if r.exists(key):
tmp = json.loads(r.get(key).decode('utf-8'))
tag_score = {}
for i in tmp:
if i["type"] == "tag":
tag_score[i["content"]] = i["score"]
elif i["content"] in name_tag.keys():
tag_score[name_tag[i["content"]]] = i["score"]
tag_sort = sorted(tag_score.items(), key=lambda x: x[1], reverse=True)
tags = []
if len(tag_sort) > top_k:
for i in range(top_k):
tags.append(tag_sort[i][0])
else:
for i in tag_sort:
tags.append(i[0])
return tags
else:
return []
except:
return []
def get_searchworlds_to_tagid():
try:
sql = 'select id, name from api_tag where is_online = 1 and tag_type < 4'
tag_id = get_mysql_data('172.16.30.141', 3306, 'work', 'BJQaT9VzDcuPBqkd', 'zhengxing', sql)
searchworlds_to_tagid = {}
for i in tag_id:
searchworlds_to_tagid[i[1]] = i[0]
return searchworlds_to_tagid
except Exception as e:
return {}
def get_queues(device_id,city_id):
try:
db = pymysql.connect(host='172.16.40.158', port=4000, user='root',
passwd='3SYz54LS9#^9sBvC', db='jerry_test')
cursor = db.cursor()
sql = "select native_queue, nearby_queue, nation_queue, megacity_queue from esmm_device_diary_queue " \
"where device_id = '{}' and city_id = '{}'".format(device_id, city_id)
cursor.execute(sql)
result = cursor.fetchone()
db.close()
if result is not None:
return list(result)
else:
return []
except:
return []
def tag_boost(cid_str, tag_list):
if cid_str is not None and cid_str != "":
cids = cid_str.split(",")
try:
if len(cids) > 6 and len(tag_list) > 0:
sql = "select id,group_concat(diary_id) from " \
"(select a.diary_id,b.id from src_mimas_prod_api_diary_tags a left join src_zhengxing_api_tag b " \
"on a.tag_id = b.id where b.tag_type < '4' and a.diary_id in {}) tmp " \
"where id in {} group by id".format(tuple(cids), tuple(tag_list))
result = get_mysql_data('172.16.40.158', 4000, 'root', '3SYz54LS9#^9sBvC','eagle',sql)
if len(result) > 0:
tag_cids = {}
left_cids = []
for i in result:
tmp = i[1].split(",")
tmp = [i for i in cids if i in tmp]
tag_cids[i[0]] = tmp
left_cids.extend(tmp)
left_cids = list(set(left_cids))
right_cids = [i for i in cids if i not in left_cids]
tag_cids["right"] = right_cids
tag_list.append("right")
sort_cids = []
n = 0
while n != len(tag_cids) - 1:
for i in tag_list:
if i in tag_cids.keys():
if len(tag_cids[i]) > 0:
sort_cids.append(tag_cids[i][0])
value = tag_cids[i]
value.pop(0)
tag_cids[i] = value
if len(value) == 0 and i != "right":
n = n + 1
if len(tag_cids["right"]) > 0:
sort_cids.extend(tag_cids["right"])
news_ids = []
for id in sort_cids:
if id not in news_ids:
news_ids.append(id)
new_str = ",".join([str(i) for i in news_ids])
return new_str
else:
return cid_str
else:
return cid_str
except:
#TODO 往sentry发,并且在本地也要打出日志
return cid_str
else:
return cid_str
def to_data_base(df):
sql = "select distinct device_id from esmm_resort_diary_queue"
result = get_mysql_data('172.16.40.158', 4000, 'root','3SYz54LS9#^9sBvC', 'jerry_test',sql)
old_uid = [i[0] for i in result]
if len(old_uid) > 0:
old_uid = set(df["device_id"].values)&set(old_uid)
old_number = len(old_uid)
if old_number > 0:
db = pymysql.connect(host='172.16.40.158', port=4000, user='root',
passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "delete from esmm_resort_diary_queue where device_id in {}".format(tuple(old_uid))
cursor = db.cursor()
cursor.execute(sql)
db.commit()
cursor.close()
db.close()
yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@172.16.40.158:4000/jerry_test?charset=utf8')
pd.io.sql.to_sql(df, "esmm_resort_diary_queue", yconnect, schema='jerry_test', if_exists='append', index=False,
chunksize=200)
print("insert done")
if __name__ == "__main__":
users_list = get_esmm_users()
print("user number")
print(len(users_list))
name_tag = get_searchworlds_to_tagid()
n = 500
split_users_list = [users_list[i:i + n] for i in range(0, len(users_list), n)]
for child_users_list in split_users_list:
total_samples = list()
for uid_city in child_users_list:
tag_list = get_user_profile(uid_city[0])
queues = get_queues(uid_city[0], uid_city[1])
if len(queues) > 0 and len(tag_list) > 0:
new_native = tag_boost(queues[0], tag_list)
new_nearby = tag_boost(queues[1], tag_list)
insert_time = str(datetime.datetime.now().strftime('%Y%m%d%H%M'))
sample = [uid_city[0], uid_city[1], new_native, new_nearby, queues[2], queues[3], insert_time]
total_samples.append(sample)
if len(total_samples) > 0:
df = pd.DataFrame(total_samples)
df = df.rename(columns={0: "device_id", 1: "city_id",2:"native_queue",
3:"nearby_queue",4:"nation_queue",5:"megacity_queue",6:"time"})
to_data_base(df)
#coding=utf-8
import pymysql
import os
import json
from datetime import date, timedelta
import tensorflow as tf
import time
import pandas as pd
import datetime
#################### CMD Arguments ####################
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_integer("dist_mode", 0, "distribuion mode {0-loacal, 1-single_dist, 2-multi_dist}")
tf.app.flags.DEFINE_string("ps_hosts", '', "Comma-separated list of hostname:port pairs")
tf.app.flags.DEFINE_string("worker_hosts", '', "Comma-separated list of hostname:port pairs")
tf.app.flags.DEFINE_string("job_name", '', "One of 'ps', 'worker'")
tf.app.flags.DEFINE_integer("task_index", 0, "Index of task within the job")
tf.app.flags.DEFINE_integer("num_threads", 16, "Number of threads")
tf.app.flags.DEFINE_integer("feature_size", 0, "Number of features")
tf.app.flags.DEFINE_integer("field_size", 0, "Number of common fields")
tf.app.flags.DEFINE_integer("embedding_size", 32, "Embedding size")
tf.app.flags.DEFINE_integer("num_epochs", 10, "Number of epochs")
tf.app.flags.DEFINE_integer("batch_size", 64, "Number of batch size")
tf.app.flags.DEFINE_integer("log_steps", 1000, "save summary every steps")
tf.app.flags.DEFINE_float("learning_rate", 0.0005, "learning rate")
tf.app.flags.DEFINE_float("l2_reg", 0.0001, "L2 regularization")
tf.app.flags.DEFINE_string("loss_type", 'log_loss', "loss type {square_loss, log_loss}")
tf.app.flags.DEFINE_float("ctr_task_wgt", 0.5, "loss weight of ctr task")
tf.app.flags.DEFINE_string("optimizer", 'Adam', "optimizer type {Adam, Adagrad, GD, Momentum}")
tf.app.flags.DEFINE_string("deep_layers", '256,128,64', "deep layers")
tf.app.flags.DEFINE_string("dropout", '0.5,0.5,0.5', "dropout rate")
tf.app.flags.DEFINE_boolean("batch_norm", False, "perform batch normaization (True or False)")
tf.app.flags.DEFINE_float("batch_norm_decay", 0.9, "decay for the moving average(recommend trying decay=0.9)")
tf.app.flags.DEFINE_string("hdfs_dir", '', "hdfs dir")
tf.app.flags.DEFINE_string("local_dir", '', "local dir")
tf.app.flags.DEFINE_string("dt_dir", '', "data dt partition")
tf.app.flags.DEFINE_string("model_dir", '', "model check point dir")
tf.app.flags.DEFINE_string("servable_model_dir", '', "export servable model for TensorFlow Serving")
tf.app.flags.DEFINE_string("task_type", 'train', "task type {train, infer, eval, export}")
tf.app.flags.DEFINE_boolean("clear_existing_model", False, "clear existing model or not")
def input_fn(filenames, batch_size=32, num_epochs=1, perform_shuffle=False):
print('Parsing', filenames)
def _parse_fn(record):
features = {
"y": tf.FixedLenFeature([], tf.float32),
"z": tf.FixedLenFeature([], tf.float32),
"ids": tf.FixedLenFeature([FLAGS.field_size], tf.int64),
"app_list": tf.VarLenFeature(tf.int64),
"level2_list": tf.VarLenFeature(tf.int64),
"level3_list": tf.VarLenFeature(tf.int64),
"search_tag2_list": tf.VarLenFeature(tf.int64),
"search_tag3_list": tf.VarLenFeature(tf.int64),
"uid": tf.VarLenFeature(tf.string),
"city": tf.VarLenFeature(tf.string),
"cid_id": tf.VarLenFeature(tf.string)
}
parsed = tf.parse_single_example(record, features)
y = parsed.pop('y')
z = parsed.pop('z')
return parsed, {"y": y, "z": z}
# Extract lines from input files using the Dataset API, can pass one filename or filename list
# dataset = tf.data.TFRecordDataset(filenames).map(_parse_fn, num_parallel_calls=8).prefetch(500000) # multi-thread pre-process then prefetch
# Randomizes input using a window of 256 elements (read into memory)
# if perform_shuffle:
# dataset = dataset.shuffle(buffer_size=256)
# epochs from blending together.
# dataset = dataset.repeat(num_epochs)
# dataset = dataset.batch(batch_size) # Batch size to use
files = tf.data.Dataset.list_files(filenames)
dataset = files.apply(
tf.data.experimental.parallel_interleave(
lambda file: tf.data.TFRecordDataset(file),
cycle_length=8
)
)
dataset = dataset.apply(tf.data.experimental.map_and_batch(map_func=_parse_fn, batch_size=batch_size, num_parallel_calls=8))
dataset = dataset.prefetch(10000)
# dataset = dataset.padded_batch(batch_size, padded_shapes=({"feeds_ids": [None], "feeds_vals": [None], "title_ids": [None]}, [None])) #不定长补齐
#return dataset.make_one_shot_iterator()
iterator = dataset.make_one_shot_iterator()
batch_features, batch_labels = iterator.get_next()
#return tf.reshape(batch_ids,shape=[-1,field_size]), tf.reshape(batch_vals,shape=[-1,field_size]), batch_labels
#print("-"*100)
#print(batch_features,batch_labels)
return batch_features, batch_labels
def model_fn(features, labels, mode, params):
"""Bulid Model function f(x) for Estimator."""
#------hyperparameters----
field_size = params["field_size"]
feature_size = params["feature_size"]
embedding_size = params["embedding_size"]
l2_reg = params["l2_reg"]
learning_rate = params["learning_rate"]
#optimizer = params["optimizer"]
layers = list(map(int, params["deep_layers"].split(',')))
dropout = list(map(float, params["dropout"].split(',')))
ctr_task_wgt = params["ctr_task_wgt"]
common_dims = field_size*embedding_size
#------bulid weights------
Feat_Emb = tf.get_variable(name='embeddings', shape=[feature_size, embedding_size], initializer=tf.glorot_normal_initializer())
feat_ids = features['ids']
app_list = features['app_list']
level2_list = features['level2_list']
level3_list = features['level3_list']
search_tag2_list = features['search_tag2_list']
search_tag3_list = features['search_tag3_list']
uid = features['uid']
city = features['city']
cid_id = features['cid_id']
if FLAGS.task_type != "infer":
y = labels['y']
z = labels['z']
#------build f(x)------
with tf.variable_scope("Shared-Embedding-layer"):
embedding_id = tf.nn.embedding_lookup(Feat_Emb,feat_ids)
app_id = tf.nn.embedding_lookup_sparse(Feat_Emb, sp_ids=app_list, sp_weights=None, combiner="sum")
level2 = tf.nn.embedding_lookup_sparse(Feat_Emb, sp_ids=level2_list, sp_weights=None, combiner="sum")
level3 = tf.nn.embedding_lookup_sparse(Feat_Emb, sp_ids=level3_list, sp_weights=None, combiner="sum")
search_tag2 = tf.nn.embedding_lookup_sparse(Feat_Emb, sp_ids=search_tag2_list, sp_weights=None, combiner="sum")
search_tag3 = tf.nn.embedding_lookup_sparse(Feat_Emb, sp_ids=search_tag3_list, sp_weights=None, combiner="sum")
# x_concat = tf.reshape(embedding_id,shape=[-1, common_dims]) # None * (F * K)
x_concat = tf.concat([tf.reshape(embedding_id, shape=[-1, common_dims]), app_id, level2, level3,
search_tag2,search_tag3], axis=1)
uid = tf.sparse.to_dense(uid,default_value="")
city = tf.sparse.to_dense(city,default_value="")
cid_id = tf.sparse.to_dense(cid_id,default_value="")
with tf.name_scope("CVR_Task"):
if mode == tf.estimator.ModeKeys.TRAIN:
train_phase = True
else:
train_phase = False
x_cvr = x_concat
for i in range(len(layers)):
x_cvr = tf.contrib.layers.fully_connected(inputs=x_cvr, num_outputs=layers[i], \
weights_regularizer=tf.contrib.layers.l2_regularizer(l2_reg), scope='cvr_mlp%d' % i)
if FLAGS.batch_norm:
x_cvr = batch_norm_layer(x_cvr, train_phase=train_phase, scope_bn='cvr_bn_%d' %i) #放在RELU之后 https://github.com/ducha-aiki/caffenet-benchmark/blob/master/batchnorm.md#bn----before-or-after-relu
if mode == tf.estimator.ModeKeys.TRAIN:
x_cvr = tf.nn.dropout(x_cvr, keep_prob=dropout[i]) #Apply Dropout after all BN layers and set dropout=0.8(drop_ratio=0.2)
y_cvr = tf.contrib.layers.fully_connected(inputs=x_cvr, num_outputs=1, activation_fn=tf.identity, \
weights_regularizer=tf.contrib.layers.l2_regularizer(l2_reg), scope='cvr_out')
y_cvr = tf.reshape(y_cvr,shape=[-1])
with tf.name_scope("CTR_Task"):
if mode == tf.estimator.ModeKeys.TRAIN:
train_phase = True
else:
train_phase = False
x_ctr = x_concat
for i in range(len(layers)):
x_ctr = tf.contrib.layers.fully_connected(inputs=x_ctr, num_outputs=layers[i], \
weights_regularizer=tf.contrib.layers.l2_regularizer(l2_reg), scope='ctr_mlp%d' % i)
if FLAGS.batch_norm:
x_ctr = batch_norm_layer(x_ctr, train_phase=train_phase, scope_bn='ctr_bn_%d' %i) #放在RELU之后 https://github.com/ducha-aiki/caffenet-benchmark/blob/master/batchnorm.md#bn----before-or-after-relu
if mode == tf.estimator.ModeKeys.TRAIN:
x_ctr = tf.nn.dropout(x_ctr, keep_prob=dropout[i]) #Apply Dropout after all BN layers and set dropout=0.8(drop_ratio=0.2)
y_ctr = tf.contrib.layers.fully_connected(inputs=x_ctr, num_outputs=1, activation_fn=tf.identity, \
weights_regularizer=tf.contrib.layers.l2_regularizer(l2_reg), scope='ctr_out')
y_ctr = tf.reshape(y_ctr,shape=[-1])
with tf.variable_scope("MTL-Layer"):
pctr = tf.sigmoid(y_ctr)
pcvr = tf.sigmoid(y_cvr)
pctcvr = pctr*pcvr
predictions = {"pctcvr": pctcvr, "uid": uid, "city": city, "cid_id": cid_id}
export_outputs = {tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: tf.estimator.export.PredictOutput(predictions)}
# Provide an estimator spec for `ModeKeys.PREDICT`
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions,
export_outputs=export_outputs)
if FLAGS.task_type != "infer":
#------bulid loss------
ctr_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_ctr, labels=y))
#cvr_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_ctcvr, labels=z))
cvr_loss = tf.reduce_mean(tf.losses.log_loss(predictions=pctcvr, labels=z))
loss = ctr_task_wgt * ctr_loss + (1 -ctr_task_wgt) * cvr_loss + l2_reg * tf.nn.l2_loss(Feat_Emb)
tf.summary.scalar('ctr_loss', ctr_loss)
tf.summary.scalar('cvr_loss', cvr_loss)
# Provide an estimator spec for `ModeKeys.EVAL`
eval_metric_ops = {
"CTR_AUC": tf.metrics.auc(y, pctr),
#"CTR_F1": tf.contrib.metrics.f1_score(y,pctr),
#"CTR_Precision": tf.metrics.precision(y,pctr),
#"CTR_Recall": tf.metrics.recall(y,pctr),
"CVR_AUC": tf.metrics.auc(z, pcvr),
"CTCVR_AUC": tf.metrics.auc(z, pctcvr)
}
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions,
loss=loss,
eval_metric_ops=eval_metric_ops)
#------bulid optimizer------
if FLAGS.optimizer == 'Adam':
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8)
elif FLAGS.optimizer == 'Adagrad':
optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=1e-8)
elif FLAGS.optimizer == 'Momentum':
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.95)
elif FLAGS.optimizer == 'ftrl':
optimizer = tf.train.FtrlOptimizer(learning_rate)
train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
# Provide an estimator spec for `ModeKeys.TRAIN` modes
if mode == tf.estimator.ModeKeys.TRAIN:
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions,
loss=loss,
train_op=train_op)
def batch_norm_layer(x, train_phase, scope_bn):
bn_train = tf.contrib.layers.batch_norm(x, decay=FLAGS.batch_norm_decay, center=True, scale=True, updates_collections=None, is_training=True, reuse=None, scope=scope_bn)
bn_infer = tf.contrib.layers.batch_norm(x, decay=FLAGS.batch_norm_decay, center=True, scale=True, updates_collections=None, is_training=False, reuse=True, scope=scope_bn)
z = tf.cond(tf.cast(train_phase, tf.bool), lambda: bn_train, lambda: bn_infer)
return z
def set_dist_env():
if FLAGS.dist_mode == 1: # 本地分布式测试模式1 chief, 1 ps, 1 evaluator
ps_hosts = FLAGS.ps_hosts.split(',')
chief_hosts = FLAGS.chief_hosts.split(',')
task_index = FLAGS.task_index
job_name = FLAGS.job_name
print('ps_host', ps_hosts)
print('chief_hosts', chief_hosts)
print('job_name', job_name)
print('task_index', str(task_index))
# 无worker参数
tf_config = {
'cluster': {'chief': chief_hosts, 'ps': ps_hosts},
'task': {'type': job_name, 'index': task_index }
}
print(json.dumps(tf_config))
os.environ['TF_CONFIG'] = json.dumps(tf_config)
elif FLAGS.dist_mode == 2: # 集群分布式模式
ps_hosts = FLAGS.ps_hosts.split(',')
worker_hosts = FLAGS.worker_hosts.split(',')
chief_hosts = worker_hosts[0:1] # get first worker as chief
worker_hosts = worker_hosts[2:] # the rest as worker
task_index = FLAGS.task_index
job_name = FLAGS.job_name
print('ps_host', ps_hosts)
print('worker_host', worker_hosts)
print('chief_hosts', chief_hosts)
print('job_name', job_name)
print('task_index', str(task_index))
# use #worker=0 as chief
if job_name == "worker" and task_index == 0:
job_name = "chief"
# use #worker=1 as evaluator
if job_name == "worker" and task_index == 1:
job_name = 'evaluator'
task_index = 0
# the others as worker
if job_name == "worker" and task_index > 1:
task_index -= 2
tf_config = {
'cluster': {'chief': chief_hosts, 'worker': worker_hosts, 'ps': ps_hosts},
'task': {'type': job_name, 'index': task_index }
}
print(json.dumps(tf_config))
os.environ['TF_CONFIG'] = json.dumps(tf_config)
def main(file_path):
#------check Arguments------
if FLAGS.dt_dir == "":
FLAGS.dt_dir = (date.today() + timedelta(-1)).strftime('%Y%m%d')
FLAGS.model_dir = FLAGS.model_dir + FLAGS.dt_dir
#FLAGS.data_dir = FLAGS.data_dir + FLAGS.dt_dir
va_files = ["hdfs://172.16.32.4:8020/strategy/esmm/va/part-r-00000"]
# if FLAGS.clear_existing_model:
# try:
# shutil.rmtree(FLAGS.model_dir)
# except Exception as e:
# print(e, "at clear_existing_model")
# else:
# print("existing model cleaned at %s" % FLAGS.model_dir)
# set_dist_env()
#------bulid Tasks------
model_params = {
"field_size": FLAGS.field_size,
"feature_size": FLAGS.feature_size,
"embedding_size": FLAGS.embedding_size,
"learning_rate": FLAGS.learning_rate,
"l2_reg": FLAGS.l2_reg,
"deep_layers": FLAGS.deep_layers,
"dropout": FLAGS.dropout,
"ctr_task_wgt":FLAGS.ctr_task_wgt
}
config = tf.estimator.RunConfig().replace(session_config = tf.ConfigProto(device_count={'GPU':0, 'CPU':FLAGS.num_threads}),
log_step_count_steps=FLAGS.log_steps, save_summary_steps=FLAGS.log_steps)
Estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=FLAGS.model_dir, params=model_params, config=config)
if FLAGS.task_type == 'train':
train_spec = tf.estimator.TrainSpec(input_fn=lambda: input_fn(file_path, num_epochs=FLAGS.num_epochs, batch_size=FLAGS.batch_size))
eval_spec = tf.estimator.EvalSpec(input_fn=lambda: input_fn(va_files, num_epochs=1, batch_size=FLAGS.batch_size), steps=None, start_delay_secs=1000, throttle_secs=1200)
result = tf.estimator.train_and_evaluate(Estimator, train_spec, eval_spec)
for key,value in sorted(result[0].items()):
print('%s: %s' % (key,value))
elif FLAGS.task_type == 'eval':
result = Estimator.evaluate(input_fn=lambda: input_fn(va_files, num_epochs=1, batch_size=FLAGS.batch_size))
for key,value in sorted(result.items()):
print('%s: %s' % (key,value))
elif FLAGS.task_type == 'infer':
preds = Estimator.predict(input_fn=lambda: input_fn(file_path, num_epochs=1, batch_size=FLAGS.batch_size), predict_keys=["pctcvr","uid","city","cid_id"])
result = []
for prob in preds:
result.append([str(prob["uid"][0]), str(prob["city"][0]), str(prob["cid_id"][0]), str(prob['pctcvr'])])
return result
elif FLAGS.task_type == 'export':
print("Not Implemented, Do It Yourself!")
def trans(x):
return str(x)[2:-1] if str(x)[0] == 'b' else x
def set_join(lst):
l = lst.unique().tolist()
r = [str(i) for i in l]
r =r[:500]
return ','.join(r)
def df_sort(result,queue_name):
df = pd.DataFrame(result, columns=["uid", "city", "cid_id", "pctcvr"])
# print(df.head(10))
df['uid1'] = df['uid'].apply(trans)
df['city1'] = df['city'].apply(trans)
df['cid_id1'] = df['cid_id'].apply(trans)
df2 = df.groupby(by=["uid1", "city1"]).apply(lambda x: x.sort_values(by="pctcvr", ascending=False)) \
.reset_index(drop=True).groupby(by=["uid1", "city1"]).agg({'cid_id1': set_join}).reset_index(drop=False)
df2.columns = ["device_id", "city_id", queue_name]
df2["time"] = str(datetime.datetime.now().strftime('%Y%m%d%H%M'))
return df2
def update_or_insert(df2,queue_name):
device_count = df2.shape[0]
con = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test', charset = 'utf8')
cur = con.cursor()
try:
for i in range(0, device_count):
query = """INSERT INTO esmm_device_diary_queue (device_id, city_id, time,%s) VALUES('%s', '%s', '%s', '%s') \
ON DUPLICATE KEY UPDATE device_id='%s', city_id='%s', time='%s', %s='%s'""" % (queue_name, df2.device_id[i],df2.city_id[i], df2.time[i], df2[queue_name][i], df2.device_id[i], df2.city_id[i], df2.time[i], queue_name, df2[queue_name][i])
cur.execute(query)
con.commit()
con.close()
print("insert or update sucess")
except Exception as e:
print(e)
if __name__ == "__main__":
b = time.time()
path = "hdfs://172.16.32.4:8020/strategy/esmm/"
tf.logging.set_verbosity(tf.logging.INFO)
if FLAGS.task_type == 'train':
print("train task")
tr_files = ["hdfs://172.16.32.4:8020/strategy/esmm/tr/part-r-00000"]
main(tr_files)
elif FLAGS.task_type == 'infer':
te_files = ["%s/part-r-00000" % FLAGS.hdfs_dir]
queue_name = te_files[0].split('/')[-2] + "_queue"
print(queue_name + " task")
result = main(te_files)
df = df_sort(result,queue_name)
update_or_insert(df,queue_name)
print("耗时(分钟):")
print((time.time()-b)/60)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from pyspark.streaming.kafka import KafkaUtils
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.streaming import StreamingContext
from pyspark import SparkConf
import json
import msgpack
import pymysql
import smtplib
from email.mime.text import MIMEText
from email.utils import formataddr
from email.mime.multipart import MIMEMultipart
from email.mime.application import MIMEApplication
import redis
import datetime
# filter logging
def gbk_decoder(s):
if s is None:
return None
try:
data = msgpack.loads(s,encoding='utf-8')
return data
except:
data = json.loads(s)
return data
def maidian(x):
try:
data = json.loads(x[1])
if 'type' in data and 'device' in data:
if data['type'] == 'on_click_button' \
and data['params']['page_name'] == 'home' and data['params']['tab_name'] == '精选' \
and data['params']['button_name'] == 'user_feedback_type' \
and data['params']['extra_param'][0]["card_content_type"] == "diary" \
and ("1" in data['params']['extra_param'][0]["feedback_type"]
or "2" in data['params']['extra_param'][0]["feedback_type"]):
return True
else:
return False
else:
return False
except Exception as e:
print("filter fail")
print(e)
def get_data(x):
try:
device_id = x[1]['device']['device_id']
diary_id = x[1]['params']['extra_param'][0]["card_id"]
return device_id,diary_id
except Exception as e:
print("get_data fail")
send_email("get_data", "get_data", e)
def write_redis(device_id,cid_list):
try:
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle')
sql = "select b.id from src_mimas_prod_api_diary_tags a left join src_zhengxing_api_tag b " \
"on a.tag_id = b.id where b.tag_type = '3' and a.diary_id in {}".format(tuple(cid_list))
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
tags = list(set([i[0] for i in result]))
if tags is not None:
sql = "select a.id from src_mimas_prod_api_diary a left join src_mimas_prod_api_diary_tags b " \
"on a.id=b.diary_id left join src_zhengxing_api_tag c on b.tag_id=c.id " \
"where a.is_online = 1 and a.content_level >= '3' " \
"and c.id in {} and c.tag_type = '3'".format(tuple(tags))
cursor.execute(sql)
result = cursor.fetchall()
if result is not None:
cids = list(set([i[0] for i in result]))
r = redis.StrictRedis.from_url('redis://:ReDis!GmTx*0aN6@172.16.40.133:6379')
key = str(device_id) + "_dislike_diary"
if r.exists(key):
value = eval(r.get(key))
value.extend(cids)
cids = json.dumps(list(set(value)))
r.set(key, json.dumps(cids))
else:
r.set(key, json.dumps(cids))
r.expire(key, 7*24*60*60)
except Exception as e:
print("insert redis fail")
print(e)
def model(rdd):
try:
rdd.filter(lambda x: maidian(x)).map(lambda x:get_data(x).na.drop().groupByKey())\
.map(lambda x:write_redis(x[0],x[1]))
except Exception as e:
print("fail")
print(e)
if __name__ == '__main__':
sc = SparkContext(conf=SparkConf().setMaster("spark://nvwa01:7077").setAppName("dislike_filter").set(
"spark.io.compression.codec", "lzf"))
ssc = StreamingContext(sc, 10)
sc.setLogLevel("WARN")
kafkaParams = {"metadata.broker.list": "172.16.44.25:9092,172.16.44.31:9092,172.16.44.45:9092",
"group.id": "dislike",
"socket.timeout.ms": "600000",
"auto.offset.reset": "largest"}
try:
stream = KafkaUtils.createDirectStream(ssc, ["gm-maidian-data"], kafkaParams, keyDecoder=gbk_decoder,
valueDecoder=gbk_decoder)
transformstream = stream.transform(lambda x: model(x))
transformstream.pprint()
ssc.start()
ssc.awaitTermination()
except Exception as e:
print(e)
# send_email(sc.appName, sc.applicationId, e)
import os
import time
def check():
out = os.popen("ps aux | grep diaryQueueUpdate.py").read()
flag = 1
for line in out.splitlines():
if 'python diaryQueueUpdate.py' in line:
flag = 2
return flag
if __name__ == "__main__":
#TODO 正式上线后,把下面的循环和time.sleep打开
# while True:
if check() == 1:
os.popen('python diaryQueueUpdate.py')
print("成功重启diaryQueueUpdate")
# time.sleep(300)
\ No newline at end of file
from itertools import chain, islice, cycle
import datetime
from collections import Counter
from gm_types.gaia import DIARY_ORDER_TYPE
from gm_types.doris import ANSWER_SORT_TYPE
from gm_types.doris import ARTICLE_SORT_TYPE
from gm_types.mimas import CONTENT_CLASS
from gm_types.doris import CARD_TYPE
from gm_types.gaia import CITY_LEVEL
from gm_rpcd.all import bind
import traceback
from search.utils.diary import recall_diary
from search.utils.answer import recall_answers
from search.utils.article import recall_articles
from gm_rpcd.all import context
from libs.algorithms import drop_dup
from libs.cache import redis_client
from libs.error import logging_exception
from extend.models.gaia import City, CityScale
from extend.models.gold import (
QAQueue,
WikiQueue,
IconQueue,
UserTopicQueue,
DoctorTopicQueue,
DiaryQueue,
ArticleQueue,
AnswerQueue,
DeviceQAQueue,
DeviceIconQueue,
DeviceUserTopicQueue,
DeviceDoctorTopicQueue,
DeviceAnswerQueue,
DeviceArticleQueue,
DeviceDiaryQueue,
QuestionQueue,
DeviceQuestionQueue
)
import logging
import redis
import json
from django.conf import settings
import traceback
MAX_LOAD = 200
logger = logging.getLogger(__name__)
@bind("doris/recommend/get_diaries")
def get_diaries(tags, city, offset=0, size=10, city_tag_id=None):
# NOTE: city as city id
sort_params = {}
if city_tag_id:
sort_params["user_city_tag_id"] = city_tag_id
elif city:
try:
x = City.objects.get(id=city)
sort_params["user_city_tag_id"] = x.tag_id
except City.DoesNotExist:
pass
filters = {
"is_sink": False,
"has_before_cover": True,
"has_after_cover": True,
"content_level_is_good": True
}
if tags:
filters["closure_tag_ids"] = tags
tail = offset + size
diaries_ids = []
if tail < MAX_LOAD:
diaries = recall_diary(None, 0, 200, filters, DIARY_ORDER_TYPE.RECOMMEND, sort_params, fields=["id", "user.id"])
diaries_items = [(diary['id'], diary['user']['id']) for diary in diaries]
drop_dup_diaries = drop_dup(diaries_items)
drop_dup_size = len(drop_dup_diaries)
if tail <= drop_dup_size:
diaries_ids = [item[0] for item in drop_dup_diaries[offset:tail]]
if len(diaries_ids) == 0: # 如果头200条去重结束 后面的排序不去重
diaries = recall_diary(None, offset, size, filters, DIARY_ORDER_TYPE.RECOMMEND, sort_params, fields=["id"])
diaries_ids = [diary['id'] for diary in diaries]
return {"diaries_ids": diaries_ids}
@bind("doris/recommend/get_articles")
def get_articles(tags, offset=0, size=10):
filters = {
"content_level": [CONTENT_CLASS.EXCELLENT, CONTENT_CLASS.FINE]
}
if tags:
filters["tag_ids"] = tags
articles = recall_articles(None, offset, size, filters, ARTICLE_SORT_TYPE.RECOMMEND, {})
article_ids = [article['id'] for article in articles]
return {"article_ids": article_ids}
@bind("doris/recommend/get_answers")
def get_answers(tags, offset=0, size=10):
filters = {
"content_level": [CONTENT_CLASS.EXCELLENT, CONTENT_CLASS.FINE]
}
if tags:
filters["tag_ids"] = tags
tail = offset + size
answer_ids = []
if tail < MAX_LOAD:
answers = recall_answers(None, 0, MAX_LOAD, filters, ANSWER_SORT_TYPE.RECOMMEND, {}, fields=["id", "user_id"])
answers = filter(lambda answer: "id" in answer and "user_id" in answer, answers)
answer_items = [(answer["id"], answer["user_id"]) for answer in answers]
drop_dup_answers = drop_dup(answer_items)
if tail <= len(drop_dup_answers):
answer_ids = [item[0] for item in drop_dup_answers[offset:tail]]
if len(answer_ids) == 0:
answers = recall_answers(None, offset, size, filters, ANSWER_SORT_TYPE.RECOMMEND, {})
answer_ids = [answer['id'] for answer in answers]
return {"answer_ids": answer_ids}
@bind('doris/recommend/icon')
def fetch_icon(device_id, size):
try:
card_type = "icon"
try:
que = DeviceIconQueue.objects.get(device_id=device_id)
except DeviceIconQueue.DoesNotExist:
que = IconQueue.objects.last()
if not que:
return {"icon": []}
que = list(filter(None, que.queue.split(',')))
# adjust args.
cursor = 0
cursor = int(cursor) % len(que)
size = min(size, len(que))
data = list(islice(cycle(que), cursor, cursor + size))
return {card_type: list(map(int, data))}
except:
logging_exception()
return {"icon": []}
@bind('doris/recommend/homepage_polymer')
def fetch_polymer_ids(device_id, size):
try:
card_type = "polymer_ids"
try:
que = DeviceIconQueue.objects.get(device_id=device_id)
except DeviceIconQueue.DoesNotExist:
que = IconQueue.objects.last()
if not que:
return {"polymer_ids": []}
que = list(filter(None, que.queue.split(',')))
# adjust args.
cursor = 0
cursor = int(cursor) % len(que)
size = min(size, len(que))
data = list(islice(cycle(que), cursor, cursor + size))
return {card_type: list(map(int, data))}
except:
logging_exception()
return {"polymer_ids": []}
@bind('doris/recommend/feed')
def recommend_feed(device_id, card_type, city_id, size):
try:
return RecommendFeed.dispatch(device_id, card_type,
city_id, size)
except:
logging_exception()
return {card_type: []}
class RecommendFeed:
@classmethod
def dispatch(cls, device_id, card_type, city_id, size):
data = []
if card_type == CARD_TYPE.QA:
data = cls.fetch_qa(device_id, card_type, size)
elif card_type == CARD_TYPE.ANSWER:
data = cls.fetch_answer(device_id, card_type, size)
data = list(map(int, data))
elif card_type == CARD_TYPE.ARTICLE:
data = cls.fetch_article(device_id, card_type, size)
data = list(map(int, data))
elif card_type == CARD_TYPE.QUESTION:
data = cls.fetch_question(device_id, card_type, size)
data = list(map(int, data))
elif card_type == CARD_TYPE.DIARY:
data = cls.fetch_diary(device_id, card_type, city_id, size)
elif card_type == CARD_TYPE.USERTOPIC:
data = cls.fetch_user_topic(device_id,card_type,size)
elif card_type == CARD_TYPE.DOCTORTOPIC:
data = cls.fetch_doctor_topic(device_id,card_type,size)
data = list(map(int, data))
elif card_type == CARD_TYPE.ENCYCLOPEDIA:
data = cls.fetch_wiki(device_id,card_type,size)
return {card_type: data}
@staticmethod
def current_date():
return datetime.datetime.now().strftime('%Y-%m-%d')
@staticmethod
def fetch_question(device_id, card_type, size):
key = '{device_id}-{card_type}-{date}'.format(device_id=device_id,
card_type=card_type, date=RecommendFeed.current_date())
try:
que = DeviceQuestionQueue.objects.get(device_id=device_id)
except DeviceQuestionQueue.DoesNotExist:
que = QuestionQueue.objects.last()
que = list(filter(None, que.queue.split(',')))
# adjust args.
cursor = redis_client.get(key) or 0
cursor = int(cursor) % len(que)
size = min(size, len(que))
redis_client.set(key, cursor + size, ex=24 * 60 * 60)
return list(islice(cycle(que), cursor, cursor + size))
@staticmethod
def fetch_icon(device_id, card_type, size):
key = '{device_id}-{card_type}-{date}'.format(device_id=device_id,
card_type=card_type, date=RecommendFeed.current_date())
try:
que = DeviceIconQueue.objects.get(device_id=device_id)
except DeviceIconQueue.DoesNotExist:
que = IconQueue.objects.last()
que = list(filter(None, que.queue.split(',')))
# adjust args.
cursor = redis_client.get(key) or 0
cursor = int(cursor) % len(que)
size = min(size, len(que))
redis_client.set(key, cursor + size, ex=24 * 60 * 60)
return list(islice(cycle(que), cursor, cursor + size))
@staticmethod
def fetch_wiki(device_id, card_type, size):
try:
key = '{device_id}-{card_type}-{date}'.format(device_id=device_id,
card_type=card_type, date=RecommendFeed.current_date())
que = WikiQueue.objects.last()
if not que:
return []
# que = list(filter(None, que.queue.split(',')))
que = json.loads(que.queue)
# adjust args.
cursor = redis_client.get(key) or 0
cursor = int(cursor) % len(que)
size = min(size, len(que))
redis_client.set(key, cursor + size, ex=24 * 60 * 60)
return list(islice(cycle(que), cursor, cursor + size))
except:
logging_exception()
return []
@staticmethod
def fetch_answer(device_id, card_type, size):
try:
key = '{device_id}-{card_type}-{date}'.format(device_id=device_id,
card_type=card_type, date=RecommendFeed.current_date())
try:
que = DeviceAnswerQueue.objects.get(device_id=device_id)
except DeviceAnswerQueue.DoesNotExist:
que = AnswerQueue.objects.last()
if not que:
return []
que = list(filter(None, que.queue.split(',')))
# adjust args.
cursor = redis_client.get(key) or 0
cursor = int(cursor) % len(que)
size = min(size, len(que))
redis_client.set(key, cursor + size, ex=24 * 60 * 60)
return list(islice(cycle(que), cursor, cursor + size))
except:
logging_exception()
return []
@staticmethod
def fetch_qa(device_id, card_type, size):
try:
key = '{device_id}-{card_type}-{date}'.format(device_id=device_id,
card_type=card_type, date=RecommendFeed.current_date())
if (device_id != '0'):
search_qa_recommend_key = "TS:search_recommend_answer_queue:device_id:" + str(device_id)
search_qa_recommend_list = list()
search_cursor_ts = 0
if redis_client.exists(search_qa_recommend_key):
search_qa_recommend_dict = redis_client.hgetall(search_qa_recommend_key)
if b'cursor' in search_qa_recommend_dict:
search_cursor_ts = json.loads(search_qa_recommend_dict[b'cursor'])
if search_cursor_ts < 10:
search_qa_recommend_list = json.loads(search_qa_recommend_dict[b'answer_queue'])
if search_cursor_ts < len(search_qa_recommend_list):
size = size - 1
try:
que = DeviceQAQueue.objects.get(device_id=device_id)
except DeviceQAQueue.DoesNotExist:
que = AnswerQueue.objects.last()
if not que:
return []
que = list(filter(None, que.queue.split(',')))
# adjust args.
cursor = redis_client.get(key) or 0
cursor = int(cursor) % len(que)
size = min(size, len(que))
# redis_client.set(key, cursor + size, ex=24 * 60 * 60)
data = list(islice(cycle(que), cursor, cursor + size))
data = list(map(int, data))
if cursor + 2*size < len(que):
redis_client.set(key, cursor + size, ex=24 * 60 * 60)
else:
try:
context.request_logger.app(reset_answer_queue=True)
cursor = 0
redis_client.set(key, cursor, ex=24 * 60 * 60)
except:
redis_client.set(key, cursor + size, ex=24 * 60 * 60)
if device_id != '0':
if len(search_qa_recommend_list) > 0 and search_cursor_ts < len(search_qa_recommend_list):
queue = search_qa_recommend_list[search_cursor_ts:search_cursor_ts + 1]
queue.extend(data)
data = queue
new_search_cursor = search_cursor_ts + 1
redis_client.hset(search_qa_recommend_key, 'cursor', new_search_cursor)
redis_client.expire(search_qa_recommend_key, 30 * 24 * 60 * 60)
read_qa_key = "TS:recommend_answer_set:device_id:" + str(device_id)
if len(data) > 0:
redis_client.sadd(read_qa_key, *data)
return data
except:
logging_exception()
return []
@staticmethod
def fetch_article(device_id, card_type, size):
key = '{device_id}-{card_type}-{date}'.format(device_id=device_id,
card_type=card_type, date=RecommendFeed.current_date())
try:
que = DeviceArticleQueue.objects.get(device_id=device_id)
except DeviceArticleQueue.DoesNotExist:
que = ArticleQueue.objects.last()
if not que:
return []
que = list(filter(None, que.queue.split(',')))
# adjust args.
cursor = redis_client.get(key) or 0
cursor = int(cursor) % len(que)
size = min(size, len(que))
redis_client.set(key, cursor + size, ex=24 * 60 * 60)
return list(islice(cycle(que), cursor, cursor + size))
@staticmethod
def fetch_user_topic(device_id, card_type, size):
try:
key = '{device_id}-{card_type}-{date}'.format(device_id=device_id,card_type=card_type, date=RecommendFeed.current_date())
if (device_id != '0') and size >= 2:
search_topic_recommend_key = "TS:search_recommend_tractate_queue:device_id:" + str(device_id)
search_topic_recommend_list = list()
search_cursor_ts = 0
if redis_client.exists(search_topic_recommend_key):
search_topic_recommend_dict = redis_client.hgetall(search_topic_recommend_key)
if b'cursor' in search_topic_recommend_dict:
search_cursor_ts = json.loads(search_topic_recommend_dict[b'cursor'])
if search_cursor_ts < 30:
search_topic_recommend_list = json.loads(search_topic_recommend_dict[b'tractate_queue'])
if search_cursor_ts < len(search_topic_recommend_list):
size = size - 2
try:
que = DeviceUserTopicQueue.objects.get(device_id=device_id)
except DeviceUserTopicQueue.DoesNotExist:
que = UserTopicQueue.objects.last()
if not que:
return []
que = list(filter(None, que.queue.split(',')))
# adjust args.
cursor = redis_client.get(key) or 0
cursor = int(cursor) % len(que)
size = min(size, len(que))
data = list(islice(cycle(que), cursor, cursor + size))
data = list(map(int, data))
if cursor + 2*size < len(que):
redis_client.set(key, cursor + size, ex=24 * 60 * 60)
else:
try:
context.request_logger.app(reset_queue=True)
cursor = 0
redis_client.set(key, cursor, ex=24 * 60 * 60)
except:
redis_client.set(key, cursor + size, ex=24 * 60 * 60)
if device_id != '0' and size >= 2:
if len(search_topic_recommend_list) > 0 and search_cursor_ts < len(search_topic_recommend_list):
queue = search_topic_recommend_list[search_cursor_ts:search_cursor_ts + 2]
queue.extend(data)
data = queue
new_search_cursor = search_cursor_ts + 2
redis_client.hset(search_topic_recommend_key, 'cursor', new_search_cursor)
redis_client.expire(search_topic_recommend_key, 30 * 24 * 60 * 60)
read_topic_key = "TS:recommend_tractate_set:device_id:" + str(device_id)
if len(data) > 0:
redis_client.sadd(read_topic_key, *data)
return data
except:
logging_exception()
return []
@staticmethod
def fetch_doctor_topic(device_id, card_type, size):
try:
key = '{device_id}-{card_type}-{date}'.format(device_id=device_id,
card_type=card_type, date=RecommendFeed.current_date())
try:
que = DeviceDoctorTopicQueue.objects.get(device_id=device_id)
except DeviceDoctorTopicQueue.DoesNotExist:
que = DoctorTopicQueue.objects.last()
if not que:
return []
que = list(filter(None, que.queue.split(',')))
# adjust args.
cursor = redis_client.get(key) or 0
cursor = int(cursor) % len(que)
size = min(size, len(que))
redis_client.set(key, cursor + size, ex=24 * 60 * 60)
return list(islice(cycle(que), cursor, cursor + size))
except:
logging_exception()
return []
@classmethod
def get_gm_kv_ins(cls,redis_ip, redis_port, redis_db, redis_password=""):
try:
if len(redis_password) == 0:
cli_ins = redis.Redis(host=redis_ip, port=redis_port, db=redis_db, socket_timeout=2)
else:
cli_ins = redis.Redis(host=redis_ip, port=redis_port, db=redis_db, password=redis_password,
socket_timeout=2)
cli_ins.ping()
return cli_ins
except:
return None
@classmethod
def fetch_diary_queue_data(cls, city_id, device_id=None):
local = list()
nearby = list()
nation = list()
megacity = list()
use_city_id = city_id
try:
gm_kv_ins = None
for gm_kv_host_item in settings.GM_KV_HOSTS:
gm_kv_ins = cls.get_gm_kv_ins(redis_ip=gm_kv_host_item["host"], redis_port=gm_kv_host_item["port"], redis_db=gm_kv_host_item["db"],redis_password=gm_kv_host_item["password"])
if gm_kv_ins:
break
specify_city_id_key = "diary_queue:city_id:" + use_city_id
world_city_id_key = "diary_queue:city_id:world"
if device_id is not None:
specify_city_id_key = "device_diary_queue:device_id:" + device_id + ":city_id:" + use_city_id
city_val_dict = gm_kv_ins.hgetall(specify_city_id_key)
if len(city_val_dict) == 0:
city_val_dict = gm_kv_ins.hgetall(world_city_id_key)
use_city_id = "world"
if b"native_queue" in city_val_dict and city_val_dict[b"native_queue"]:
local = list(filter(None, city_val_dict[b"native_queue"].split(b",")))
if b"nearby_queue" in city_val_dict and city_val_dict[b"nearby_queue"]:
nearby = list(filter(None, city_val_dict[b"nearby_queue"].split(b",")))
if b"nation_queue" in city_val_dict and city_val_dict[b"nation_queue"]:
nation = list(filter(None, city_val_dict[b"nation_queue"].split(b",")))
if b"megacity_queue" in city_val_dict and city_val_dict[b"megacity_queue"]:
megacity = list(filter(None, city_val_dict[b"megacity_queue"].split(b",")))
return (local, nearby, nation, megacity, use_city_id)
except:
logging_exception()
logger.error("catch exception,err_log:%s" % traceback.format_exc())
qs = DiaryQueue.objects.filter(city_id__in=[city_id, 'world'])
# Assume that world queue must exist.
if len(qs) == 1:
obj = qs[0]
else:
obj = qs[0] if qs[0].city_id == city_id else qs[1]
if obj.native_queue:
local = list(filter(None, obj.native_queue.split(',')))
if obj.nearby_queue:
nearby = list(filter(None, obj.nearby_queue.split(',')))
if obj.nation_queue:
nation = list(filter(None, obj.nation_queue.split(',')))
if obj.megacity_queue:
megacity = list(filter(None, obj.megacity_queue.split(',')))
use_city_id = obj.city_id if obj else use_city_id
return (local, nearby, nation, megacity, use_city_id)
@classmethod
def fetch_device_diary_queue_data(cls, city_id, device_id):
local = list()
nearby = list()
nation = list()
megacity = list()
use_city_id = city_id
try:
gm_kv_ins = None
for gm_kv_host_item in settings.GM_KV_HOSTS:
gm_kv_ins = cls.get_gm_kv_ins(redis_ip=gm_kv_host_item["host"], redis_port=gm_kv_host_item["port"], redis_db=gm_kv_host_item["db"],redis_password=gm_kv_host_item["password"])
if gm_kv_ins:
break
specify_city_id_key = "device_diary_queue:device_id:" + device_id + ":city_id:" + use_city_id
city_val_dict = gm_kv_ins.hgetall(specify_city_id_key)
if b"native_queue" in city_val_dict and city_val_dict[b"native_queue"]:
local = list(filter(None, city_val_dict[b"native_queue"].split(b",")))
if b"nearby_queue" in city_val_dict and city_val_dict[b"nearby_queue"]:
nearby = list(filter(None, city_val_dict[b"nearby_queue"].split(b",")))
if b"nation_queue" in city_val_dict and city_val_dict[b"nation_queue"]:
nation = list(filter(None, city_val_dict[b"nation_queue"].split(b",")))
if b"megacity_queue" in city_val_dict and city_val_dict[b"megacity_queue"]:
megacity = list(filter(None, city_val_dict[b"megacity_queue"].split(b",")))
return (local, nearby, nation, megacity, use_city_id)
except:
logging_exception()
logger.error("catch exception,err_log:%s" % traceback.format_exc())
obj = DeviceDiaryQueue.objects.filter(device_id=device_id, city_id=city_id).first()
if obj and obj.native_queue:
local = list(filter(None, obj.native_queue.split(',')))
if obj and obj.nearby_queue:
nearby = list(filter(None, obj.nearby_queue.split(',')))
if obj and obj.nation_queue:
nation = list(filter(None, obj.nation_queue.split(',')))
if obj and obj.megacity_queue:
megacity = list(filter(None, obj.megacity_queue.split(',')))
use_city_id = obj.city_id if obj else use_city_id
return (local, nearby, nation, megacity, use_city_id)
@classmethod
def fetch_diary(cls, device_id, card_type, city_id, size):
# first, we fetch data from personal-queue city-queue, if not both, get data
# from world queue.
user_portrait_diary_part_list = list()
click_diary_size = 1
search_diary_size = 4
if device_id != '0':
user_portrait_diary_key = 'user_portrait_recommend_diary_queue:device_id:%s:%s' % (device_id, datetime.datetime.now().strftime('%Y-%m-%d'))
if redis_client.exists(user_portrait_diary_key):
user_portrait_diary_dict = redis_client.hgetall(user_portrait_diary_key)
user_portrait_cursor = str(user_portrait_diary_dict[b'cursor'],encoding='utf-8')
if user_portrait_cursor == '0':
if b'len_cursor' in user_portrait_diary_dict.keys():
user_portrait_diary_list = json.loads(user_portrait_diary_dict[b'diary_queue'])
len_cursor = str(user_portrait_diary_dict[b'len_cursor'],encoding='utf-8')
len_cursor = int(len_cursor)
if len(user_portrait_diary_list) - len_cursor >size:
user_portrait_diary_part_list = user_portrait_diary_list[len_cursor:len_cursor+size]
redis_client.hset(user_portrait_diary_key,'len_cursor',len_cursor+size)
size = 0
else:
user_portrait_diary_list = json.loads(user_portrait_diary_dict[b'diary_queue'])
diary_list_len = len(user_portrait_diary_list) - len_cursor
size = size - diary_list_len
user_portrait_diary_part_list = user_portrait_diary_list[len_cursor:len_cursor + diary_list_len]
redis_client.hset(user_portrait_diary_key, 'len_cursor', len_cursor + diary_list_len)
user_portrait_cursor = int(user_portrait_cursor) + 1
redis_client.hset(user_portrait_diary_key, 'cursor', user_portrait_cursor)
else:
user_portrait_diary_part_list = json.loads(user_portrait_diary_dict[b'diary_queue'])
size = size - len(user_portrait_diary_part_list)
user_portrait_cursor = int(user_portrait_cursor) + 1
redis_client.hset(user_portrait_diary_key, 'cursor', user_portrait_cursor)
try:
# obj = DeviceDiaryQueue.objects.filter(device_id=device_id, city_id=city_id).first()
(local, nearby, nation, megacity, city_id) = cls.fetch_device_diary_queue_data(city_id, device_id)
if len(local) == 0 and len(nearby) == 0 and len(nation) == 0 and len(megacity) == 0:
(local, nearby, nation, megacity, city_id) = cls.fetch_diary_queue_data(city_id)
# if not obj:
# (local, nearby, nation, megacity,city_id) = cls.fetch_diary_queue_data(city_id)
# else:
# local = list(filter(None, obj.native_queue.split(','))) if obj.native_queue else []
# nearby = list(filter(None, obj.nearby_queue.split(','))) if obj.nearby_queue else []
# nation = list(filter(None, obj.nation_queue.split(','))) if obj.nation_queue else []
# megacity = list(filter(None, obj.megacity_queue.split(','))) if obj.megacity_queue else []
except:
logging_exception()
(local, nearby, nation, megacity, city_id) = cls.fetch_diary_queue_data(city_id)
if(device_id!='0'):
search_diary_recommend_key = "TS:search_recommend_diary_queue:device_id:" + str(device_id)
search_diary_recommend_list = list()
search_cursor_ts = 0
if redis_client.exists(search_diary_recommend_key) and size >3:
search_diary_recommend_dict = redis_client.hgetall(search_diary_recommend_key)
if b'cursor' in search_diary_recommend_dict:
search_cursor_ts = json.loads(search_diary_recommend_dict[b'cursor'])
search_diary_recommend_list = json.loads(search_diary_recommend_dict[b'diary_queue'])
if search_cursor_ts +search_diary_size < len(search_diary_recommend_list) :
size = size - search_diary_size
if (device_id != '0') :
diary_recommend_key = "TS:recommend_diary_queue:device_id:" + str(device_id)
diary_recommend_list = list()
if redis_client.exists(diary_recommend_key) and size > 0:
diary_recommend_dict = redis_client.hgetall(diary_recommend_key)
diary_recommend_list = json.loads(diary_recommend_dict[b'diary_queue'])
if len(diary_recommend_list)>0:
size = size -click_diary_size
key = '{device_id}-{city_id}-{date}'.format(device_id=device_id,
city_id=city_id, date=RecommendFeed.current_date())
# strategy rule: when user refresh over 30 loadings, reset native nearby nation queue cursor.
counter_key = key + '-counter_v1'
counter = redis_client.incr(counter_key)
if counter == 1:
redis_client.expire(counter_key, 24 * 60 * 60)
cursor_key = key + '-cursor_v1'
cursor = redis_client.get(cursor_key) or b'0-0-0-0'
# if counter > 30:
# cursor = b'0-0-0-0'
# redis_client.delete(counter_key)
cx, cy, cm, cz = map(int, cursor.split(b'-'))
x, y, m, z = cls.get_city_scale(city_id)
data, ncx, ncy, ncm, ncz = cls.get_scale_data(
local, nearby, nation, megacity,
cx, cy, cm, cz,
x, y, z, m, size
)
if ncx == cx and ncy == cy: # native queue and nearby queue
logger.info("diary queue reach end,cx:%d,cy:%d,cm:%d,cz:%d",cx,cy,cm,cz)
# redis_client.delete(counter_key)
# data, ncx, ncy, ncm, ncz = cls.get_scale_data(
# local, nearby, nation, megacity,
# 0, 0, 0, 0,
# x, y, z, m, size
# )
ncx = ncy = ncm = ncz = 0
val = '-'.join(map(str, [ncx, ncy, ncm, ncz]))
redis_client.set(cursor_key, val, ex=24 * 60 * 60)
data = list(map(int, data))
if device_id != '0':
if search_cursor_ts<len(search_diary_recommend_list)-search_diary_size:
queue = search_diary_recommend_list[search_cursor_ts:search_cursor_ts+search_diary_size]
queue.extend(data)
data = queue
new_search_cursor = search_cursor_ts +search_diary_size
redis_client.hset(search_diary_recommend_key,'cursor',new_search_cursor)
redis_client.expire(search_diary_recommend_key,30*24*60*60)
if len(diary_recommend_list) >0:
diary_id = diary_recommend_list.pop(0)
data.insert(0,diary_id)
if len(diary_recommend_list)>0:
diary_recommend_list_json = json.dumps(diary_recommend_list)
redis_client.hset(diary_recommend_key,'diary_queue',diary_recommend_list_json)
redis_client.expire(diary_recommend_key,30*24*60*60)
else:
redis_client.delete(diary_recommend_key)
if len(user_portrait_diary_part_list)>0:
user_portrait_diary_part_list.extend(data)
data = user_portrait_diary_part_list
#已读
read_diary_key = "TS:recommend_diary_set:device_id:" + str(device_id)
if len(data)>0:
redis_client.sadd(read_diary_key,*data)
return data
@staticmethod
def get_scale_data(local, nearby, nation, megacity, cx, cy, cm, cz, x, y, z, m, size):
"""
:param local: local diary queue
:param nearby: nearby diary queue
:param nation: nation diary queue
:param megacity: megacity diary queue
:param cx: seen local diary offset
:param cy: seen nearby diary offset
:param cz: seen nation diary offset
:param cm: seen megacity diary offset
:param x: local diary scale factor
:param y: nearby diary scale factor
:param z: nation diary scale factor
:param m: megacity diary scale factor
:param size: nubmer of diary
:return:
"""
# 本地 临近 特大城市 全国 四个层级 都按照的是四舍五入取得方式
# 针对出现的问题,本次相应的优化是:
# 1、如果出现两个层级为零,且有剩余坑位时,则按照本地 临近 全国的优先级,先给优先级高且为零的层级一个坑位。
# 2、如果所有层级都非零,且有剩余坑位时,则优先给权重占比大的层级一个坑位。
# 3、如果只有一个层级为零,且有剩余坑位时,则优先填充权重占比大的层级一个坑位。
nx = int(round(x * 1.0 / (x + y + z + m) * size))
ny = int(round(y * 1.0 / (x + y + z + m) * size))
nz = int(round(z * 1.0 / (x + y + z + m) * size))
nm = int(round(m * 1.0 / (x + y + z + m) * size))
nxyz = [nx, ny, nm, nz]
xyz = [x, y, m, z]
counter = Counter([nx, ny, nm, nz])
if counter[0] == 2:
nxyz[nxyz.index(0)] += size - sum(nxyz)
else:
nxyz[xyz.index(max(xyz))] += size - sum(nxyz)
nx, ny, nm, nz = nxyz
slocal = local[cx:cx + nx]
cx = min(cx + nx, len(local))
ny += (nx - len(slocal))
snearby = nearby[cy:cy + ny]
cy = min(cy + ny, len(nearby))
nm += (ny - len(snearby))
smegacity = megacity[cm: cm + nm]
cm = min(cm + nm, len(megacity))
nz += (nm - len(smegacity))
snation = nation[cz:cz + nz]
cz = min(cz + nz, len(nation))
return chain(slocal, snearby, smegacity, snation), cx, cy, cm, cz
@staticmethod
def get_city_scale(city_id):
try:
c = CityScale.objects.get(city_id=city_id)
x, y, z, m = c.native, c.nearby, c.nation, c.megacity
except CityScale.DoesNotExist:
try:
c = City.objects.get(id=city_id)
if c.level in (CITY_LEVEL.SUPER, CITY_LEVEL.ONE):
x, y, m, z = 4, 3, 0, 3
elif c.level == CITY_LEVEL.TWO:
x, y, m, z = 3, 3, 0, 3
elif c.level == CITY_LEVEL.THREE:
x, y, m, z = 1, 4, 0, 5
else:
x, y, m, z = 0, 0, 0, 10
except City.DoesNotExist:
x, y, m, z = 0, 0, 0, 10
return x, y, m, z
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
from pyspark.streaming.kafka import KafkaUtils
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.streaming import StreamingContext
from pyspark import SparkConf
import redis
import sys
import os
import json
import pymysql
import numpy as np
import pandas as pd
import time
import datetime
def Json(x):
data = json.loads(x[1])
if 'type' in data and 'device' in data and 'params' in data and 'card_content_type' in data['params']:
if data['type'] == 'on_click_card' and data["device"]["device_id"] == "E417C286-40A4-42F6-BDA9-AEEBD8FEC3B6":
return True
else:
return False
else:
return False
def model(rdd):
try:
rdd = rdd.filter(lambda x:Json(x)).repartition(10).map(lambda x:get_data(x))\
.map(lambda x:write_redis(x[0],x[1],x[2]))
return rdd
except:
print("fail")
def get_data(x):
try:
data = json.loads(x[1])
device_id = data['device']['device_id']
diary_id = data['params']["card_id"]
card = data['params']['card_content_type']
return device_id,diary_id,card
except Exception as e:
print("get_data fail")
# send_email("get_data", "get_data", e)
def write_redis(device_id,cid,card):
if card == "diary":
diary_write(device_id, cid)
elif card == "qa":
question_write(device_id, cid)
elif card == "user_post":
tractate_write(device_id, cid)
def tractate_write(device_id, cid):
try:
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle')
sql = "select b.id from src_mimas_prod_api_tractate_tag a left join src_zhengxing_api_tag b " \
"on a.tag_id = b.id where b.tag_type = '3' and a.tractate_id = {}".format(cid)
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
if len(result) > 0:
tags = result[0][0]
if tags is not None:
sql = "select a.id from src_mimas_prod_api_tractate a left join src_mimas_prod_api_tractate_tag b " \
"on a.id=b.tractate_id left join src_zhengxing_api_tag c on b.tag_id=c.id " \
"where a.is_online = 1 and c.id = {} and c.tag_type = '3'".format(tags)
cursor.execute(sql)
result = cursor.fetchall()
db.close()
if len(result) > 0:
cids = [str(i[0]) for i in result]
r = redis.Redis(host="172.16.40.135", port=5379, password="",db = 2)
key = str(device_id) + "_dislike_tractate"
if r.exists(key):
value = json.loads(r.get(key).decode('utf-8'))
value.extend(cids)
cids = json.dumps(list(set(value)))
r.set(key, cids)
print("cunza")
else:
r.set(key, json.dumps(cids))
r.expire(key, 7 * 24 * 60 * 60)
except Exception as e:
print("tractate insert redis fail")
print(e)
def question_write(device_id,cid):
try:
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle')
sql = "select b.id from src_mimas_prod_api_questiontag a left join src_zhengxing_api_tag b " \
"on a.tag_id = b.id where b.tag_type = '3' and a.question_id = {}".format(cid)
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
if len(result) > 0:
tags = result[0][0]
if tags is not None:
sql = "select a.id from src_mimas_prod_api_question a left join src_mimas_prod_api_questiontag b " \
"on a.id=b.question_id left join src_zhengxing_api_tag c on b.tag_id=c.id " \
"where a.is_online = 1 and c.tag_type = '3' and c.id = {}".format(tags)
cursor.execute(sql)
result = cursor.fetchall()
db.close()
if len(result) > 0:
cids = [str(i[0]) for i in result]
r = redis.Redis(host="172.16.40.135", port=5379, password="", db=2)
key = str(device_id) + "_dislike_qa"
if r.exists(key):
value = json.loads(r.get(key).decode('utf-8'))
value.extend(cids)
cids = json.dumps(list(set(value)))
r.set(key, cids)
print("cunza")
else:
r.set(key, json.dumps(cids))
r.expire(key, 7 * 24 * 60 * 60)
print("bucunza")
return "question good"
except Exception as e:
print("question insert redis fail")
print(e)
def diary_write(device_id,cid):
try:
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle')
sql = "select b.id from src_mimas_prod_api_diary_tags a left join src_zhengxing_api_tag b " \
"on a.tag_id = b.id where b.tag_type = '3' and a.diary_id = {}".format(cid)
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
if len(result) > 0:
tags = result[0][0]
if tags is not None:
sql = "select a.id from src_mimas_prod_api_diary a left join src_mimas_prod_api_diary_tags b " \
"on a.id=b.diary_id left join src_zhengxing_api_tag c on b.tag_id=c.id " \
"where a.is_online = 1 and a.content_level >= '3' " \
"and c.id = {} and c.tag_type = '3'".format(tags)
cursor.execute(sql)
result = cursor.fetchall()
db.close()
if len(result) > 0:
cids = [str(i[0]) for i in result]
r = redis.Redis(host="172.16.40.135", port=5379, password="", db=2)
key = str(device_id) + "_dislike_diary"
if r.exists(key):
value = json.loads(r.get(key).decode('utf-8'))
value.extend(cids)
cids = json.dumps(list(set(value)))
r.set(key, cids)
else:
r.set(key, json.dumps(cids))
r.expire(key, 7 * 24 * 60 * 60)
except Exception as e:
print("diary insert redis fail")
print(e)
# sc = SparkContext(conf=SparkConf().setMaster("spark://nvwa01:7077").setAppName("dislike").set("spark.io.compression.codec", "lzf"))
# ssc = StreamingContext(sc,4)
# sc.setLogLevel("WARN")
# kafkaParams = {"metadata.broker.list": "172.16.44.25:9092,172.16.44.31:9092,172.16.44.45:9092",
# "group.id": "dislike",
# "socket.timeout.ms": "600000",
# "auto.offset.reset": "largest"}
#
#
# stream = KafkaUtils.createDirectStream(ssc, ["gm-maidian-data"], kafkaParams)
# transformstream = stream.transform(lambda x:model(x))
# transformstream.pprint()
#
# ssc.start()
# ssc.awaitTermination()
def make_data(device_id,city_id,key_head):
r = redis.StrictRedis.from_url("redis://redis.paas-test.env:6379/2")
key = key_head + device_id + ":city_id:" + city_id
r.hset(name=key, key="native_queue", value=native)
r.hset(name=key, key="nearby_queue", value=nearby)
r.hset(name=key, key="nation_queue", value=nation)
r.hset(name=key, key="megacity_queue", value=megacity)
print(r.hgetall(key))
native = ",".join([str(i) for i in (range(100, 102))])
nearby = ",".join([str(i) for i in (range(102, 106))])
nation = ",".join([str(i) for i in (range(106, 110))])
megacity = ",".join([str(i) for i in (range(110, 118))])
key_head = "device_diary_queue_rerank:device_id:"
# key_head = "device_diary_queue:device_id:"
device_id = "868663038800471"
# make_data(device_id, "beijing", key_head)
# device_id = "868663038800476"
city_id = "beijing"
if __name__ == "__main__":
users_list = list(range(1,90))
n = 3
split_users_list = [users_list[i:i + n] for i in range(0, len(users_list), n)]
for child_users_list in split_users_list:
total_samples = list()
for uid_city in child_users_list:
# tag_list = get_user_profile(uid_city[0])
# queues = get_queues(uid_city[0], uid_city[1])
# if len(queues) > 0 and len(tag_list) > 0:
# new_native = tag_boost(queues[0], tag_list)
# new_nearby = tag_boost(queues[1], tag_list)
#
# insert_time = str(datetime.datetime.now().strftime('%Y%m%d%H%M'))
# sample = [uid_city[0], uid_city[1], new_native, new_nearby, queues[2], queues[3], insert_time]
total_samples.append(uid_city)
if len(total_samples) > 0:
df = pd.DataFrame(total_samples)
df = df.rename(columns={0: "device_id"})
print("df numbers")
print(df.shape[0])
# to_data_base(df)
from utils import con_sql
import datetime
import time
import pymysql
def fetch_data(start_date, end_date):
# 获取点击表里的device_id
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select distinct device_id from data_feed_click"
click_device_id = con_sql(db,sql)[0].values.tolist()
print("成功获取点击表里的device_id")
# 获取点击表里的数据
sql = "select cid,device_id,time,stat_date from data_feed_click " \
"where stat_date >= '{0}' and stat_date <= '{1}'".format(start_date, end_date)
# 因为上面的db已经关了,需要再写一遍
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
click = con_sql(db,sql)
click = click.rename(columns={0: "cid", 1: "device_id", 2: "time_date", 3: "stat_date"})
print("成功获取点击表里的数据")
# 从time特征中抽取hour
click["hour"] = click["time_date"].apply(lambda x: datetime.datetime.fromtimestamp(x).hour)
click["minute"] = click["time_date"].apply(lambda x: datetime.datetime.fromtimestamp(x).minute)
click = click.drop("time_date", axis=1)
# 获取曝光表里的数据
sql = "select cid,device_id,time,stat_date from data_feed_exposure " \
"where stat_date >= '{0}' and stat_date <= '{1}'".format(start_date, end_date)
start = time.time()
# 因为上面的db已经关了,需要再写一遍
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
exposure = con_sql(db,sql)
end = time.time()
print("获取曝光表耗时{}分".format((end-start)/60))
exposure = exposure.rename(columns={0: "cid", 1: "device_id", 2: "time_date", 3: "stat_date"})
print("成功获取曝光表里的数据")
# 从time特征中抽取hour
exposure["hour"] = exposure["time_date"].apply(lambda x: datetime.datetime.fromtimestamp(x).hour)
exposure["minute"] = exposure["time_date"].apply(lambda x: datetime.datetime.fromtimestamp(x).minute)
exposure = exposure.drop("time_date", axis=1)
return exposure, click, click_device_id
import redis
import datetime
import json
def filter_history(device_id,cid_list):
r = redis.StrictRedis.from_url("redis://redis.paas-test.env:6379/1")
all_key = "TS:recommend_tractate_set:device_id:" + str(device_id)
old_key = "TS:recommend_tractate_set:device_id:'{}':'{}'"\
.format(device_id,(datetime.date.today() - datetime.timedelta(days=14)).strftime("%Y-%m-%d"))
today_key = "TS:recommend_tractate_set:device_id:'{}':'{}'"\
.format(device_id,datetime.date.today().strftime("%Y-%m-%d"))
if r.exists(today_key):
r.sadd(today_key, *cid_list)
else:
r.sadd(today_key, *cid_list)
r.expire(today_key,15*24*60*60)
if r.exists(all_key) and r.exists(old_key):
r.sdiffstore(all_key, all_key, old_key)
r.delete(old_key)
r.expire(all_key, time=13*24*60*60)
r.sadd(all_key, *r.smembers(today_key))
def get_dairy():
device_id = "868080041007173"
r = redis.StrictRedis.from_url("redis://redis.paas-test.env:6379/0")
dislike_key = str(device_id) + "_dislike_diary"
dislike_cids = [2,20,40,61,81,101,121]
r.sadd(dislike_key,*dislike_cids)
print("不喜欢")
print(r.smembers(dislike_key))
user_portrait_diary_key = 'user_portrait_recommend_diary_queue:device_id:%s:%s' % \
(device_id, datetime.datetime.now().strftime('%Y-%m-%d'))
user_cids = list(range(2,6))
user_cids = [str(i) for i in user_cids]
r.hset(user_portrait_diary_key,'diary_queue',json.dumps(user_cids))
r.hset(user_portrait_diary_key, 'cursor', "0")
r.hset(user_portrait_diary_key, 'len_cursor', "0")
print("画像")
print(r.hgetall(user_portrait_diary_key))
search_diary_recommend_key = "TS:search_recommend_diary_queue:device_id:" + str(device_id)
serach_cids = list(range(20,26))
serach_cids = [str(i) for i in serach_cids]
r.hset(search_diary_recommend_key, 'diary_queue', json.dumps(serach_cids))
print("search")
print(r.hgetall(search_diary_recommend_key))
diary_recommend_key = "TS:recommend_diary_queue:device_id:" + str(device_id)
ts_cids = list(range(40,46))
ts_cids = [str(i) for i in ts_cids]
r.hset(diary_recommend_key, 'diary_queue', json.dumps(ts_cids))
print("ts")
print(r.hgetall(diary_recommend_key))
use_city_id = "beijing"
personal_key = "device_diary_queue:device_id:" + device_id + ":city_id:" + use_city_id
native_quue = ",".join([str(i) for i in range(60,80)])
nearby_quue = ",".join([str(i) for i in range(80,100)])
mea_queue = ",".join([str(i) for i in range(100,120)])
nation_queue = ",".join([str(i) for i in range(120,130)])
r.hset(personal_key,"native_queue",native_quue)
r.hset(personal_key, "nearby_queue", nearby_quue)
r.hset(personal_key, "nation_queue", nation_queue)
r.hset(personal_key, "megacity_queue", mea_queue)
print("personnal ")
print(r.hgetall(personal_key))
def get_qa():
device_id = "868080041007173"
r = redis.StrictRedis.from_url("redis://redis.paas-test.env:6379/0")
dislike_key = str(device_id) + "_dislike_qa"
dislike_cids = [529401,529412,529403]
r.sadd(dislike_key, *dislike_cids)
print("不喜欢")
print(r.smembers(dislike_key))
search_qa_recommend_key = "TS:search_recommend_answer_queue:device_id:" + str(device_id)
r.hset(search_qa_recommend_key,'answer_queue',json.dumps(list(range(529401,529408))))
print(r.hgetall(search_qa_recommend_key))
def get_topic():
device_id = "868080041007173"
r = redis.StrictRedis.from_url("redis://redis.paas-test.env:6379/0")
dislike_key = str(device_id) + "_dislike_tractate"
dislike_cids = [2,37]
r.sadd(dislike_key, *dislike_cids)
print("不喜欢")
print(r.smembers(dislike_key))
search_topic_recommend_key = "TS:search_recommend_tractate_queue:device_id:" + str(device_id)
r.hset(search_topic_recommend_key,'tractate_queue',json.dumps(list(range(1,4))))
print(r.hgetall(search_topic_recommend_key))
def yanzheng():
device_id = "E417C286-40A4-42F6-BDA9-AEEBD8FEC3B6"
r = redis.Redis(host="172.16.40.135", port=5379, db=2, socket_timeout=2000)
if __name__ == "__main__":
# cid = [16,18,20]
# filter_history("hello",cid)
get_topic()
import pymysql
import datetime
import json
import redis
import pandas as pd
from sqlalchemy import create_engine
def get_mysql_data(host,port,user,passwd,db,sql):
db = pymysql.connect(host=host, port=port, user=user, passwd=passwd,db=db)
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
db.close()
return result
def get_esmm_users():
try:
stat_date = (datetime.date.today() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")
sql = "select distinct device_id,city_id from data_feed_exposure_precise " \
"where stat_date = '{}'".format(stat_date)
result = get_mysql_data('172.16.40.158', 4000, 'root','3SYz54LS9#^9sBvC','jerry_prod',sql)
result = list(result)
return result
except:
return []
def get_user_profile(device_id,top_k = 5):
try:
r = redis.Redis(host="172.16.40.135", port=5379, password="", db=2)
key = "user:portrait_tags:cl_id:" + str(device_id)
if r.exists(key):
tmp = json.loads(r.get(key).decode('utf-8'))
tag_score = {}
for i in tmp:
if i["type"] == "tag":
tag_score[i["content"]] = i["score"]
elif i["content"] in name_tag.keys():
tag_score[name_tag[i["content"]]] = i["score"]
tag_sort = sorted(tag_score.items(), key=lambda x: x[1], reverse=True)
tags = []
if len(tag_sort) > top_k:
for i in range(top_k):
tags.append(tag_sort[i][0])
else:
for i in tag_sort:
tags.append(i[0])
return tags
else:
return []
except:
return []
def get_searchworlds_to_tagid():
try:
sql = 'select id, name from api_tag where is_online = 1 and tag_type < 4'
tag_id = get_mysql_data('172.16.30.141', 3306, 'work', 'BJQaT9VzDcuPBqkd', 'zhengxing', sql)
searchworlds_to_tagid = {}
for i in tag_id:
searchworlds_to_tagid[i[1]] = i[0]
return searchworlds_to_tagid
except Exception as e:
return {}
def get_queues(device_id,city_id):
try:
db = pymysql.connect(host='172.16.40.158', port=4000, user='root',
passwd='3SYz54LS9#^9sBvC', db='jerry_test')
cursor = db.cursor()
sql = "select native_queue, nearby_queue, nation_queue, megacity_queue from esmm_device_diary_queue " \
"where device_id = '{}' and city_id = '{}'".format(device_id, city_id)
cursor.execute(sql)
result = cursor.fetchone()
db.close()
if result is not None:
return list(result)
else:
return []
except:
return []
def tag_boost(cid_str, tag_list):
if cid_str is not None and cid_str != "":
cids = cid_str.split(",")
try:
if len(cids) > 6 and len(tag_list) > 0:
sql = "select id,group_concat(diary_id) from " \
"(select a.diary_id,b.id from src_mimas_prod_api_diary_tags a left join src_zhengxing_api_tag b " \
"on a.tag_id = b.id where b.tag_type < '4' and a.diary_id in {}) tmp " \
"where id in {} group by id".format(tuple(cids), tuple(tag_list))
result = get_mysql_data('172.16.40.158', 4000, 'root', '3SYz54LS9#^9sBvC','eagle',sql)
if len(result) > 0:
tag_cids = {}
left_cids = []
for i in result:
tmp = i[1].split(",")
tmp = [i for i in cids if i in tmp]
tag_cids[i[0]] = tmp
left_cids.extend(tmp)
left_cids = list(set(left_cids))
right_cids = [i for i in cids if i not in left_cids]
tag_cids["right"] = right_cids
tag_list.append("right")
sort_cids = []
n = 0
while n != len(tag_cids) - 1:
for i in tag_list:
if i in tag_cids.keys():
if len(tag_cids[i]) > 0:
sort_cids.append(tag_cids[i][0])
value = tag_cids[i]
value.pop(0)
tag_cids[i] = value
if len(value) == 0 and i != "right":
n = n + 1
if len(tag_cids["right"]) > 0:
sort_cids.extend(tag_cids["right"])
news_ids = []
for id in sort_cids:
if id not in news_ids:
news_ids.append(id)
new_str = ",".join([str(i) for i in news_ids])
return new_str
else:
return cid_str
else:
return cid_str
except:
#TODO 往sentry发,并且在本地也要打出日志
return cid_str
else:
return cid_str
def to_data_base(df):
sql = "select distinct device_id from esmm_resort_diary_queue"
result = get_mysql_data('172.16.40.158', 4000, 'root','3SYz54LS9#^9sBvC', 'jerry_test',sql)
old_uid = [i[0] for i in result]
if len(old_uid) > 0:
old_uid = set(df["device_id"].values)&set(old_uid)
old_number = len(old_uid)
if old_number > 0:
db = pymysql.connect(host='172.16.40.158', port=4000, user='root',
passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "delete from esmm_resort_diary_queue where device_id in {}".format(tuple(old_uid))
cursor = db.cursor()
cursor.execute(sql)
db.commit()
cursor.close()
db.close()
yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@172.16.40.158:4000/jerry_test?charset=utf8')
pd.io.sql.to_sql(df, "esmm_resort_diary_queue", yconnect, schema='jerry_test', if_exists='append', index=False,
chunksize=200)
print("insert done")
def get_all_users():
try:
sql = "select distinct device_id,city_id from esmm_device_diary_queue"
result = get_mysql_data('172.16.40.158', 4000, 'root','3SYz54LS9#^9sBvC','jerry_test',sql)
result = list(result)
return result
except:
return []
if __name__ == "__main__":
# users_list = get_esmm_users()
# print("user number")
# print(len(users_list))
users_list = get_all_users()
name_tag = get_searchworlds_to_tagid()
n = 500
split_users_list = [users_list[i:i + n] for i in range(0, len(users_list), n)]
for child_users_list in split_users_list:
total_samples = list()
for uid_city in child_users_list:
tag_list = get_user_profile(uid_city[0])
queues = get_queues(uid_city[0], uid_city[1])
if len(queues) > 0 and len(tag_list) > 0:
new_native = tag_boost(queues[0], tag_list)
new_nearby = tag_boost(queues[1], tag_list)
insert_time = str(datetime.datetime.now().strftime('%Y%m%d%H%M'))
sample = [uid_city[0], uid_city[1], new_native, new_nearby, queues[2], queues[3], insert_time]
total_samples.append(sample)
if len(total_samples) > 0:
df = pd.DataFrame(total_samples)
df = df.rename(columns={0: "device_id", 1: "city_id",2:"native_queue",
3:"nearby_queue",4:"nation_queue",5:"megacity_queue",6:"time"})
to_data_base(df)
......@@ -6,15 +6,16 @@
import pandas as pd
import pymysql
from sqlalchemy import create_engine
import redis
import json
def test():
conf = SparkConf().setAppName("My App").set("spark.io.compression.codec", "lzf")
sc = SparkContext(conf = conf)
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
# ti = pti.TiContext(spark)
# ti.tidbMapDatabase("jerry_test")
ti = pti.TiContext(spark)
ti.tidbMapDatabase("jerry_test")
spark = SparkSession.builder.appName("hello test").enableHiveSupport().getOrCreate()
......@@ -24,11 +25,11 @@ def test():
spark.sql("CREATE TEMPORARY FUNCTION json_map AS 'brickhouse.udf.json.JsonMapUDF'")
spark.sql("CREATE TEMPORARY FUNCTION is_json AS 'com.gmei.hive.common.udf.UDFJsonFormatCheck'")
# hive_context.sql("SET mapreduce.job.queuename=data")
# hive_context.sql("SET mapred.input.dir.recursive=true")
# hive_context.sql("SET hive.mapred.supports.subdirectories=true")
sql = "select user_id from online.tl_hdfs_maidian_view where partition_date = '20190412' limit 10"
spark.sql(sql).show(6)
sql = """select cl_id as device_id,params["business_id"] as cid_id,
(params["out"]-params["in"]) as dur_time from online.bl_hdfs_maidian_updates where action="page_view"
and params["page_name"]="diary_detail" and partition_date = '20190801'
"""
df = spark.sql(sql)
......@@ -40,39 +41,37 @@ def con_sql(db,sql):
db.close()
return df
def write_redis(device_id,cid_list):
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle')
sql = "select b.id from src_mimas_prod_api_diary_tags a left join src_zhengxing_api_tag b " \
"on a.tag_id = b.id where b.tag_type = '3' and a.diary_id in {}".format(tuple(cid_list))
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
tags = list(set([i[0] for i in result]))
if tags is not None:
sql = "select a.id from src_mimas_prod_api_diary a left join src_mimas_prod_api_diary_tags b " \
"on a.id=b.diary_id left join src_zhengxing_api_tag c on b.tag_id=c.id " \
"where a.is_online = 1 and a.content_level >= '3' " \
"and c.id in {} and c.tag_type = '3'".format(tuple(tags))
cursor.execute(sql)
result = cursor.fetchall()
if result is not None:
cids = list(set([i[0] for i in result]))
r = redis.StrictRedis.from_url('redis://:ReDis!GmTx*0aN6@172.16.40.133:6379')
key = str(device_id) + "_dislike_diary"
if r.exists(key):
value = eval(r.get(key))
value.extend(cids)
cids = json.dumps(list(set(value)))
r.set(key, json.dumps(cids))
if __name__ == '__main__':
# sparkConf = SparkConf().set("spark.hive.mapred.supports.subdirectories", "true") \
# .set("spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive", "true") \
# .set("spark.tispark.plan.allow_index_double_read", "false") \
# .set("spark.tispark.plan.allow_index_read", "true") \
# .set("spark.sql.extensions", "org.apache.spark.sql.TiExtensions") \
# .set("spark.tispark.pd.addresses", "172.16.40.158:2379").set("spark.io.compression.codec", "lzf") \
# .set("spark.driver.maxResultSize", "8g")
#
# spark = SparkSession.builder.config(conf=sparkConf).enableHiveSupport().getOrCreate()
# ti = pti.TiContext(spark)
# ti.tidbMapDatabase("jerry_test")
# spark.sparkContext.setLogLevel("WARN")
# sql = "select stat_date,cid_id,y,ccity_name from esmm_train_data limit 60"
# spark.sql(sql).show(6)
sql = "select level2_id,concat('t',treatment_method)," \
"concat('min',price_min),concat('max',price_max)," \
"concat('tr',treatment_time),concat('m',maintain_time)," \
"concat('r',recover_time) from jerry_test.train_Knowledge_network_data"
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
df = con_sql(db, sql)
df = df.rename(columns={0: "level2_id", 1: "treatment_method",2:"price_min",3:"price_max",4:"treatment_time",
5:"maintain_time",6:"recover_time"})
print(df.head(6))
host = '172.16.40.158'
port = 4000
user = 'root'
password = '3SYz54LS9#^9sBvC'
db = 'jerry_test'
charset = 'utf8'
engine = create_engine(str(r"mysql+pymysql://%s:" + '%s' + "@%s:%s/%s") % (user, password, host, port, db))
df.to_sql('knowledge', con=engine, if_exists='append', index=False, chunksize=8000)
print("insert done")
a = [15202811, 15825403, 16480766, 15432195, 15759876]
d = "E417C286-40A4-42F6-BDA9-AEEBD8FEC3B6"
write_redis(d, a)
......@@ -8,138 +8,320 @@ import pymysql
import time
# 统计尾号6的活跃用户数
def unique_user_count(file_path, temp_list, now):
if os.path.exists(file_path):
# 尾号是6的活跃用户数
tail_6_list = eval(pd.read_csv(file_path).loc[0, "list"])
else:
tail_6_list = []
tail_6_list.extend(list(filter(lambda x: (str(x)[-1] == "6"), temp_list)))
if tail_6_list != []:
df_tail_6 = pd.DataFrame({"number": [len(set(tail_6_list))], "time": [str(now)[:16]],
"list": [list(set(tail_6_list))]})
df_tail_6.to_csv(file_path, index=None)
print("截止现在尾号是6的独立活跃数:{}".format(len(set(tail_6_list))))
# 统计预测过的独立用户数
def predict_user_count(predict_file_path,device_list,now):
if os.path.exists(predict_file_path):
# 预测过尾号是6的用户数
all_predict_list = eval(pd.read_csv(predict_file_path).loc[0, "list"])
else:
all_predict_list = []
all_predict_list.extend(device_list)
if all_predict_list != []:
df_predict = pd.DataFrame({"number": [len(set(all_predict_list))], "time": [str(now)[:16]],
"list": [list(set(all_predict_list))]})
df_predict.to_csv(predict_file_path, index=None)
print("截止现在预测过尾号是6的独立活跃数:{}".format(len(set(all_predict_list))))
# 获取当下一分钟内活跃用户
def get_active_users(flag,path,differ):
if differ == 0:
end = time.time()
start = time.time()-60
elif 0 < differ < 10:
time.sleep(30)
differ += 30
end = time.time()
start = end - differ
else:
end = time.time()
start = end - differ
end_datetime = str(datetime.fromtimestamp(end))
start_datetime = str(datetime.fromtimestamp(start))
if flag:
sql = "select device_id,city_id from user_active_time " \
"where active_time <= '{}' and active_time >= '{}'".format(end_datetime, start_datetime)
db = pymysql.connect(host=ACTIVE_USER_DB_ONLINE["host"], port=ACTIVE_USER_DB_ONLINE["port"],
user=ACTIVE_USER_DB_ONLINE["user"], passwd=ACTIVE_USER_DB_ONLINE["passwd"],
db=ACTIVE_USER_DB_ONLINE["db"])
df = con_sql(db,sql)
else:
db = pymysql.connect(host=ACTIVE_USER_DB_LOCAL["host"], port=ACTIVE_USER_DB_LOCAL["port"],
user=ACTIVE_USER_DB_LOCAL["user"], db=ACTIVE_USER_DB_LOCAL["db"])
sql = "select device_id,city_id from user_active_time"
df = con_sql(db, sql)
if df.empty:
print("当下没有活跃用户数")
def fetch_qa(device_id, card_type, size):
try:
key = '{device_id}-{card_type}-{date}'.format(device_id=device_id,
card_type=card_type, date=RecommendFeed.current_date())
if (device_id != '0'):
search_qa_recommend_key = "TS:search_recommend_answer_queue:device_id:" + str(device_id)
search_qa_recommend_list = list()
search_cursor_ts = 0
if redis_client.exists(search_qa_recommend_key):
search_qa_recommend_dict = redis_client.hgetall(search_qa_recommend_key)
if b'cursor' in search_qa_recommend_dict:
search_cursor_ts = json.loads(search_qa_recommend_dict[b'cursor'])
if search_cursor_ts < 10:
search_qa_recommend_list = json.loads(search_qa_recommend_dict[b'answer_queue'])
if search_cursor_ts < len(search_qa_recommend_list):
size = size - 1
try:
que = DeviceQAQueue.objects.get(device_id=device_id)
except DeviceQAQueue.DoesNotExist:
que = AnswerQueue.objects.last()
if not que:
return []
que = list(filter(None, que.queue.split(',')))
# adjust args.
cursor = redis_client.get(key) or 0
cursor = int(cursor) % len(que)
size = min(size, len(que))
# redis_client.set(key, cursor + size, ex=24 * 60 * 60)
data = list(islice(cycle(que), cursor, cursor + size))
data = list(map(int, data))
if cursor + 2 * size < len(que):
redis_client.set(key, cursor + size, ex=24 * 60 * 60)
else:
try:
context.request_logger.app(reset_answer_queue=True)
cursor = 0
redis_client.set(key, cursor, ex=24 * 60 * 60)
except:
redis_client.set(key, cursor + size, ex=24 * 60 * 60)
if device_id != '0':
if len(search_qa_recommend_list) > 0 and search_cursor_ts < len(search_qa_recommend_list):
queue = search_qa_recommend_list[search_cursor_ts:search_cursor_ts + 1]
queue.extend(data)
data = queue
new_search_cursor = search_cursor_ts + 1
redis_client.hset(search_qa_recommend_key, 'cursor', new_search_cursor)
redis_client.expire(search_qa_recommend_key, 30 * 24 * 60 * 60)
read_qa_key = "TS:recommend_answer_set:device_id:" + str(device_id)
if len(data) > 0:
redis_client.sadd(read_qa_key, *data)
return data
except:
logging_exception()
return []
# 统计活跃用户中尾号是6的用户数
else:
temp_list = df[0].values.tolist()
now = datetime.now()
tail6_file_path = path + "{}tail6Unique.csv".format(str(now)[:10])
unique_user_count(tail6_file_path, temp_list, now)
# if os.path.exists(tail6_file_path):
# # 尾号是6的活跃用户数
# tail_6_list = eval(pd.read_csv(tail6_file_path).loc[0, "list"])
# else:
# tail_6_list = []
#
# tail_6_list.extend(list(filter(lambda x: (str(x)[-1] == "6"), temp_list)))
# if tail_6_list != []:
# df_tail_6 = pd.DataFrame({"number": [len(set(tail_6_list))], "time": [str(now)[:16]],
# "list": [list(set(tail_6_list))]})
# df_tail_6.to_csv(tail6_file_path, index=None)
#
# print("截止现在尾号是6的独立活跃数:{}".format(len(set(tail_6_list))))
old_device_id_list = pd.read_csv(path + "data_set_device_id.csv")["device_id"].values.tolist()
# 求活跃用户和老用户的交集,也就是只预测老用户
df = df.loc[df[0].isin(old_device_id_list)]
if df.empty:
print("该列表是新用户,不需要预测")
def fetch_user_topic(device_id, card_type, size):
try:
key = '{device_id}-{card_type}-{date}'.format(device_id=device_id, card_type=card_type,
date=RecommendFeed.current_date())
if (device_id != '0') and size >= 2:
search_topic_recommend_key = "TS:search_recommend_tractate_queue:device_id:" + str(device_id)
search_topic_recommend_list = list()
search_cursor_ts = 0
if redis_client.exists(search_topic_recommend_key):
search_topic_recommend_dict = redis_client.hgetall(search_topic_recommend_key)
if b'cursor' in search_topic_recommend_dict:
search_cursor_ts = json.loads(search_topic_recommend_dict[b'cursor'])
if search_cursor_ts < 30:
search_topic_recommend_list = json.loads(search_topic_recommend_dict[b'tractate_queue'])
if search_cursor_ts < len(search_topic_recommend_list):
size = size - 2
try:
que = DeviceUserTopicQueue.objects.get(device_id=device_id)
except DeviceUserTopicQueue.DoesNotExist:
que = UserTopicQueue.objects.last()
if not que:
return []
que = list(filter(None, que.queue.split(',')))
# adjust args.
cursor = redis_client.get(key) or 0
cursor = int(cursor) % len(que)
size = min(size, len(que))
data = list(islice(cycle(que), cursor, cursor + size))
data = list(map(int, data))
if cursor + 2 * size < len(que):
redis_client.set(key, cursor + size, ex=24 * 60 * 60)
else:
# TODO 正式上线后注释下面的只预测尾号是6的代码
# 只预测尾号是6的ID,这块是测试要求的
device_temp_list = df[0].values.tolist()
predict_list = list(filter(lambda x: (str(x)[-1] == "6") or (str(x)=="358035085192742")
or str(x)=="AB20292B-5D15-4C44-9429-1C2FF5ED26F6",
device_temp_list))
if predict_list == []:
print('没有尾号是6和目标用户')
return []
else:
df = df.loc[df[0].isin(predict_list)]
device_list = df[0].values.tolist()
city_list = df[1].values.tolist()
device_city_list = list(zip(device_list, city_list))
print("当下这一分钟预测用户数量:{}".format(len(device_city_list)))
#统计尾号6的预测用户
predict_file_path = path + "{}predictTail6Unique.csv".format(str(now)[:10])
predict_user_count(predict_file_path,device_list,now)
# if os.path.exists(predict_file_path):
# # 预测过尾号是6的用户数
# all_predict_list = eval(pd.read_csv(predict_file_path).loc[0, "list"])
# else:
# all_predict_list = []
# all_predict_list.extend(device_list)
# if all_predict_list != []:
# df_predict = pd.DataFrame({"number": [len(set(all_predict_list))], "time": [str(now)[:16]],
# "list": [list(set(all_predict_list))]})
# df_predict.to_csv(predict_file_path, index=None)
return device_city_list
def fetch_user_profile(device_id):
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select device_id,city_id from data_feed_click where device_id = '{0}' limit 1".format(device_id)
user_profile = con_sql(db,sql)
if user_profile.empty:
print("没有获取到该用户对应的city_id")
return None,True
try:
context.request_logger.app(reset_queue=True)
cursor = 0
redis_client.set(key, cursor, ex=24 * 60 * 60)
except:
redis_client.set(key, cursor + size, ex=24 * 60 * 60)
if device_id != '0' and size >= 2:
if len(search_topic_recommend_list) > 0 and search_cursor_ts < len(search_topic_recommend_list):
queue = search_topic_recommend_list[search_cursor_ts:search_cursor_ts + 2]
queue.extend(data)
data = queue
new_search_cursor = search_cursor_ts + 2
redis_client.hset(search_topic_recommend_key, 'cursor', new_search_cursor)
redis_client.expire(search_topic_recommend_key, 30 * 24 * 60 * 60)
read_topic_key = "TS:recommend_tractate_set:device_id:" + str(device_id)
if len(data) > 0:
redis_client.sadd(read_topic_key, *data)
return data
except:
logging_exception()
return []
def fetch_diary(cls, device_id, card_type, city_id, size):
# first, we fetch data from personal-queue city-queue, if not both, get data
# from world queue.
user_portrait_diary_part_list = list()
click_diary_size = 1
search_diary_size = 4
if device_id != '0':
user_portrait_diary_key = 'user_portrait_recommend_diary_queue:device_id:%s:%s' % (device_id, datetime.datetime.now().strftime('%Y-%m-%d'))
if redis_client.exists(user_portrait_diary_key):
user_portrait_diary_dict = redis_client.hgetall(user_portrait_diary_key)
user_portrait_cursor = str(user_portrait_diary_dict[b'cursor'],encoding='utf-8')
if user_portrait_cursor == '0':
if b'len_cursor' in user_portrait_diary_dict.keys():
user_portrait_diary_list = json.loads(user_portrait_diary_dict[b'diary_queue'])
len_cursor = str(user_portrait_diary_dict[b'len_cursor'],encoding='utf-8')
len_cursor = int(len_cursor)
if len(user_portrait_diary_list) - len_cursor >size:
user_portrait_diary_part_list = user_portrait_diary_list[len_cursor:len_cursor+size]
redis_client.hset(user_portrait_diary_key,'len_cursor',len_cursor+size)
size = 0
else:
user_portrait_diary_list = json.loads(user_portrait_diary_dict[b'diary_queue'])
diary_list_len = len(user_portrait_diary_list) - len_cursor
size = size - diary_list_len
user_portrait_diary_part_list = user_portrait_diary_list[len_cursor:len_cursor + diary_list_len]
redis_client.hset(user_portrait_diary_key, 'len_cursor', len_cursor + diary_list_len)
user_portrait_cursor = int(user_portrait_cursor) + 1
redis_client.hset(user_portrait_diary_key, 'cursor', user_portrait_cursor)
else:
user_portrait_diary_part_list = json.loads(user_portrait_diary_dict[b'diary_queue'])
size = size - len(user_portrait_diary_part_list)
user_portrait_cursor = int(user_portrait_cursor) + 1
redis_client.hset(user_portrait_diary_key, 'cursor', user_portrait_cursor)
try:
# obj = DeviceDiaryQueue.objects.filter(device_id=device_id, city_id=city_id).first()
(local, nearby, nation, megacity, city_id) = cls.fetch_device_diary_queue_data(city_id, device_id)
if len(local) == 0 and len(nearby) == 0 and len(nation) == 0 and len(megacity) == 0:
(local, nearby, nation, megacity, city_id) = cls.fetch_diary_queue_data(city_id)
# if not obj:
# (local, nearby, nation, megacity,city_id) = cls.fetch_diary_queue_data(city_id)
# else:
# local = list(filter(None, obj.native_queue.split(','))) if obj.native_queue else []
# nearby = list(filter(None, obj.nearby_queue.split(','))) if obj.nearby_queue else []
# nation = list(filter(None, obj.nation_queue.split(','))) if obj.nation_queue else []
# megacity = list(filter(None, obj.megacity_queue.split(','))) if obj.megacity_queue else []
except:
logging_exception()
(local, nearby, nation, megacity, city_id) = cls.fetch_diary_queue_data(city_id)
if(device_id!='0'):
search_diary_recommend_key = "TS:search_recommend_diary_queue:device_id:" + str(device_id)
search_diary_recommend_list = list()
search_cursor_ts = 0
if redis_client.exists(search_diary_recommend_key) and size >3:
search_diary_recommend_dict = redis_client.hgetall(search_diary_recommend_key)
if b'cursor' in search_diary_recommend_dict:
search_cursor_ts = json.loads(search_diary_recommend_dict[b'cursor'])
search_diary_recommend_list = json.loads(search_diary_recommend_dict[b'diary_queue'])
if search_cursor_ts +search_diary_size < len(search_diary_recommend_list) :
size = size - search_diary_size
if (device_id != '0') :
diary_recommend_key = "TS:recommend_diary_queue:device_id:" + str(device_id)
diary_recommend_list = list()
if redis_client.exists(diary_recommend_key) and size > 0:
diary_recommend_dict = redis_client.hgetall(diary_recommend_key)
diary_recommend_list = json.loads(diary_recommend_dict[b'diary_queue'])
if len(diary_recommend_list)>0:
size = size -click_diary_size
key = '{device_id}-{city_id}-{date}'.format(device_id=device_id,
city_id=city_id, date=RecommendFeed.current_date())
# strategy rule: when user refresh over 30 loadings, reset native nearby nation queue cursor.
counter_key = key + '-counter_v1'
counter = redis_client.incr(counter_key)
if counter == 1:
redis_client.expire(counter_key, 24 * 60 * 60)
cursor_key = key + '-cursor_v1'
cursor = redis_client.get(cursor_key) or b'0-0-0-0'
# if counter > 30:
# cursor = b'0-0-0-0'
# redis_client.delete(counter_key)
cx, cy, cm, cz = map(int, cursor.split(b'-'))
x, y, m, z = cls.get_city_scale(city_id)
data, ncx, ncy, ncm, ncz = cls.get_scale_data(
local, nearby, nation, megacity,
cx, cy, cm, cz,
x, y, z, m, size
)
if ncx == cx and ncy == cy: # native queue and nearby queue
logger.info("diary queue reach end,cx:%d,cy:%d,cm:%d,cz:%d",cx,cy,cm,cz)
# redis_client.delete(counter_key)
# data, ncx, ncy, ncm, ncz = cls.get_scale_data(
# local, nearby, nation, megacity,
# 0, 0, 0, 0,
# x, y, z, m, size
# )
ncx = ncy = ncm = ncz = 0
val = '-'.join(map(str, [ncx, ncy, ncm, ncz]))
redis_client.set(cursor_key, val, ex=24 * 60 * 60)
data = list(map(int, data))
if device_id != '0':
if search_cursor_ts<len(search_diary_recommend_list)-search_diary_size:
queue = search_diary_recommend_list[search_cursor_ts:search_cursor_ts+search_diary_size]
queue.extend(data)
data = queue
new_search_cursor = search_cursor_ts +search_diary_size
redis_client.hset(search_diary_recommend_key,'cursor',new_search_cursor)
redis_client.expire(search_diary_recommend_key,30*24*60*60)
if len(diary_recommend_list) >0:
diary_id = diary_recommend_list.pop(0)
data.insert(0,diary_id)
if len(diary_recommend_list)>0:
diary_recommend_list_json = json.dumps(diary_recommend_list)
redis_client.hset(diary_recommend_key,'diary_queue',diary_recommend_list_json)
redis_client.expire(diary_recommend_key,30*24*60*60)
else:
redis_client.delete(diary_recommend_key)
if len(user_portrait_diary_part_list)>0:
user_portrait_diary_part_list.extend(data)
data = user_portrait_diary_part_list
#已读
read_diary_key = "TS:recommend_diary_set:device_id:" + str(device_id)
if len(data)>0:
redis_client.sadd(read_diary_key,*data)
return data
def get_scale_data(local, nearby, nation, megacity, cx, cy, cm, cz, x, y, z, m, size):
"""
:param local: local diary queue
:param nearby: nearby diary queue
:param nation: nation diary queue
:param megacity: megacity diary queue
:param cx: seen local diary offset
:param cy: seen nearby diary offset
:param cz: seen nation diary offset
:param cm: seen megacity diary offset
:param x: local diary scale factor
:param y: nearby diary scale factor
:param z: nation diary scale factor
:param m: megacity diary scale factor
:param size: nubmer of diary
:return:
"""
# 本地 临近 特大城市 全国 四个层级 都按照的是四舍五入取得方式
# 针对出现的问题,本次相应的优化是:
# 1、如果出现两个层级为零,且有剩余坑位时,则按照本地 临近 全国的优先级,先给优先级高且为零的层级一个坑位。
# 2、如果所有层级都非零,且有剩余坑位时,则优先给权重占比大的层级一个坑位。
# 3、如果只有一个层级为零,且有剩余坑位时,则优先填充权重占比大的层级一个坑位。
nx = int(round(x * 1.0 / (x + y + z + m) * size))
ny = int(round(y * 1.0 / (x + y + z + m) * size))
nz = int(round(z * 1.0 / (x + y + z + m) * size))
nm = int(round(m * 1.0 / (x + y + z + m) * size))
nxyz = [nx, ny, nm, nz]
xyz = [x, y, m, z]
counter = Counter([nx, ny, nm, nz])
if counter[0] == 2:
nxyz[nxyz.index(0)] += size - sum(nxyz)
else:
user_profile = user_profile.rename(columns={0:"device_id",1:"city_id"})
user_profile_dict = {}
for i in user_profile.columns:
user_profile_dict[i] = user_profile.loc[0, i]
return user_profile_dict, False
nxyz[xyz.index(max(xyz))] += size - sum(nxyz)
nx, ny, nm, nz = nxyz
slocal = local[cx:cx + nx]
cx = min(cx + nx, len(local))
ny += (nx - len(slocal))
snearby = nearby[cy:cy + ny]
cy = min(cy + ny, len(nearby))
nm += (ny - len(snearby))
smegacity = megacity[cm: cm + nm]
cm = min(cm + nm, len(megacity))
nz += (nm - len(smegacity))
snation = nation[cz:cz + nz]
cz = min(cz + nz, len(nation))
return chain(slocal, snearby, smegacity, snation), cx, cy, cm, cz
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment