# -*- coding: utf-8 -*- """ Created on Mon Feb 25 15:11:06 2019 @author: hanye """ import datetime import numpy as np VECTOR_LEN = 1440 def strToVector(vector_str, sep='__'): """ vector value string to vector. Input vector_str is all vector values connected by sep str. """ vector_str_Lst = vector_str.split(sep) vector = np.zeros(VECTOR_LEN, dtype=np.float) for ii in range(len(vector_str_Lst)): vector[ii] = float(vector_str_Lst[ii]) return vector def vectorToCorLst(vector, sep='__'): vectStr = vectorToCor(vector, sep=sep) vectStrLst = vectStr.split('__') vectCorIntLst = list(map(lambda cor: int(cor), vectStrLst)) vectStrLstSorted = sorted(vectCorIntLst) return vectStrLstSorted def vectorToCor(vector, sep='__'): """""" corLst = [] for vectorIdx in range(len(vector)): if vector[vectorIdx] != 0: corLst.append(str(vectorIdx)) corStr = sep.join(corLst) return corStr def corStrToCorLst(vector_str, sep='__'): vectStrLst = vector_str.split(sep) vectCorIntLst = list(map(lambda cor: int(cor), vectStrLst)) vectStrLstSorted = sorted(vectCorIntLst) return vectStrLstSorted def corToVector(vector_str, sep='__'): """ corrdinates string to vector. Input vector_str is all vector index (corrdinates) for destination vector. """ vector_str_Lst = vector_str.split(sep) vector = np.zeros(VECTOR_LEN, dtype=np.int) for vv in vector_str_Lst: try: vvi = int(vv) vector[vvi] = 1 except ValueError: return vector return vector def timeStrToCor(timeStr): """ 00:00:00 to 25:59:59 is leagel. """ illegalMsg = '00:00:00 to 25:59:59 is leagel, otherwise is illegal.' if timeStr is None: return None # AttributeError: 'datetime.time' object has no attribute 'split' if type(timeStr) == datetime.time: timeStr = timeStr.strftime('%H:%M:%S') timeStrLst = timeStr.split(':') hourStr = timeStrLst[0] hourNum = int(hourStr) if hourNum > 25: print(illegalMsg) return minuteStr = timeStrLst[1] minuteNum = int(minuteStr) if minuteNum > 59: print(illegalMsg) return if len(timeStrLst) == 3: secondStr = timeStrLst[2] secondNum = int(secondStr) if secondNum > 59: print(illegalMsg) return if secondNum >= 30: minuteNum += 1 vectCordinate = (hourNum - 2) * 60 + minuteNum if vectCordinate < 0: vectCordinate += 24 * 60 return vectCordinate def form_people_type_str(sex, aged, edu): if isinstance(sex, int): sex = str(sex) if isinstance(aged, int): aged = str(aged) if isinstance(edu, int): edu = str(edu) return '__'.join([sex, aged, edu]) def form_house_people_type(sex, aged, edu, house_type, house_count=None, pid=None): if house_count is not None: house_count = house_count else: house_id = pid[0:8] house_count = str(house_type[house_id]) if isinstance(sex, int): sex = str(sex) if isinstance(aged, int): aged = str(aged) if isinstance(edu, int): edu = str(edu) return '__'.join([house_count, sex, aged, edu]) def format_raw_line(raw_line, headerLst=None, sep='\t'): line_Lst = raw_line.strip().split(sep) if headerLst is None: headerLst = ['pid', 'wei', 'sex', 'aged', 'edu', 'ch', 'minutes'] line_dict = dict(zip(headerLst, line_Lst)) if 'minutes' in line_dict: rawVectStr = line_dict['minutes'] vector = corToVector(line_dict['minutes']) elif 'st' in line_dict: rawVectStr = line_dict['st'] vector = corToVector(line_dict['st']) else: rawVectStr = '' vector = None if vector is not None: corLst = corStrToCorLst(line_dict['minutes']) vectorMin = int(min(corLst)) vectorMax = int(max(corLst)) else: vectorMin = 0 vectorMax = 0 line_dict.update({'minutes_first': vectorMin, 'minutes_last': vectorMax}) if 'ch' not in line_dict: line_dict['ch'] = '' line_dict['minutes_rawStr'] = rawVectStr line_dict['minutes'] = vector line_dict['vector'] = line_dict['minutes'] if 'wei' in line_dict: line_dict['weight'] = float(line_dict['wei']) elif 'weight' in line_dict: line_dict['weight'] = float(line_dict['weight']) line_dict.pop('wei', None) # house_id is the first 8 digits of pid line_dict['hid'] = line_dict['pid'][:8] line_dict['cityid'] = line_dict['pid'][:3] if 'BrandID' not in line_dict: line_dict['BrandID'] = '' if 'platform' not in line_dict: line_dict['platform'] = '' return line_dict # def format_lines_from_one_file(raw_file_path, raw_file_name, sep='\t'): # lines_Lst = [] # rf = open(raw_file_path + '/' + raw_file_name, 'r', encoding='utf-8') # first_l = rf.readline() # if 'pid' in first_l: # headerLst = first_l.strip().split(sep) # else: # headerLst = None # line_dict = format_raw_line(first_l, headerLst) # lines_Lst.append(line_dict) # for raw_line in rf: # line_dict = format_raw_line(raw_line, headerLst) # lines_Lst.append(line_dict) # rf.close() # return lines_Lst def format_lines_from_one_file(raw_file_path, raw_file_name, sep='\t'): """should have better performance with yield""" # check first line containing headers or not skipFirstLineFlag = False rf = open(raw_file_path + '/' + raw_file_name, 'r', encoding='utf-8') first_l = rf.readline() if 'pid' in first_l: headerLst = first_l.strip().split(sep) skipFirstLineFlag = True else: headerLst = None rf.close() with open(raw_file_path + '/' + raw_file_name, 'r', encoding='utf-8') as rf: if skipFirstLineFlag: rf.readline() for raw_line in rf: line_dict = format_raw_line(raw_line, headerLst) yield line_dict def PIV_from_one_file(raw_file_path, raw_file_name, house_type=None): lines_Lst = format_lines_from_one_file(raw_file_path, raw_file_name) # form channel-people dict channel_peopleLst_dict = {} weight_sum_dict = {} for lined in lines_Lst: # people_type = '__'.join([lined['sex'], lined['aged'], lined['edu']]) people_type = form_people_type_str(lined['sex'], lined['aged'], lined['edu']) if house_type is not None: people_type = form_house_people_type(sex=lined['sex'], aged=lined['aged'], edu=lined['edu'], house_type=house_type, pid=pid) pid = lined['pid'] channel = lined['ch'] vector = lined['minutes'] weight = lined['weight'] # update weight sum if pid not in weight_sum_dict: weight_sum_dict.update({pid: weight}) if 'ch' not in lined or lined['minutes'] is None: # skip pid without view behavior, after weight sum continue # update channel_peopleLst_dict if channel not in channel_peopleLst_dict: channel_peopleLst_dict.update( {channel: { people_type: [{'vector': vector, 'weight': weight, 'pid': pid}]}}) else: if people_type not in channel_peopleLst_dict[channel]: channel_peopleLst_dict[channel].update( {people_type: [{'vector': vector, 'weight': weight, 'pid': pid}]}) else: channel_peopleLst_dict[channel][ people_type].append({'vector': vector, 'weight': weight, 'pid': pid}) # total weight weight_sum = int(round(sum(weight_sum_dict.values())) * 1e3) channel_people_averageVector_dict = {} for ch in channel_peopleLst_dict: channel_people_averageVector_dict.update({ch: {}}) for peo in channel_peopleLst_dict[ch]: channel_people_averageVector_dict[ch].update({peo: 0}) for vect_meta in channel_peopleLst_dict[ch][peo]: channel_people_averageVector_dict[ch][ peo] += vect_meta['vector'] * vect_meta['weight'] * 1e3 channel_people_averageVector_dict[ch][peo] /= weight_sum return (channel_people_averageVector_dict, weight_sum) def PIV_model_gen(date_str, raw_file_path, house_type=None): try: date_T = datetime.datetime.strptime(date_str, '%y%m%d') except ValueError: try: date_T = datetime.datetime.strptime(date_str, '%Y%m%d') except ValueError: try: date_T = datetime.datetime.strptime(date_str, '%Y-%m-%d') except: print('Ill formatted date string %s' % date_str) else: date_str_iso = date_T.isoformat()[:10] date_str = date_str_iso.replace('-', '')[2:] raw_fn = '%s_Minutes.csv' % date_str piv_dict, weight_sum = PIV_from_one_file(raw_file_path, raw_fn, house_type=house_type) date_channel_people_averVect = {date_str_iso: piv_dict} return date_channel_people_averVect, weight_sum # test if __name__ == '__main__': date_str = '2018-01-02' raw_file_path = 'D:\\CSM\\code_repo\\PIV\\data\\PIV_CD' channel_people_averageVector_dict, weight_sum = PIV_model_gen( date_str, raw_file_path)