Commit ca8dbaa3 authored by 张彦钊's avatar 张彦钊

修改测试文件

parent 44fd59df
...@@ -62,52 +62,43 @@ def feature_engineer(): ...@@ -62,52 +62,43 @@ def feature_engineer():
df = spark.sql(sql) df = spark.sql(sql)
url = "jdbc:mysql://172.16.30.143:3306/zhengxing"
jdbcDF = spark.read.format("jdbc").option("driver", "com.mysql.jdbc.Driver").option("url", url) \
.option("dbtable", "api_service").option("user", 'work').option("password", 'BJQaT9VzDcuPBqkd').load() # url = "jdbc:mysql://172.16.30.143:3306/zhengxing"
jdbcDF.createOrReplaceTempView("api_service") # jdbcDF = spark.read.format("jdbc").option("driver", "com.mysql.jdbc.Driver").option("url", url) \
jdbc = spark.read.format("jdbc").option("driver", "com.mysql.jdbc.Driver").option("url", url) \ # .option("dbtable", "api_service").option("user", 'work').option("password", 'BJQaT9VzDcuPBqkd').load()
.option("dbtable", "api_doctor").option("user", 'work').option("password", 'BJQaT9VzDcuPBqkd').load() # jdbcDF.createOrReplaceTempView("api_service")
jdbc.createOrReplaceTempView("api_doctor") # jdbc = spark.read.format("jdbc").option("driver", "com.mysql.jdbc.Driver").option("url", url) \
# .option("dbtable", "api_doctor").option("user", 'work').option("password", 'BJQaT9VzDcuPBqkd').load()
sql = "select s.id as diary_service_id,d.hospital_id " \ # jdbc.createOrReplaceTempView("api_doctor")
"from api_service s left join api_doctor d on s.doctor_id = d.id"
hospital = spark.sql(sql)
df = df.join(hospital,"diary_service_id","left_outer").fillna("na")
df = df.drop("level2").drop("diary_service_id")
df = df.drop_duplicates(["ucity_id", "level2_ids", "ccity_name", "device_type", "manufacturer",
"channel", "top", "time", "stat_date", "app_list", "hospital_id", "level3_ids"])
df = df.fillna("na")
v = df.select("app_list").rdd.map(lambda x:x[0]).collect()
print(type(v))
print(v[:2])
# app_list_value = [i.split(",") for i in v]
# #
# app_list_unique = [] # sql = "select s.id as diary_service_id,d.hospital_id " \
# for i in app_list_value: # "from api_service s left join api_doctor d on s.doctor_id = d.id"
# app_list_unique.extend(i) # hospital = spark.sql(sql)
# app_list_unique = list(set(app_list_unique))
# number = len(app_list_unique)
# app_list_map = dict(zip(app_list_unique, list(range(1, number + 1))))
# #
# df = df.select("app_list","ucity_id", "level2_ids", "ccity_name", "device_type", "manufacturer","channel", # df = df.join(hospital,"diary_service_id","left_outer").fillna("na")
# "top", "time", "stat_date", "hospital_id", "level3_ids","y","z", # df = df.drop("level2").drop("diary_service_id")
# "treatment_method","price_min","price_max","treatment_time","maintain_time","recover_time")\ # df = df.drop_duplicates(["ucity_id", "level2_ids", "ccity_name", "device_type", "manufacturer",
# .map(lambda x :app_list_func(x[0],app_list_map)) # "channel", "top", "time", "stat_date", "app_list", "hospital_id", "level3_ids"])
# #
# df.show(6) # df = df.fillna("na")
v = set(df.select("app_list").rdd.map(lambda x: x[0]).collect())
app_list_value = [i.split(",") for i in v]
app_list_unique = []
for i in app_list_value:
app_list_unique.extend(i)
app_list_unique = list(set(app_list_unique))
number = len(app_list_unique)
app_list_map = dict(zip(app_list_unique, list(range(1, number + 1))))
print(df.select("app_list","stat_date").rdd.map(lambda x:(app_list_func(x[0],app_list_map),x[1])).first())
# app_list_number, app_list_map = multi_hot(df, "app_list", 2) # app_list_number, app_list_map = multi_hot(df, "app_list", 2)
# level2_number, level2_map = multi_hot(df, "clevel2_id", 2 + app_list_number) # level2_number, level2_map = multi_hot(df, "clevel2_id", 2 + app_list_number)
# level3_number, level3_map = multi_hot(df, "level3_ids", 2 + app_list_number + level2_number) # level3_number, level3_map = multi_hot(df, "level3_ids", 2 + app_list_number + level2_number)
# #
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment