本篇是 Udacity 中
数据分析进阶
的项目1:分析OSM地图数据所使用的代码。
1. 清理数据
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import io
import re
from collections import Counter
import copy
import pprint
import codecs
import json
est_width_counter = Counter()
osm_file_name = "kawasaki1.osm"
osm_file = open(osm_file_name,"r",encoding="utf8")
# 处理邮政编码
def clean_zipcode(zipcode):
# 去除数字以外的字符
newzipcode = re.sub(u"\D",u"",zipcode)
# 如果邮政编码长度不为7,则置空
if not len(newzipcode) == 7:
newzipcode = ""
return newzipcode
# 处理邮政编码
def audit_zipcode(elem):
if (elem.tag == "node") or (elem.tag == "way") or (elem.tag == "relation"):
tag_dict = {}
for tag in elem.iter("tag"):
try:
if (tag.attrib["k"] == "addr:postcode"):
zipcode = tag.attrib["v"]
newzipcode = clean_zipcode(zipcode)
tag.attrib["v"] = newzipcode
#print("{},{}".format(zipcode,newzipcode))
except:
pass
# 处理医院相关错误
ori_amenity = ["hospital","doctors","clinic"]
excepted_amenity = 'dentist'
dentist_str = "歯科"
def audit_dentist(elem):
if (elem.tag == "node") or (elem.tag == "way") or (elem.tag == "relation"):
tag_dict = {}
for tag in elem.iter("tag"):
try:
if (tag.attrib['k'] == "amenity") and (tag.attrib['v'] in ori_amenity):
tag_dict[tag.attrib['k']] = tag.attrib['v']
if (tag.attrib['k'] == "name") and (dentist_str in tag.attrib['v']):
tag_dict[tag.attrib['k']] = tag.attrib['v']
except:
return None
# 如果上面两个条件都满足,即名称含有牙科,但是分类不是牙科时,替换其amenity属性
if len(tag_dict) == 2:
for tag in elem.iter("tag"):
if tag.attrib['k'] == "amenity":
tag.attrib['v'] = excepted_amenity
return True
return False
# 处理宽度数据
def audit_est_width(elem):
if elem.tag in ["node","way","relation"]:
for item in elem.iter("tag"):
if item.attrib["k"] in ["est_width","yh:WIDTH"]:
width = item.attrib["v"]
new_width = re.sub(u"[a-zA-Z]",u"",width)
new_width = new_width.replace("-","〜")
est_width_counter[new_width] += 1
# 将原先的element改为min,新增一个max
item_max = copy.deepcopy(item)
item.attrib["k"] = item.attrib["k"] + "_min"
item.attrib["v"] = float(new_width.split("〜")[0])
# 修改新增的element
item_max.attrib["k"] = item_max.attrib["k"] + "_max"
item_max.attrib["v"] = float(new_width.split("〜")[1])
elem.append(item_max)
# 全角转半角
def digit_full2half(fullnum):
try:
return {"0":"0","1":"1","2":"2","3":"3","4":"4","5":"5",
"6":"6","7":"7","8":"8","9":"9","ー":"-"}[fullnum]
except:
return fullnum
# 处理housenumber的全半角问题
def audit_number_full_half(elem):
if elem.tag in ["node","way","relation"]:
for item in elem.iter("tag"):
if item.attrib["k"] in ["addr:housenumber","addr:block_number"]:
number = item.attrib["v"]
if re.match(u"[0-9]",number):
new_number = "".join((map(digit_full2half,number)))
if re.match(u"^\d号$",number):
new_number = number.replace("号","")
2. 读取数据,并写入Json文件
# 1.将node/way/relation中的id等值填到dict中
def set_common_info(dict_xml,elem):
dict_xml["id"] = elem.attrib['id']
dict_xml["version"] = elem.attrib['version']
dict_xml["timestamp"] = elem.attrib['timestamp']
dict_xml["changeset"] = elem.attrib['changeset']
dict_xml["uid"] = elem.attrib['uid']
dict_xml["user"] = elem.attrib['user']
# 类型
dict_xml["type"] = elem.tag
# 如果类型是node,处理pos
if dict_xml["type"] == "node":
pos = (elem.attrib['lat'],elem.attrib['lon'])
dict_xml["pos"] = pos
# 2. 将tag数据写入tag的字典
def set_tag_info(dict_xml,elem):
dict_xml["tag"] = {}
dict_tag = {}
for item in elem.iter("tag"):
dict_tag[item.attrib["k"]] = item.attrib["v"]
dict_xml["tag"] = dict_tag
# 3. 处理nd数据
def set_nd_info(dict_xml,elem):
dict_xml["nd"] = []
for item in elem.iter("nd"):
dict_xml["nd"].append(item.attrib["ref"])
# 4. 处理member数据
def set_member_info(dict_xml,elem):
dict_xml["member"] = []
for item in elem.iter("member"):
mem_list = []
mem_list.append(item.attrib["type"])
mem_list.append(item.attrib["ref"])
mem_list.append(item.attrib["role"])
dict_xml["member"].append(mem_list)
# 将xml数据读取并处理为字典格式
def read_and_process_XML(osm_file):
dict_xml_list = []
n = 0
for event,elem in ET.iterparse(osm_file,events=("start",)):
audit_zipcode(elem)
audit_dentist(elem)
audit_est_width(elem)
audit_number_full_half(elem)
# 将一个elem组织成一个dict
#if elem.tag in ["node","way","relation"]:
if elem.tag in ["node","way","relation"]:
#print(elem.tag)
dict_xml = {}
# 1.将node/way/relation中的id等值填到dict中
set_common_info(dict_xml,elem)
# 2. 处理tag数据
set_tag_info(dict_xml,elem)
# 3. 如果是way类型,则处理nd数据
if elem.tag == "way":
set_nd_info(dict_xml,elem)
# 4. 如果是relation类型,处理member数据
if elem.tag == "relation":
set_member_info(dict_xml,elem)
# 将填充完毕的dict追加到list中
dict_xml_list.append(dict_xml)
# 调试用
#n += 1
#if n == 1000000:
# break
return dict_xml_list
# 将dict的list写入json文件
def write_dictlist_json(dict_xml_list,osm_file_name):
file_out = "{0}.json".format(osm_file_name)
print(file_out)
with codecs.open(file_out, "w",encoding="utf-8") as fo:
for el in dict_xml_list:
#fo.write(json.dumps(el, indent=2,ensure_ascii=False)+"\n")
# 下面这种方式比较节省空间,另外如果不加ensure_ascii 参数的话,会出现乱码
fo.write(json.dumps(el,ensure_ascii=False)+"\n")
pass
if __name__ == "__main__":
dict_xml_list = []
dict_xml_list = read_and_process_XML(osm_file)
write_dictlist_json(dict_xml_list,osm_file_name)
#pprint.pprint(dict_xml_list)
kawasaki1.osm.json
3. 利用MongoDB进行数据分析
3.1 用户贡献度
from pymongo import MongoClient
import pprint
client = MongoClient("mongodb://localhost:27017")
db = client.mydb
dict_list = []
def most_user():
result = db.kawasaki.aggregate([
{"$group":{"_id":"$user",
"count":{"$sum":1}}},
{"$sort":{"count":-1}},
{"$limit":8}
])
return result
if __name__ == "__main__":
sum = 0
result = most_user()
for row in result:
dict_user = {}
dict_user["_id"] = row['_id']
dict_user["count"] = row['count']
sum += row['count']
dict_list.append(dict_user)
# 补全最后的其他用户
others = {}
others["_id"] = "others"
# 488500 是所有文档数目,减去前8位用户贡献的文档,就是其他人贡献的
others["count"] = 488500 - sum
dict_list.append(others)
pprint.pprint(dict_list)
[{'_id': 'futurumspes', 'count': 265385},
{'_id': 'kawah64', 'count': 37233},
{'_id': 'Ryo-a', 'count': 20083},
{'_id': 'Nuko', 'count': 14077},
{'_id': 'hayashi', 'count': 13189},
{'_id': 'ribbon', 'count': 12811},
{'_id': 'kurauchi', 'count': 10178},
{'_id': 'indyKK', 'count': 8356},
{'_id': 'others', 'count': 107188}]
def get_labels_values_from_dictlist(dict_list):
labels = []
sizes = []
for row in dict_list:
labels.append(row["_id"])
sizes.append(row["count"])
return labels,sizes
import matplotlib.pyplot as plt
import numpy as np
labels,sizes = get_labels_values_from_dictlist(dict_list)
print(labels)
print(sizes)
# starttangle指第一个饼起始旋转的角度,逆时针旋转
plt.pie(sizes,labels=labels,autopct='%1.1f%%',
shadow=True,startangle=90)
plt.show()
['futurumspes', 'kawah64', 'Ryo-a', 'Nuko', 'hayashi', 'ribbon', 'kurauchi', 'indyKK', 'others']
[265385, 37233, 20083, 14077, 13189, 12811, 10178, 8356, 107188]
# 不同类型的地图,贡献度也不同,区分条件中加入type
def most_user():
result = db.kawasaki.aggregate([
{"$group":{"_id":{"user":"$user","type":"$type"},
"count":{"$sum":1}}},
{"$sort":{"count":-1}},
{"$limit":10}
])
return result
if __name__ == "__main__":
result = most_user()
for row in result:
#pass
pprint.pprint(row)
{'_id': {'type': 'node', 'user': 'futurumspes'}, 'count': 222374}
{'_id': {'type': 'way', 'user': 'futurumspes'}, 'count': 43011}
{'_id': {'type': 'node', 'user': 'kawah64'}, 'count': 31345}
{'_id': {'type': 'node', 'user': 'Ryo-a'}, 'count': 16568}
{'_id': {'type': 'node', 'user': 'Nuko'}, 'count': 12198}
{'_id': {'type': 'node', 'user': 'ribbon'}, 'count': 11116}
{'_id': {'type': 'node', 'user': 'hayashi'}, 'count': 11070}
{'_id': {'type': 'node', 'user': 'kurauchi'}, 'count': 8353}
{'_id': {'type': 'node', 'user': 'indyKK'}, 'count': 7120}
{'_id': {'type': 'node', 'user': 'nyampire'}, 'count': 6656}
3.2 更新时间分布
from collections import Counter
time_counter = Counter()
def process_time(timestamp,key="year"):
if key == "year":
return timestamp[:4]
if key == "month":
return timestamp[5:7]
if key == "hour":
return timestamp[11:13]
if key == "weekday":
pass
pass
def most_time():
result = db.kawasaki.aggregate([
{"$group":{"_id":"$timestamp",
"count":{"$sum":1}}},
{"$sort":{"count":-1}}
])
return result
if __name__ == "__main__":
result = most_time()
judge_key = "month"
sum = 0
for row in result:
row["_id"] = process_time(row["_id"],judge_key)
time_counter[row["_id"]] += row["count"]
sum += row["count"]
time_counter = time_counter.most_common()
print(time_counter)
print(sum)
[('02', 108736), ('03', 107897), ('01', 77523), ('12', 30395), ('05', 30153), ('06', 28304), ('07', 22837), ('04', 19932), ('08', 18515), ('09', 17435), ('11', 13534), ('10', 13239)]
488500
def get_label_values_from_list(time_counter):
labels = []
sizes = []
for item in time_counter:
labels.append(item[0])
sizes.append(item[1])
return labels,sizes
labels,sizes = get_label_values_from_list(time_counter)
# starttangle指第一个饼起始旋转的角度,逆时针旋转
plt.pie(sizes,labels=labels,autopct='%1.1f%%',shadow=True,startangle=90)
plt.show()
3.3 餐馆种类
def most_food():
result = db.kawasaki.aggregate([
{"$match":{"tag.amenity":{"$exists":1},
"tag.amenity":{"$in":["fast_food","restaurant"]},
"tag.cuisine":{"$exists":True}
}},
{"$group":{"_id":"$tag.cuisine",
"count":{"$sum":1}}},
{"$sort":{"count":-1}},
{"$limit":10}
])
return result
food_counter = Counter()
dict_label={"ramen":"noodle","noodle;ramen":"noodle","chicken":"broiled meat","barbecue":"broiled meat"}
if __name__ == "__main__":
sum = 0
result = most_food()
for row in result:
try:
food_label = dict_label[row["_id"]]
food_counter[food_label] += row["count"]
except:
food_label = row["_id"]
food_counter[food_label] += row["count"]
food_counter = food_counter.most_common()
print(food_counter)
[('noodle', 61), ('japanese', 50), ('chinese', 40), ('sushi', 27), ('burger', 26), ('broiled meat', 16), ('italian', 11)]
def get_label_values_from_list(food_counter):
labels = []
sizes = []
for item in food_counter:
labels.append(item[0])
sizes.append(item[1])
return labels,sizes
labels,sizes = get_label_values_from_list(food_counter)
print(labels)
print(sizes)
# starttangle指第一个饼起始旋转的角度,逆时针旋转
plt.pie(sizes,labels=labels,autopct='%1.1f%%',shadow=True,startangle=90)
plt.show()
['noodle', 'japanese', 'chinese', 'sushi', 'burger', 'broiled meat', 'italian']
[61, 50, 40, 27, 26, 16, 11]
3.4 地物种类分布
def most_cat():
result = db.kawasaki.aggregate([
{"$match":{"tag.amenity":{"$exists":1}}},
{"$group":{"_id":"$tag.amenity",
"count":{"$sum":1}}},
{"$sort":{"count":-1}},
{"$limit":10}
])
return result
food_counter = Counter()
if __name__ == "__main__":
sum = 0
result = most_cat()
for row in result:
print(row)
{'_id': 'parking', 'count': 3635}
{'_id': 'restaurant', 'count': 376}
{'_id': 'drinking_water', 'count': 321}
{'_id': 'school', 'count': 290}
{'_id': 'toilets', 'count': 261}
{'_id': 'kindergarten', 'count': 230}
{'_id': 'place_of_worship', 'count': 207}
{'_id': 'social_facility', 'count': 186}
{'_id': 'fast_food', 'count': 183}
{'_id': 'pub', 'count': 126}
3.5 中国餐馆
def most_cat():
result = db.kawasaki.aggregate([
{"$match":{"tag.cuisine":"chinese"}},
{"$group":{"_id":"$tag.name",
"count":{"$sum":1}}},
{"$sort":{"count":-1}},
{"$limit":100}
])
return result
food_counter = Counter()
if __name__ == "__main__":
sum = 0
result = most_cat()
for row in result:
print(row)
{'_id': 'バーミヤン', 'count': 3}
{'_id': 'GYOZA', 'count': 1}
{'_id': '如家飯店', 'count': 1}
{'_id': '珍味楼', 'count': 1}
{'_id': '粥菜坊', 'count': 1}
{'_id': '珍々亭 (Chin Chin Tei)', 'count': 1}
{'_id': '満園', 'count': 1}
{'_id': 'バーミヤン 北加瀬店', 'count': 1}
...