Haste makes waste

Uda-DataAnalysis-项目1-分析OpenStreetMap的OSM数据(代码)

Posted on By lijun

本篇是 Udacity 中 数据分析进阶 的项目1:分析OSM地图数据所使用的代码。

1. 清理数据

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import xml.etree.cElementTree as ET
import io
import re
from collections import Counter
import copy
import pprint
import codecs
import json

est_width_counter = Counter()
osm_file_name = "kawasaki1.osm"
osm_file = open(osm_file_name,"r",encoding="utf8")

# 处理邮政编码
def clean_zipcode(zipcode):
    # 去除数字以外的字符
    newzipcode = re.sub(u"\D",u"",zipcode)
    
    # 如果邮政编码长度不为7,则置空
    if not len(newzipcode) == 7:
        newzipcode = ""
    
    return newzipcode

# 处理邮政编码
def audit_zipcode(elem):
    if (elem.tag == "node") or (elem.tag == "way") or (elem.tag == "relation"):
        tag_dict = {}
        for tag in elem.iter("tag"):
            try:
                if (tag.attrib["k"] == "addr:postcode"):
                        zipcode = tag.attrib["v"]
                        newzipcode = clean_zipcode(zipcode)
                        tag.attrib["v"] = newzipcode 
                        #print("{},{}".format(zipcode,newzipcode))
            except:
                pass

# 处理医院相关错误
ori_amenity = ["hospital","doctors","clinic"]
excepted_amenity = 'dentist'
dentist_str = "歯科"

def audit_dentist(elem):
    if (elem.tag == "node") or (elem.tag == "way") or (elem.tag == "relation"):
        tag_dict = {}
        for tag in elem.iter("tag"):
            try:
                if (tag.attrib['k'] == "amenity") and (tag.attrib['v'] in ori_amenity):
                    tag_dict[tag.attrib['k']] = tag.attrib['v']
                if (tag.attrib['k'] == "name") and (dentist_str in tag.attrib['v']):
                    tag_dict[tag.attrib['k']] = tag.attrib['v']
            except:
                return None
        # 如果上面两个条件都满足,即名称含有牙科,但是分类不是牙科时,替换其amenity属性
        if len(tag_dict) == 2:
            for tag in elem.iter("tag"):
                if tag.attrib['k'] == "amenity":
                    tag.attrib['v'] = excepted_amenity
                    return True
            
    return False

# 处理宽度数据
def audit_est_width(elem):
    if elem.tag in ["node","way","relation"]:
        for item in elem.iter("tag"):
            if item.attrib["k"] in ["est_width","yh:WIDTH"]:
                width = item.attrib["v"]
                new_width = re.sub(u"[a-zA-Z]",u"",width)
                new_width = new_width.replace("-","〜")
                est_width_counter[new_width] += 1
                
                # 将原先的element改为min,新增一个max
                item_max = copy.deepcopy(item)
                
                item.attrib["k"] = item.attrib["k"] + "_min"
                item.attrib["v"] = float(new_width.split("〜")[0])
                
                # 修改新增的element
                item_max.attrib["k"] = item_max.attrib["k"] + "_max"
                item_max.attrib["v"] = float(new_width.split("〜")[1])
                
                elem.append(item_max)

# 全角转半角
def digit_full2half(fullnum):
    try:
        return {"0":"0","1":"1","2":"2","3":"3","4":"4","5":"5",
                "6":"6","7":"7","8":"8","9":"9","ー":"-"}[fullnum]
    except:
        return fullnum

# 处理housenumber的全半角问题 
def audit_number_full_half(elem):
    if elem.tag in ["node","way","relation"]:
        for item in elem.iter("tag"):
            if item.attrib["k"] in ["addr:housenumber","addr:block_number"]:
                number = item.attrib["v"]
                if re.match(u"[0-9]",number):
                    new_number = "".join((map(digit_full2half,number)))
                if re.match(u"^\d号$",number):
                    new_number = number.replace("号","")
                    

2. 读取数据,并写入Json文件



# 1.将node/way/relation中的id等值填到dict中
def set_common_info(dict_xml,elem):
    dict_xml["id"] = elem.attrib['id']
    dict_xml["version"] = elem.attrib['version']
    dict_xml["timestamp"] = elem.attrib['timestamp']
    dict_xml["changeset"] = elem.attrib['changeset']
    dict_xml["uid"] = elem.attrib['uid']
    dict_xml["user"] = elem.attrib['user']
    
    # 类型
    dict_xml["type"] = elem.tag
    
    # 如果类型是node,处理pos
    if dict_xml["type"] == "node":
        pos = (elem.attrib['lat'],elem.attrib['lon'])
        dict_xml["pos"] = pos

# 2. 将tag数据写入tag的字典
def set_tag_info(dict_xml,elem):
    dict_xml["tag"] = {}
    dict_tag = {}
    for item in elem.iter("tag"):
        dict_tag[item.attrib["k"]] = item.attrib["v"]
    
    dict_xml["tag"] = dict_tag

# 3. 处理nd数据
def set_nd_info(dict_xml,elem):
    dict_xml["nd"] = []
    for item in elem.iter("nd"):
        dict_xml["nd"].append(item.attrib["ref"])

# 4. 处理member数据
def set_member_info(dict_xml,elem):
    dict_xml["member"] = []
    for item in elem.iter("member"):
        mem_list = []
        mem_list.append(item.attrib["type"])
        mem_list.append(item.attrib["ref"])
        mem_list.append(item.attrib["role"])
        
        dict_xml["member"].append(mem_list)

        
        
# 将xml数据读取并处理为字典格式  
def read_and_process_XML(osm_file):
    dict_xml_list = []
    n = 0
    for event,elem in ET.iterparse(osm_file,events=("start",)):   

        audit_zipcode(elem)
        audit_dentist(elem)
        audit_est_width(elem)
        audit_number_full_half(elem)
        
        # 将一个elem组织成一个dict
        #if elem.tag in ["node","way","relation"]:
        if elem.tag in ["node","way","relation"]:
            #print(elem.tag)
            dict_xml = {}
            
            # 1.将node/way/relation中的id等值填到dict中
            set_common_info(dict_xml,elem)    
            
            # 2. 处理tag数据
            set_tag_info(dict_xml,elem)
            
            # 3. 如果是way类型,则处理nd数据
            if elem.tag == "way":
                set_nd_info(dict_xml,elem)
                
            # 4. 如果是relation类型,处理member数据
            if elem.tag == "relation":
                set_member_info(dict_xml,elem)
            
            # 将填充完毕的dict追加到list中
            dict_xml_list.append(dict_xml)

            # 调试用
            #n += 1
            #if n == 1000000:
            #    break        
    
    return dict_xml_list
        
# 将dict的list写入json文件
def write_dictlist_json(dict_xml_list,osm_file_name):
    file_out = "{0}.json".format(osm_file_name)
    print(file_out)
    with codecs.open(file_out, "w",encoding="utf-8") as fo:
        for el in dict_xml_list:
            #fo.write(json.dumps(el, indent=2,ensure_ascii=False)+"\n")
            # 下面这种方式比较节省空间,另外如果不加ensure_ascii 参数的话,会出现乱码
            fo.write(json.dumps(el,ensure_ascii=False)+"\n")
            pass
            
if __name__ == "__main__":
    dict_xml_list = []
    dict_xml_list = read_and_process_XML(osm_file)
    write_dictlist_json(dict_xml_list,osm_file_name)
    #pprint.pprint(dict_xml_list)
kawasaki1.osm.json

3. 利用MongoDB进行数据分析

3.1 用户贡献度

from pymongo import MongoClient
import pprint

client = MongoClient("mongodb://localhost:27017")
db = client.mydb
dict_list = []

def most_user():
    result = db.kawasaki.aggregate([
        {"$group":{"_id":"$user",
                  "count":{"$sum":1}}},
        {"$sort":{"count":-1}},
        {"$limit":8}
    ])
    return result

if __name__ == "__main__":
    sum = 0
    result = most_user()
    for row in result:
        dict_user = {}
        dict_user["_id"] = row['_id']
        dict_user["count"] = row['count']
        sum += row['count']
        
        dict_list.append(dict_user)
    
    # 补全最后的其他用户
    others = {}
    others["_id"] = "others"
    # 488500 是所有文档数目,减去前8位用户贡献的文档,就是其他人贡献的
    others["count"] = 488500 - sum
    dict_list.append(others)
    
    pprint.pprint(dict_list)
[{'_id': 'futurumspes', 'count': 265385},
 {'_id': 'kawah64', 'count': 37233},
 {'_id': 'Ryo-a', 'count': 20083},
 {'_id': 'Nuko', 'count': 14077},
 {'_id': 'hayashi', 'count': 13189},
 {'_id': 'ribbon', 'count': 12811},
 {'_id': 'kurauchi', 'count': 10178},
 {'_id': 'indyKK', 'count': 8356},
 {'_id': 'others', 'count': 107188}]
def get_labels_values_from_dictlist(dict_list):
    labels = []
    sizes = []
    
    for row in dict_list:
        labels.append(row["_id"])
        sizes.append(row["count"])
    
    return labels,sizes
import matplotlib.pyplot as plt
import numpy as np

labels,sizes = get_labels_values_from_dictlist(dict_list)
print(labels)
print(sizes)

# starttangle指第一个饼起始旋转的角度,逆时针旋转
plt.pie(sizes,labels=labels,autopct='%1.1f%%',
        shadow=True,startangle=90)

plt.show()
['futurumspes', 'kawah64', 'Ryo-a', 'Nuko', 'hayashi', 'ribbon', 'kurauchi', 'indyKK', 'others']
[265385, 37233, 20083, 14077, 13189, 12811, 10178, 8356, 107188]

image

# 不同类型的地图,贡献度也不同,区分条件中加入type
def most_user():
    result = db.kawasaki.aggregate([
        {"$group":{"_id":{"user":"$user","type":"$type"},
                  "count":{"$sum":1}}},
        {"$sort":{"count":-1}},
        {"$limit":10}
    ])
    return result

if __name__ == "__main__":
    result = most_user()
    for row in result:
        #pass
        pprint.pprint(row)
{'_id': {'type': 'node', 'user': 'futurumspes'}, 'count': 222374}
{'_id': {'type': 'way', 'user': 'futurumspes'}, 'count': 43011}
{'_id': {'type': 'node', 'user': 'kawah64'}, 'count': 31345}
{'_id': {'type': 'node', 'user': 'Ryo-a'}, 'count': 16568}
{'_id': {'type': 'node', 'user': 'Nuko'}, 'count': 12198}
{'_id': {'type': 'node', 'user': 'ribbon'}, 'count': 11116}
{'_id': {'type': 'node', 'user': 'hayashi'}, 'count': 11070}
{'_id': {'type': 'node', 'user': 'kurauchi'}, 'count': 8353}
{'_id': {'type': 'node', 'user': 'indyKK'}, 'count': 7120}
{'_id': {'type': 'node', 'user': 'nyampire'}, 'count': 6656}

3.2 更新时间分布

from collections import Counter

time_counter = Counter()

def process_time(timestamp,key="year"):
    if key == "year":
        return timestamp[:4]
    if key == "month":
        return timestamp[5:7]
    if key == "hour":
        return timestamp[11:13]
    if key == "weekday":
        pass
    pass
    

def most_time():
    result = db.kawasaki.aggregate([
        {"$group":{"_id":"$timestamp",
                  "count":{"$sum":1}}},
        {"$sort":{"count":-1}}
    ])
    return result

if __name__ == "__main__":
    result = most_time()
    judge_key = "month"
    sum = 0
    for row in result:
        row["_id"] = process_time(row["_id"],judge_key)
        time_counter[row["_id"]] += row["count"]
        sum += row["count"]
    
    time_counter = time_counter.most_common()
    print(time_counter)
    print(sum)
[('02', 108736), ('03', 107897), ('01', 77523), ('12', 30395), ('05', 30153), ('06', 28304), ('07', 22837), ('04', 19932), ('08', 18515), ('09', 17435), ('11', 13534), ('10', 13239)]
488500
def get_label_values_from_list(time_counter):
    labels = []
    sizes = []
    for item in time_counter:
        labels.append(item[0])
        sizes.append(item[1])
        
    return labels,sizes

labels,sizes = get_label_values_from_list(time_counter)

# starttangle指第一个饼起始旋转的角度,逆时针旋转
plt.pie(sizes,labels=labels,autopct='%1.1f%%',shadow=True,startangle=90)

plt.show()

image

3.3 餐馆种类

def most_food():
    result = db.kawasaki.aggregate([
        {"$match":{"tag.amenity":{"$exists":1},
                 "tag.amenity":{"$in":["fast_food","restaurant"]},
                  "tag.cuisine":{"$exists":True}
                  }},
        {"$group":{"_id":"$tag.cuisine",
                  "count":{"$sum":1}}},
        {"$sort":{"count":-1}},
        {"$limit":10}
    ])
    return result

food_counter = Counter()
dict_label={"ramen":"noodle","noodle;ramen":"noodle","chicken":"broiled meat","barbecue":"broiled meat"}

if __name__ == "__main__":
    sum = 0
    result = most_food()
    for row in result:
        try:
            food_label = dict_label[row["_id"]]
            food_counter[food_label] += row["count"]
        except:
            food_label = row["_id"]
            food_counter[food_label] += row["count"]
    
    food_counter = food_counter.most_common()
    print(food_counter)
[('noodle', 61), ('japanese', 50), ('chinese', 40), ('sushi', 27), ('burger', 26), ('broiled meat', 16), ('italian', 11)]
def get_label_values_from_list(food_counter):
    labels = []
    sizes = []
    for item in food_counter:
        labels.append(item[0])
        sizes.append(item[1])
        
    return labels,sizes
labels,sizes = get_label_values_from_list(food_counter)
print(labels)
print(sizes)

# starttangle指第一个饼起始旋转的角度,逆时针旋转
plt.pie(sizes,labels=labels,autopct='%1.1f%%',shadow=True,startangle=90)

plt.show()
['noodle', 'japanese', 'chinese', 'sushi', 'burger', 'broiled meat', 'italian']
[61, 50, 40, 27, 26, 16, 11]

image

3.4 地物种类分布

def most_cat():
    result = db.kawasaki.aggregate([
        {"$match":{"tag.amenity":{"$exists":1}}},
        {"$group":{"_id":"$tag.amenity",
                  "count":{"$sum":1}}},
        {"$sort":{"count":-1}},
        {"$limit":10}
    ])
    return result

food_counter = Counter()

if __name__ == "__main__":
    sum = 0
    result = most_cat()
    for row in result:
        print(row)
{'_id': 'parking', 'count': 3635}
{'_id': 'restaurant', 'count': 376}
{'_id': 'drinking_water', 'count': 321}
{'_id': 'school', 'count': 290}
{'_id': 'toilets', 'count': 261}
{'_id': 'kindergarten', 'count': 230}
{'_id': 'place_of_worship', 'count': 207}
{'_id': 'social_facility', 'count': 186}
{'_id': 'fast_food', 'count': 183}
{'_id': 'pub', 'count': 126}

3.5 中国餐馆

def most_cat():
    result = db.kawasaki.aggregate([
        {"$match":{"tag.cuisine":"chinese"}},
        {"$group":{"_id":"$tag.name",
                  "count":{"$sum":1}}},
        {"$sort":{"count":-1}},
        {"$limit":100}
    ])
    return result

food_counter = Counter()

if __name__ == "__main__":
    sum = 0
    result = most_cat()
    for row in result:
        print(row)
{'_id': 'バーミヤン', 'count': 3}
{'_id': 'GYOZA', 'count': 1}
{'_id': '如家飯店', 'count': 1}
{'_id': '珍味楼', 'count': 1}
{'_id': '粥菜坊', 'count': 1}
{'_id': '珍々亭 (Chin Chin Tei)', 'count': 1}
{'_id': '満園', 'count': 1}
{'_id': 'バーミヤン 北加瀬店', 'count': 1}
...