Haste makes waste

Uda-DataAnalysis-04-复杂格式数据提取

Posted on By lijun

本篇是 Udacity 中 数据分析进阶 的4,5 复杂格式数据提取的总结。 讲述如何用python提取xml和html等数据,不是很深入,但是能完成简单的任务,有特定任务时再详细研究。

1. 7 解析XML示例程序

1.1 代码(1)- 获取节点下的标签tag

import xml.etree.ElementTree as ET
import pprint

# 将xml文件解析为一个tree
tree = ET.parse('exampleResearchArticle.xml')
# 获取该tree的所有root节点
root = tree.getroot()

# 获取root节点下的标签
print '\nChildren of root:'
for child in root:
    print child.tag

1.2 代码2- find和findall函数查找元素

# 获取特定路径下的标签
# 如果这里有两个title,修改上面的函数为findall,返回一个list。
# 否则通过find的话,返回最开始找到的那个。
title = root.find('./fm/bibl/title')
title_text = ""
# title元素中有<p></p>子元素,循环找出所有子元素
for p in title:
    title_text += p.text
print "\nTitle:\n",title_text

# 因为aug下面有很多个au,故用findall函数取出所有的au元素
print "\nAuthor email addresses:"
for a in root.findall('./fm/bibl/aug/au'):
    # au元素下面只有一个email元素
    email = a.find('email')
    if email is not None:
        # email元素是最低级的元素,直接取出text
        print email.text

1.3 代码-练习

# 取出au元素下面的作者信息,另外将作者信息组织成字典
# 将所有的作者信息作为list的一个元素添加进去

import xml.etree.ElementTree as ET

article_file = "exampleResearchArticle.xml"

def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()

def get_authors(root):
    authors = []
    # 从当前起,指定一个特定的路径
    for author in root.findall('./fm/bibl/aug/au'):
        data = {
                "fnm": None,
                "snm": None,
                "email": None
        }

        # 取出author元素中fnm/snm/email标签的数据
        data["fnm"] = author.find('./fnm').text
        data["snm"] = author.find('./snm').text
        data["email"] = author.find('./email').text

        authors.append(data)
    
    print authors

    return authors

1.4 代码-练习-处理属性

def get_authors(root):
    authors = []
    for author in root.findall('./fm/bibl/aug/au'):
        data = {
                "fnm": None,
                "snm": None,
                "email": None,
                "insr": []
        }

        
        data["fnm"] = author.find('./fnm').text
        data["snm"] = author.find('./snm').text
        data["email"] = author.find('./email').text
        
        # 生成器,将findall找到的所有xml元素取出,将每个元素的属性用attrib找到
        # 得到的结果是[{'iid': 'I3'}, {'iid': 'I4'}],即包含字典的list
        insr_list = [insr.attrib for insr in author.findall('insr')]
        print insr_list

        # 生成器,先将list中的字典取出,然后取出字典中key为iid的value值,赋值给data['insr']这个list
        data['insr'] = [insr['iid'] for insr in insr_list]
        
        #print data

        authors.append(data)

    return authors

1.5 示例代码-练习

def get_authors(root):
    authors = []
    for author in root.findall('./fm/bibl/aug/au'):
        data = {
                "fnm": None,
                "snm": None,
                "email": None,
                "insr": []
        }
        # 确实这里一步到位就行了,不需要分开写
        data["fnm"] = author.find('./fnm').text
        data["snm"] = author.find('./snm').text
        data["email"] = author.find('./email').text
        
        # 找到所有的insr,返回一个包含了很多xml元素list
        insr = author.findall('./insr')
        # 遍历list,i是一个xml的元素,取出其属性用attrib
        for i in insr:
            # 属性是个字典,将字典中的value用key为iid取出,添加到data['insr'] 这个list中
            data["insr"].append(i.attrib["iid"])
        authors.append(data)

    return authors

2. 使用beautiful soup提取html

2.1 代码

from bs4 import BeautifulSoup

def options(soup,id):
    option_values =[]
    
    # 找出所有特定id的元素list
    carrier_list = soup.find(id=id)
    # 在特定元素list中,找option标签的元素
    for option in carrier_list.find_all('option'):
        # 找到特定元素的value
        option_values.append(option['value'])
    return option_values

def print_list(label,codes):
    print "\n%s:" % label
    for c in codes:
        print c

def main():
    soup = BeautifulSoup(open('virgin_and_logan_airport.html'))
    
    codes = options(soup,"CarrierList")
    #print_list("Carriers",codes)
    
    codes = options(soup,"CarrierList")
    #print_list('Carriers',codes)
    
    codes = options(soup,"AirportList")
    #print_list('AirportList',codes)

main()

2.2 练习

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Your task is to process the HTML using BeautifulSoup, extract the hidden
# form field values for "__EVENTVALIDATION" and "__VIEWSTATE" and set the appropriate
# values in the data dictionary.
# All your changes should be in the 'extract_data' function
from bs4 import BeautifulSoup
import requests
import json

html_page = "page_source.html"


def extract_data(page):
    data = {"eventvalidation": "",
            "viewstate": ""}
    with open(page, "r") as html:
         soup = BeautifulSoup(html,"lxml")
         # 获取所有tag name为input的标签,是一个set
         inputs = soup.find_all("input")
            
         # 循环处理 tag 标签集
         for input in inputs:
                # input.attrs 是取出一个tag的所有属性,返回的是一个字典,key和value的键值对
                # 因为有的input tag可能没有name属性,所以必须要要先判断
                if "name" in input.attrs.keys():
                    # 根据题目要求获取指定value值,input这个标签的操作完全类似于字典
                    if input["name"] == "__EVENTVALIDATION":
                        data['eventvalidation'] = input['value']
                    if input["name"] == "__VIEWSTATE":
                        data['viewstate'] = input['value']
    return data

# 这个函数是直接从网上获取数据
def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]

    r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
                    data={'AirportList': "BOS",
                          'CarrierList': "VX",
                          'Submit': 'Submit',
                          "__EVENTTARGET": "",
                          "__EVENTARGUMENT": "",
                          "__EVENTVALIDATION": eventvalidation,
                          "__VIEWSTATE": viewstate
                    })

    return r.text

def test():
    data = extract_data(html_page)
    # data中包含了request所需要的参数数据,传递这个参数数据给request函数
    # request可以返回整个网页
    data2 = make_request(data)
    assert data["eventvalidation"] != ""
    assert data["eventvalidation"].startswith("/wEWjAkCoIj1ng0")
    assert data["viewstate"].startswith("/wEPDwUKLTI")
  
test()

2.3 示例代码

def extract_data(page):
    data = {"eventvalidation": "",
            "viewstate": ""}
    with open(page, "r") as html:
        soup = BeautifulSoup(html, "lxml")
        # 直接通过id取得字典,如果有重复的怎么办呢
        ev = soup.find(id="__EVENTVALIDATION")
        data["eventvalidation"] = ev["value"]

        vs = soup.find(id="__VIEWSTATE")
        data["viewstate"] = vs["value"]

    return data

# 将得到的数据写入本地网页文件中

def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]

    r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
                    data={'AirportList': "BOS",
                          'CarrierList': "VX",
                          'Submit': 'Submit',
                          "__EVENTTARGET": "",
                          "__EVENTARGUMENT": "",
                          "__EVENTVALIDATION": eventvalidation,
                          "__VIEWSTATE": viewstate
                    })

    f = open('soup-output.html',"w")
    f.write(r.text)

data = extract_data(html_page)
make_request(data)

观察上面的输出文件,并没有得到想要的数据集。 解决方式: 视频中 21.抓取解法部分讲解了原因,没有弄明白

def make_request():
    s = requests.Session()
    
    r = s.get("https://www.transtats.bts.gov/Data_Elements.aspx?Data=2")
    soup = BeautifulSoup(r.text)
    
    eventvalidation_element = soup.find(id="__EVENTVALIDATION")
    eventvalidation = data["eventvalidation"]
    
    viewstate_element = soup.find(id="__VIEWSTATE")
    viewstate = viewstate_element['value']

    
    r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
                    data={'AirportList': "BOS",
                          'CarrierList': "VX",
                          'Submit': 'Submit',
                          "__EVENTTARGET": "",
                          "__EVENTARGUMENT": "",
                          "__EVENTVALIDATION": eventvalidation,
                          "__VIEWSTATE": viewstate
                    })

    f = open('soup-output2.html',"w")
    f.write(r.text)

make_request()

3. 从html中提取特定数据

  • 提取运营商
from bs4 import BeautifulSoup
html_page = "options1.html"


def extract_carriers(page):
    data = []

    with open(page, "r") as html:
        # do something here to find the necessary values
        soup = BeautifulSoup(html, "lxml")
        carrierList = soup.find(id="CarrierList")
        for child in carrierList.children:
            # 去除换行符,暂定方式
            if child == "\n":
                continue
                
            value = child['value']
            
            # 去除all之类的value,判断起始是否是all开头
            all_begin_value = value.startswith('All')
            
            # 如果不是all开头,则将value放入data中
            if all_begin_value == False:
                data.append(value)
            
            #print value
            #print all_begin_value

    return data


def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]
    airport = data["airport"]
    carrier = data["carrier"]

    r = s.post("https://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
               data = (("__EVENTTARGET", ""),
                       ("__EVENTARGUMENT", ""),
                       ("__VIEWSTATE", viewstate),
                       ("__VIEWSTATEGENERATOR",viewstategenerator),
                       ("__EVENTVALIDATION", eventvalidation),
                       ("CarrierList", carrier),
                       ("AirportList", airport),
                       ("Submit", "Submit")))

    return r.text


def test():
    data = extract_carriers(html_page)
    assert len(data) == 16
    assert "FL" in data
    assert "NK" in data

if __name__ == "__main__":
    test()


  • 提取机场数据

def extract_airports(page):
    data = []

    with open(page, "r") as html:
        # do something here to find the necessary values
        soup = BeautifulSoup(html, "lxml")
        carrierList = soup.find(id="AirportList")
        for child in carrierList.children:
            # 去除换行符,暂定方式
            if child == "\n":
                continue
                
            value = child['value']
            
            # 去除all之类的value,判断起始是否是all开头
            all_begin_value = value.startswith('All')
            
            # 如果不是all开头,则将value放入data中
            if all_begin_value == False:
                data.append(value)
            
            print value
            print all_begin_value

    return data


4. 处理所有的数据

在完成练习之前,需要将前两个练习结合起来,将所有的数据下载保存到本地。

4.1 获取所有request所需要的信息


from bs4 import BeautifulSoup
import requests

html_page = "options1.html"

# 获取航空公司信息
def extract_carriers(page):
    data = []

    with open(page, "r") as html:
        # do something here to find the necessary values
        soup = BeautifulSoup(html, "lxml")
        carrierList = soup.find(id="CarrierList")
        for child in carrierList.children:
            # 去除换行符,暂定方式
            if child == "\n":
                continue
                
            value = child['value']
            
            # 去除all之类的value,判断起始是否是all开头
            all_begin_value = value.startswith('All')
            
            # 如果不是all开头,则将value放入data中
            if all_begin_value == False:
                data.append(value)
            
    print "carrier:",data
    return data

# 获取机场信息
def extract_airports(page):
    data = []

    with open(page, "r") as html:
        # do something here to find the necessary values
        soup = BeautifulSoup(html, "lxml")
        carrierList = soup.find(id="AirportList")
        for child in carrierList.children:
            # 去除换行符,暂定方式
            if child == "\n":
                continue
                
            value = child['value']
            
            # 去除all之类的value,判断起始是否是all开头
            all_begin_value = value.startswith('All')
            
            # 如果不是all开头,则将value放入data中
            if all_begin_value == False:
                data.append(value)
    
    print "airports:",data
    return data

def extract_data(page):
    data = {"eventvalidation": "",
            "viewstate": ""}
    with open(page, "r") as html:
        soup = BeautifulSoup(html, "lxml")
        # 直接通过id取得字典
        ev = soup.find(id="__EVENTVALIDATION")
        data["eventvalidation"] = ev["value"]

        vs = soup.find(id="__VIEWSTATE")
        data["viewstate"] = vs["value"]

    print "extract_data:",data
    return data

4.2 进行网络请求,并将返回的数据写入本地文件

def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]
    airport = data["airport"]
    carrier = data["carrier"]

    r = requests.post("https://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
               data = (("__EVENTTARGET", ""),
                       ("__EVENTARGUMENT", ""),
                       ("__VIEWSTATE", viewstate),
                       ("__EVENTVALIDATION", eventvalidation),
                       ("CarrierList", carrier),
                       ("AirportList", airport),
                       ("Submit", "Submit")))
    
    # 写到本地data文件夹,以airport和carrier命名
    f = open('data\{0}-{1}.html'.format(airport,carrier),"w")
    f.write(r.text)
    
    return r.text

def main():
    carriers_data = extract_carriers(html_page)
    airports_data = extract_airports(html_page)
    other_data = extract_data(html_page)
    
    data = {}
    data['eventvalidation'] = other_data['eventvalidation']
    data['viewstate'] = other_data['viewstate']
    
    # 循环调用
    for carrier in carriers_data:
        data["carrier"] = carrier
        for airport in airports_data:
            data['airport'] = airport
            make_request(data)

main()

5. 练习-分隔错误的xml文件

原始文件中,将几个xml文件合并在一起了,需要将其分开,并输出到单独的文件中。


import xml.etree.ElementTree as ET
PATENTS = 'patent5.xml'

def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def split_file(filename):
   
    file_counter = 0
    with open(filename,'rb') as f:
        for line in f:
        	# 判断是否xml的文件头
            if line.startswith("<?xml"):
                if file_counter > 0:
                    f.close()
                # 如果是xml文件头,新打开一个文件
                f = open('{0}-{1}'.format(PATENTS,file_counter),"w")
                file_counter = file_counter + 1
            
            # 将数据写入文件
            f.write(line)  

# 如下是udacity的测试代码
def test():
    split_file(PATENTS)
    for n in range(4):
        try:
            fname = "{}-{}".format(PATENTS, n)
            f = open(fname, "r")
            if not f.readline().startswith("<?xml"):
                print "You have not split the file {} in the correct boundary!".format(fname)
            f.close()
        except:
            print "Could not find file {}. Check if the filename is correct!".format(fname)


test()