Python从门到精通(五):文件处理-04-xml文件处理-CFANZ编程社区

python处理专用的领域一般都会有专门的库（类似java的三方开源包），在本节中我们使用的是xml库，也可以选用lxml库。

一、简单实现

<?xml version="1.0"?>
<stop>
    <id>14791</id>
    <nm>Clark &amp; Balmoral</nm>
    <sri>
        <rt>22</rt>
        <d>North Bound</d>
        <dd>North Bound</dd>
    </sri>
    <cr>22</cr>
    <pre>
        <pt>5 MIN</pt>
        <fd>Howard</fd>
        <v>1378</v>
        <rn>22</rn>
    </pre>
    <pre>
        <pt>15 MIN</pt>
        <fd>Howard</fd>
        <v>1867</v>
        <rn>22</rn>
    </pre>
</stop>

from xml.etree.ElementTree import parse

doc = parse('test.xml')

# Extract and output tags of interest
for item in doc.iterfind('pre'):
    pt = item.findtext('pt')
    fd = item.findtext('fd')
    v = item.findtext('v')

    print(f'the value of pt: {pt}')
    print(f'the value of fd: {fd}')
    print(f'the value of v: {v}')


print(f'doc content: {doc}')
e = doc.find('pre')
print(f'e is: {e}')
print(f'e tag is: {e.tag}')
print(f'e text value: {e.text}')
print(f"e get attribute v is: {e.get('v')}")

the value of pt: 5 MIN
the value of fd: Howard
the value of v: 1378
the value of pt: 15 MIN
the value of fd: Howard
the value of v: 1867
doc content: <xml.etree.ElementTree.ElementTree object at 0x1078e0f70>
e is: <Element 'pre' at 0x107952090>
e tag is: pre
e text value: 
        
e get attribute v is: None

二、XML解析

<?xml version="1.0" encoding="utf-8"?>
<top>
    <author>David Beazley</author>
    <content>
        <html xmlns="http://www.w3.org/1999/xhtml">
            <head>
                <title>Hello World</title>
            </head>
            <body>
                <h1>Hello World!</h1>
            </body>
        </html>
    </content>
</top>

from xml.etree.ElementTree import parse
doc = parse('named.xml')
print(f"author is: {doc.findtext('author')}")
print(f"content is: {doc.find('content')}")
print(f"content/html is: {doc.find('content/html')}")
print(f"find content: {doc.find('content/{http://www.w3.org/1999/xhtml}html')}")
print(f"find text: {doc.findtext('content/{http://www.w3.org/1999/xhtml}html/head/title')}")
print('find more:\n',doc.findtext('content/{http://www.w3.org/1999/xhtml}html/'
                   '{http://www.w3.org/1999/xhtml}head/{http://www.w3.org/1999/xhtml}title'))


class XMLNamespaces:
    def __init__(self, **kwargs):
        self.namespaces = {}
        for name, uri in kwargs.items():
            self.register(name, uri)
    def register(self, name, uri):
        self.namespaces[name] = '{'+uri+'}'
    def __call__(self, path):
        return path.format_map(self.namespaces)


ns = XMLNamespaces(html='http://www.w3.org/1999/xhtml')
print(f"ns find: {doc.find(ns('content/{html}html'))}")
print(f"ns text find: {doc.findtext(ns('content/{html}html/{html}head/{html}title'))}")


from xml.etree.ElementTree import iterparse
for evt, elem in iterparse('named.xml', ('end', 'start-ns', 'end-ns')):
    print(f'evt is: {evt}, elem is: {elem}')

print(f'elem: {elem}')

author is: David Beazley
content is: <Element 'content' at 0x10326c1d0>
content/html is: None
find content: <Element '{http://www.w3.org/1999/xhtml}html' at 0x10326c270>
find text: None
find more:
 Hello World
ns find: <Element '{http://www.w3.org/1999/xhtml}html' at 0x10326c270>
ns text find: Hello World
evt is: end, elem is: <Element 'author' at 0x10326c5e0>
evt is: start-ns, elem is: ('', 'http://www.w3.org/1999/xhtml')
evt is: end, elem is: <Element '{http://www.w3.org/1999/xhtml}title' at 0x10326c8b0>
evt is: end, elem is: <Element '{http://www.w3.org/1999/xhtml}head' at 0x10326c7c0>
evt is: end, elem is: <Element '{http://www.w3.org/1999/xhtml}h1' at 0x10326c9f0>
evt is: end, elem is: <Element '{http://www.w3.org/1999/xhtml}body' at 0x10326c950>
evt is: end, elem is: <Element '{http://www.w3.org/1999/xhtml}html' at 0x10326c720>
evt is: end-ns, elem is: None
evt is: end, elem is: <Element 'content' at 0x10326c630>
evt is: end, elem is: <Element 'top' at 0x10326c590>
elem: <Element 'top' at 0x10326c590>

三、XML修改

<?xml version="1.0"?>
<stop>
    <id>14791</id>
    <nm>Clark &amp; Balmoral</nm>
    <sri>
        <rt>22</rt>
        <d>North Bound</d>
        <dd>North Bound</dd>
    </sri>
    <cr>22</cr>
    <pre>
        <pt>5 MIN</pt>
        <fd>Howard</fd>
        <v>1378</v>
        <rn>22</rn>
    </pre>
    <pre>
        <pt>15 MIN</pt>
        <fd>Howard</fd>
        <v>1867</v>
        <rn>22</rn>
    </pre>
</stop>

from xml.etree.ElementTree import parse, Element
doc = parse('test.xml')
root = doc.getroot()
print(f'root is: {root}')
root.remove(root.find('sri'))
root.remove(root.find('cr'))
print(f"root children index: {root.getchildren().index(root.find('nm'))}")
e = Element('spam')
e.text = 'This is a test'
root.insert(2, e)

print(f"doc write: {doc.write('newpred.xml', xml_declaration=True)}")

四、DICT与XML

from xml.etree.ElementTree import Element

def dict_to_xml(tag, d):
    element = Element(tag)
    for key, val in d.items():
        child = Element(key)
        child.text = str(val)
        element.append(child)
    return element


course_dict = {'course_name': 'python', 'total_class': 30, 'score':0.3}
elem = dict_to_xml('course', course_dict)
print(f'elem is: {elem}')


from xml.etree.ElementTree import tostring
print(f'elem to sting is: {tostring(elem)}')


elem.set('_id','1234')
print(f'elem to sting is: {tostring(elem)}')

#只能创建字符串类型的值
def dict_to_xml_str(tag, d):
    part_list = [f'<{tag}>']
    for key, val in d.items():
        part_list.append(f'<{key}>{val}</{key}>')
    part_list.append(f'</{tag}>')
    return ''.join(part_list)


d = {'courese_name': '<python>'}
print(f"dict to xml str: {dict_to_xml_str('item',d)}")
elem = dict_to_xml('item',d)
print(f'elem to sting is: {tostring(elem)}')

#替换xml的5种特有字符
from xml.sax.saxutils import escape, unescape
print(f"escape: {escape('<python>')}")
print(f"unescape: {unescape('_')}")

elem is: <Element 'course' at 0x100953950>
elem to sting is: b'<course><course_name>python</course_name><total_class>30</total_class><score>0.3</score></course>'
elem to sting is: b'<course _id="1234"><course_name>python</course_name><total_class>30</total_class><score>0.3</score></course>'
dict to xml str: <item><courese_name><python></courese_name></item>
elem to sting is: b'<item><courese_name>&lt;python&gt;</courese_name></item>'
escape: &lt;python&gt;
unescape: _