0
点赞
收藏
分享

微信扫一扫

Python从门到精通(五):文件处理-04-xml文件处理

python处理专用的领域一般都会有专门的库(类似java的三方开源包),在本节中我们使用的是xml库,也可以选用lxml库。

一、简单实现

<?xml version="1.0"?>
<stop>
<id>14791</id>
<nm>Clark &amp; Balmoral</nm>
<sri>
<rt>22</rt>
<d>North Bound</d>
<dd>North Bound</dd>
</sri>
<cr>22</cr>
<pre>
<pt>5 MIN</pt>
<fd>Howard</fd>
<v>1378</v>
<rn>22</rn>
</pre>
<pre>
<pt>15 MIN</pt>
<fd>Howard</fd>
<v>1867</v>
<rn>22</rn>
</pre>
</stop>
from xml.etree.ElementTree import parse

doc = parse('test.xml')

# Extract and output tags of interest
for item in doc.iterfind('pre'):
pt = item.findtext('pt')
fd = item.findtext('fd')
v = item.findtext('v')

print(f'the value of pt: {pt}')
print(f'the value of fd: {fd}')
print(f'the value of v: {v}')


print(f'doc content: {doc}')
e = doc.find('pre')
print(f'e is: {e}')
print(f'e tag is: {e.tag}')
print(f'e text value: {e.text}')
print(f"e get attribute v is: {e.get('v')}")
the value of pt: 5 MIN
the value of fd: Howard
the value of v: 1378
the value of pt: 15 MIN
the value of fd: Howard
the value of v: 1867
doc content: <xml.etree.ElementTree.ElementTree object at 0x1078e0f70>
e is: <Element 'pre' at 0x107952090>
e tag is: pre
e text value:

e get attribute v is: None

二、XML解析

<?xml version="1.0" encoding="utf-8"?>
<top>
<author>David Beazley</author>
<content>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Hello World</title>
</head>
<body>
<h1>Hello World!</h1>
</body>
</html>
</content>
</top>
from xml.etree.ElementTree import parse
doc = parse('named.xml')
print(f"author is: {doc.findtext('author')}")
print(f"content is: {doc.find('content')}")
print(f"content/html is: {doc.find('content/html')}")
print(f"find content: {doc.find('content/{http://www.w3.org/1999/xhtml}html')}")
print(f"find text: {doc.findtext('content/{http://www.w3.org/1999/xhtml}html/head/title')}")
print('find more:\n',doc.findtext('content/{http://www.w3.org/1999/xhtml}html/'
'{http://www.w3.org/1999/xhtml}head/{http://www.w3.org/1999/xhtml}title'))


class XMLNamespaces:
def __init__(self, **kwargs):
self.namespaces = {}
for name, uri in kwargs.items():
self.register(name, uri)
def register(self, name, uri):
self.namespaces[name] = '{'+uri+'}'
def __call__(self, path):
return path.format_map(self.namespaces)


ns = XMLNamespaces(html='http://www.w3.org/1999/xhtml')
print(f"ns find: {doc.find(ns('content/{html}html'))}")
print(f"ns text find: {doc.findtext(ns('content/{html}html/{html}head/{html}title'))}")


from xml.etree.ElementTree import iterparse
for evt, elem in iterparse('named.xml', ('end', 'start-ns', 'end-ns')):
print(f'evt is: {evt}, elem is: {elem}')

print(f'elem: {elem}')
author is: David Beazley
content is: <Element 'content' at 0x10326c1d0>
content/html is: None
find content: <Element '{http://www.w3.org/1999/xhtml}html' at 0x10326c270>
find text: None
find more:
Hello World
ns find: <Element '{http://www.w3.org/1999/xhtml}html' at 0x10326c270>
ns text find: Hello World
evt is: end, elem is: <Element 'author' at 0x10326c5e0>
evt is: start-ns, elem is: ('', 'http://www.w3.org/1999/xhtml')
evt is: end, elem is: <Element '{http://www.w3.org/1999/xhtml}title' at 0x10326c8b0>
evt is: end, elem is: <Element '{http://www.w3.org/1999/xhtml}head' at 0x10326c7c0>
evt is: end, elem is: <Element '{http://www.w3.org/1999/xhtml}h1' at 0x10326c9f0>
evt is: end, elem is: <Element '{http://www.w3.org/1999/xhtml}body' at 0x10326c950>
evt is: end, elem is: <Element '{http://www.w3.org/1999/xhtml}html' at 0x10326c720>
evt is: end-ns, elem is: None
evt is: end, elem is: <Element 'content' at 0x10326c630>
evt is: end, elem is: <Element 'top' at 0x10326c590>
elem: <Element 'top' at 0x10326c590>

三、XML修改

<?xml version="1.0"?>
<stop>
<id>14791</id>
<nm>Clark &amp; Balmoral</nm>
<sri>
<rt>22</rt>
<d>North Bound</d>
<dd>North Bound</dd>
</sri>
<cr>22</cr>
<pre>
<pt>5 MIN</pt>
<fd>Howard</fd>
<v>1378</v>
<rn>22</rn>
</pre>
<pre>
<pt>15 MIN</pt>
<fd>Howard</fd>
<v>1867</v>
<rn>22</rn>
</pre>
</stop>
from xml.etree.ElementTree import parse, Element
doc = parse('test.xml')
root = doc.getroot()
print(f'root is: {root}')
root.remove(root.find('sri'))
root.remove(root.find('cr'))
print(f"root children index: {root.getchildren().index(root.find('nm'))}")
e = Element('spam')
e.text = 'This is a test'
root.insert(2, e)

print(f"doc write: {doc.write('newpred.xml', xml_declaration=True)}")

四、DICT与XML

from xml.etree.ElementTree import Element

def dict_to_xml(tag, d):
element = Element(tag)
for key, val in d.items():
child = Element(key)
child.text = str(val)
element.append(child)
return element


course_dict = {'course_name': 'python', 'total_class': 30, 'score':0.3}
elem = dict_to_xml('course', course_dict)
print(f'elem is: {elem}')


from xml.etree.ElementTree import tostring
print(f'elem to sting is: {tostring(elem)}')


elem.set('_id','1234')
print(f'elem to sting is: {tostring(elem)}')

#只能创建字符串类型的值
def dict_to_xml_str(tag, d):
part_list = [f'<{tag}>']
for key, val in d.items():
part_list.append(f'<{key}>{val}</{key}>')
part_list.append(f'</{tag}>')
return ''.join(part_list)


d = {'courese_name': '<python>'}
print(f"dict to xml str: {dict_to_xml_str('item',d)}")
elem = dict_to_xml('item',d)
print(f'elem to sting is: {tostring(elem)}')

#替换xml的5种特有字符
from xml.sax.saxutils import escape, unescape
print(f"escape: {escape('<python>')}")
print(f"unescape: {unescape('_')}")
elem is: <Element 'course' at 0x100953950>
elem to sting is: b'<course><course_name>python</course_name><total_class>30</total_class><score>0.3</score></course>'
elem to sting is: b'<course _id="1234"><course_name>python</course_name><total_class>30</total_class><score>0.3</score></course>'
dict to xml str: <item><courese_name><python></courese_name></item>
elem to sting is: b'<item><courese_name>&lt;python&gt;</courese_name></item>'
escape: &lt;python&gt;
unescape: _
举报

相关推荐

0 条评论