SAXパーサーを使用して、以下のサンプルxmlドキュメント(元のドキュメントは約30 GB)からすべてのproject1
ノード(およびその子要素)を削除しようとしています。問題ありません。別の変更されたファイルを作成するか、インライン編集で問題ありません。
sample.xml
<ROOT>
<test src="http://dfs.com">Hi</test>
<project1>This is old data<foo></foo></project1>
<bar>
<project1>ty</project1>
<foo></foo>
</bar>
</ROOT>
これが私の試みです。
parser.py
from xml.sax.handler import ContentHandler
import xml.sax
class MyHandler(xml.sax.handler.ContentHandler):
def __init__(self, out_file):
self._charBuffer = []
self._result = []
self._out = open(out_file, 'w')
def _createElement(self, name, attrs):
attributes = attrs.items()
if attributes:
out = ''
for key, value in attributes:
out += ' {}={}'.format(key, value)
return '<{}{}>'.format(name, out)
return '<{}>'.format(name)
def _getCharacterData(self):
data = ''.join(self._charBuffer).strip()
self._charBuffer = []
self._out.write(data.strip()) #remove strip() if whitespace is important
def parse(self, f):
xml.sax.parse(f, self)
def characters(self, data):
self._charBuffer.append(data)
def startElement(self, name, attrs):
if not name == 'project1':
self._result.append({})
self._out.write(self._createElement(name, attrs))
def endElement(self, name):
if not name == 'project1': self._result[-1][name] = self._getCharacterData()
MyHandler('out.xml').parse("sample.xml")
私はそれを機能させることができません。
xml.sax.saxutils.XMLFilterBase
project1ノードを除外するための実装。
Xml文字列を自分で組み立てる代わりに、 xml.sax.saxutils.XMLGenerator
。
以下はPython3コードです。Python2が必要な場合はsuper
を調整してください。
from xml.sax import make_parser
from xml.sax.saxutils import XMLFilterBase, XMLGenerator
class Project1Filter(XMLFilterBase):
"""This decides which SAX events to forward to the ContentHandler
We will not forward events when we are inside any elements with a
name specified in the 'tags_names_to_exclude' parameter
"""
def __init__(self, tag_names_to_exclude, parent=None):
super().__init__(parent)
# set of tag names to exclude
self._tag_names_to_exclude = tag_names_to_exclude
# _project_1_count keeps track of opened project1 elements
self._project_1_count = 0
def _forward_events(self):
# will return True when we are not inside a project1 element
return self._project_1_count == 0
def startElement(self, name, attrs):
if name in self._tag_names_to_exclude:
self._project_1_count += 1
if self._forward_events():
super().startElement(name, attrs)
def endElement(self, name):
if self._forward_events():
super().endElement(name)
if name in self._tag_names_to_exclude:
self._project_1_count -= 1
def characters(self, content):
if self._forward_events():
super().characters(content)
# override other content handler methods on XMLFilterBase as neccessary
def main():
tag_names_to_exclude = {'project1', 'project2', 'project3'}
reader = Project1Filter(tag_names_to_exclude, make_parser())
with open('out-small.xml', 'w') as f:
handler = XMLGenerator(f)
reader.setContentHandler(handler)
reader.parse('input.xml')
if __name__ == "__main__":
main()