changeset: 99484:dd67c8c53aea user: Serhiy Storchaka date: Mon Dec 07 02:31:11 2015 +0200 files: Lib/xml/etree/ElementTree.py Misc/NEWS Modules/_elementtree.c Modules/clinic/_elementtree.c.h description: Issue #25638: Optimized ElementTree.iterparse(); it is now 2x faster. ElementTree.XMLParser._setevents now accepts any objects with the append method, not just a list. diff -r 2cf16918b632 -r dd67c8c53aea Lib/xml/etree/ElementTree.py --- a/Lib/xml/etree/ElementTree.py Sun Dec 06 23:55:05 2015 +0200 +++ b/Lib/xml/etree/ElementTree.py Mon Dec 07 02:31:11 2015 +0200 @@ -95,6 +95,7 @@ import re import warnings import io +import collections import contextlib from . import ElementPath @@ -1198,16 +1199,37 @@ Returns an iterator providing (event, elem) pairs. """ + # Use the internal, undocumented _parser argument for now; When the + # parser argument of iterparse is removed, this can be killed. + pullparser = XMLPullParser(events=events, _parser=parser) + def iterator(): + try: + while True: + yield from pullparser.read_events() + # load event buffer + data = source.read(16 * 1024) + if not data: + break + pullparser.feed(data) + root = pullparser._close_and_return_root() + yield from pullparser.read_events() + it.root = root + finally: + if close_source: + source.close() + + class IterParseIterator(collections.Iterator): + __next__ = iterator().__next__ + it = IterParseIterator() + it.root = None + del iterator, IterParseIterator + close_source = False if not hasattr(source, "read"): source = open(source, "rb") close_source = True - try: - return _IterParseIterator(source, events, parser, close_source) - except: - if close_source: - source.close() - raise + + return it class XMLPullParser: @@ -1217,9 +1239,7 @@ # upon in user code. It will be removed in a future release. # See http://bugs.python.org/issue17741 for more details. - # _elementtree.c expects a list, not a deque - self._events_queue = [] - self._index = 0 + self._events_queue = collections.deque() self._parser = _parser or XMLParser(target=TreeBuilder()) # wire up the parser for event reporting if events is None: @@ -1257,64 +1277,14 @@ retrieved from the iterator. """ events = self._events_queue - while True: - index = self._index - try: - event = events[self._index] - # Avoid retaining references to past events - events[self._index] = None - except IndexError: - break - index += 1 - # Compact the list in a O(1) amortized fashion - # As noted above, _elementree.c needs a list, not a deque - if index * 2 >= len(events): - events[:index] = [] - self._index = 0 - else: - self._index = index + while events: + event = events.popleft() if isinstance(event, Exception): raise event else: yield event -class _IterParseIterator: - - def __init__(self, source, events, parser, close_source=False): - # Use the internal, undocumented _parser argument for now; When the - # parser argument of iterparse is removed, this can be killed. - self._parser = XMLPullParser(events=events, _parser=parser) - self._file = source - self._close_file = close_source - self.root = self._root = None - - def __next__(self): - try: - while 1: - for event in self._parser.read_events(): - return event - if self._parser._parser is None: - break - # load event buffer - data = self._file.read(16 * 1024) - if data: - self._parser.feed(data) - else: - self._root = self._parser._close_and_return_root() - self.root = self._root - except: - if self._close_file: - self._file.close() - raise - if self._close_file: - self._file.close() - raise StopIteration - - def __iter__(self): - return self - - def XML(text, parser=None): """Parse XML document from string constant. diff -r 2cf16918b632 -r dd67c8c53aea Misc/NEWS --- a/Misc/NEWS Sun Dec 06 23:55:05 2015 +0200 +++ b/Misc/NEWS Mon Dec 07 02:31:11 2015 +0200 @@ -109,6 +109,8 @@ Library ------- +- Issue #25638: Optimized ElementTree.iterparse(); it is now 2x faster. + - Issue #25761: Improved detecting errors in broken pickle data. - Issue #25717: Restore the previous behaviour of tolerating most fstat() diff -r 2cf16918b632 -r dd67c8c53aea Modules/_elementtree.c --- a/Modules/_elementtree.c Sun Dec 06 23:55:05 2015 +0200 +++ b/Modules/_elementtree.c Mon Dec 07 02:31:11 2015 +0200 @@ -2289,7 +2289,7 @@ PyObject *element_factory; /* element tracing */ - PyObject *events; /* list of events, or NULL if not collecting */ + PyObject *events_append; /* the append method of the list of events, or NULL */ PyObject *start_event_obj; /* event objects (NULL to ignore) */ PyObject *end_event_obj; PyObject *start_ns_event_obj; @@ -2324,7 +2324,7 @@ } t->index = 0; - t->events = NULL; + t->events_append = NULL; t->start_event_obj = t->end_event_obj = NULL; t->start_ns_event_obj = t->end_ns_event_obj = NULL; } @@ -2374,7 +2374,7 @@ Py_CLEAR(self->start_ns_event_obj); Py_CLEAR(self->end_event_obj); Py_CLEAR(self->start_event_obj); - Py_CLEAR(self->events); + Py_CLEAR(self->events_append); Py_CLEAR(self->stack); Py_CLEAR(self->data); Py_CLEAR(self->last); @@ -2455,13 +2455,14 @@ PyObject *node) { if (action != NULL) { - PyObject *res = PyTuple_Pack(2, action, node); + PyObject *res; + PyObject *event = PyTuple_Pack(2, action, node); + if (event == NULL) + return -1; + res = PyObject_CallFunctionObjArgs(self->events_append, event, NULL); + Py_DECREF(event); if (res == NULL) return -1; - if (PyList_Append(self->events, res) < 0) { - Py_DECREF(res); - return -1; - } Py_DECREF(res); } return 0; @@ -3039,7 +3040,7 @@ if (PyErr_Occurred()) return; - if (!target->events || !target->start_ns_event_obj) + if (!target->events_append || !target->start_ns_event_obj) return; if (!uri) @@ -3062,7 +3063,7 @@ if (PyErr_Occurred()) return; - if (!target->events) + if (!target->events_append) return; treebuilder_append_event(target, target->end_ns_event_obj, Py_None); @@ -3551,7 +3552,7 @@ /*[clinic input] _elementtree.XMLParser._setevents - events_queue: object(subclass_of='&PyList_Type') + events_queue: object events_to_report: object = None / @@ -3561,12 +3562,12 @@ _elementtree_XMLParser__setevents_impl(XMLParserObject *self, PyObject *events_queue, PyObject *events_to_report) -/*[clinic end generated code: output=1440092922b13ed1 input=59db9742910c6174]*/ +/*[clinic end generated code: output=1440092922b13ed1 input=abf90830a1c3b0fc]*/ { /* activate element event reporting */ Py_ssize_t i, seqlen; TreeBuilderObject *target; - PyObject *events_seq; + PyObject *events_append, *events_seq; if (!TreeBuilder_CheckExact(self->target)) { PyErr_SetString( @@ -3579,9 +3580,11 @@ target = (TreeBuilderObject*) self->target; - Py_INCREF(events_queue); - Py_XDECREF(target->events); - target->events = events_queue; + events_append = PyObject_GetAttrString(events_queue, "append"); + if (events_append == NULL) + return NULL; + Py_XDECREF(target->events_append); + target->events_append = events_append; /* clear out existing events */ Py_CLEAR(target->start_event_obj); diff -r 2cf16918b632 -r dd67c8c53aea Modules/clinic/_elementtree.c.h --- a/Modules/clinic/_elementtree.c.h Sun Dec 06 23:55:05 2015 +0200 +++ b/Modules/clinic/_elementtree.c.h Mon Dec 07 02:31:11 2015 +0200 @@ -668,12 +668,13 @@ PyObject *events_queue; PyObject *events_to_report = Py_None; - if (!PyArg_ParseTuple(args, "O!|O:_setevents", - &PyList_Type, &events_queue, &events_to_report)) + if (!PyArg_UnpackTuple(args, "_setevents", + 1, 2, + &events_queue, &events_to_report)) goto exit; return_value = _elementtree_XMLParser__setevents_impl(self, events_queue, events_to_report); exit: return return_value; } -/*[clinic end generated code: output=25b8bf7e7f2151ca input=a9049054013a1b77]*/ +/*[clinic end generated code: output=19d94e2d2726d3aa input=a9049054013a1b77]*/