changeset: 95401:407883c52bf3 user: Serhiy Storchaka date: Thu Apr 02 21:00:13 2015 +0300 files: Doc/library/xml.sax.reader.rst Doc/whatsnew/3.5.rst Lib/test/test_sax.py Lib/xml/sax/expatreader.py Lib/xml/sax/saxutils.py Lib/xml/sax/xmlreader.py Misc/NEWS description: Issue #2175: SAX parsers now support a character stream of InputSource object. diff -r e0292b3ba245 -r 407883c52bf3 Doc/library/xml.sax.reader.rst --- a/Doc/library/xml.sax.reader.rst Thu Apr 02 20:57:20 2015 +0300 +++ b/Doc/library/xml.sax.reader.rst Thu Apr 02 21:00:13 2015 +0300 @@ -100,8 +100,10 @@ system identifier (a string identifying the input source -- typically a file name or an URL), a file-like object, or an :class:`InputSource` object. When :meth:`parse` returns, the input is completely processed, and the parser object - can be discarded or reset. As a limitation, the current implementation only - accepts byte streams; processing of character streams is for further study. + can be discarded or reset. + + .. versionchanged:: 3.5 + Added support of character streams. .. method:: XMLReader.getContentHandler() @@ -288,8 +290,7 @@ .. method:: InputSource.setByteStream(bytefile) - Set the byte stream (a Python file-like object which does not perform - byte-to-character conversion) for this input source. + Set the byte stream (a :term:`binary file`) for this input source. The SAX parser will ignore this if there is also a character stream specified, but it will use a byte stream in preference to opening a URI connection itself. @@ -308,8 +309,7 @@ .. method:: InputSource.setCharacterStream(charfile) - Set the character stream for this input source. (The stream must be a Python 1.6 - Unicode-wrapped file-like that performs conversion to strings.) + Set the character stream (a :term:`text file`) for this input source. If there is a character stream specified, the SAX parser will ignore any byte stream and will not attempt to open a URI connection to the system identifier. diff -r e0292b3ba245 -r 407883c52bf3 Doc/whatsnew/3.5.rst --- a/Doc/whatsnew/3.5.rst Thu Apr 02 20:57:20 2015 +0300 +++ b/Doc/whatsnew/3.5.rst Thu Apr 02 21:00:13 2015 +0300 @@ -499,6 +499,13 @@ * :class:`xmlrpc.client.ServerProxy` is now a :term:`context manager`. (Contributed by Claudiu Popa in :issue:`20627`.) +xml.sax +------- + +* SAX parsers now support a character stream of + :class:`~xml.sax.xmlreader.InputSource` object. + (Contributed by Serhiy Storchaka in :issue:`2175`.) + faulthandler ------------ diff -r e0292b3ba245 -r 407883c52bf3 Lib/test/test_sax.py --- a/Lib/test/test_sax.py Thu Apr 02 20:57:20 2015 +0300 +++ b/Lib/test/test_sax.py Thu Apr 02 21:00:13 2015 +0300 @@ -185,12 +185,24 @@ def make_byte_stream(self): return BytesIO(b"This is a byte stream.") + def make_character_stream(self): + return StringIO("This is a character stream.") + def checkContent(self, stream, content): self.assertIsNotNone(stream) self.assertEqual(stream.read(), content) stream.close() + def test_character_stream(self): + # If the source is an InputSource with a character stream, use it. + src = InputSource(self.file) + src.setCharacterStream(self.make_character_stream()) + prep = prepare_input_source(src) + self.assertIsNone(prep.getByteStream()) + self.checkContent(prep.getCharacterStream(), + "This is a character stream.") + def test_byte_stream(self): # If the source is an InputSource that does not have a character # stream but does have a byte stream, use the byte stream. @@ -225,6 +237,14 @@ self.checkContent(prep.getByteStream(), b"This is a byte stream.") + def test_text_file(self): + # If the source is a text file-like object, use it as a character + # stream. + prep = prepare_input_source(self.make_character_stream()) + self.assertIsNone(prep.getByteStream()) + self.checkContent(prep.getCharacterStream(), + "This is a character stream.") + # ===== XMLGenerator @@ -904,6 +924,19 @@ self.assertEqual(result.getvalue(), xml_test_out) + def test_expat_inpsource_character_stream(self): + parser = create_parser() + result = BytesIO() + xmlgen = XMLGenerator(result) + + parser.setContentHandler(xmlgen) + inpsrc = InputSource() + with open(TEST_XMLFILE, 'rt', encoding='iso-8859-1') as f: + inpsrc.setCharacterStream(f) + parser.parse(inpsrc) + + self.assertEqual(result.getvalue(), xml_test_out) + # ===== IncrementalParser support def test_expat_incremental(self): diff -r e0292b3ba245 -r 407883c52bf3 Lib/xml/sax/expatreader.py --- a/Lib/xml/sax/expatreader.py Thu Apr 02 20:57:20 2015 +0300 +++ b/Lib/xml/sax/expatreader.py Thu Apr 02 21:00:13 2015 +0300 @@ -219,9 +219,14 @@ self._parsing = 0 # break cycle created by expat handlers pointing to our methods self._parser = None - bs = self._source.getByteStream() - if bs is not None: - bs.close() + try: + file = self._source.getCharacterStream() + if file is not None: + file.close() + finally: + file = self._source.getByteStream() + if file is not None: + file.close() def _reset_cont_handler(self): self._parser.ProcessingInstructionHandler = \ diff -r e0292b3ba245 -r 407883c52bf3 Lib/xml/sax/saxutils.py --- a/Lib/xml/sax/saxutils.py Thu Apr 02 20:57:20 2015 +0300 +++ b/Lib/xml/sax/saxutils.py Thu Apr 02 21:00:13 2015 +0300 @@ -345,11 +345,14 @@ elif hasattr(source, "read"): f = source source = xmlreader.InputSource() - source.setByteStream(f) + if isinstance(f.read(0), str): + source.setCharacterStream(f) + else: + source.setByteStream(f) if hasattr(f, "name") and isinstance(f.name, str): source.setSystemId(f.name) - if source.getByteStream() is None: + if source.getCharacterStream() is None and source.getByteStream() is None: sysid = source.getSystemId() basehead = os.path.dirname(os.path.normpath(base)) sysidfilename = os.path.join(basehead, sysid) diff -r e0292b3ba245 -r 407883c52bf3 Lib/xml/sax/xmlreader.py --- a/Lib/xml/sax/xmlreader.py Thu Apr 02 20:57:20 2015 +0300 +++ b/Lib/xml/sax/xmlreader.py Thu Apr 02 21:00:13 2015 +0300 @@ -117,7 +117,9 @@ source = saxutils.prepare_input_source(source) self.prepareParser(source) - file = source.getByteStream() + file = source.getCharacterStream() + if file is None: + file = source.getByteStream() buffer = file.read(self._bufsize) while buffer: self.feed(buffer) diff -r e0292b3ba245 -r 407883c52bf3 Misc/NEWS --- a/Misc/NEWS Thu Apr 02 20:57:20 2015 +0300 +++ b/Misc/NEWS Thu Apr 02 21:00:13 2015 +0300 @@ -16,6 +16,8 @@ Library ------- +- Issue #2175: SAX parsers now support a character stream of InputSource object. + - Issue #16840: Tkinter now supports 64-bit integers added in Tcl 8.4 and arbitrary precision integers added in Tcl 8.5.