Skip to content

Commit 0b1fb09

Browse files
committed
Merge remote-tracking branch 'anti-social/clean_xml'
2 parents 843907f + e46ac6d commit 0b1fb09

File tree

2 files changed

+34
-2
lines changed

2 files changed

+34
-2
lines changed

pysolr.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,33 @@ def safe_urlencode(params, doseq=0):
181181
return urlencode(new_params, doseq)
182182

183183

184+
def is_valid_xml_char_ordinal(i):
185+
"""
186+
Defines whether char is valid to use in xml document
187+
188+
XML standard defines a valid char as::
189+
190+
Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
191+
"""
192+
return ( # conditions ordered by presumed frequency
193+
0x20 <= i <= 0xD7FF
194+
or i in (0x9, 0xA, 0xD)
195+
or 0xE000 <= i <= 0xFFFD
196+
or 0x10000 <= i <= 0x10FFFF
197+
)
198+
199+
200+
def clean_xml_string(s):
201+
"""
202+
Cleans string from invalid xml chars
203+
204+
Solution was found there::
205+
206+
http://stackoverflow.com/questions/8733233/filtering-out-certain-bytes-in-python
207+
"""
208+
return ''.join(c for c in s if is_valid_xml_char_ordinal(ord(c)))
209+
210+
184211
class SolrError(Exception):
185212
pass
186213

@@ -476,7 +503,7 @@ def _from_python(self, value):
476503

477504
value = "{0}".format(value)
478505

479-
return value
506+
return clean_xml_string(value)
480507

481508
def _to_python(self, value):
482509
"""

tests/client.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
import datetime
55

66
from pysolr import (Solr, Results, SolrError, unescape_html, safe_urlencode,
7-
force_unicode, force_bytes, sanitize, json, ET, IS_PY3)
7+
force_unicode, force_bytes, sanitize, json, ET, IS_PY3,
8+
clean_xml_string)
89

910
try:
1011
import unittest2 as unittest
@@ -51,6 +52,9 @@ def test_force_bytes(self):
5152
# Don't mangle, it's already a bytestring.
5253
self.assertEqual(force_bytes(b'Hello \xe2\x98\x83'), b'Hello \xe2\x98\x83')
5354

55+
def test_clean_xml_string(self):
56+
self.assertEqual(clean_xml_string('\x00\x0b\x0d\uffff'), '\x0d')
57+
5458

5559
class ResultsTestCase(unittest.TestCase):
5660
def test_init(self):
@@ -263,6 +267,7 @@ def test__from_python(self):
263267
self.assertEqual(self.solr._from_python(1.2), '1.2')
264268
self.assertEqual(self.solr._from_python(b'hello'), 'hello')
265269
self.assertEqual(self.solr._from_python('hello ☃'), 'hello ☃')
270+
self.assertEqual(self.solr._from_python('\x01test\x02'), 'test')
266271

267272
def test__to_python(self):
268273
self.assertEqual(self.solr._to_python('2013-01-18T00:00:00Z'), datetime.datetime(2013, 1, 18))

0 commit comments

Comments
 (0)