Skip to content

Perform adds with field updates using JSON #374

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 54 additions & 36 deletions pysolr.py
Original file line number Diff line number Diff line change
Expand Up @@ -916,8 +916,56 @@ def suggest_terms(self, fields, prefix, handler="terms", **kwargs):
)
return res

def _build_json_doc(self, doc):
cleaned_doc = {k: v for k, v in doc.items() if not self._is_null_value(v)}
def _build_docs(self, docs, boost=None, fieldUpdates=None, commitWithin=None):
# if no boost needed use json multidocument api
# The JSON API skips the XML conversion and speedup load from 15 to 20 times.
# CPU Usage is drastically lower.
if boost is None:
solrapi = "JSON"
message = docs
# single doc convert to array of docs
if isinstance(message, dict):
# convert dict to list
message = [message]
# json array of docs
if isinstance(message, list):
# convert to string
cleaned_message = [
self._build_json_doc(doc, fieldUpdates=fieldUpdates)
for doc in message
]
m = self.encoder.encode(cleaned_message).encode("utf-8")
else:
raise ValueError("wrong message type")
else:
solrapi = "XML"
message = ElementTree.Element("add")

if commitWithin:
message.set("commitWithin", commitWithin)

for doc in docs:
el = self._build_xml_doc(doc, boost=boost, fieldUpdates=fieldUpdates)
message.append(el)

# This returns a bytestring. Ugh.
m = ElementTree.tostring(message, encoding="utf-8")
# Convert back to Unicode please.
m = force_unicode(m)

return (solrapi, m, len(message))

def _build_json_doc(self, doc, fieldUpdates=None):
if fieldUpdates is None:
cleaned_doc = {k: v for k, v in doc.items() if not self._is_null_value(v)}
else:
# id must be added without a modifier
# if using field updates, all other fields should have a modifier
cleaned_doc = {
k: {fieldUpdates[k]: v} if k in fieldUpdates else v
for k, v in doc.items()
}

return cleaned_doc

def _build_xml_doc(self, doc, boost=None, fieldUpdates=None):
Expand Down Expand Up @@ -1025,43 +1073,13 @@ def add(
"""
start_time = time.time()
self.log.debug("Starting to build add request...")
solrapi = "XML"
# if no commands (no boost, no atomic updates) needed use json multidocument api
# The JSON API skips the XML conversion and speedup load from 15 to 20 times.
# CPU Usage is drastically lower.
if boost is None and fieldUpdates is None:
solrapi = "JSON"
message = docs
# single doc convert to array of docs
if isinstance(message, dict):
# convert dict to list
message = [message]
# json array of docs
if isinstance(message, list):
# convert to string
cleaned_message = [self._build_json_doc(doc) for doc in message]
m = self.encoder.encode(cleaned_message).encode("utf-8")
else:
raise ValueError("wrong message type")
else:
message = ElementTree.Element("add")

if commitWithin:
message.set("commitWithin", commitWithin)

for doc in docs:
el = self._build_xml_doc(doc, boost=boost, fieldUpdates=fieldUpdates)
message.append(el)

# This returns a bytestring. Ugh.
m = ElementTree.tostring(message, encoding="utf-8")
# Convert back to Unicode please.
m = force_unicode(m)

solrapi, m, len_message = self._build_docs(
docs, boost, fieldUpdates, commitWithin
)
end_time = time.time()
self.log.debug(
"Built add request of %s docs in %0.2f seconds.",
len(message),
len_message,
end_time - start_time,
)
return self._update(
Expand Down
30 changes: 30 additions & 0 deletions tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -772,6 +772,36 @@ def test_build_json_doc_matches_xml(self):
self.assertNotIn("title", doc_json)
self.assertIsNone(doc_xml.find("*[name='title']"))

def test__build_docs_plain(self):
docs = [{
"id": "doc_1",
"title": "",
"price": 12.59,
"popularity": 10
}]
solrapi, m, len_message = self.solr._build_docs(docs)
self.assertEqual(solrapi, "JSON")

def test__build_docs_boost(self):
docs = [{
"id": "doc_1",
"title": "",
"price": 12.59,
"popularity": 10
}]
solrapi, m, len_message = self.solr._build_docs(docs, boost={"title": 10.0})
self.assertEqual(solrapi, "XML")

def test__build_docs_field_updates(self):
docs = [{
"id": "doc_1",
"popularity": 10
}]
solrapi, m, len_message = self.solr._build_docs(
docs, fieldUpdates={"popularity": "inc"}
)
self.assertEqual(solrapi, "JSON")

def test_add(self):
self.assertEqual(len(self.solr.search("doc")), 3)
self.assertEqual(len(self.solr.search("example")), 2)
Expand Down