|
7 | 7 | from django.db.models.query import QuerySet
|
8 | 8 | from django.db.utils import OperationalError
|
9 | 9 | from eyecite import get_citations
|
10 |
| -from eyecite.models import CitationBase, FullCaseCitation |
| 10 | +from eyecite.models import CitationBase |
11 | 11 | from eyecite.tokenizers import HyperscanTokenizer
|
12 | 12 |
|
13 | 13 | from cl.celery_init import app
|
|
21 | 21 | NO_MATCH_RESOURCE,
|
22 | 22 | do_resolve_citations,
|
23 | 23 | )
|
24 |
| -from cl.citations.models import UnmatchedCitation |
25 | 24 | from cl.citations.parenthetical_utils import (
|
26 | 25 | create_parenthetical_groups,
|
27 | 26 | disconnect_parenthetical_group_signals,
|
|
30 | 29 | from cl.citations.recap_citations import store_recap_citations
|
31 | 30 | from cl.citations.score_parentheticals import parenthetical_score
|
32 | 31 | from cl.citations.types import MatchedResourceType, SupportedCitationType
|
33 |
| -from cl.citations.utils import make_get_citations_kwargs |
| 32 | +from cl.citations.unmatched_citations_utils import handle_unmatched_citations |
| 33 | +from cl.citations.utils import ( |
| 34 | + make_get_citations_kwargs, |
| 35 | +) |
34 | 36 | from cl.search.models import (
|
35 | 37 | Opinion,
|
36 | 38 | OpinionCluster,
|
@@ -208,27 +210,10 @@ def store_opinion_citations_and_update_parentheticals(
|
208 | 210 | opinion.save()
|
209 | 211 | return
|
210 | 212 |
|
211 |
| - # Put apart the unmatched citations |
| 213 | + # Put apart the unmatched citations and ambiguous citations |
212 | 214 | unmatched_citations = citation_resolutions.pop(NO_MATCH_RESOURCE, [])
|
213 |
| - |
214 |
| - # Delete citations with multiple matches |
215 | 215 | ambiguous_matches = citation_resolutions.pop(MULTIPLE_MATCHES_RESOURCE, [])
|
216 | 216 |
|
217 |
| - # Increase the citation count for the cluster of each matched opinion |
218 |
| - # if that cluster has not already been cited by this opinion. First, |
219 |
| - # calculate a list of the IDs of every opinion whose cluster will need |
220 |
| - # updating. |
221 |
| - |
222 |
| - currently_cited_opinions = opinion.opinions_cited.all().values_list( |
223 |
| - "pk", flat=True |
224 |
| - ) |
225 |
| - |
226 |
| - opinion_ids_to_update = { |
227 |
| - o.pk |
228 |
| - for o in citation_resolutions.keys() |
229 |
| - if o.pk not in currently_cited_opinions |
230 |
| - } |
231 |
| - |
232 | 217 | clusters_to_update_par_groups_for = set()
|
233 | 218 | parentheticals: list[Parenthetical] = []
|
234 | 219 |
|
@@ -256,28 +241,33 @@ def store_opinion_citations_and_update_parentheticals(
|
256 | 241 | )
|
257 | 242 | )
|
258 | 243 |
|
259 |
| - # If the opinion has been processed previously, we update it's |
260 |
| - # associated UnmatchedCitations.status. If not, we store them all |
261 |
| - update_unmatched_status = UnmatchedCitation.objects.filter( |
262 |
| - citing_opinion=opinion |
263 |
| - ).exists() |
| 244 | + # Increase the citation count for the cluster of each matched opinion |
| 245 | + # if that cluster has not already been cited by this opinion. First, |
| 246 | + # calculate a list of the IDs of every opinion whose cluster will need |
| 247 | + # updating. |
| 248 | + currently_cited_opinions = OpinionsCited.objects.filter( |
| 249 | + citing_opinion_id=opinion.pk |
| 250 | + ).values_list("cited_opinion_id", flat=True) |
| 251 | + cluster_ids_to_update = { |
| 252 | + o.cluster.pk |
| 253 | + for o in citation_resolutions.keys() |
| 254 | + if o.pk not in currently_cited_opinions |
| 255 | + } |
264 | 256 |
|
265 | 257 | # Finally, commit these changes to the database in a single
|
266 |
| - # transcation block. |
| 258 | + # transaction block. |
267 | 259 | with transaction.atomic():
|
268 | 260 | opinion_clusters_to_update = OpinionCluster.objects.filter(
|
269 |
| - sub_opinions__pk__in=opinion_ids_to_update |
| 261 | + id__in=cluster_ids_to_update |
270 | 262 | )
|
271 | 263 | opinion_clusters_to_update.update(
|
272 | 264 | citation_count=F("citation_count") + 1
|
273 | 265 | )
|
274 |
| - |
275 |
| - if update_unmatched_status: |
276 |
| - update_unmatched_citations_status(citation_resolutions, opinion) |
277 |
| - elif unmatched_citations or ambiguous_matches: |
278 |
| - store_unmatched_citations( |
279 |
| - unmatched_citations, ambiguous_matches, opinion |
280 |
| - ) |
| 266 | + handle_unmatched_citations( |
| 267 | + opinion, |
| 268 | + unmatched_citations + ambiguous_matches, |
| 269 | + citation_resolutions, |
| 270 | + ) |
281 | 271 |
|
282 | 272 | # Nuke existing citations and parentheticals
|
283 | 273 | OpinionsCited.objects.filter(citing_opinion_id=opinion.pk).delete()
|
@@ -307,130 +297,11 @@ def store_opinion_citations_and_update_parentheticals(
|
307 | 297 | opinion.save()
|
308 | 298 |
|
309 | 299 | # Update changes in ES.
|
310 |
| - cluster_ids_to_update = list( |
311 |
| - opinion_clusters_to_update.values_list("id", flat=True) |
312 |
| - ) |
313 | 300 | index_related_cites_fields.apply_async(
|
314 | 301 | args=(
|
315 | 302 | OpinionsCited.__name__,
|
316 | 303 | opinion.pk,
|
317 |
| - cluster_ids_to_update, |
| 304 | + list(cluster_ids_to_update), |
318 | 305 | ),
|
319 | 306 | queue=queue_for_children,
|
320 | 307 | )
|
321 |
| - |
322 |
| - |
323 |
| -def update_unmatched_citations_status( |
324 |
| - citation_resolutions: dict[ |
325 |
| - MatchedResourceType, list[SupportedCitationType] |
326 |
| - ], |
327 |
| - citing_opinion: Opinion, |
328 |
| -) -> None: |
329 |
| - """Check if previously unmatched citations have been resolved and |
330 |
| - updates UnmatchedCitation.status accordingly |
331 |
| -
|
332 |
| - We assume no new UnmatchedCitations will be created after the first run |
333 |
| -
|
334 |
| - :param citation_resolutions: dict whose values are resolved citations |
335 |
| - :param citing_opinion: the opinion |
336 |
| - :return None: |
337 |
| - """ |
338 |
| - resolved_citations = { |
339 |
| - c.matched_text() for v in citation_resolutions.values() for c in v |
340 |
| - } |
341 |
| - |
342 |
| - # try to update the status of FOUND and FAILED_* UnmatchedCitations |
343 |
| - found_citations = UnmatchedCitation.objects.filter( |
344 |
| - citing_opinion=citing_opinion |
345 |
| - ).exclude( |
346 |
| - status__in=[UnmatchedCitation.UNMATCHED, UnmatchedCitation.RESOLVED] |
347 |
| - ) |
348 |
| - for found in found_citations: |
349 |
| - if found.citation_string in resolved_citations: |
350 |
| - found.status = UnmatchedCitation.RESOLVED |
351 |
| - else: |
352 |
| - if found.status in [ |
353 |
| - UnmatchedCitation.FAILED, |
354 |
| - UnmatchedCitation.FAILED_AMBIGUOUS, |
355 |
| - ]: |
356 |
| - continue |
357 |
| - found.status = UnmatchedCitation.FAILED |
358 |
| - found.save() |
359 |
| - |
360 |
| - |
361 |
| -def store_unmatched_citations( |
362 |
| - unmatched_citations: list[CitationBase], |
363 |
| - ambiguous_matches: list[CitationBase], |
364 |
| - opinion: Opinion, |
365 |
| -) -> None: |
366 |
| - """Bulk create UnmatchedCitation instances cited by an opinion |
367 |
| -
|
368 |
| - Only FullCaseCitations provide useful information for resolution |
369 |
| - updates. Other types are discarded |
370 |
| -
|
371 |
| - :param unmatched_citations: citations with 0 matches |
372 |
| - :param ambiguous_matches: citations with more than 1 match |
373 |
| - :param opinion: the citing opinion |
374 |
| - :return None: |
375 |
| - """ |
376 |
| - unmatched_citations_to_store = [] |
377 |
| - seen_citations = set() |
378 |
| - citations_to_this_cluster = [ |
379 |
| - str(c) for c in opinion.cluster.citations.all() |
380 |
| - ] |
381 |
| - |
382 |
| - for index, unmatched_citation in enumerate( |
383 |
| - unmatched_citations + ambiguous_matches, 1 |
384 |
| - ): |
385 |
| - has_multiple_matches = index > len(unmatched_citations) |
386 |
| - |
387 |
| - if not isinstance(unmatched_citation, FullCaseCitation): |
388 |
| - continue |
389 |
| - |
390 |
| - # handle bugs in eyecite that make it return FullCitations with null |
391 |
| - # values in required fields |
392 |
| - groups = unmatched_citation.groups |
393 |
| - if ( |
394 |
| - not groups.get("reporter") |
395 |
| - or not groups.get("volume") |
396 |
| - or not groups.get("page") |
397 |
| - ): |
398 |
| - logger.error( |
399 |
| - "Unexpected null value in FullCaseCitation %s", |
400 |
| - unmatched_citation, |
401 |
| - ) |
402 |
| - continue |
403 |
| - if not groups.get("volume").isdigit(): |
404 |
| - logger.error( |
405 |
| - "Unexpected non-integer volume value in FullCaseCitation %s", |
406 |
| - unmatched_citation, |
407 |
| - ) |
408 |
| - continue |
409 |
| - |
410 |
| - # This would raise a DataError, we have seen cases from bad OCR or |
411 |
| - # citation lookalikes. See #5191 |
412 |
| - if int(groups["volume"]) >= 32_767: |
413 |
| - continue |
414 |
| - |
415 |
| - citation_object = UnmatchedCitation.create_from_eyecite( |
416 |
| - unmatched_citation, opinion, has_multiple_matches |
417 |
| - ) |
418 |
| - |
419 |
| - # use to prevent Integrity error from duplicates |
420 |
| - citation_str = str(citation_object) |
421 |
| - if citation_str in seen_citations: |
422 |
| - continue |
423 |
| - seen_citations.add(citation_str) |
424 |
| - |
425 |
| - # avoid storing self citations as unmatched; the self citation will |
426 |
| - # usually be found at the beginning of the opinion's text |
427 |
| - # Note that both Citation.__str__ and UnmatchedCitation.__str__ use |
428 |
| - # the standardized volume, reporter and page values, so they are |
429 |
| - # comparable |
430 |
| - if citation_str in citations_to_this_cluster: |
431 |
| - continue |
432 |
| - |
433 |
| - unmatched_citations_to_store.append(citation_object) |
434 |
| - |
435 |
| - if unmatched_citations_to_store: |
436 |
| - UnmatchedCitation.objects.bulk_create(unmatched_citations_to_store) |
0 commit comments