Skip to content

Commit 5d15143

Browse files
authored
Merge branch 'main' into 5500-recap-search-alerts-limitations
2 parents c89c60c + c1932ca commit 5d15143

File tree

4 files changed

+243
-163
lines changed

4 files changed

+243
-163
lines changed

cl/citations/match_citations.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
source_text="MULTIPLE_MATCHES", page="999999", volume="999999"
3737
)
3838
)
39+
# to be used when storing unmatched citations
40+
MULTIPLE_MATCHES_FLAG = "is_ambiguous"
3941

4042

4143
def filter_by_matching_antecedent(
@@ -96,10 +98,14 @@ def resolve_fullcase_citation(
9698
# return the first item by ordering key
9799
return clusters[0].ordered_opinions.first()
98100
elif _count >= 2:
101+
# set an attribute to differentiate 0-match and
102+
# more-than-one-match citations
103+
setattr(full_citation, MULTIPLE_MATCHES_FLAG, True)
99104
# if two or more remain return multiple matches
100105
return MULTIPLE_MATCHES_RESOURCE
101106

102107
if len(db_search_results) > 1:
108+
setattr(full_citation, MULTIPLE_MATCHES_FLAG, True)
103109
return MULTIPLE_MATCHES_RESOURCE
104110

105111
# If there is one search result, try to return it

cl/citations/tasks.py

Lines changed: 26 additions & 155 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from django.db.models.query import QuerySet
88
from django.db.utils import OperationalError
99
from eyecite import get_citations
10-
from eyecite.models import CitationBase, FullCaseCitation
10+
from eyecite.models import CitationBase
1111
from eyecite.tokenizers import HyperscanTokenizer
1212

1313
from cl.celery_init import app
@@ -21,7 +21,6 @@
2121
NO_MATCH_RESOURCE,
2222
do_resolve_citations,
2323
)
24-
from cl.citations.models import UnmatchedCitation
2524
from cl.citations.parenthetical_utils import (
2625
create_parenthetical_groups,
2726
disconnect_parenthetical_group_signals,
@@ -30,7 +29,10 @@
3029
from cl.citations.recap_citations import store_recap_citations
3130
from cl.citations.score_parentheticals import parenthetical_score
3231
from cl.citations.types import MatchedResourceType, SupportedCitationType
33-
from cl.citations.utils import make_get_citations_kwargs
32+
from cl.citations.unmatched_citations_utils import handle_unmatched_citations
33+
from cl.citations.utils import (
34+
make_get_citations_kwargs,
35+
)
3436
from cl.search.models import (
3537
Opinion,
3638
OpinionCluster,
@@ -208,27 +210,10 @@ def store_opinion_citations_and_update_parentheticals(
208210
opinion.save()
209211
return
210212

211-
# Put apart the unmatched citations
213+
# Put apart the unmatched citations and ambiguous citations
212214
unmatched_citations = citation_resolutions.pop(NO_MATCH_RESOURCE, [])
213-
214-
# Delete citations with multiple matches
215215
ambiguous_matches = citation_resolutions.pop(MULTIPLE_MATCHES_RESOURCE, [])
216216

217-
# Increase the citation count for the cluster of each matched opinion
218-
# if that cluster has not already been cited by this opinion. First,
219-
# calculate a list of the IDs of every opinion whose cluster will need
220-
# updating.
221-
222-
currently_cited_opinions = opinion.opinions_cited.all().values_list(
223-
"pk", flat=True
224-
)
225-
226-
opinion_ids_to_update = {
227-
o.pk
228-
for o in citation_resolutions.keys()
229-
if o.pk not in currently_cited_opinions
230-
}
231-
232217
clusters_to_update_par_groups_for = set()
233218
parentheticals: list[Parenthetical] = []
234219

@@ -256,28 +241,33 @@ def store_opinion_citations_and_update_parentheticals(
256241
)
257242
)
258243

259-
# If the opinion has been processed previously, we update it's
260-
# associated UnmatchedCitations.status. If not, we store them all
261-
update_unmatched_status = UnmatchedCitation.objects.filter(
262-
citing_opinion=opinion
263-
).exists()
244+
# Increase the citation count for the cluster of each matched opinion
245+
# if that cluster has not already been cited by this opinion. First,
246+
# calculate a list of the IDs of every opinion whose cluster will need
247+
# updating.
248+
currently_cited_opinions = OpinionsCited.objects.filter(
249+
citing_opinion_id=opinion.pk
250+
).values_list("cited_opinion_id", flat=True)
251+
cluster_ids_to_update = {
252+
o.cluster.pk
253+
for o in citation_resolutions.keys()
254+
if o.pk not in currently_cited_opinions
255+
}
264256

265257
# Finally, commit these changes to the database in a single
266-
# transcation block.
258+
# transaction block.
267259
with transaction.atomic():
268260
opinion_clusters_to_update = OpinionCluster.objects.filter(
269-
sub_opinions__pk__in=opinion_ids_to_update
261+
id__in=cluster_ids_to_update
270262
)
271263
opinion_clusters_to_update.update(
272264
citation_count=F("citation_count") + 1
273265
)
274-
275-
if update_unmatched_status:
276-
update_unmatched_citations_status(citation_resolutions, opinion)
277-
elif unmatched_citations or ambiguous_matches:
278-
store_unmatched_citations(
279-
unmatched_citations, ambiguous_matches, opinion
280-
)
266+
handle_unmatched_citations(
267+
opinion,
268+
unmatched_citations + ambiguous_matches,
269+
citation_resolutions,
270+
)
281271

282272
# Nuke existing citations and parentheticals
283273
OpinionsCited.objects.filter(citing_opinion_id=opinion.pk).delete()
@@ -307,130 +297,11 @@ def store_opinion_citations_and_update_parentheticals(
307297
opinion.save()
308298

309299
# Update changes in ES.
310-
cluster_ids_to_update = list(
311-
opinion_clusters_to_update.values_list("id", flat=True)
312-
)
313300
index_related_cites_fields.apply_async(
314301
args=(
315302
OpinionsCited.__name__,
316303
opinion.pk,
317-
cluster_ids_to_update,
304+
list(cluster_ids_to_update),
318305
),
319306
queue=queue_for_children,
320307
)
321-
322-
323-
def update_unmatched_citations_status(
324-
citation_resolutions: dict[
325-
MatchedResourceType, list[SupportedCitationType]
326-
],
327-
citing_opinion: Opinion,
328-
) -> None:
329-
"""Check if previously unmatched citations have been resolved and
330-
updates UnmatchedCitation.status accordingly
331-
332-
We assume no new UnmatchedCitations will be created after the first run
333-
334-
:param citation_resolutions: dict whose values are resolved citations
335-
:param citing_opinion: the opinion
336-
:return None:
337-
"""
338-
resolved_citations = {
339-
c.matched_text() for v in citation_resolutions.values() for c in v
340-
}
341-
342-
# try to update the status of FOUND and FAILED_* UnmatchedCitations
343-
found_citations = UnmatchedCitation.objects.filter(
344-
citing_opinion=citing_opinion
345-
).exclude(
346-
status__in=[UnmatchedCitation.UNMATCHED, UnmatchedCitation.RESOLVED]
347-
)
348-
for found in found_citations:
349-
if found.citation_string in resolved_citations:
350-
found.status = UnmatchedCitation.RESOLVED
351-
else:
352-
if found.status in [
353-
UnmatchedCitation.FAILED,
354-
UnmatchedCitation.FAILED_AMBIGUOUS,
355-
]:
356-
continue
357-
found.status = UnmatchedCitation.FAILED
358-
found.save()
359-
360-
361-
def store_unmatched_citations(
362-
unmatched_citations: list[CitationBase],
363-
ambiguous_matches: list[CitationBase],
364-
opinion: Opinion,
365-
) -> None:
366-
"""Bulk create UnmatchedCitation instances cited by an opinion
367-
368-
Only FullCaseCitations provide useful information for resolution
369-
updates. Other types are discarded
370-
371-
:param unmatched_citations: citations with 0 matches
372-
:param ambiguous_matches: citations with more than 1 match
373-
:param opinion: the citing opinion
374-
:return None:
375-
"""
376-
unmatched_citations_to_store = []
377-
seen_citations = set()
378-
citations_to_this_cluster = [
379-
str(c) for c in opinion.cluster.citations.all()
380-
]
381-
382-
for index, unmatched_citation in enumerate(
383-
unmatched_citations + ambiguous_matches, 1
384-
):
385-
has_multiple_matches = index > len(unmatched_citations)
386-
387-
if not isinstance(unmatched_citation, FullCaseCitation):
388-
continue
389-
390-
# handle bugs in eyecite that make it return FullCitations with null
391-
# values in required fields
392-
groups = unmatched_citation.groups
393-
if (
394-
not groups.get("reporter")
395-
or not groups.get("volume")
396-
or not groups.get("page")
397-
):
398-
logger.error(
399-
"Unexpected null value in FullCaseCitation %s",
400-
unmatched_citation,
401-
)
402-
continue
403-
if not groups.get("volume").isdigit():
404-
logger.error(
405-
"Unexpected non-integer volume value in FullCaseCitation %s",
406-
unmatched_citation,
407-
)
408-
continue
409-
410-
# This would raise a DataError, we have seen cases from bad OCR or
411-
# citation lookalikes. See #5191
412-
if int(groups["volume"]) >= 32_767:
413-
continue
414-
415-
citation_object = UnmatchedCitation.create_from_eyecite(
416-
unmatched_citation, opinion, has_multiple_matches
417-
)
418-
419-
# use to prevent Integrity error from duplicates
420-
citation_str = str(citation_object)
421-
if citation_str in seen_citations:
422-
continue
423-
seen_citations.add(citation_str)
424-
425-
# avoid storing self citations as unmatched; the self citation will
426-
# usually be found at the beginning of the opinion's text
427-
# Note that both Citation.__str__ and UnmatchedCitation.__str__ use
428-
# the standardized volume, reporter and page values, so they are
429-
# comparable
430-
if citation_str in citations_to_this_cluster:
431-
continue
432-
433-
unmatched_citations_to_store.append(citation_object)
434-
435-
if unmatched_citations_to_store:
436-
UnmatchedCitation.objects.bulk_create(unmatched_citations_to_store)

cl/citations/tests.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
get_representative_parenthetical,
4242
)
4343
from cl.citations.match_citations import (
44+
MULTIPLE_MATCHES_FLAG,
4445
MULTIPLE_MATCHES_RESOURCE,
4546
NO_MATCH_RESOURCE,
4647
do_resolve_citations,
@@ -53,10 +54,14 @@
5354
find_citations_and_parentheticals_for_opinion_by_pks,
5455
store_opinion_citations_and_update_parentheticals,
5556
store_recap_citations,
56-
store_unmatched_citations,
57+
)
58+
from cl.citations.unmatched_citations_utils import (
59+
handle_unmatched_citations,
5760
update_unmatched_citations_status,
5861
)
59-
from cl.citations.utils import make_get_citations_kwargs
62+
from cl.citations.utils import (
63+
make_get_citations_kwargs,
64+
)
6065
from cl.lib.test_helpers import CourtTestCase, PeopleTestCase, SearchTestCase
6166
from cl.search.documents import ParentheticalGroupDocument
6267
from cl.search.factories import (
@@ -3030,8 +3035,9 @@ class UnmatchedCitationTest(TransactionTestCase):
30303035
# select one to mark as ambiguous; as happens on the resolution flow
30313036
# to MULTIPLE_RESOURCE_MATCH citations
30323037
ambiguous_citations = [eyecite_citations.pop(2)]
3038+
setattr(ambiguous_citations[0], MULTIPLE_MATCHES_FLAG, True)
30333039
cluster = None
3034-
opinion = None
3040+
opinion: Opinion
30353041

30363042
@classmethod
30373043
def setUpClass(cls):
@@ -3041,8 +3047,8 @@ def setUpClass(cls):
30413047

30423048
def test_1st_creation(self) -> None:
30433049
"""Can we save unmatched citations?"""
3044-
store_unmatched_citations(
3045-
self.eyecite_citations, self.ambiguous_citations, self.opinion
3050+
handle_unmatched_citations(
3051+
self.opinion, self.eyecite_citations + self.ambiguous_citations, {}
30463052
)
30473053
unmatched_citations = list(
30483054
UnmatchedCitation.objects.filter(citing_opinion=self.opinion).all()
@@ -3096,8 +3102,15 @@ def test_1st_creation(self) -> None:
30963102
found_count == 2,
30973103
f"There should be 2 found UnmatchedCitations, there are {found_count}",
30983104
)
3099-
3100-
update_unmatched_citations_status(citation_resolutions, self.opinion)
3105+
existing_unmatched_citations = list(
3106+
self.opinion.unmatched_citations.all()
3107+
)
3108+
resolved_citations = {
3109+
c.matched_text() for v in citation_resolutions.values() for c in v
3110+
}
3111+
update_unmatched_citations_status(
3112+
resolved_citations, existing_unmatched_citations
3113+
)
31013114
should_resolve.refresh_from_db()
31023115
should_not_resolve.refresh_from_db()
31033116

@@ -3124,7 +3137,7 @@ def test_self_citation(self) -> None:
31243137
tokenizer=HYPERSCAN_TOKENIZER,
31253138
)
31263139
opinion = cluster.sub_opinions.first()
3127-
store_unmatched_citations(eyecite_citations, [], opinion)
3140+
handle_unmatched_citations(opinion, eyecite_citations, {})
31283141
count = UnmatchedCitation.objects.filter(
31293142
citing_opinion=opinion
31303143
).count()

0 commit comments

Comments
 (0)