Skip to content

Commit 1929f73

Browse files
committed
chore: Update dependencies from main
Signed-off-by: Christoph Auer <[email protected]>
2 parents 31f0efc + dee40e8 commit 1929f73

28 files changed

+1728
-547
lines changed
Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
import logging
2+
from pathlib import Path
3+
from typing import Dict, List, Optional, Tuple, Union
4+
5+
import openpyxl
6+
import pandas as pd
7+
from openpyxl import load_workbook
8+
from openpyxl.cell.cell import Cell
9+
from openpyxl.styles import Font
10+
from pandas import DataFrame
11+
12+
from docling_eval.aggregations.multi_evalutor import MultiEvaluation
13+
from docling_eval.datamodels.types import ConsolidationFormats, EvaluationModality
14+
from docling_eval.evaluators.base_evaluator import EvaluationRejectionType
15+
from docling_eval.evaluators.bbox_text_evaluator import DatasetBoxesTextEvaluation
16+
from docling_eval.evaluators.layout_evaluator import DatasetLayoutEvaluation
17+
from docling_eval.evaluators.markdown_text_evaluator import DatasetMarkdownEvaluation
18+
from docling_eval.evaluators.readingorder_evaluator import DatasetReadingOrderEvaluation
19+
from docling_eval.evaluators.stats import DatasetStatistics
20+
from docling_eval.evaluators.table_evaluator import DatasetTableEvaluation
21+
22+
_log = logging.getLogger(__name__)
23+
24+
25+
def export_value(val: Union[float, DatasetStatistics]) -> str:
26+
r"""Get statistics value"""
27+
if isinstance(val, DatasetStatistics):
28+
fmt_val = f"{val.mean:.2f}±{val.std:.2f}"
29+
else:
30+
fmt_val = f"{val:.2f}"
31+
32+
return fmt_val
33+
34+
35+
class Consolidator:
36+
r"""
37+
Consolidate a MultiEvaluation into a comparison matrix
38+
39+
The comparison matrix has 3 dimensions:
40+
- Benchmarks
41+
- ConversionProviders
42+
- Modalities
43+
"""
44+
45+
def __init__(self, output_path: Path):
46+
r""" """
47+
self._output_path = output_path
48+
self._excel_engine = "openpyxl"
49+
self._sheet_name = "matrix"
50+
self._excel_filename = "consolidation_matrix.xlsx"
51+
52+
self._output_path.mkdir(parents=True, exist_ok=True)
53+
54+
def __call__(
55+
self,
56+
multi_evaluation: MultiEvaluation,
57+
consolidation_format: Optional[
58+
ConsolidationFormats
59+
] = ConsolidationFormats.EXCEL,
60+
) -> Tuple[Dict[EvaluationModality, DataFrame], Optional[Path]]:
61+
r""" """
62+
dfs = self._build_dataframes(multi_evaluation)
63+
64+
# Export dataframe
65+
if consolidation_format == ConsolidationFormats.EXCEL:
66+
produced_fn = self._to_excel(dfs)
67+
_log.info("Produced excel: %s", str(produced_fn))
68+
else:
69+
_log.info("Unsupported consolidation format: %s", consolidation_format)
70+
71+
return dfs, produced_fn
72+
73+
def _to_excel(self, dfs: Dict[EvaluationModality, DataFrame]) -> Path:
74+
r""" """
75+
excel_fn = self._output_path / self._excel_filename
76+
startrow = 0
77+
header_rows: List[int] = []
78+
with pd.ExcelWriter(excel_fn, engine=self._excel_engine) as writer: # type: ignore
79+
for modality, df in dfs.items():
80+
if self._sheet_name in writer.book.sheetnames:
81+
sheet = writer.book[self._sheet_name]
82+
startrow = sheet.max_row + 2
83+
84+
# Add the modality as a "header" for the metrics subtable
85+
header_df = DataFrame([modality.name])
86+
header_rows.append(startrow + 1)
87+
header_df.to_excel(
88+
writer,
89+
sheet_name=self._sheet_name,
90+
startrow=startrow,
91+
index=False,
92+
header=False,
93+
)
94+
startrow += 1
95+
96+
# Metrics subtable
97+
df.to_excel(
98+
writer,
99+
sheet_name=self._sheet_name,
100+
startrow=startrow,
101+
index=False,
102+
)
103+
# Format the excel
104+
self._format_excel(excel_fn, header_rows)
105+
106+
return excel_fn
107+
108+
def _format_excel(self, excel_fn: Path, header_rows: List[int]):
109+
r"""Do some proper formatting of the generated excel"""
110+
workbook = load_workbook(excel_fn)
111+
sheet = workbook[self._sheet_name]
112+
113+
# Adjust the cell width
114+
for col in sheet.columns:
115+
# Find the maximum length of strings in this column (excluding empty cells)
116+
max_length = 0
117+
for cell in col:
118+
try:
119+
if len(str(cell.value)) > max_length:
120+
max_length = len(str(cell.value))
121+
except:
122+
pass
123+
adjusted_width = max_length + 2 # Add some padding to make it look better
124+
first_cell = col[0]
125+
assert isinstance(first_cell, Cell)
126+
sheet.column_dimensions[first_cell.column_letter].width = adjusted_width
127+
128+
# Iterate through each cell in the worksheet and remove borders
129+
for row in sheet.iter_rows():
130+
for cell in row:
131+
cell.border = openpyxl.styles.Border() # Remove borders
132+
133+
# Make bold the subtable headers
134+
bold_font = Font(bold=True)
135+
for header_row in header_rows:
136+
cell = sheet.cell(row=header_row, column=1)
137+
cell.font = bold_font
138+
x = 0
139+
140+
# Save back the excel
141+
workbook.save(excel_fn)
142+
143+
def _build_dataframes(
144+
self,
145+
multi_evaluation: MultiEvaluation,
146+
) -> Dict[EvaluationModality, DataFrame]:
147+
r"""
148+
Return a Dict with dataframes per modality
149+
"""
150+
# Collect all data to build the dataframes
151+
df_data: Dict[EvaluationModality, List[Dict[str, Union[str, float, int]]]] = {}
152+
153+
# Collect the dataframe data
154+
for benchmark, prov_mod_eval in multi_evaluation.evaluations.items():
155+
for experiment, mod_eval in prov_mod_eval.items():
156+
for modality, single_evaluation in mod_eval.items():
157+
evaluation = single_evaluation.evaluation
158+
159+
if modality == EvaluationModality.LAYOUT:
160+
metrics = self._layout_metrics(evaluation)
161+
elif modality == EvaluationModality.MARKDOWN_TEXT:
162+
metrics = self._markdowntext_metrics(evaluation)
163+
elif modality == EvaluationModality.TABLE_STRUCTURE:
164+
metrics = self._tablestructure_metrics(evaluation)
165+
elif modality == EvaluationModality.READING_ORDER:
166+
metrics = self._readingorder_metrics(evaluation)
167+
elif modality == EvaluationModality.BBOXES_TEXT:
168+
metrics = self._bboxestext_metrics(evaluation)
169+
else:
170+
_log.error(
171+
"Evaluation modality unsupported for export: %s", modality
172+
)
173+
continue
174+
175+
# Gather the dataframe data
176+
provider = (
177+
single_evaluation.prediction_provider_type.value
178+
if single_evaluation.prediction_provider_type is not None
179+
else "Unkown"
180+
)
181+
data: Dict[str, Union[str, float]] = {
182+
"Benchmark": benchmark.value,
183+
"Provider": provider,
184+
"Experiment": experiment,
185+
"evaluated_samples": evaluation.evaluated_samples,
186+
}
187+
for rej_type in EvaluationRejectionType:
188+
if rej_type not in evaluation.rejected_samples:
189+
data[rej_type.value] = 0
190+
else:
191+
data[rej_type.value] = evaluation.rejected_samples[rej_type]
192+
193+
data |= metrics
194+
if modality not in df_data:
195+
df_data[modality] = []
196+
df_data[modality].append(data)
197+
198+
# Build the dataframes
199+
dfs: Dict[EvaluationModality, DataFrame] = {}
200+
for modality, m_data in df_data.items():
201+
df = DataFrame(m_data)
202+
df = df.sort_values(by=["Benchmark", "Provider"], ascending=[True, True])
203+
dfs[modality] = df
204+
205+
return dfs
206+
207+
def _layout_metrics(self, evaluation: DatasetLayoutEvaluation) -> Dict[str, str]:
208+
r"""Get the metrics for the LayoutEvaluation"""
209+
metrics = {
210+
"mAP": export_value(evaluation.map_stats),
211+
"mAP_50": export_value(evaluation.map_50_stats),
212+
"mAP_75": export_value(evaluation.map_75_stats),
213+
"weighted_mAP_50": export_value(evaluation.weighted_map_50_stats),
214+
"weighted_mAP_75": export_value(evaluation.weighted_map_75_stats),
215+
"weighted_mAP_90": export_value(evaluation.weighted_map_90_stats),
216+
"weighted_mAP_95": export_value(evaluation.weighted_map_95_stats),
217+
}
218+
for class_evaluation in evaluation.evaluations_per_class:
219+
key = f"class_{class_evaluation.label}"
220+
metrics[key] = export_value(class_evaluation.value)
221+
222+
return metrics
223+
224+
def _markdowntext_metrics(
225+
self,
226+
evaluation: DatasetMarkdownEvaluation,
227+
) -> Dict[str, str]:
228+
r""" """
229+
metrics = {
230+
"BLEU": export_value(evaluation.bleu_stats),
231+
"F1": export_value(evaluation.f1_score_stats),
232+
"Precision": export_value(evaluation.precision_stats),
233+
"Recall": export_value(evaluation.recall_stats),
234+
"Edit_Distance": export_value(evaluation.edit_distance_stats),
235+
"METEOR": export_value(evaluation.meteor_stats),
236+
}
237+
return metrics
238+
239+
def _tablestructure_metrics(
240+
self,
241+
evaluation: DatasetTableEvaluation,
242+
) -> Dict[str, str]:
243+
r""" """
244+
metrics = {
245+
"TEDS": export_value(evaluation.TEDS),
246+
"TEDS_struct": export_value(evaluation.TEDS_struct),
247+
"TEDS_simple": export_value(evaluation.TEDS_simple),
248+
"TEDS_complex": export_value(evaluation.TEDS_complex),
249+
}
250+
return metrics
251+
252+
def _readingorder_metrics(
253+
self,
254+
evaluation: DatasetReadingOrderEvaluation,
255+
) -> Dict[str, str]:
256+
r""" """
257+
metrics = {
258+
"ARD": export_value(evaluation.ard_stats),
259+
"Weighted_ARD": export_value(evaluation.w_ard_stats),
260+
}
261+
return metrics
262+
263+
def _bboxestext_metrics(
264+
self,
265+
evaluation: DatasetBoxesTextEvaluation,
266+
) -> Dict[str, str]:
267+
r""" """
268+
metrics = {
269+
"BLEU": export_value(evaluation.bleu_stats),
270+
"F1": export_value(evaluation.f1_score_stats),
271+
"Precision": export_value(evaluation.precision_stats),
272+
"Recall": export_value(evaluation.recall_stats),
273+
"Edit_Distance": export_value(evaluation.edit_distance_stats),
274+
"METEOR": export_value(evaluation.meteor_stats),
275+
}
276+
return metrics

0 commit comments

Comments
 (0)