Skip to content

Commit ed23c10

Browse files
rework splitting
1 parent 8fd9a86 commit ed23c10

File tree

1 file changed

+18
-4
lines changed

1 file changed

+18
-4
lines changed

src/unstructured_client/_hooks/custom/split_pdf_hook.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import io
66
import json
77
import logging
8+
import math
89
import os
910
from concurrent.futures import ThreadPoolExecutor, Future
1011
from typing import Optional, Tuple, Union, Generator
@@ -33,6 +34,7 @@
3334
PARTITION_FORM_FILES_KEY = "files"
3435
PARTITION_FORM_SPLIT_PDF_PAGE_KEY = "split_pdf_page"
3536
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY = "starting_page_number"
37+
PARTITION_FORM_NUM_THREADS_KEY = "split_pdf_threads"
3638

3739
DEFAULT_STARTING_PAGE_NUMBER = 1
3840
DEFAULT_NUM_THREADS = 5
@@ -115,14 +117,18 @@ def before_request(
115117
starting_page_number = self._get_starting_page_number(form_data)
116118
call_threads = self._get_split_pdf_call_threads(form_data)
117119

118-
pages = self._get_pdf_pages(file.content)
120+
pdf = PdfReader(io.BytesIO(file.content))
121+
split_size = self._get_optimal_split_size(
122+
num_pages=len(pdf.pages), num_threads=call_threads
123+
)
124+
pages = self._get_pdf_pages(pdf, split_size)
125+
119126
call_api_partial = functools.partial(
120127
self._call_api,
121128
request=request,
122129
form_data=form_data,
123130
filename=file.file_name,
124131
)
125-
call_threads = self._get_split_pdf_call_threads()
126132
self.partition_requests[operation_id] = []
127133
last_page_content = io.BytesIO()
128134
last_page_number = 0
@@ -247,9 +253,18 @@ def _is_pdf(self, file: shared.Files) -> bool:
247253

248254
return True
249255

256+
def _get_optimal_split_size(self, num_pages: int, num_threads: int) -> int:
257+
"""Distributes pages to threads evenly based on the number of pages and threads."""
258+
if num_pages < MAX_PAGES_PER_THREAD * num_threads:
259+
split_size = math.ceil(num_pages / num_threads)
260+
else:
261+
split_size = MAX_PAGES_PER_THREAD
262+
263+
return max(split_size, MIN_PAGES_PER_THREAD)
264+
250265
def _get_pdf_pages(
251266
self,
252-
file_content: bytes,
267+
pdf: PdfReader,
253268
split_size: int = 1,
254269
) -> Generator[Tuple[io.BytesIO, int, int], None, None]:
255270
"""Reads given bytes of a pdf file and split it into n file-like objects, each
@@ -266,7 +281,6 @@ def _get_pdf_pages(
266281
their page number and overall pages number of the original document.
267282
"""
268283

269-
pdf = PdfReader(io.BytesIO(file_content))
270284
offset = 0
271285
offset_end = len(pdf.pages)
272286

0 commit comments

Comments
 (0)