5
5
import io
6
6
import json
7
7
import logging
8
+ import math
8
9
import os
9
10
from concurrent .futures import ThreadPoolExecutor , Future
10
11
from typing import Optional , Tuple , Union , Generator
33
34
PARTITION_FORM_FILES_KEY = "files"
34
35
PARTITION_FORM_SPLIT_PDF_PAGE_KEY = "split_pdf_page"
35
36
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY = "starting_page_number"
37
+ PARTITION_FORM_NUM_THREADS_KEY = "split_pdf_threads"
36
38
37
39
DEFAULT_STARTING_PAGE_NUMBER = 1
38
40
DEFAULT_NUM_THREADS = 5
@@ -115,14 +117,18 @@ def before_request(
115
117
starting_page_number = self ._get_starting_page_number (form_data )
116
118
call_threads = self ._get_split_pdf_call_threads (form_data )
117
119
118
- pages = self ._get_pdf_pages (file .content )
120
+ pdf = PdfReader (io .BytesIO (file .content ))
121
+ split_size = self ._get_optimal_split_size (
122
+ num_pages = len (pdf .pages ), num_threads = call_threads
123
+ )
124
+ pages = self ._get_pdf_pages (pdf , split_size )
125
+
119
126
call_api_partial = functools .partial (
120
127
self ._call_api ,
121
128
request = request ,
122
129
form_data = form_data ,
123
130
filename = file .file_name ,
124
131
)
125
- call_threads = self ._get_split_pdf_call_threads ()
126
132
self .partition_requests [operation_id ] = []
127
133
last_page_content = io .BytesIO ()
128
134
last_page_number = 0
@@ -247,9 +253,18 @@ def _is_pdf(self, file: shared.Files) -> bool:
247
253
248
254
return True
249
255
256
+ def _get_optimal_split_size (self , num_pages : int , num_threads : int ) -> int :
257
+ """Distributes pages to threads evenly based on the number of pages and threads."""
258
+ if num_pages < MAX_PAGES_PER_THREAD * num_threads :
259
+ split_size = math .ceil (num_pages / num_threads )
260
+ else :
261
+ split_size = MAX_PAGES_PER_THREAD
262
+
263
+ return max (split_size , MIN_PAGES_PER_THREAD )
264
+
250
265
def _get_pdf_pages (
251
266
self ,
252
- file_content : bytes ,
267
+ pdf : PdfReader ,
253
268
split_size : int = 1 ,
254
269
) -> Generator [Tuple [io .BytesIO , int , int ], None , None ]:
255
270
"""Reads given bytes of a pdf file and split it into n file-like objects, each
@@ -266,7 +281,6 @@ def _get_pdf_pages(
266
281
their page number and overall pages number of the original document.
267
282
"""
268
283
269
- pdf = PdfReader (io .BytesIO (file_content ))
270
284
offset = 0
271
285
offset_end = len (pdf .pages )
272
286
0 commit comments