Skip to content

Commit 34a0b17

Browse files
committed
Adds a cache for comparing bboxes to speedup processing
1 parent b257182 commit 34a0b17

File tree

1 file changed

+22
-4
lines changed

1 file changed

+22
-4
lines changed

pymupdf4llm/pymupdf4llm/helpers/multi_column.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,23 @@ def in_bbox(bb, bboxes):
100100
return i
101101
return 0
102102

103+
def in_bbox_using_cache(bb, bboxes, cache):
104+
"""Return 1-based number if a bbox contains bb, else return 0."""
105+
"""Results are stored in the cache for speedup."""
106+
cache_key = f"{id(bb)}_{id(bboxes)}"
107+
cached = cache.get(cache_key)
108+
if cached is not None:
109+
return cached
110+
111+
index = 0
112+
for i, bbox in enumerate(bboxes, start=1):
113+
if bb in bbox:
114+
index = i
115+
break
116+
117+
cache[cache_key] = index
118+
return index
119+
103120
def intersects_bboxes(bb, bboxes):
104121
"""Return True if a bbox touches bb, else return False."""
105122
for bbox in bboxes:
@@ -225,7 +242,7 @@ def join_rects_phase2(bboxes):
225242
new_rects.append(r)
226243
return new_rects
227244

228-
def join_rects_phase3(bboxes, path_rects):
245+
def join_rects_phase3(bboxes, path_rects, cache):
229246
prects = bboxes[:]
230247
new_rects = []
231248

@@ -240,7 +257,7 @@ def join_rects_phase3(bboxes, path_rects):
240257
if prect1.x0 > prect0.x1 or prect1.x1 < prect0.x0:
241258
continue
242259
# do not join different backgrounds
243-
if in_bbox(prect0, path_rects) != in_bbox(prect1, path_rects):
260+
if in_bbox_using_cache(prect0, path_rects, cache) != in_bbox_using_cache(prect1, path_rects, cache):
244261
continue
245262
temp = prect0 | prect1
246263
test = set(
@@ -397,6 +414,7 @@ def join_rects_phase3(bboxes, path_rects):
397414
# the final block bboxes on page
398415
nblocks = [bboxes[0]] # pre-fill with first bbox
399416
bboxes = bboxes[1:] # remaining old bboxes
417+
cache = {}
400418

401419
for i, bb in enumerate(bboxes): # iterate old bboxes
402420
check = False # indicates unwanted joins
@@ -410,7 +428,7 @@ def join_rects_phase3(bboxes, path_rects):
410428
continue
411429

412430
# never join across different background colors
413-
if in_bbox(nbb, path_rects) != in_bbox(bb, path_rects):
431+
if in_bbox_using_cache(nbb, path_rects, cache) != in_bbox_using_cache(bb, path_rects, cache):
414432
continue
415433

416434
temp = bb | nbb # temporary extension of new block
@@ -437,7 +455,7 @@ def join_rects_phase3(bboxes, path_rects):
437455
# several phases of rectangle joining
438456
nblocks = join_rects_phase1(nblocks)
439457
nblocks = join_rects_phase2(nblocks)
440-
nblocks = join_rects_phase3(nblocks, path_rects)
458+
nblocks = join_rects_phase3(nblocks, path_rects, cache)
441459

442460
# return identified text bboxes
443461
return nblocks

0 commit comments

Comments
 (0)