Skip to content

Commit 49d915b

Browse files
authored
Merge pull request #216 from openstate/main
Adds a cache for comparing bboxes to speedup processing
2 parents 816ea79 + 2d25598 commit 49d915b

File tree

1 file changed

+27
-4
lines changed

1 file changed

+27
-4
lines changed

pymupdf4llm/pymupdf4llm/helpers/multi_column.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,23 @@ def in_bbox(bb, bboxes):
100100
return i
101101
return 0
102102

103+
def in_bbox_using_cache(bb, bboxes, cache):
104+
"""Return 1-based number if a bbox contains bb, else return 0."""
105+
"""Results are stored in the cache for speedup."""
106+
cache_key = f"{id(bb)}_{id(bboxes)}"
107+
cached = cache.get(cache_key)
108+
if cached is not None:
109+
return cached
110+
111+
index = 0
112+
for i, bbox in enumerate(bboxes, start=1):
113+
if bb in bbox:
114+
index = i
115+
break
116+
117+
cache[cache_key] = index
118+
return index
119+
103120
def intersects_bboxes(bb, bboxes):
104121
"""Return True if a bbox touches bb, else return False."""
105122
for bbox in bboxes:
@@ -140,6 +157,9 @@ def clean_nblocks(nblocks):
140157
if bb0 == bb1:
141158
del nblocks[i]
142159

160+
if len(nblocks) == 0:
161+
return nblocks
162+
143163
# 2. repair sequence in special cases:
144164
# consecutive bboxes with almost same bottom value are sorted ascending
145165
# by x-coordinate.
@@ -225,7 +245,7 @@ def join_rects_phase2(bboxes):
225245
new_rects.append(r)
226246
return new_rects
227247

228-
def join_rects_phase3(bboxes, path_rects):
248+
def join_rects_phase3(bboxes, path_rects, cache):
229249
prects = bboxes[:]
230250
new_rects = []
231251

@@ -240,7 +260,7 @@ def join_rects_phase3(bboxes, path_rects):
240260
if prect1.x0 > prect0.x1 or prect1.x1 < prect0.x0:
241261
continue
242262
# do not join different backgrounds
243-
if in_bbox(prect0, path_rects) != in_bbox(prect1, path_rects):
263+
if in_bbox_using_cache(prect0, path_rects, cache) != in_bbox_using_cache(prect1, path_rects, cache):
244264
continue
245265
temp = prect0 | prect1
246266
test = set(
@@ -398,6 +418,7 @@ def join_rects_phase3(bboxes, path_rects):
398418
# the final block bboxes on page
399419
nblocks = [bboxes[0]] # pre-fill with first bbox
400420
bboxes = bboxes[1:] # remaining old bboxes
421+
cache = {}
401422

402423
for i, bb in enumerate(bboxes): # iterate old bboxes
403424
check = False # indicates unwanted joins
@@ -411,7 +432,7 @@ def join_rects_phase3(bboxes, path_rects):
411432
continue
412433

413434
# never join across different background colors
414-
if in_bbox(nbb, path_rects) != in_bbox(bb, path_rects):
435+
if in_bbox_using_cache(nbb, path_rects, cache) != in_bbox_using_cache(bb, path_rects, cache):
415436
continue
416437

417438
temp = bb | nbb # temporary extension of new block
@@ -434,11 +455,13 @@ def join_rects_phase3(bboxes, path_rects):
434455

435456
# do some elementary cleaning
436457
nblocks = clean_nblocks(nblocks)
458+
if len(nblocks) == 0:
459+
return nblocks
437460

438461
# several phases of rectangle joining
439462
nblocks = join_rects_phase1(nblocks)
440463
nblocks = join_rects_phase2(nblocks)
441-
nblocks = join_rects_phase3(nblocks, path_rects)
464+
nblocks = join_rects_phase3(nblocks, path_rects, cache)
442465

443466
# return identified text bboxes
444467
return nblocks

0 commit comments

Comments
 (0)