@@ -100,6 +100,23 @@ def in_bbox(bb, bboxes):
100100 return i
101101 return 0
102102
103+ def in_bbox_using_cache (bb , bboxes , cache ):
104+ """Return 1-based number if a bbox contains bb, else return 0."""
105+ """Results are stored in the cache for speedup."""
106+ cache_key = f"{ id (bb )} _{ id (bboxes )} "
107+ cached = cache .get (cache_key )
108+ if cached is not None :
109+ return cached
110+
111+ index = 0
112+ for i , bbox in enumerate (bboxes , start = 1 ):
113+ if bb in bbox :
114+ index = i
115+ break
116+
117+ cache [cache_key ] = index
118+ return index
119+
103120 def intersects_bboxes (bb , bboxes ):
104121 """Return True if a bbox touches bb, else return False."""
105122 for bbox in bboxes :
@@ -140,6 +157,9 @@ def clean_nblocks(nblocks):
140157 if bb0 == bb1 :
141158 del nblocks [i ]
142159
160+ if len (nblocks ) == 0 :
161+ return nblocks
162+
143163 # 2. repair sequence in special cases:
144164 # consecutive bboxes with almost same bottom value are sorted ascending
145165 # by x-coordinate.
@@ -225,7 +245,7 @@ def join_rects_phase2(bboxes):
225245 new_rects .append (r )
226246 return new_rects
227247
228- def join_rects_phase3 (bboxes , path_rects ):
248+ def join_rects_phase3 (bboxes , path_rects , cache ):
229249 prects = bboxes [:]
230250 new_rects = []
231251
@@ -240,7 +260,7 @@ def join_rects_phase3(bboxes, path_rects):
240260 if prect1 .x0 > prect0 .x1 or prect1 .x1 < prect0 .x0 :
241261 continue
242262 # do not join different backgrounds
243- if in_bbox (prect0 , path_rects ) != in_bbox (prect1 , path_rects ):
263+ if in_bbox_using_cache (prect0 , path_rects , cache ) != in_bbox_using_cache (prect1 , path_rects , cache ):
244264 continue
245265 temp = prect0 | prect1
246266 test = set (
@@ -398,6 +418,7 @@ def join_rects_phase3(bboxes, path_rects):
398418 # the final block bboxes on page
399419 nblocks = [bboxes [0 ]] # pre-fill with first bbox
400420 bboxes = bboxes [1 :] # remaining old bboxes
421+ cache = {}
401422
402423 for i , bb in enumerate (bboxes ): # iterate old bboxes
403424 check = False # indicates unwanted joins
@@ -411,7 +432,7 @@ def join_rects_phase3(bboxes, path_rects):
411432 continue
412433
413434 # never join across different background colors
414- if in_bbox (nbb , path_rects ) != in_bbox (bb , path_rects ):
435+ if in_bbox_using_cache (nbb , path_rects , cache ) != in_bbox_using_cache (bb , path_rects , cache ):
415436 continue
416437
417438 temp = bb | nbb # temporary extension of new block
@@ -434,11 +455,13 @@ def join_rects_phase3(bboxes, path_rects):
434455
435456 # do some elementary cleaning
436457 nblocks = clean_nblocks (nblocks )
458+ if len (nblocks ) == 0 :
459+ return nblocks
437460
438461 # several phases of rectangle joining
439462 nblocks = join_rects_phase1 (nblocks )
440463 nblocks = join_rects_phase2 (nblocks )
441- nblocks = join_rects_phase3 (nblocks , path_rects )
464+ nblocks = join_rects_phase3 (nblocks , path_rects , cache )
442465
443466 # return identified text bboxes
444467 return nblocks
0 commit comments