@@ -100,6 +100,23 @@ def in_bbox(bb, bboxes):
100100 return i
101101 return 0
102102
103+ def in_bbox_using_cache (bb , bboxes , cache ):
104+ """Return 1-based number if a bbox contains bb, else return 0."""
105+ """Results are stored in the cache for speedup."""
106+ cache_key = f"{ id (bb )} _{ id (bboxes )} "
107+ cached = cache .get (cache_key )
108+ if cached is not None :
109+ return cached
110+
111+ index = 0
112+ for i , bbox in enumerate (bboxes , start = 1 ):
113+ if bb in bbox :
114+ index = i
115+ break
116+
117+ cache [cache_key ] = index
118+ return index
119+
103120 def intersects_bboxes (bb , bboxes ):
104121 """Return True if a bbox touches bb, else return False."""
105122 for bbox in bboxes :
@@ -225,7 +242,7 @@ def join_rects_phase2(bboxes):
225242 new_rects .append (r )
226243 return new_rects
227244
228- def join_rects_phase3 (bboxes , path_rects ):
245+ def join_rects_phase3 (bboxes , path_rects , cache ):
229246 prects = bboxes [:]
230247 new_rects = []
231248
@@ -240,7 +257,7 @@ def join_rects_phase3(bboxes, path_rects):
240257 if prect1 .x0 > prect0 .x1 or prect1 .x1 < prect0 .x0 :
241258 continue
242259 # do not join different backgrounds
243- if in_bbox (prect0 , path_rects ) != in_bbox (prect1 , path_rects ):
260+ if in_bbox_using_cache (prect0 , path_rects , cache ) != in_bbox_using_cache (prect1 , path_rects , cache ):
244261 continue
245262 temp = prect0 | prect1
246263 test = set (
@@ -397,6 +414,7 @@ def join_rects_phase3(bboxes, path_rects):
397414 # the final block bboxes on page
398415 nblocks = [bboxes [0 ]] # pre-fill with first bbox
399416 bboxes = bboxes [1 :] # remaining old bboxes
417+ cache = {}
400418
401419 for i , bb in enumerate (bboxes ): # iterate old bboxes
402420 check = False # indicates unwanted joins
@@ -410,7 +428,7 @@ def join_rects_phase3(bboxes, path_rects):
410428 continue
411429
412430 # never join across different background colors
413- if in_bbox (nbb , path_rects ) != in_bbox (bb , path_rects ):
431+ if in_bbox_using_cache (nbb , path_rects , cache ) != in_bbox_using_cache (bb , path_rects , cache ):
414432 continue
415433
416434 temp = bb | nbb # temporary extension of new block
@@ -437,7 +455,7 @@ def join_rects_phase3(bboxes, path_rects):
437455 # several phases of rectangle joining
438456 nblocks = join_rects_phase1 (nblocks )
439457 nblocks = join_rects_phase2 (nblocks )
440- nblocks = join_rects_phase3 (nblocks , path_rects )
458+ nblocks = join_rects_phase3 (nblocks , path_rects , cache )
441459
442460 # return identified text bboxes
443461 return nblocks
0 commit comments