Skip to content

Commit d4d68b0

Browse files
authored
Merge pull request #234 from pymupdf/v0.0.18
Mutiple Fixes
2 parents 0362d28 + 3dd3429 commit d4d68b0

File tree

6 files changed

+265
-184
lines changed

6 files changed

+265
-184
lines changed

pdf4llm/pdf4llm/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
from pymupdf4llm import *
33

44

5-
__version__ = "0.0.9"
6-
version = __version__
7-
version_tuple = tuple(map(int, version.split(".")))
5+
__version__ = pymupdf4llm.__version__
6+
version = pymupdf4llm.version
7+
version_tuple = pymupdf4llm.version_tuple
88

99

1010
def LlamaMarkdownReader(*args, **kwargs):

pdf4llm/setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@
1313
"Programming Language :: Python :: 3",
1414
"Topic :: Utilities",
1515
]
16-
requires = ["pymupdf4llm"]
16+
requires = ["pymupdf4llm>=0.0.18"]
1717

1818
setuptools.setup(
1919
name="pdf4llm",
20-
version="0.0.9",
20+
version="0.0.18",
2121
author="Artifex",
2222
author_email="support@artifex.com",
2323
description="PyMuPDF Utilities for LLM/RAG",

pymupdf4llm/pymupdf4llm/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown
22

3-
__version__ = "0.0.17"
3+
__version__ = "0.0.18"
44
version = __version__
55
version_tuple = tuple(map(int, version.split(".")))
66

pymupdf4llm/pymupdf4llm/helpers/multi_column.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -188,9 +188,9 @@ def join_rects_phase1(bboxes):
188188
189189
Joins any rectangles that "touch" each other.
190190
This means that their intersection is valid (but may be empty).
191-
To prefer vertical joins, we will ignore small horizontal gaps.
191+
To prefer vertical joins, we will ignore small gaps.
192192
"""
193-
delta = (0, 0, 0, 2) # allow this gap below
193+
delta = (0, 0, 0, 10) # allow this gap below
194194
prects = bboxes[:]
195195
new_rects = []
196196
while prects:
@@ -199,7 +199,7 @@ def join_rects_phase1(bboxes):
199199
while repeat:
200200
repeat = False
201201
for i in range(len(prects) - 1, 0, -1):
202-
if not ((prect0 + delta) & prects[i]).is_empty:
202+
if ((prect0 + delta) & prects[i]).is_valid:
203203
prect0 |= prects[i]
204204
del prects[i]
205205
repeat = True
@@ -228,11 +228,11 @@ def join_rects_phase2(bboxes):
228228
prects.sort(key=lambda b: (b.x0, b.y0))
229229
new_rects = [prects[0]] # initialize with first item
230230

231-
# walk through the rest, top to bottom, thwn left to right
231+
# walk through the rest, top to bottom, then left to right
232232
for r in prects[1:]:
233233
r0 = new_rects[-1] # previous bbox
234234

235-
# join if we have similar borders and are not to far down
235+
# join if we have similar borders and are not too far down
236236
if (
237237
abs(r.x0 - r0.x0) <= 3
238238
and abs(r.x1 - r0.x1) <= 3
@@ -259,6 +259,7 @@ def join_rects_phase3(bboxes, path_rects, cache):
259259
# do not join across columns
260260
if prect1.x0 > prect0.x1 or prect1.x1 < prect0.x0:
261261
continue
262+
262263
# do not join different backgrounds
263264
if in_bbox_using_cache(prect0, path_rects, cache) != in_bbox_using_cache(prect1, path_rects, cache):
264265
continue
@@ -318,7 +319,7 @@ def join_rects_phase3(bboxes, path_rects, cache):
318319
sort_rects.sort(key=lambda sr: sr[1]) # by computed key
319320
new_rects = [sr[0] for sr in sort_rects] # extract sorted rectangles
320321

321-
# move shaded text rects into a separate list
322+
# move text rects with background color into a separate list
322323
shadow_rects = []
323324
# for i in range(len(new_rects) - 1, 0, -1):
324325
# r = +new_rects[i]

0 commit comments

Comments
 (0)