Skip to content

Commit 7a53eb7

Browse files
committed
Mutiple Fixes
1 parent e20b9e7 commit 7a53eb7

File tree

6 files changed

+250
-184
lines changed

6 files changed

+250
-184
lines changed

pdf4llm/pdf4llm/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
from pymupdf4llm import *
33

44

5-
__version__ = "0.0.9"
6-
version = __version__
7-
version_tuple = tuple(map(int, version.split(".")))
5+
__version__ = pymupdf4llm.__version__
6+
version = pymupdf4llm.version
7+
version_tuple = pymupdf4llm.version_tuple
88

99

1010
def LlamaMarkdownReader(*args, **kwargs):

pdf4llm/setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@
1313
"Programming Language :: Python :: 3",
1414
"Topic :: Utilities",
1515
]
16-
requires = ["pymupdf4llm"]
16+
requires = ["pymupdf4llm>=0.0.18"]
1717

1818
setuptools.setup(
1919
name="pdf4llm",
20-
version="0.0.9",
20+
version="0.0.18",
2121
author="Artifex",
2222
author_email="support@artifex.com",
2323
description="PyMuPDF Utilities for LLM/RAG",

pymupdf4llm/pymupdf4llm/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown
22

3-
__version__ = "0.0.17"
3+
__version__ = "0.0.18"
44
version = __version__
55
version_tuple = tuple(map(int, version.split(".")))
66

pymupdf4llm/pymupdf4llm/helpers/multi_column.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -168,9 +168,9 @@ def join_rects_phase1(bboxes):
168168
169169
Joins any rectangles that "touch" each other.
170170
This means that their intersection is valid (but may be empty).
171-
To prefer vertical joins, we will ignore small horizontal gaps.
171+
To prefer vertical joins, we will ignore small gaps.
172172
"""
173-
delta = (0, 0, 0, 2) # allow this gap below
173+
delta = (0, 0, 0, 10) # allow this gap below
174174
prects = bboxes[:]
175175
new_rects = []
176176
while prects:
@@ -179,7 +179,7 @@ def join_rects_phase1(bboxes):
179179
while repeat:
180180
repeat = False
181181
for i in range(len(prects) - 1, 0, -1):
182-
if not ((prect0 + delta) & prects[i]).is_empty:
182+
if ((prect0 + delta) & prects[i]).is_valid:
183183
prect0 |= prects[i]
184184
del prects[i]
185185
repeat = True
@@ -208,11 +208,11 @@ def join_rects_phase2(bboxes):
208208
prects.sort(key=lambda b: (b.x0, b.y0))
209209
new_rects = [prects[0]] # initialize with first item
210210

211-
# walk through the rest, top to bottom, thwn left to right
211+
# walk through the rest, top to bottom, then left to right
212212
for r in prects[1:]:
213213
r0 = new_rects[-1] # previous bbox
214214

215-
# join if we have similar borders and are not to far down
215+
# join if we have similar borders and are not too far down
216216
if (
217217
abs(r.x0 - r0.x0) <= 3
218218
and abs(r.x1 - r0.x1) <= 3
@@ -239,7 +239,7 @@ def join_rects_phase3(bboxes, path_rects):
239239
# do not join across columns
240240
if prect1.x0 > prect0.x1 or prect1.x1 < prect0.x0:
241241
continue
242-
# do not join different backgrounds
242+
# do not join areas with a different background
243243
if in_bbox(prect0, path_rects) != in_bbox(prect1, path_rects):
244244
continue
245245
temp = prect0 | prect1
@@ -297,7 +297,7 @@ def join_rects_phase3(bboxes, path_rects):
297297
sort_rects.sort(key=lambda sr: sr[1]) # by computed key
298298
new_rects = [sr[0] for sr in sort_rects] # extract sorted rectangles
299299

300-
# move shaded text rects into a separate list
300+
# move text rects with background color into a separate list
301301
shadow_rects = []
302302
# for i in range(len(new_rects) - 1, 0, -1):
303303
# r = +new_rects[i]

0 commit comments

Comments
 (0)