Skip to content

Commit 3dd3429

Browse files
authored
Merge branch 'main' into v0.0.18
2 parents 7a53eb7 + 0362d28 commit 3dd3429

File tree

6 files changed

+104
-9
lines changed

6 files changed

+104
-9
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
_build
22
build
3+
*.egg-info
4+
__pycache__
5+
.pytest_cache

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,12 @@ To create small **chunks of text** - as opposed to generating one large string f
4242

4343
Also new in version 0.0.2 is the optional **extraction of images** and vector graphics: use of parameter `write_images=True`. The will store PNG images in the document's folder, and the Markdown text will appropriately refer to them. The images are named like `"input.pdf-page_number-index.png"`.
4444

45+
# Documentation and API
46+
47+
[Documentation](https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/index.html)
48+
49+
[API](https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/api.html#pymupdf4llm-api)
50+
4551
# Document Support
4652

4753
While PDF is by far the most important document format worldwide, it is worthwhile mentioning that all examples and helper scripts work in the same way and **_without change_** for [all supported file types](https://pymupdf.readthedocs.io/en/latest/how-to-open-a-file.html#supported-file-types).

pdf4llm/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ Instead of the filename string as above, one can also provide a PyMuPDF `Documen
5050
```python
5151
import pdf4llm
5252

53-
md_read = LlamaMarkdownReader()
53+
md_read = pdf4llm.LlamaMarkdownReader()
5454
data = md_read.load_data("input.pdf")
5555

5656
# The result 'data' is of type List[LlamaIndexDocument]

pymupdf4llm/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ Instead of the filename string as above, one can also provide a PyMuPDF `Documen
5050
```python
5151
import pymupdf4llm
5252

53-
md_read = LlamaMarkdownReader()
53+
md_read = pymupdf4llm.LlamaMarkdownReader()
5454
data = md_read.load_data("input.pdf")
5555

5656
# The result 'data' is of type List[LlamaIndexDocument]

pymupdf4llm/pymupdf4llm/helpers/multi_column.py

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,23 @@ def in_bbox(bb, bboxes):
100100
return i
101101
return 0
102102

103+
def in_bbox_using_cache(bb, bboxes, cache):
104+
"""Return 1-based number if a bbox contains bb, else return 0."""
105+
"""Results are stored in the cache for speedup."""
106+
cache_key = f"{id(bb)}_{id(bboxes)}"
107+
cached = cache.get(cache_key)
108+
if cached is not None:
109+
return cached
110+
111+
index = 0
112+
for i, bbox in enumerate(bboxes, start=1):
113+
if bb in bbox:
114+
index = i
115+
break
116+
117+
cache[cache_key] = index
118+
return index
119+
103120
def intersects_bboxes(bb, bboxes):
104121
"""Return True if a bbox touches bb, else return False."""
105122
for bbox in bboxes:
@@ -140,6 +157,9 @@ def clean_nblocks(nblocks):
140157
if bb0 == bb1:
141158
del nblocks[i]
142159

160+
if len(nblocks) == 0:
161+
return nblocks
162+
143163
# 2. repair sequence in special cases:
144164
# consecutive bboxes with almost same bottom value are sorted ascending
145165
# by x-coordinate.
@@ -225,7 +245,7 @@ def join_rects_phase2(bboxes):
225245
new_rects.append(r)
226246
return new_rects
227247

228-
def join_rects_phase3(bboxes, path_rects):
248+
def join_rects_phase3(bboxes, path_rects, cache):
229249
prects = bboxes[:]
230250
new_rects = []
231251

@@ -239,15 +259,17 @@ def join_rects_phase3(bboxes, path_rects):
239259
# do not join across columns
240260
if prect1.x0 > prect0.x1 or prect1.x1 < prect0.x0:
241261
continue
242-
# do not join areas with a different background
243-
if in_bbox(prect0, path_rects) != in_bbox(prect1, path_rects):
262+
263+
# do not join different backgrounds
264+
if in_bbox_using_cache(prect0, path_rects, cache) != in_bbox_using_cache(prect1, path_rects, cache):
244265
continue
245266
temp = prect0 | prect1
246267
test = set(
247268
[tuple(b) for b in prects + new_rects if b.intersects(temp)]
248269
)
249270
if test == set((tuple(prect0), tuple(prect1))):
250271
prect0 |= prect1
272+
prects[0] = prect0
251273
del prects[i]
252274
repeat = True
253275
new_rects.append(prect0)
@@ -397,6 +419,7 @@ def join_rects_phase3(bboxes, path_rects):
397419
# the final block bboxes on page
398420
nblocks = [bboxes[0]] # pre-fill with first bbox
399421
bboxes = bboxes[1:] # remaining old bboxes
422+
cache = {}
400423

401424
for i, bb in enumerate(bboxes): # iterate old bboxes
402425
check = False # indicates unwanted joins
@@ -410,7 +433,7 @@ def join_rects_phase3(bboxes, path_rects):
410433
continue
411434

412435
# never join across different background colors
413-
if in_bbox(nbb, path_rects) != in_bbox(bb, path_rects):
436+
if in_bbox_using_cache(nbb, path_rects, cache) != in_bbox_using_cache(bb, path_rects, cache):
414437
continue
415438

416439
temp = bb | nbb # temporary extension of new block
@@ -433,11 +456,13 @@ def join_rects_phase3(bboxes, path_rects):
433456

434457
# do some elementary cleaning
435458
nblocks = clean_nblocks(nblocks)
459+
if len(nblocks) == 0:
460+
return nblocks
436461

437462
# several phases of rectangle joining
438463
nblocks = join_rects_phase1(nblocks)
439464
nblocks = join_rects_phase2(nblocks)
440-
nblocks = join_rects_phase3(nblocks, path_rects)
465+
nblocks = join_rects_phase3(nblocks, path_rects, cache)
441466

442467
# return identified text bboxes
443468
return nblocks

pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,7 @@ def to_markdown(
253253
extract_words=False,
254254
show_progress=False,
255255
):
256+
) -> str:
256257
"""Process the document and return the text of the selected pages.
257258
258259
Args:
@@ -274,6 +275,7 @@ def to_markdown(
274275
ignore_code: (bool) suppress code-like formatting (mono-space fonts)
275276
extract_words: (bool) include "words"-like output in page chunks
276277
show_progress: (bool) print progress as each page is processed.
278+
image_extract_algorithm: (str) which algorithm to use "simple" or "simple-drop".
277279
278280
"""
279281
if write_images is False and embed_images is False and force_text is False:
@@ -583,8 +585,8 @@ def write_text(
583585
cwidth = (span0["bbox"][2] - span0["bbox"][0]) / len(
584586
span0["text"]
585587
)
586-
if cwidth == 0:
587-
cwidth = 1
588+
if cwidth == 0.0:
589+
cwidth = span0["size"] * 0.5
588590
text = " " * int(round(dist / cwidth)) + text
589591

590592
out_string += text
@@ -852,6 +854,7 @@ def get_page_output(doc, pno, margins, textflags, FILENAME):
852854
del img_info[i] # contained in some larger image
853855
break
854856
parms.images = img_info
857+
855858
parms.img_rects = [i["bbox"] for i in parms.images]
856859

857860
# Locate all tables on page
@@ -1015,6 +1018,64 @@ def get_page_output(doc, pno, margins, textflags, FILENAME):
10151018
return document_output
10161019

10171020

1021+
def extract_images_on_page_simple(page, parms, image_size_limit):
1022+
# extract images on page
1023+
# ignore images contained in some other one (simplified mechanism)
1024+
img_info = page.get_image_info()
1025+
for i in range(len(img_info)):
1026+
item = img_info[i]
1027+
item["bbox"] = pymupdf.Rect(item["bbox"]) & parms.clip
1028+
img_info[i] = item
1029+
1030+
# sort descending by image area size
1031+
img_info.sort(key=lambda i: abs(i["bbox"]), reverse=True)
1032+
# run from back to front (= small to large)
1033+
for i in range(len(img_info) - 1, 0, -1):
1034+
r = img_info[i]["bbox"]
1035+
if r.is_empty:
1036+
del img_info[i]
1037+
continue
1038+
for j in range(i): # image areas larger than r
1039+
if r in img_info[j]["bbox"]:
1040+
del img_info[i] # contained in some larger image
1041+
break
1042+
1043+
return img_info
1044+
1045+
1046+
def filter_small_images(page, parms, image_size_limit):
1047+
img_info = []
1048+
for item in page.get_image_info():
1049+
r = pymupdf.Rect(item["bbox"]) & parms.clip
1050+
if r.is_empty or (
1051+
max(r.width / page.rect.width, r.height / page.rect.height)
1052+
< image_size_limit
1053+
):
1054+
continue
1055+
item["bbox"] = r
1056+
img_info.append(item)
1057+
return img_info
1058+
1059+
1060+
def extract_images_on_page_simple_drop(page, parms, image_size_limit):
1061+
img_info = filter_small_images(page, parms, image_size_limit)
1062+
1063+
# sort descending by image area size
1064+
img_info.sort(key=lambda i: abs(i["bbox"]), reverse=True)
1065+
# run from back to front (= small to large)
1066+
for i in range(len(img_info) - 1, 0, -1):
1067+
r = img_info[i]["bbox"]
1068+
if r.is_empty:
1069+
del img_info[i]
1070+
continue
1071+
for j in range(i): # image areas larger than r
1072+
if r in img_info[j]["bbox"]:
1073+
del img_info[i] # contained in some larger image
1074+
break
1075+
1076+
return img_info
1077+
1078+
10181079
if __name__ == "__main__":
10191080
import pathlib
10201081
import sys

0 commit comments

Comments
 (0)