Skip to content

Commit 6e4dc9e

Browse files
authored
Merge pull request #208 from HDembinski/fast_image_merge
Faster image filter
2 parents b257182 + 83ea4d1 commit 6e4dc9e

File tree

2 files changed

+70
-22
lines changed

2 files changed

+70
-22
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
_build
22
build
3+
*.egg-info
4+
__pycache__
5+
.pytest_cache

pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py

Lines changed: 67 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
1616
Text will be sorted in Western reading order. Any table will be included in
1717
the text in markdwn format as well.
18-
18+
1919
Dependencies
2020
-------------
2121
PyMuPDF v1.24.3 or later
@@ -256,6 +256,7 @@ def to_markdown(
256256
ignore_code=False,
257257
extract_words=False,
258258
show_progress=True,
259+
image_extract_algorithm="simple-drop",
259260
) -> str:
260261
"""Process the document and return the text of the selected pages.
261262
@@ -278,6 +279,7 @@ def to_markdown(
278279
ignore_code: (bool) suppress extra formatting for mono-space fonts
279280
extract_words: (bool) include "words"-like output in page chunks
280281
show_progress: (bool) print progress as each page is processed.
282+
image_extract_algorithm: (str) which algorithm to use "simple" or "simple-drop".
281283
282284
"""
283285
if write_images is False and embed_images is False and force_text is False:
@@ -784,27 +786,12 @@ def get_page_output(doc, pno, margins, textflags):
784786
# make a TextPage for all later extractions
785787
parms.textpage = page.get_textpage(flags=textflags, clip=parms.clip)
786788

787-
# extract images on page
788-
# ignore images contained in some other one (simplified mechanism)
789-
img_info = page.get_image_info()
790-
for i in range(len(img_info)):
791-
item = img_info[i]
792-
item["bbox"] = pymupdf.Rect(item["bbox"]) & parms.clip
793-
img_info[i] = item
794-
795-
# sort descending by image area size
796-
img_info.sort(key=lambda i: abs(i["bbox"]), reverse=True)
797-
# run from back to front (= small to large)
798-
for i in range(len(img_info) - 1, 0, -1):
799-
r = img_info[i]["bbox"]
800-
if r.is_empty:
801-
del img_info[i]
802-
continue
803-
for j in range(i): # image areas larger than r
804-
if r in img_info[j]["bbox"]:
805-
del img_info[i] # contained in some larger image
806-
break
807-
parms.images = img_info
789+
extract_images_on_page = {
790+
"simple": extract_images_on_page_simple,
791+
"simple-drop": extract_images_on_page_simple_drop,
792+
}[image_extract_algorithm]
793+
794+
parms.images = extract_images_on_page(page, parms, image_size_limit)
808795
parms.img_rects = [i["bbox"] for i in parms.images]
809796

810797
# Locate all tables on page
@@ -957,6 +944,64 @@ def get_page_output(doc, pno, margins, textflags):
957944
return document_output
958945

959946

947+
def extract_images_on_page_simple(page, parms, image_size_limit):
948+
# extract images on page
949+
# ignore images contained in some other one (simplified mechanism)
950+
img_info = page.get_image_info()
951+
for i in range(len(img_info)):
952+
item = img_info[i]
953+
item["bbox"] = pymupdf.Rect(item["bbox"]) & parms.clip
954+
img_info[i] = item
955+
956+
# sort descending by image area size
957+
img_info.sort(key=lambda i: abs(i["bbox"]), reverse=True)
958+
# run from back to front (= small to large)
959+
for i in range(len(img_info) - 1, 0, -1):
960+
r = img_info[i]["bbox"]
961+
if r.is_empty:
962+
del img_info[i]
963+
continue
964+
for j in range(i): # image areas larger than r
965+
if r in img_info[j]["bbox"]:
966+
del img_info[i] # contained in some larger image
967+
break
968+
969+
return img_info
970+
971+
972+
def filter_small_images(page, parms, image_size_limit):
973+
img_info = []
974+
for item in page.get_image_info():
975+
r = pymupdf.Rect(item["bbox"]) & parms.clip
976+
if r.is_empty or (
977+
max(r.width / page.rect.width, r.height / page.rect.height)
978+
< image_size_limit
979+
):
980+
continue
981+
item["bbox"] = r
982+
img_info.append(item)
983+
return img_info
984+
985+
986+
def extract_images_on_page_simple_drop(page, parms, image_size_limit):
987+
img_info = filter_small_images(page, parms, image_size_limit)
988+
989+
# sort descending by image area size
990+
img_info.sort(key=lambda i: abs(i["bbox"]), reverse=True)
991+
# run from back to front (= small to large)
992+
for i in range(len(img_info) - 1, 0, -1):
993+
r = img_info[i]["bbox"]
994+
if r.is_empty:
995+
del img_info[i]
996+
continue
997+
for j in range(i): # image areas larger than r
998+
if r in img_info[j]["bbox"]:
999+
del img_info[i] # contained in some larger image
1000+
break
1001+
1002+
return img_info
1003+
1004+
9601005
if __name__ == "__main__":
9611006
import pathlib
9621007
import sys

0 commit comments

Comments
 (0)