Skip to content

Commit 0609def

Browse files
authored
Merge pull request #63 from pymupdf/version-0.0.6
Changes for v0.0.7
2 parents 35f82cc + 3b3c13b commit 0609def

File tree

3 files changed

+42
-31
lines changed

3 files changed

+42
-31
lines changed

pymupdf4llm/pymupdf4llm/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown
22

3-
__version__ = "0.0.6"
3+
__version__ = "0.0.7"
44
version = __version__
55
version_tuple = tuple(map(int, version.split(".")))
66

pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py

Lines changed: 40 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -187,19 +187,15 @@ def to_markdown(
187187
if len(margins) == 2:
188188
margins = (0, margins[0], 0, margins[1])
189189
if len(margins) != 4:
190-
raise ValueError(
191-
"margins must be a float or a sequence of 2 or 4 floats"
192-
)
190+
raise ValueError("margins must be a float or a sequence of 2 or 4 floats")
193191
elif not all([hasattr(m, "__float__") for m in margins]):
194192
raise ValueError("margin values must be floats")
195193

196194
# If "hdr_info" is not an object having method "get_header_id", scan the
197195
# document and use font sizes as header level indicators.
198196
if callable(hdr_info):
199197
get_header_id = hdr_info
200-
elif hasattr(hdr_info, "get_header_id") and callable(
201-
hdr_info.get_header_id
202-
):
198+
elif hasattr(hdr_info, "get_header_id") and callable(hdr_info.get_header_id):
203199
get_header_id = hdr_info.get_header_id
204200
else:
205201
hdr_info = IdentifyHeaders(doc)
@@ -382,9 +378,7 @@ def write_text(
382378
if ltext:
383379
text = f"{hdr_string}{prefix}{ltext}{suffix} "
384380
else:
385-
text = (
386-
f"{hdr_string}{prefix}{s['text'].strip()}{suffix} "
387-
)
381+
text = f"{hdr_string}{prefix}{s['text'].strip()}{suffix} "
388382

389383
if text.startswith(bullet):
390384
text = "- " + text[1:]
@@ -397,9 +391,7 @@ def write_text(
397391
code = False
398392

399393
return (
400-
out_string.replace(" \n", "\n")
401-
.replace(" ", " ")
402-
.replace("\n\n\n", "\n\n")
394+
out_string.replace(" \n", "\n").replace(" ", " ").replace("\n\n\n", "\n\n")
403395
)
404396

405397
def is_in_rects(rect, rect_list):
@@ -486,7 +478,9 @@ def get_page_output(doc, pno, margins, textflags):
486478
if GRAPHICS_LIMIT is not None:
487479
test_paths = page.get_cdrawings()
488480
if (excess := len(test_paths)) > GRAPHICS_LIMIT:
489-
md_string = f"\n**Ignoring page {page.number} with {excess} vector graphics.**"
481+
md_string = (
482+
f"\n**Ignoring page {page.number} with {excess} vector graphics.**"
483+
)
490484
md_string += "\n\n-----\n\n"
491485
return md_string, [], [], []
492486
left, top, right, bottom = margins
@@ -497,9 +491,7 @@ def get_page_output(doc, pno, margins, textflags):
497491
# make a TextPage for all later extractions
498492
textpage = page.get_textpage(flags=textflags, clip=clip)
499493

500-
img_info = [
501-
img for img in page.get_image_info() if img["bbox"] in clip
502-
]
494+
img_info = [img for img in page.get_image_info() if img["bbox"] in clip]
503495
images = img_info[:]
504496
tables = []
505497
graphics = []
@@ -533,24 +525,42 @@ def get_page_output(doc, pno, margins, textflags):
533525
and p["rect"].height < page_clip.height
534526
]
535527

536-
# Determine vector graphics outside any tables, ignoring any
537-
# fill-only (type "f") paths.
538-
vg_clusters = []
528+
# We also ignore vector graphics that only represent "background
529+
# sugar".
530+
vg_clusters = [] # worthwhile vector graphics go here
531+
532+
# walk through all vector graphics not belonging to a table
539533
for bbox in page.cluster_drawings(drawings=paths):
534+
subbox = bbox + (3, 3, -3, -3) # sub rect without any border
535+
box_area = abs(bbox)
540536
include = False
541-
for p in [p for p in paths if p["rect"] in bbox]:
542-
if p["type"] != "f":
537+
for p in paths:
538+
mp = (p["rect"].tl + p["rect"].br) / 2 # center point of rect
539+
540+
# fill-only paths or being part of the border will not
541+
# make this a worthwhile vector grahic
542+
if mp not in subbox or p["type"] == "f":
543+
continue
544+
545+
# this checks if all items are part of the bbox border
546+
near_border = set()
547+
for itm in p["items"]: # walk through path items
548+
if itm[0] == "re": # a full-sized rectangle
549+
if abs(item[1]) / box_area < 1e-3:
550+
near_border.add(True) # is part of the border
551+
elif itm[0] in ("c", "l"): # curves and lines
552+
for temp in itm[1:]:
553+
# if their points are on the border
554+
near_border.add(temp not in subbox)
555+
# if any stroked path has a point inside bbox (i.e. not on its
556+
# border then this vector graphic is treated as significant
557+
if not near_border == {True}:
543558
include = True
544559
break
545-
if [item[0] for item in p["items"] if item[0] == "c"]:
546-
include = True
547-
break
548-
if include is True:
560+
if include is True: # this box is a significant vector graphic
549561
vg_clusters.append(bbox)
550562

551-
actual_paths = [
552-
p for p in paths if is_in_rects(p["rect"], vg_clusters)
553-
]
563+
actual_paths = [p for p in paths if is_in_rects(p["rect"], vg_clusters)]
554564

555565
vg_clusters0 = [
556566
r
@@ -594,6 +604,7 @@ def get_page_output(doc, pno, margins, textflags):
594604
links=links,
595605
)
596606

607+
md_string = md_string.replace(" ,", ",").replace("-\n", "")
597608
# write any remaining tables and images
598609
md_string += output_tables(tabs, None, tab_rects)
599610
md_string += output_images(None, tab_rects, None)
@@ -609,7 +620,7 @@ def get_page_output(doc, pno, margins, textflags):
609620

610621
# read the Table of Contents
611622
toc = doc.get_toc()
612-
textflags = fitz.TEXT_DEHYPHENATE | fitz.TEXT_MEDIABOX_CLIP
623+
textflags = fitz.TEXT_MEDIABOX_CLIP
613624
for pno in pages:
614625
page_output, images, tables, graphics = get_page_output(
615626
doc, pno, margins, textflags

pymupdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
setuptools.setup(
1919
name="pymupdf4llm",
20-
version="0.0.6",
20+
version="0.0.7",
2121
author="Artifex",
2222
author_email="support@artifex.com",
2323
description="PyMuPDF Utilities for LLM/RAG",

0 commit comments

Comments
 (0)