Skip to content

Commit 48bc28d

Browse files
committed
Update pymupdf_rag.py
1 parent eb377bc commit 48bc28d

File tree

1 file changed

+22
-24
lines changed

1 file changed

+22
-24
lines changed

pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py

Lines changed: 22 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828

2929
import os
3030
import string
31-
import typing
3231

3332
try:
3433
import pymupdf as fitz # available with v1.24.3
@@ -50,8 +49,8 @@ class IdentifyHeaders:
5049

5150
def __init__(
5251
self,
53-
doc: fitz.Document | str,
54-
pages: list | range | None = None,
52+
doc: str,
53+
pages: list = None,
5554
body_limit: float = 12,
5655
):
5756
"""Read all text and make a dictionary of fontsizes.
@@ -113,7 +112,7 @@ def __init__(
113112
for i, size in enumerate(sizes):
114113
self.header_id[size] = "#" * (i + 1) + " "
115114

116-
def get_header_id(self, span: dict, **kwargs) -> str:
115+
def get_header_id(self, span: dict, page=None) -> str:
117116
"""Return appropriate markdown header prefix.
118117
119118
Given a text span from a "dict"/"rawdict" extraction, determine the
@@ -125,14 +124,14 @@ def get_header_id(self, span: dict, **kwargs) -> str:
125124

126125

127126
def to_markdown(
128-
doc: fitz.Document | str,
127+
doc: str,
129128
*,
130-
pages: list | range | None = None,
131-
hdr_info: typing.Any = None,
129+
pages: list = None,
130+
hdr_info=None,
132131
write_images: bool = False,
133132
page_chunks: bool = False,
134-
margins: float | typing.Iterable = (0, 50, 0, 50),
135-
) -> str | list[dict]:
133+
margins: typing.Iterable = (0, 50, 0, 50),
134+
) -> str:
136135
"""Process the document and return the text of its selected pages."""
137136

138137
if isinstance(doc, str):
@@ -188,9 +187,9 @@ def write_text(
188187
textpage: fitz.TextPage,
189188
clip: fitz.Rect,
190189
tabs=None,
191-
tab_rects: dict | None = None,
192-
img_rects: dict | None = None,
193-
links: list | None = None,
190+
tab_rects: dict = None,
191+
img_rects: dict = None,
192+
links: list = None,
194193
) -> string:
195194
"""Output the text found inside the given clip.
196195
@@ -289,15 +288,15 @@ def write_text(
289288
prev_lrect = lrect
290289

291290
# if line is a header, this will return multiple "#" characters
292-
hdr_string = get_header_id(span0)
291+
hdr_string = get_header_id(span0, page=page)
293292

294293
# intercept if header text has been broken in multiple lines
295294
if hdr_string and hdr_string == prev_hdr_string:
296295
out_string = out_string[:-1] + " " + text + "\n"
297296
continue
298297

299298
prev_hdr_string = hdr_string
300-
if hdr_string.startswith("#"): # if a header output and skip the rest
299+
if hdr_string.startswith("#"): # if a header line skip the rest
301300
out_string += hdr_string + text + "\n"
302301
continue
303302

@@ -421,29 +420,28 @@ def get_page_output(doc, pno, margins, textflags):
421420
doc: fitz.Document
422421
pno: 0-based page number
423422
textflags: text extraction flag bits
424-
images: store image information here
425-
tables: store table information here
426-
graphics: store graphics information here
427423
428424
Returns:
429-
Markdown string of page content.
425+
Markdown string of page content and image, table and vector
426+
graphics information.
430427
"""
431428
page = doc[pno]
432429
md_string = ""
433-
430+
left, top, right, bottom = margins
431+
clip = page.rect + (left, top, -right, -bottom)
434432
# extract all links on page
435433
links = [l for l in page.get_links() if l["kind"] == 2]
436434

437435
# make a TextPage for all later extractions
438-
textpage = page.get_textpage(flags=textflags)
436+
textpage = page.get_textpage(flags=textflags, clip=clip)
439437

440-
img_info = page.get_image_info()
438+
img_info = [img for img in page.get_image_info() if img["bbox"] in clip]
441439
images = img_info[:]
442440
tables = []
443441
graphics = []
444442

445443
# Locate all tables on page
446-
tabs = page.find_tables(strategy="lines_strict")
444+
tabs = page.find_tables(clip=clip, strategy="lines_strict")
447445

448446
# Make a list of table boundary boxes.
449447
# Must include the header bbox (may exist outside tab.bbox)
@@ -485,7 +483,7 @@ def get_page_output(doc, pno, margins, textflags):
485483
vg_clusters.append(bbox)
486484

487485
actual_paths = [p for p in paths if is_in_rects(p["rect"], vg_clusters)]
488-
print(f"before: {len(vg_clusters)=}")
486+
489487
vg_clusters0 = [
490488
r
491489
for r in vg_clusters
@@ -496,8 +494,8 @@ def get_page_output(doc, pno, margins, textflags):
496494
vg_clusters0 += [fitz.Rect(i["bbox"]) for i in img_info]
497495

498496
vg_clusters = dict((i, r) for i, r in enumerate(vg_clusters0))
497+
499498
# Determine text column bboxes on page, avoiding tables and graphics
500-
print(f"{len(tab_rects0)=}, {len(vg_clusters0)=}")
501499
text_rects = column_boxes(
502500
page,
503501
paths=actual_paths,

0 commit comments

Comments
 (0)