Skip to content

Commit 9bfc056

Browse files
authored
Merge pull request #28 from pymupdf/update-image-inclusion
Update image inclusion
2 parents 49d5f32 + 48bc28d commit 9bfc056

File tree

1 file changed

+155
-60
lines changed

1 file changed

+155
-60
lines changed

pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py

Lines changed: 155 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,19 @@
4040
if fitz.pymupdf_version_tuple < (1, 24, 2):
4141
raise NotImplementedError("PyMuPDF version 1.24.2 or later is needed.")
4242

43-
bullet = ("* ", chr(0xF0B7), chr(0xB7), chr(8226), chr(9679))
43+
bullet = ("- ", "* ", chr(0xF0A7), chr(0xF0B7), chr(0xB7), chr(8226), chr(9679))
4444
GRAPHICS_TEXT = "\n![%s](%s)\n"
4545

4646

4747
class IdentifyHeaders:
4848
"""Compute data for identifying header text."""
4949

50-
def __init__(self, doc, pages: list = None, body_limit: float = None):
50+
def __init__(
51+
self,
52+
doc: str,
53+
pages: list = None,
54+
body_limit: float = 12,
55+
):
5156
"""Read all text and make a dictionary of fontsizes.
5257
5358
Args:
@@ -85,53 +90,74 @@ def __init__(self, doc, pages: list = None, body_limit: float = None):
8590
self.header_id = {}
8691

8792
# If not provided, choose the most frequent font size as body text.
88-
# If no text at all on all pages, just use 12
89-
if body_limit is None:
90-
temp = sorted(
91-
[(k, v) for k, v in fontsizes.items()],
92-
key=lambda i: i[1],
93-
reverse=True,
94-
)
95-
if temp:
96-
body_limit = temp[0][0]
97-
else:
98-
body_limit = 12
93+
# If no text at all on all pages, just use 12.
94+
# In any case all fonts not exceeding
95+
temp = sorted(
96+
[(k, v) for k, v in fontsizes.items()],
97+
key=lambda i: i[1],
98+
reverse=True,
99+
)
100+
if temp:
101+
b_limit = max(body_limit, temp[0][0])
102+
else:
103+
b_limit = body_limit
99104

100-
sizes = sorted([f for f in fontsizes.keys() if f > body_limit], reverse=True)
105+
# identify up to 6 font sizes as header candidates
106+
sizes = sorted(
107+
[f for f in fontsizes.keys() if f > b_limit],
108+
reverse=True,
109+
)[:6]
101110

102111
# make the header tag dictionary
103112
for i, size in enumerate(sizes):
104113
self.header_id[size] = "#" * (i + 1) + " "
105114

106-
def get_header_id(self, span):
115+
def get_header_id(self, span: dict, page=None) -> str:
107116
"""Return appropriate markdown header prefix.
108117
109-
Given a text span from a "dict"/"radict" extraction, determine the
110-
markdown header prefix string of 0 to many concatenated '#' characters.
118+
Given a text span from a "dict"/"rawdict" extraction, determine the
119+
markdown header prefix string of 0 to n concatenated '#' characters.
111120
"""
112121
fontsize = round(span["size"]) # compute fontsize
113122
hdr_id = self.header_id.get(fontsize, "")
114123
return hdr_id
115124

116125

117126
def to_markdown(
118-
doc: fitz.Document | str,
127+
doc: str,
119128
*,
120-
pages: list | range | None = None,
121-
hdr_info: IdentifyHeaders | None = None,
129+
pages: list = None,
130+
hdr_info=None,
122131
write_images: bool = False,
123132
page_chunks: bool = False,
124-
) -> str | list[dict]:
133+
margins: typing.Iterable = (0, 50, 0, 50),
134+
) -> str:
125135
"""Process the document and return the text of its selected pages."""
126136

127137
if isinstance(doc, str):
128138
doc = fitz.open(doc)
129139

130-
if not pages: # use all pages if argument not given
131-
pages = range(doc.page_count)
132-
133-
if not isinstance(hdr_info, IdentifyHeaders):
140+
if pages is None: # use all pages if no selection given
141+
pages = list(range(doc.page_count))
142+
143+
if hasattr(margins, "__float__"):
144+
margins = [margins] * 4
145+
if len(margins) == 2:
146+
margins = (0, margins[0], 0, margins[1])
147+
if len(margins) != 4:
148+
raise ValueError("margins must have length 2 or 4 or be a number.")
149+
elif not all([hasattr(m, "__float__") for m in margins]):
150+
raise ValueError("margin values must be numbers")
151+
152+
# If "hdr_info" is not an object having method "get_header_id", scan the
153+
# document and use font sizes as header level indicators.
154+
if callable(hdr_info):
155+
get_header_id = hdr_info
156+
elif hasattr(hdr_info, "get_header_id") and callable(hdr_info.get_header_id):
157+
get_header_id = hdr_info.get_header_id
158+
else:
134159
hdr_info = IdentifyHeaders(doc)
160+
get_header_id = hdr_info.get_header_id
135161

136162
def resolve_links(links, span):
137163
"""Accept a span and return a markdown link string."""
@@ -146,27 +172,24 @@ def resolve_links(links, span):
146172
return text
147173

148174
def save_image(page, rect, i):
149-
"""Optionally render the rect part of a page.
150-
151-
In any case return the image filename.
152-
"""
175+
"""Optionally render the rect part of a page."""
153176
filename = page.parent.name.replace("\\", "/")
154177
image_path = f"{filename}-{page.number}-{i}.png"
155178
if write_images is True:
156179
pix = page.get_pixmap(clip=rect)
157180
pix.save(image_path)
158181
del pix
159-
return os.path.basename(image_path)
182+
return os.path.basename(image_path)
183+
return ""
160184

161185
def write_text(
162186
page: fitz.Page,
163187
textpage: fitz.TextPage,
164188
clip: fitz.Rect,
165189
tabs=None,
166-
tab_rects: dict | None = None,
167-
img_rects: dict | None = None,
168-
links: list | None = None,
169-
hdr_info=None,
190+
tab_rects: dict = None,
191+
img_rects: dict = None,
192+
links: list = None,
170193
) -> string:
171194
"""Output the text found inside the given clip.
172195
@@ -227,7 +250,8 @@ def write_text(
227250
key=lambda j: (j[1].y1, j[1].x0),
228251
):
229252
pathname = save_image(page, img_rect, i)
230-
out_string += GRAPHICS_TEXT % (pathname, pathname)
253+
if pathname:
254+
out_string += GRAPHICS_TEXT % (pathname, pathname)
231255
del img_rects[i]
232256

233257
text = " ".join([s["text"] for s in spans])
@@ -247,11 +271,11 @@ def write_text(
247271
out_string += indent + text + "\n"
248272
continue # done with this line
249273

250-
bno = spans[0]["block"] # block number of line
274+
span0 = spans[0]
275+
bno = span0["block"] # block number of line
251276
if bno != prev_bno:
252277
out_string += "\n"
253278
prev_bno = bno
254-
span0 = spans[0]
255279

256280
if ( # check if we need another line break
257281
prev_lrect
@@ -264,19 +288,24 @@ def write_text(
264288
prev_lrect = lrect
265289

266290
# if line is a header, this will return multiple "#" characters
267-
hdr_string = hdr_info.get_header_id(spans[0])
291+
hdr_string = get_header_id(span0, page=page)
268292

269293
# intercept if header text has been broken in multiple lines
270294
if hdr_string and hdr_string == prev_hdr_string:
271295
out_string = out_string[:-1] + " " + text + "\n"
272296
continue
297+
273298
prev_hdr_string = hdr_string
299+
if hdr_string.startswith("#"): # if a header line skip the rest
300+
out_string += hdr_string + text + "\n"
301+
continue
302+
303+
# this line is not all-mono, so switch off "code" mode
304+
if code: # still in code output mode?
305+
out_string += "```\n" # switch of code mode
306+
code = False
274307

275308
for i, s in enumerate(spans): # iterate spans of the line
276-
# this line is not all-mono, so switch off "code" mode
277-
if code: # still in code output mode?
278-
out_string += "```\n" # switch of code mode
279-
code = False
280309
# decode font properties
281310
mono = s["flags"] & 8
282311
bold = s["flags"] & 16
@@ -312,6 +341,7 @@ def write_text(
312341
if code:
313342
out_string += "```\n" # switch of code mode
314343
code = False
344+
315345
return (
316346
out_string.replace(" \n", "\n").replace(" ", " ").replace("\n\n\n", "\n\n")
317347
)
@@ -361,7 +391,8 @@ def output_images(page, text_rect, img_rects):
361391
key=lambda j: (j[1].y1, j[1].x0),
362392
):
363393
pathname = save_image(page, img_rect, i)
364-
this_md += GRAPHICS_TEXT % (pathname, pathname)
394+
if pathname:
395+
this_md += GRAPHICS_TEXT % (pathname, pathname)
365396
del img_rects[i] # do not touch this image twice
366397

367398
else: # output all remaining table
@@ -370,7 +401,8 @@ def output_images(page, text_rect, img_rects):
370401
key=lambda j: (j[1].y1, j[1].x0),
371402
):
372403
pathname = save_image(page, img_rect, i)
373-
this_md += GRAPHICS_TEXT % (pathname, pathname)
404+
if pathname:
405+
this_md += GRAPHICS_TEXT % (pathname, pathname)
374406
del img_rects[i] # do not touch this image twice
375407
return this_md
376408

@@ -381,48 +413,93 @@ def get_metadata(doc, pno):
381413
meta["page"] = pno + 1
382414
return meta
383415

384-
def get_page_output(doc, pno, textflags):
385-
"""Process one page."""
416+
def get_page_output(doc, pno, margins, textflags):
417+
"""Process one page.
418+
419+
Args:
420+
doc: fitz.Document
421+
pno: 0-based page number
422+
textflags: text extraction flag bits
423+
424+
Returns:
425+
Markdown string of page content and image, table and vector
426+
graphics information.
427+
"""
386428
page = doc[pno]
387429
md_string = ""
388-
430+
left, top, right, bottom = margins
431+
clip = page.rect + (left, top, -right, -bottom)
389432
# extract all links on page
390433
links = [l for l in page.get_links() if l["kind"] == 2]
391434

392435
# make a TextPage for all later extractions
393-
textpage = page.get_textpage(flags=textflags)
436+
textpage = page.get_textpage(flags=textflags, clip=clip)
437+
438+
img_info = [img for img in page.get_image_info() if img["bbox"] in clip]
439+
images = img_info[:]
440+
tables = []
441+
graphics = []
394442

395443
# Locate all tables on page
396-
tabs = page.find_tables()
444+
tabs = page.find_tables(clip=clip, strategy="lines_strict")
397445

398446
# Make a list of table boundary boxes.
399447
# Must include the header bbox (may exist outside tab.bbox)
400448
tab_rects = {}
401449
for i, t in enumerate(tabs):
402450
tab_rects[i] = fitz.Rect(t.bbox) | fitz.Rect(t.header.bbox)
451+
tab_dict = {
452+
"bbox": tuple(tab_rects[i]),
453+
"rows": t.row_count,
454+
"columns": t.col_count,
455+
}
456+
tables.append(tab_dict)
403457
tab_rects0 = list(tab_rects.values())
404458

405459
# Select paths that are not contained in any table
406460
page_clip = page.rect + (36, 36, -36, -36) # ignore full page graphics
407461
paths = [
408462
p
409463
for p in page.get_drawings()
410-
if not intersects_rects(p["rect"], tab_rects0) and p["rect"] in page_clip
464+
if not intersects_rects(p["rect"], tab_rects0)
465+
and p["rect"] in page_clip
466+
and p["rect"].width < page_clip.width
467+
and p["rect"].height < page_clip.height
411468
]
412469

413-
# determine vector graphics outside any tables
414-
vg_clusters = page.cluster_drawings(drawings=paths)
470+
# Determine vector graphics outside any tables, filerting out any
471+
# which contain no stroked paths
472+
vg_clusters = []
473+
for bbox in page.cluster_drawings(drawings=paths):
474+
include = False
475+
for p in [p for p in paths if p["rect"] in bbox]:
476+
if p["type"] != "f":
477+
include = True
478+
break
479+
if [item[0] for item in p["items"] if item[0] == "c"]:
480+
include = True
481+
break
482+
if include is True:
483+
vg_clusters.append(bbox)
484+
485+
actual_paths = [p for p in paths if is_in_rects(p["rect"], vg_clusters)]
486+
415487
vg_clusters0 = [
416488
r
417489
for r in vg_clusters
418490
if not intersects_rects(r, tab_rects0) and r.height > 20
419-
] + [fitz.Rect(i["bbox"]) for i in page.get_image_info()]
491+
]
492+
493+
if write_images is True:
494+
vg_clusters0 += [fitz.Rect(i["bbox"]) for i in img_info]
420495

421496
vg_clusters = dict((i, r) for i, r in enumerate(vg_clusters0))
497+
422498
# Determine text column bboxes on page, avoiding tables and graphics
423499
text_rects = column_boxes(
424500
page,
425-
paths=paths,
501+
paths=actual_paths,
502+
no_image_text=write_images,
426503
textpage=textpage,
427504
avoid=tab_rects0 + vg_clusters0,
428505
)
@@ -444,28 +521,46 @@ def get_page_output(doc, pno, textflags):
444521
tab_rects=tab_rects,
445522
img_rects=vg_clusters,
446523
links=links,
447-
hdr_info=hdr_info,
448524
)
449525

450-
# write remaining tables.
526+
# write any remaining tables and images
451527
md_string += output_tables(tabs, None, tab_rects)
452528
md_string += output_images(None, tab_rects, None)
453529
md_string += "\n-----\n\n"
454-
return md_string
530+
while md_string.startswith("\n"):
531+
md_string = md_string[1:]
532+
return md_string, images, tables, graphics
455533

456534
if page_chunks is False:
457535
document_output = ""
458536
else:
459537
document_output = []
460538

539+
# read the Table of Contents
540+
toc = doc.get_toc()
461541
textflags = fitz.TEXT_DEHYPHENATE | fitz.TEXT_MEDIABOX_CLIP
462-
for pno in list(pages):
463-
page_output = get_page_output(doc, pno, textflags)
542+
for pno in pages:
543+
544+
page_output, images, tables, graphics = get_page_output(
545+
doc, pno, margins, textflags
546+
)
464547
if page_chunks is False:
465548
document_output += page_output
466549
else:
550+
# build subet of TOC for this page
551+
page_tocs = [t for t in toc if t[-1] == pno + 1]
552+
467553
metadata = get_metadata(doc, pno)
468-
document_output.append({"metadata": metadata, "text": page_output})
554+
document_output.append(
555+
{
556+
"metadata": metadata,
557+
"toc_items": page_tocs,
558+
"tables": tables,
559+
"images": images,
560+
"graphics": graphics,
561+
"text": page_output,
562+
}
563+
)
469564

470565
return document_output
471566

0 commit comments

Comments
 (0)