Skip to content

Commit a65ea4a

Browse files
committed
Multiple updates & code cleaning
1 parent d5bf626 commit a65ea4a

File tree

7 files changed

+81
-43
lines changed

7 files changed

+81
-43
lines changed

pdf4llm/pdf4llm/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from pymupdf4llm import *
33

44

5-
__version__ = "0.0.8"
5+
__version__ = "0.0.9"
66
version = __version__
77
version_tuple = tuple(map(int, version.split(".")))
88

pdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
setuptools.setup(
1919
name="pdf4llm",
20-
version="0.0.8",
20+
version="0.0.9",
2121
author="Artifex",
2222
author_email="support@artifex.com",
2323
description="PyMuPDF Utilities for LLM/RAG",

pymupdf4llm/pymupdf4llm/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .helpers.pymupdf_rag import to_markdown, IdentifyHeaders
22

3-
__version__ = "0.0.2"
3+
__version__ = "0.0.3"
44
version = __version__
55
version_tuple = tuple(map(int, version.split(".")))
66

pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,24 @@
1-
import fitz
1+
"""
2+
This script accepts a PDF document filename and converts it to a text file.
3+
4+
5+
Dependencies
6+
-------------
7+
PyMuPDF v1.24.2 or later
8+
9+
Copyright and License
10+
----------------------
11+
Copyright 2024 Artifex Software, Inc.
12+
License GNU Affero GPL 3.0
13+
"""
14+
15+
import string
216
import sys
3-
import pathlib, string
17+
18+
try:
19+
import pymupdf as fitz # available with v1.24.3
20+
except ImportError:
21+
import fitz
422

523
WHITE = set(string.whitespace)
624

@@ -192,6 +210,8 @@ def get_text_lines(page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr
192210

193211

194212
if __name__ == "__main__":
213+
import pathlib
214+
195215
filename = sys.argv[1]
196216
doc = fitz.open(filename)
197217
text = ""

pymupdf4llm/pymupdf4llm/helpers/multi_column.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,15 @@
4949
for rect in bboxes:
5050
print(page.get_text(clip=rect, sort=True))
5151
----------------------------------------------------------------------------------
52+
53+
Dependencies
54+
-------------
55+
PyMuPDF v1.24.2 or later
56+
57+
Copyright and License
58+
----------------------
59+
Copyright 2024 Artifex Software, Inc.
60+
License GNU Affero GPL 3.0
5261
"""
5362

5463
import string

pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py

Lines changed: 46 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,6 @@
1616
Text will be sorted in Western reading order. Any table will be included in
1717
the text in markdwn format as well.
1818
19-
Use in some other script
20-
-------------------------
21-
import fitz
22-
from to_markdown import to_markdown
23-
24-
doc = fitz.open("input.pdf")
25-
page_list = [ list of 0-based page numbers ]
26-
md_text = to_markdown(doc, pages=page_list)
27-
2819
Dependencies
2920
-------------
3021
PyMuPDF v1.24.2 or later
@@ -43,10 +34,7 @@
4334
except ImportError:
4435
import fitz
4536

46-
from pymupdf4llm.helpers.get_text_lines import (
47-
get_raw_lines,
48-
is_white,
49-
)
37+
from pymupdf4llm.helpers.get_text_lines import get_raw_lines, is_white
5038
from pymupdf4llm.helpers.multi_column import column_boxes
5139

5240
if fitz.pymupdf_version_tuple < (1, 24, 2):
@@ -127,13 +115,13 @@ def get_header_id(self, span):
127115

128116

129117
def to_markdown(
130-
doc: fitz.Document,
118+
doc: fitz.Document | str,
131119
*,
132-
pages: list = None,
133-
hdr_info: IdentifyHeaders = None,
134-
write_images=False,
135-
page_chunks=False,
136-
) -> str:
120+
pages: list | range | None = None,
121+
hdr_info: IdentifyHeaders | None = None,
122+
write_images: bool = False,
123+
page_chunks: bool = False,
124+
) -> str | list[dict]:
137125
"""Process the document and return the text of its selected pages."""
138126

139127
if isinstance(doc, str):
@@ -146,7 +134,7 @@ def to_markdown(
146134
hdr_info = IdentifyHeaders(doc)
147135

148136
def resolve_links(links, span):
149-
"""Accept a span bbox and return a markdown link string."""
137+
"""Accept a span and return a markdown link string."""
150138
bbox = fitz.Rect(span["bbox"]) # span bbox
151139
# a link should overlap at least 70% of the span
152140
bbox_area = 0.7 * abs(bbox)
@@ -158,6 +146,10 @@ def resolve_links(links, span):
158146
return text
159147

160148
def save_image(page, rect, i):
149+
"""Optionally render the rect part of a page.
150+
151+
In any case return the image filename.
152+
"""
161153
filename = page.parent.name.replace("\\", "/")
162154
image_path = f"{filename}-{page.number}-{i}.png"
163155
if write_images is True:
@@ -167,13 +159,13 @@ def save_image(page, rect, i):
167159
return os.path.basename(image_path)
168160

169161
def write_text(
170-
page,
162+
page: fitz.Page,
171163
textpage: fitz.TextPage,
172164
clip: fitz.Rect,
173165
tabs=None,
174-
tab_rects: dict = None,
175-
img_rects: dict = None,
176-
links: list = None,
166+
tab_rects: dict | None = None,
167+
img_rects: dict | None = None,
168+
links: list | None = None,
177169
hdr_info=None,
178170
) -> string:
179171
"""Output the text found inside the given clip.
@@ -184,12 +176,18 @@ def write_text(
184176
inline code, bold, italic and bold-italic styling.
185177
There is also some effort for list supported (ordered / unordered) in
186178
that typical characters are replaced by respective markdown characters.
179+
180+
'tab_rects'/'img_rects' are dictionaries of table, respectively image
181+
or vector graphic rectangles.
182+
General Markdown text generation skips these areas. Tables are written
183+
via their own 'to_markdown' method. Images and vector graphics are
184+
optionally saved as files and pointed to by respective markdown text.
187185
"""
188186
if clip is None:
189187
clip = textpage.rect
190188
out_string = ""
191189

192-
# This is a list of tuples (linerect, [spanlist])
190+
# This is a list of tuples (linerect, spanlist)
193191
nlines = get_raw_lines(textpage, clip=clip, tolerance=3)
194192

195193
tab_rects0 = list(tab_rects.values())
@@ -260,12 +258,15 @@ def write_text(
260258
and lrect.y1 - prev_lrect.y1 > lrect.height * 1.5
261259
or span0["text"].startswith("[")
262260
or span0["text"].startswith(bullet)
263-
or span0["flags"] & 1
261+
or span0["flags"] & 1 # superscript?
264262
):
265263
out_string += "\n"
266264
prev_lrect = lrect
267265

266+
# if line is a header, this will return multiple "#" characters
268267
hdr_string = hdr_info.get_header_id(spans[0])
268+
269+
# intercept if header text has been broken in multiple lines
269270
if hdr_string and hdr_string == prev_hdr_string:
270271
out_string = out_string[:-1] + " " + text + "\n"
271272
continue
@@ -295,6 +296,7 @@ def write_text(
295296
prefix += "_"
296297
suffix = "_" + suffix
297298

299+
# convert intersecting link into markdown syntax
298300
ltext = resolve_links(links, s)
299301
if ltext:
300302
text = f"{hdr_string}{prefix}{ltext}{suffix} "
@@ -329,27 +331,27 @@ def intersects_rects(rect, rect_list):
329331
return 0
330332

331333
def output_tables(tabs, text_rect, tab_rects):
332-
"""Output and remove tables above text rectangle."""
334+
"""Output tables above a text rectangle."""
333335
this_md = "" # markdown string for table content
334336
if text_rect is not None: # select tables above the text block
335337
for i, trect in sorted(
336338
[j for j in tab_rects.items() if j[1].y1 <= text_rect.y0],
337339
key=lambda j: (j[1].y1, j[1].x0),
338340
):
339341
this_md += tabs[i].to_markdown(clean=False)
340-
del tab_rects[i]
342+
del tab_rects[i] # do not touch this table twice
341343

342344
else: # output all remaining table
343345
for i, trect in sorted(
344346
tab_rects.items(),
345347
key=lambda j: (j[1].y1, j[1].x0),
346348
):
347349
this_md += tabs[i].to_markdown(clean=False)
348-
del tab_rects[i]
350+
del tab_rects[i] # do not touch this table twice
349351
return this_md
350352

351353
def output_images(page, text_rect, img_rects):
352-
"""Output and remove images and graphics above text rectangle."""
354+
"""Output images and graphics above text rectangle."""
353355
if img_rects is None:
354356
return ""
355357
this_md = "" # markdown string
@@ -360,7 +362,7 @@ def output_images(page, text_rect, img_rects):
360362
):
361363
pathname = save_image(page, img_rect, i)
362364
this_md += GRAPHICS_TEXT % (pathname, pathname)
363-
del img_rects[i]
365+
del img_rects[i] # do not touch this image twice
364366

365367
else: # output all remaining table
366368
for i, img_rect in sorted(
@@ -369,7 +371,7 @@ def output_images(page, text_rect, img_rects):
369371
):
370372
pathname = save_image(page, img_rect, i)
371373
this_md += GRAPHICS_TEXT % (pathname, pathname)
372-
del img_rects[i]
374+
del img_rects[i] # do not touch this image twice
373375
return this_md
374376

375377
def get_metadata(doc, pno):
@@ -380,22 +382,28 @@ def get_metadata(doc, pno):
380382
return meta
381383

382384
def get_page_output(doc, pno, textflags):
385+
"""Process one page."""
383386
page = doc[pno]
384387
md_string = ""
388+
389+
# extract all links on page
385390
links = [l for l in page.get_links() if l["kind"] == 2]
391+
392+
# make a TextPage for all later extractions
386393
textpage = page.get_textpage(flags=textflags)
387-
# First locate all tables on page
394+
395+
# Locate all tables on page
388396
tabs = page.find_tables()
389397

390-
# Second, make a list of table boundary boxes.
391-
# Must include the header bbox (may be outside tab.bbox)
398+
# Make a list of table boundary boxes.
399+
# Must include the header bbox (may exist outside tab.bbox)
392400
tab_rects = {}
393401
for i, t in enumerate(tabs):
394402
tab_rects[i] = fitz.Rect(t.bbox) | fitz.Rect(t.header.bbox)
395403
tab_rects0 = list(tab_rects.values())
396404

397405
# Select paths that are not contained in any table
398-
page_clip = page.rect + (36, 36, -36, -36)
406+
page_clip = page.rect + (36, 36, -36, -36) # ignore full page graphics
399407
paths = [
400408
p
401409
for p in page.get_drawings()
@@ -409,6 +417,7 @@ def get_page_output(doc, pno, textflags):
409417
for r in vg_clusters
410418
if not intersects_rects(r, tab_rects0) and r.height > 20
411419
] + [fitz.Rect(i["bbox"]) for i in page.get_image_info()]
420+
412421
vg_clusters = dict((i, r) for i, r in enumerate(vg_clusters0))
413422
# Determine text column bboxes on page, avoiding tables and graphics
414423
text_rects = column_boxes(
@@ -422,7 +431,7 @@ def get_page_output(doc, pno, textflags):
422431
the text rectangles.
423432
"""
424433
for text_rect in text_rects:
425-
# outpt tables above this block of text
434+
# output tables above this block of text
426435
md_string += output_tables(tabs, text_rect, tab_rects)
427436
md_string += output_images(page, text_rect, vg_clusters)
428437

pymupdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
setuptools.setup(
1919
name="pymupdf4llm",
20-
version="0.0.2",
20+
version="0.0.3",
2121
author="Artifex",
2222
author_email="support@artifex.com",
2323
description="PyMuPDF Utilities for LLM/RAG",

0 commit comments

Comments
 (0)