Skip to content

Commit 3ad7edf

Browse files
committed
Ignore Graphics only
If a limit for processing vector graphics is specified (GRAPHICS_LIMIT), we now only ignore the graphics - no longer the complete page. Multiple other changes improve text property rendering in markdown.
1 parent d4d68b0 commit 3ad7edf

File tree

5 files changed

+115
-85
lines changed

5 files changed

+115
-85
lines changed

pdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
setuptools.setup(
1919
name="pdf4llm",
20-
version="0.0.18",
20+
version="0.0.19",
2121
author="Artifex",
2222
author_email="support@artifex.com",
2323
description="PyMuPDF Utilities for LLM/RAG",

pymupdf4llm/pymupdf4llm/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown
22

3-
__version__ = "0.0.18"
3+
__version__ = "0.0.19"
44
version = __version__
55
version_tuple = tuple(map(int, version.split(".")))
66

pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -74,11 +74,15 @@ def sanitize_spans(line):
7474
s0 = line[i - 1]
7575
s1 = line[i]
7676
# "delta" depends on the font size. Spans will be joined if
77-
# no more than 10% of the font size separates them.
77+
# no more than 10% of the font size separates them and important
78+
# attributes are the same.
7879
delta = s1["size"] * 0.1
79-
if s0["bbox"].x1 + delta < s1["bbox"].x0:
80-
continue # all good: no joining neded
81-
80+
if s0["bbox"].x1 + delta < s1["bbox"].x0 or (
81+
s0["flags"],
82+
s0["char_flags"],
83+
s0["size"],
84+
) != (s1["flags"], s1["char_flags"], s1["size"]):
85+
continue # no joining
8286
# We need to join bbox and text of two consecutive spans
8387
# On occasion, spans may also be duplicated.
8488
if s0["text"] != s1["text"] or s0["bbox"] != s1["bbox"]:
@@ -108,6 +112,8 @@ def sanitize_spans(line):
108112
continue
109113
if is_white(s["text"]): # ignore white text
110114
continue
115+
if s["alpha"] == 0: # ignore invisible text
116+
continue
111117
if s["flags"] & 1 == 1: # if a superscript, modify bbox
112118
# with that of the preceding or following span
113119
i = 1 if sno == 0 else sno - 1
@@ -132,10 +138,7 @@ def sanitize_spans(line):
132138
sbbox = s["bbox"] # this bbox
133139
sbbox0 = line[-1]["bbox"] # previous bbox
134140
# if any of top or bottom coordinates are close enough, join...
135-
if (
136-
abs(sbbox.y1 - sbbox0.y1) <= y_delta
137-
or abs(sbbox.y0 - sbbox0.y0) <= y_delta
138-
):
141+
if abs(sbbox.y1 - sbbox0.y1) <= y_delta or abs(sbbox.y0 - sbbox0.y0) <= y_delta:
139142
line.append(s) # append to this line
140143
lrect |= sbbox # extend line rectangle
141144
continue
@@ -156,9 +159,7 @@ def sanitize_spans(line):
156159
return nlines
157160

158161

159-
def get_text_lines(
160-
page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr=False
161-
):
162+
def get_text_lines(page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr=False):
162163
"""Extract text by line keeping natural reading sequence.
163164
164165
Notes:

pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py

Lines changed: 100 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,12 @@
3939
import string
4040
from binascii import b2a_base64
4141
import pymupdf
42+
from pymupdf import mupdf
4243
from pymupdf4llm.helpers.get_text_lines import get_raw_lines, is_white
4344
from pymupdf4llm.helpers.multi_column import column_boxes
4445
from pymupdf4llm.helpers.progress import ProgressBar
4546
from dataclasses import dataclass
47+
from collections import defaultdict
4648

4749
pymupdf.TOOLS.unset_quad_corrections(True)
4850
# Characters recognized as bullets when starting a line.
@@ -88,8 +90,9 @@ def __init__(
8890
"""Read all text and make a dictionary of fontsizes.
8991
9092
Args:
91-
pages: optional list of pages to consider
92-
body_limit: consider text with larger font size as some header
93+
doc: PDF document or filename
94+
pages: consider these page numbers only
95+
body_limit: treat text with larger font size as a header
9396
"""
9497
if isinstance(doc, pymupdf.Document):
9598
mydoc = doc
@@ -99,7 +102,7 @@ def __init__(
99102
if pages is None: # use all pages if omitted
100103
pages = range(mydoc.page_count)
101104

102-
fontsizes = {}
105+
fontsizes = defaultdict(int)
103106
for pno in pages:
104107
page = mydoc.load_page(pno)
105108
blocks = page.get_text("dict", flags=pymupdf.TEXTFLAGS_TEXT)["blocks"]
@@ -111,8 +114,7 @@ def __init__(
111114
if not is_white(s["text"])
112115
]:
113116
fontsz = round(span["size"])
114-
count = fontsizes.get(fontsz, 0) + len(span["text"].strip())
115-
fontsizes[fontsz] = count
117+
fontsizes[fontsz] += len(span["text"].strip())
116118

117119
if mydoc != doc:
118120
# if opened here, close it now
@@ -242,7 +244,7 @@ def to_markdown(
242244
filename=None,
243245
force_text=True,
244246
page_chunks=False,
245-
margins=(0, 0, 0, 0),
247+
margins=0,
246248
dpi=150,
247249
page_width=612,
248250
page_height=None,
@@ -252,30 +254,30 @@ def to_markdown(
252254
ignore_code=False,
253255
extract_words=False,
254256
show_progress=False,
255-
):
257+
use_glyphs=False,
256258
) -> str:
257259
"""Process the document and return the text of the selected pages.
258260
259261
Args:
260262
doc: pymupdf.Document or string.
261263
pages: list of page numbers to consider (0-based).
262-
hdr_info: callable or object having a method named 'get_hdr_info'.
263-
write_images: (bool) whether to save images / drawing as files.
264-
embed_images: (bool) embed images as base64 encoded strings
265-
image_path: (str) folder into which images should be stored.
266-
image_format: (str) desired image format. Choose a supported one.
267-
force_text: (bool) output text despite of background.
264+
hdr_info: callable or object having method 'get_hdr_info'.
265+
write_images: (bool) save images / graphics as files.
266+
embed_images: (bool) embed images in markdown text (base64 encoded)
267+
image_path: (str) store images in this folder.
268+
image_format: (str) use this image format. Choose a supported one.
269+
force_text: (bool) output text despite of image background.
268270
page_chunks: (bool) whether to segment output by page.
269-
margins: do not consider content overlapping margin areas.
271+
margins: omit content overlapping margin areas.
270272
dpi: (int) desired resolution for generated images.
271273
page_width: (float) assumption if page layout is variable.
272274
page_height: (float) assumption if page layout is variable.
273275
table_strategy: choose table detection strategy
274-
graphics_limit: (int) ignore page with too many vector graphics.
276+
graphics_limit: (int) if vector graphics count exceeds this, ignore all.
275277
ignore_code: (bool) suppress code-like formatting (mono-space fonts)
276278
extract_words: (bool) include "words"-like output in page chunks
277279
show_progress: (bool) print progress as each page is processed.
278-
image_extract_algorithm: (str) which algorithm to use "simple" or "simple-drop".
280+
glyph_fallback: (bool) replace the Invalid Unicode by glyph number.
279281
280282
"""
281283
if write_images is False and embed_images is False and force_text is False:
@@ -339,6 +341,14 @@ def to_markdown(
339341
hdr_info = IdentifyHeaders(doc)
340342
get_header_id = hdr_info.get_header_id
341343

344+
def max_header_id(spans, page):
345+
hdr_ids = sorted(
346+
[l for l in set([len(get_header_id(s, page=page)) for s in spans]) if l > 0]
347+
)
348+
if not hdr_ids:
349+
return ""
350+
return "#" * (hdr_ids[0] - 1) + " "
351+
342352
def resolve_links(links, span):
343353
"""Accept a span and return a markdown link string.
344354
@@ -422,7 +432,11 @@ def write_text(
422432
]
423433

424434
parms.line_rects.extend(
425-
[l[0] for l in nlines if not intersects_rects(l[0], parms.tab_rects.values())]
435+
[
436+
l[0]
437+
for l in nlines
438+
if not intersects_rects(l[0], parms.tab_rects.values())
439+
]
426440
) # store line rectangles
427441

428442
prev_lrect = None # previous line rectangle
@@ -492,8 +506,17 @@ def write_text(
492506
parms.deleted_images.append(i)
493507

494508
parms.line_rects.append(lrect)
509+
510+
# make text string for the full line
495511
text = " ".join([s["text"] for s in spans])
496512

513+
# if line is a header, this will return multiple "#" characters,
514+
# otherwise an empty string
515+
hdr_string = max_header_id(spans, page=parms.page) # a header?
516+
517+
# full line strikeout?
518+
all_strikeout = all([s["char_flags"] & 1 for s in spans])
519+
497520
# full line mono-spaced?
498521
if not IGNORE_CODE:
499522
all_mono = all([s["flags"] & 8 for s in spans])
@@ -512,6 +535,12 @@ def write_text(
512535
out_string += indent + text + "\n"
513536
continue # done with this line
514537

538+
if hdr_string: # if a header line skip the rest
539+
if all_strikeout:
540+
text = "~~" + text + "~~"
541+
out_string += hdr_string + text + "\n"
542+
continue
543+
515544
span0 = spans[0]
516545
bno = span0["block"] # block number of line
517546
if bno != prev_bno:
@@ -528,9 +557,6 @@ def write_text(
528557
out_string += "\n"
529558
prev_lrect = lrect
530559

531-
# if line is a header, this will return multiple "#" characters
532-
hdr_string = get_header_id(span0, page=parms.page)
533-
534560
# intercept if header text has been broken in multiple lines
535561
if hdr_string and hdr_string == prev_hdr_string:
536562
while out_string.endswith("\n"):
@@ -539,9 +565,6 @@ def write_text(
539565
continue
540566

541567
prev_hdr_string = hdr_string
542-
if hdr_string.startswith("#"): # if a header line skip the rest
543-
out_string += hdr_string + text + "\n"
544-
continue
545568

546569
# this line is not all-mono, so switch off "code" mode
547570
if code: # in code output mode?
@@ -551,45 +574,47 @@ def write_text(
551574
for i, s in enumerate(spans): # iterate spans of the line
552575
# decode font properties
553576
mono = s["flags"] & 8 and IGNORE_CODE is False
554-
bold = s["flags"] & 16
577+
bold = s["flags"] & 16 or s["char_flags"] & 8
555578
italic = s["flags"] & 2
579+
strikeout = s["char_flags"] & 1
556580

557581
if mono:
558582
# this is text in some monospaced font
559583
out_string += f"`{s['text'].strip()}` "
560-
else: # not a mono text
561-
prefix = ""
562-
suffix = ""
563-
if hdr_string == "":
564-
if bold:
565-
prefix = "**"
566-
suffix += "**"
567-
if italic:
568-
prefix += "_"
569-
suffix = "_" + suffix
570-
571-
# convert intersecting link to markdown syntax
572-
ltext = resolve_links(parms.links, s)
573-
if ltext:
574-
text = f"{hdr_string}{prefix}{ltext}{suffix} "
575-
else:
576-
text = f"{hdr_string}{prefix}{s['text'].strip()}{suffix} "
577-
if text.startswith(bullet):
578-
text = text[1:]
579-
if len(text) > 1 and text[1] == " ":
580-
t = "-"
581-
else:
582-
t = "- "
583-
text = t + text[1:]
584-
dist = span0["bbox"][0] - clip.x0
585-
cwidth = (span0["bbox"][2] - span0["bbox"][0]) / len(
586-
span0["text"]
587-
)
588-
if cwidth == 0.0:
589-
cwidth = span0["size"] * 0.5
590-
text = " " * int(round(dist / cwidth)) + text
584+
continue
591585

592-
out_string += text
586+
prefix = ""
587+
suffix = ""
588+
if bold:
589+
prefix = "**" + prefix
590+
suffix += "**"
591+
if italic:
592+
prefix = "*" + prefix
593+
suffix += "*"
594+
if strikeout:
595+
prefix = "~~" + prefix
596+
suffix += "~~"
597+
598+
# convert intersecting link to markdown syntax
599+
ltext = resolve_links(parms.links, s)
600+
if ltext:
601+
text = f"{hdr_string}{prefix}{ltext}{suffix} "
602+
else:
603+
text = f"{hdr_string}{prefix}{s['text'].strip()}{suffix} "
604+
if text.startswith(bullet):
605+
text = text[1:]
606+
if len(text) > 1 and text[1] == " ":
607+
t = "-"
608+
else:
609+
t = "- "
610+
text = t + text[1:]
611+
dist = span0["bbox"][0] - clip.x0
612+
cwidth = (span0["bbox"][2] - span0["bbox"][0]) / len(span0["text"])
613+
if cwidth == 0.0:
614+
cwidth = span0["size"] * 0.5
615+
text = " " * int(round(dist / cwidth)) + text
616+
617+
out_string += text
593618
if not code:
594619
out_string += "\n"
595620
out_string += "\n"
@@ -807,17 +832,10 @@ def get_page_output(doc, pno, margins, textflags, FILENAME):
807832
parms.graphics = []
808833
parms.words = []
809834
parms.line_rects = []
835+
810836
# determine background color
811837
parms.bg_color = get_bg_color(page)
812-
# catch too-many-graphics situation
813-
if GRAPHICS_LIMIT is not None:
814-
test_paths = page.get_cdrawings() # fastest access to graphics
815-
if (excess := len(test_paths)) > GRAPHICS_LIMIT:
816-
parms.md_string = (
817-
f"\n**Ignoring page {page.number} with {excess}+ vector graphics.**"
818-
)
819-
parms.md_string += "\n\n-----\n\n"
820-
return parms
838+
821839
left, top, right, bottom = margins
822840
parms.clip = page.rect + (left, top, -right, -bottom)
823841

@@ -887,6 +905,10 @@ def get_page_output(doc, pno, margins, textflags, FILENAME):
887905
and not (p["type"] == "f" and p["fill"] == parms.bg_color)
888906
]
889907

908+
# catch too-many-graphics situation
909+
if GRAPHICS_LIMIT and len(paths) > GRAPHICS_LIMIT:
910+
paths = []
911+
890912
# We also ignore vector graphics that only represent
891913
# "text emphasizing sugar".
892914
vg_clusters0 = [] # worthwhile vector graphics go here
@@ -988,7 +1010,17 @@ def get_page_output(doc, pno, margins, textflags, FILENAME):
9881010
# read the Table of Contents
9891011
toc = doc.get_toc()
9901012

991-
textflags = pymupdf.TEXT_MEDIABOX_CLIP | pymupdf.TEXT_ACCURATE_BBOXES
1013+
# Text extraction flags:
1014+
# omit invisible text, collect styles, use accurate bounding boxes
1015+
textflags = (
1016+
0
1017+
| mupdf.FZ_STEXT_CLIP
1018+
| mupdf.FZ_STEXT_ACCURATE_BBOXES
1019+
| 32768 # mupdf.FZ_STEXT_COLLECT_STYLES
1020+
)
1021+
# optionally replace 0xFFFD by glyph number
1022+
if use_glyphs:
1023+
textflags |= mupdf.FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE
9921024

9931025
if show_progress:
9941026
print(f"Processing {FILENAME}...")
@@ -1082,9 +1114,7 @@ def extract_images_on_page_simple_drop(page, parms, image_size_limit):
10821114
import time
10831115

10841116
try:
1085-
filename = (
1086-
"C:/Users/haral/OneDrive/Desktop/pymupdf4llm/issues/0225/e000050.full.pdf"
1087-
)
1117+
filename = "markdown.pdf"
10881118
except IndexError:
10891119
print(f"Usage:\npython {os.path.basename(__file__)} input.pdf")
10901120
sys.exit()
@@ -1117,11 +1147,10 @@ def extract_images_on_page_simple_drop(page, parms, image_size_limit):
11171147
pages=pages,
11181148
write_images=True,
11191149
force_text=False,
1120-
image_path=r"C:\Users\haral\OneDrive\Desktop\pymupdf4llm\rag\pymupdf4llm\pymupdf4llm\helpers",
11211150
)
11221151
FILENAME = doc.name
11231152
# output to a text file with extension ".md"
1124-
outname = FILENAME.replace(".pdf", ".md")
1153+
outname = FILENAME + ".md"
11251154
pathlib.Path(outname).write_bytes(md_string.encode())
11261155
t1 = time.perf_counter() # stop timer
11271156
print(f"Markdown creation time for {FILENAME=} {round(t1-t0,2)} sec.")

0 commit comments

Comments
 (0)