Ignore Graphics only

JorjMcKie · JorjMcKie · commit 3ad7edf8accf · 2025-03-26T12:37:21.000-04:00
If a limit for processing vector graphics is specified (GRAPHICS_LIMIT), we now only ignore the graphics - no longer the complete page.

Multiple other changes improve text property rendering in markdown.
diff --git a/pdf4llm/setup.py b/pdf4llm/setup.py
@@ -17,7 +17,7 @@
 
 setuptools.setup(
     name="pdf4llm",
-    version="0.0.18",
+    version="0.0.19",
     author="Artifex",
     author_email="support@artifex.com",
     description="PyMuPDF Utilities for LLM/RAG",
diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py
@@ -1,6 +1,6 @@
 from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown
 
-__version__ = "0.0.18"
+__version__ = "0.0.19"
 version = __version__
 version_tuple = tuple(map(int, version.split(".")))
 
diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
@@ -74,11 +74,15 @@ def sanitize_spans(line):
             s0 = line[i - 1]
             s1 = line[i]
             # "delta" depends on the font size. Spans  will be joined if
-            # no more than 10% of the font size separates them.
+            # no more than 10% of the font size separates them and important
+            # attributes are the same.
             delta = s1["size"] * 0.1
-            if s0["bbox"].x1 + delta < s1["bbox"].x0:
-                continue  # all good: no joining neded
-
+            if s0["bbox"].x1 + delta < s1["bbox"].x0 or (
+                s0["flags"],
+                s0["char_flags"],
+                s0["size"],
+            ) != (s1["flags"], s1["char_flags"], s1["size"]):
+                continue  # no joining
             # We need to join bbox and text of two consecutive spans
             # On occasion, spans may also be duplicated.
             if s0["text"] != s1["text"] or s0["bbox"] != s1["bbox"]:
@@ -108,6 +112,8 @@ def sanitize_spans(line):
                     continue
                 if is_white(s["text"]):  # ignore white text
                     continue
+                if s["alpha"] == 0:  # ignore invisible text
+                    continue
                 if s["flags"] & 1 == 1:  # if a superscript, modify bbox
                     # with that of the preceding or following span
                     i = 1 if sno == 0 else sno - 1
@@ -132,10 +138,7 @@ def sanitize_spans(line):
         sbbox = s["bbox"]  # this bbox
         sbbox0 = line[-1]["bbox"]  # previous bbox
         # if any of top or bottom coordinates are close enough, join...
-        if (
-            abs(sbbox.y1 - sbbox0.y1) <= y_delta
-            or abs(sbbox.y0 - sbbox0.y0) <= y_delta
-        ):
+        if abs(sbbox.y1 - sbbox0.y1) <= y_delta or abs(sbbox.y0 - sbbox0.y0) <= y_delta:
             line.append(s)  # append to this line
             lrect |= sbbox  # extend line rectangle
             continue
@@ -156,9 +159,7 @@ def sanitize_spans(line):
     return nlines
 
 
-def get_text_lines(
-    page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr=False
-):
+def get_text_lines(page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr=False):
     """Extract text by line keeping natural reading sequence.
 
     Notes:
diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
@@ -39,10 +39,12 @@
 import string
 from binascii import b2a_base64
 import pymupdf
+from pymupdf import mupdf
 from pymupdf4llm.helpers.get_text_lines import get_raw_lines, is_white
 from pymupdf4llm.helpers.multi_column import column_boxes
 from pymupdf4llm.helpers.progress import ProgressBar
 from dataclasses import dataclass
+from collections import defaultdict
 
 pymupdf.TOOLS.unset_quad_corrections(True)
 # Characters recognized as bullets when starting a line.
@@ -88,8 +90,9 @@ def __init__(
         """Read all text and make a dictionary of fontsizes.
 
         Args:
-            pages: optional list of pages to consider
-            body_limit: consider text with larger font size as some header
+            doc: PDF document or filename
+            pages: consider these page numbers only
+            body_limit: treat text with larger font size as a header
         """
         if isinstance(doc, pymupdf.Document):
             mydoc = doc
@@ -99,7 +102,7 @@ def __init__(
         if pages is None:  # use all pages if omitted
             pages = range(mydoc.page_count)
 
-        fontsizes = {}
+        fontsizes = defaultdict(int)
         for pno in pages:
             page = mydoc.load_page(pno)
             blocks = page.get_text("dict", flags=pymupdf.TEXTFLAGS_TEXT)["blocks"]
@@ -111,8 +114,7 @@ def __init__(
                 if not is_white(s["text"])
             ]:
                 fontsz = round(span["size"])
-                count = fontsizes.get(fontsz, 0) + len(span["text"].strip())
-                fontsizes[fontsz] = count
+                fontsizes[fontsz] += len(span["text"].strip())
 
         if mydoc != doc:
             # if opened here, close it now
@@ -242,7 +244,7 @@ def to_markdown(
     filename=None,
     force_text=True,
     page_chunks=False,
-    margins=(0, 0, 0, 0),
+    margins=0,
     dpi=150,
     page_width=612,
     page_height=None,
@@ -252,30 +254,30 @@ def to_markdown(
     ignore_code=False,
     extract_words=False,
     show_progress=False,
-):
+    use_glyphs=False,
 ) -> str:
     """Process the document and return the text of the selected pages.
 
     Args:
         doc: pymupdf.Document or string.
         pages: list of page numbers to consider (0-based).
-        hdr_info: callable or object having a method named 'get_hdr_info'.
-        write_images: (bool) whether to save images / drawing as files.
-        embed_images: (bool) embed images as base64 encoded strings
-        image_path: (str) folder into which images should be stored.
-        image_format: (str) desired image format. Choose a supported one.
-        force_text: (bool) output text despite of background.
+        hdr_info: callable or object having method 'get_hdr_info'.
+        write_images: (bool) save images / graphics as files.
+        embed_images: (bool) embed images in markdown text (base64 encoded)
+        image_path: (str) store images in this folder.
+        image_format: (str) use this image format. Choose a supported one.
+        force_text: (bool) output text despite of image background.
         page_chunks: (bool) whether to segment output by page.
-        margins: do not consider content overlapping margin areas.
+        margins: omit content overlapping margin areas.
         dpi: (int) desired resolution for generated images.
         page_width: (float) assumption if page layout is variable.
         page_height: (float) assumption if page layout is variable.
         table_strategy: choose table detection strategy
-        graphics_limit: (int) ignore page with too many vector graphics.
+        graphics_limit: (int) if vector graphics count exceeds this, ignore all.
         ignore_code: (bool) suppress code-like formatting (mono-space fonts)
         extract_words: (bool) include "words"-like output in page chunks
         show_progress: (bool) print progress as each page is processed.
-        image_extract_algorithm: (str) which algorithm to use "simple" or "simple-drop".
+        glyph_fallback: (bool) replace the Invalid Unicode by glyph number.
 
     """
     if write_images is False and embed_images is False and force_text is False:
@@ -339,6 +341,14 @@ def to_markdown(
         hdr_info = IdentifyHeaders(doc)
         get_header_id = hdr_info.get_header_id
 
+    def max_header_id(spans, page):
+        hdr_ids = sorted(
+            [l for l in set([len(get_header_id(s, page=page)) for s in spans]) if l > 0]
+        )
+        if not hdr_ids:
+            return ""
+        return "#" * (hdr_ids[0] - 1) + " "
+
     def resolve_links(links, span):
         """Accept a span and return a markdown link string.
 
@@ -422,7 +432,11 @@ def write_text(
         ]
 
         parms.line_rects.extend(
-            [l[0] for l in nlines if not intersects_rects(l[0], parms.tab_rects.values())]
+            [
+                l[0]
+                for l in nlines
+                if not intersects_rects(l[0], parms.tab_rects.values())
+            ]
         )  # store line rectangles
 
         prev_lrect = None  # previous line rectangle
@@ -492,8 +506,17 @@ def write_text(
                         parms.deleted_images.append(i)
 
             parms.line_rects.append(lrect)
+
+            # make text string for the full line
             text = " ".join([s["text"] for s in spans])
 
+            # if line is a header, this will return multiple "#" characters,
+            # otherwise an empty string
+            hdr_string = max_header_id(spans, page=parms.page)  # a header?
+
+            # full line strikeout?
+            all_strikeout = all([s["char_flags"] & 1 for s in spans])
+
             # full line mono-spaced?
             if not IGNORE_CODE:
                 all_mono = all([s["flags"] & 8 for s in spans])
@@ -512,6 +535,12 @@ def write_text(
                 out_string += indent + text + "\n"
                 continue  # done with this line
 
+            if hdr_string:  # if a header line skip the rest
+                if all_strikeout:
+                    text = "~~" + text + "~~"
+                out_string += hdr_string + text + "\n"
+                continue
+
             span0 = spans[0]
             bno = span0["block"]  # block number of line
             if bno != prev_bno:
@@ -528,9 +557,6 @@ def write_text(
                 out_string += "\n"
             prev_lrect = lrect
 
-            # if line is a header, this will return multiple "#" characters
-            hdr_string = get_header_id(span0, page=parms.page)
-
             # intercept if header text has been broken in multiple lines
             if hdr_string and hdr_string == prev_hdr_string:
                 while out_string.endswith("\n"):
@@ -539,9 +565,6 @@ def write_text(
                 continue
 
             prev_hdr_string = hdr_string
-            if hdr_string.startswith("#"):  # if a header line skip the rest
-                out_string += hdr_string + text + "\n"
-                continue
 
             # this line is not all-mono, so switch off "code" mode
             if code:  # in code output mode?
@@ -551,45 +574,47 @@ def write_text(
             for i, s in enumerate(spans):  # iterate spans of the line
                 # decode font properties
                 mono = s["flags"] & 8 and IGNORE_CODE is False
-                bold = s["flags"] & 16
+                bold = s["flags"] & 16 or s["char_flags"] & 8
                 italic = s["flags"] & 2
+                strikeout = s["char_flags"] & 1
 
                 if mono:
                     # this is text in some monospaced font
                     out_string += f"`{s['text'].strip()}` "
-                else:  # not a mono text
-                    prefix = ""
-                    suffix = ""
-                    if hdr_string == "":
-                        if bold:
-                            prefix = "**"
-                            suffix += "**"
-                        if italic:
-                            prefix += "_"
-                            suffix = "_" + suffix
-
-                    # convert intersecting link to markdown syntax
-                    ltext = resolve_links(parms.links, s)
-                    if ltext:
-                        text = f"{hdr_string}{prefix}{ltext}{suffix} "
-                    else:
-                        text = f"{hdr_string}{prefix}{s['text'].strip()}{suffix} "
-                    if text.startswith(bullet):
-                        text = text[1:]
-                        if len(text) > 1 and text[1] == " ":
-                            t = "-"
-                        else:
-                            t = "- "
-                        text = t + text[1:]
-                        dist = span0["bbox"][0] - clip.x0
-                        cwidth = (span0["bbox"][2] - span0["bbox"][0]) / len(
-                            span0["text"]
-                        )
-                        if cwidth == 0.0:
-                            cwidth = span0["size"] * 0.5
-                        text = " " * int(round(dist / cwidth)) + text
+                    continue
 
-                    out_string += text
+                prefix = ""
+                suffix = ""
+                if bold:
+                    prefix = "**" + prefix
+                    suffix += "**"
+                if italic:
+                    prefix = "*" + prefix
+                    suffix += "*"
+                if strikeout:
+                    prefix = "~~" + prefix
+                    suffix += "~~"
+
+                # convert intersecting link to markdown syntax
+                ltext = resolve_links(parms.links, s)
+                if ltext:
+                    text = f"{hdr_string}{prefix}{ltext}{suffix} "
+                else:
+                    text = f"{hdr_string}{prefix}{s['text'].strip()}{suffix} "
+                if text.startswith(bullet):
+                    text = text[1:]
+                    if len(text) > 1 and text[1] == " ":
+                        t = "-"
+                    else:
+                        t = "- "
+                    text = t + text[1:]
+                    dist = span0["bbox"][0] - clip.x0
+                    cwidth = (span0["bbox"][2] - span0["bbox"][0]) / len(span0["text"])
+                    if cwidth == 0.0:
+                        cwidth = span0["size"] * 0.5
+                    text = " " * int(round(dist / cwidth)) + text
+
+                out_string += text
             if not code:
                 out_string += "\n"
         out_string += "\n"
@@ -807,17 +832,10 @@ def get_page_output(doc, pno, margins, textflags, FILENAME):
         parms.graphics = []
         parms.words = []
         parms.line_rects = []
+
         # determine background color
         parms.bg_color = get_bg_color(page)
-        # catch too-many-graphics situation
-        if GRAPHICS_LIMIT is not None:
-            test_paths = page.get_cdrawings()  # fastest access to graphics
-            if (excess := len(test_paths)) > GRAPHICS_LIMIT:
-                parms.md_string = (
-                    f"\n**Ignoring page {page.number} with {excess}+ vector graphics.**"
-                )
-                parms.md_string += "\n\n-----\n\n"
-                return parms
+
         left, top, right, bottom = margins
         parms.clip = page.rect + (left, top, -right, -bottom)
 
@@ -887,6 +905,10 @@ def get_page_output(doc, pno, margins, textflags, FILENAME):
             and not (p["type"] == "f" and p["fill"] == parms.bg_color)
         ]
 
+        # catch too-many-graphics situation
+        if GRAPHICS_LIMIT and len(paths) > GRAPHICS_LIMIT:
+            paths = []
+
         # We also ignore vector graphics that only represent
         # "text emphasizing sugar".
         vg_clusters0 = []  # worthwhile vector graphics go here
@@ -988,7 +1010,17 @@ def get_page_output(doc, pno, margins, textflags, FILENAME):
     # read the Table of Contents
     toc = doc.get_toc()
 
-    textflags = pymupdf.TEXT_MEDIABOX_CLIP | pymupdf.TEXT_ACCURATE_BBOXES
+    # Text extraction flags:
+    # omit invisible text, collect styles, use accurate bounding boxes
+    textflags = (
+        0
+        | mupdf.FZ_STEXT_CLIP
+        | mupdf.FZ_STEXT_ACCURATE_BBOXES
+        | 32768  # mupdf.FZ_STEXT_COLLECT_STYLES
+    )
+    # optionally replace 0xFFFD by glyph number
+    if use_glyphs:
+        textflags |= mupdf.FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE
 
     if show_progress:
         print(f"Processing {FILENAME}...")
@@ -1082,9 +1114,7 @@ def extract_images_on_page_simple_drop(page, parms, image_size_limit):
     import time
 
     try:
-        filename = (
-            "C:/Users/haral/OneDrive/Desktop/pymupdf4llm/issues/0225/e000050.full.pdf"
-        )
+        filename = "markdown.pdf"
     except IndexError:
         print(f"Usage:\npython {os.path.basename(__file__)} input.pdf")
         sys.exit()
@@ -1117,11 +1147,10 @@ def extract_images_on_page_simple_drop(page, parms, image_size_limit):
         pages=pages,
         write_images=True,
         force_text=False,
-        image_path=r"C:\Users\haral\OneDrive\Desktop\pymupdf4llm\rag\pymupdf4llm\pymupdf4llm\helpers",
     )
     FILENAME = doc.name
     # output to a text file with extension ".md"
-    outname = FILENAME.replace(".pdf", ".md")
+    outname = FILENAME + ".md"
     pathlib.Path(outname).write_bytes(md_string.encode())
     t1 = time.perf_counter()  # stop timer
     print(f"Markdown creation time for {FILENAME=} {round(t1-t0,2)} sec.")
diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py