Changes for v0.0.9

JorjMcKie · JorjMcKie · commit 5ba23e3a500d · 2024-07-11T11:44:12.000-04:00
See changes.rst
diff --git a/docs/src/changes.rst b/docs/src/changes.rst
@@ -4,6 +4,22 @@
 Change Log
 ===========================================================================
 
+Changes in version 0.0.9
+--------------------------
+
+Fixes:
+~~~~~~~
+
+* `71 <https://github.com/pymupdf/RAG/issues/71>`_ "Unexpected results in pymupdf4llm but pymupdf works"
+* `68 <https://github.com/pymupdf/RAG/issues/68>`_ "Issue with text extraction near footer of page"
+
+
+Improvements:
+~~~~~~~~~~~~~~
+* Improved identification of scattered text span particles. This should address most issues with out-of-sequence situations.
+* We now correctly process rotated pages (see issue #68).
+
+
 Changes in version 0.0.8
 --------------------------
 
@@ -24,7 +40,7 @@ Fixes:
 Improvements:
 ~~~~~~~~~~~~~~~~
 
-* Improved the algorithm dealing with vector graphics. Vector graphics are now more reliably classified as irrelevant when they are simple background for text (quite often the case for code snippets).
+* Improved the algorithm dealing with vector graphics. Vector graphics are now more reliably classified as irrelevant: We now detect when "strokes" only exist in the neighborhood of the graphics boundary box border itself. This is quite often the case for code snippets.
 
 
 Changes in version 0.0.6
diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py
@@ -1,6 +1,6 @@
 from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown
 
-__version__ = "0.0.8"
+__version__ = "0.0.9"
 version = __version__
 version_tuple = tuple(map(int, version.split(".")))
 
diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
@@ -28,33 +28,63 @@ def is_white(text):
 
 
 def get_raw_lines(textpage, clip=None, tolerance=3):
-    """Extract the text spans from a TextPage in a natural reading sequence.
+    """Extract the text spans from a TextPage in natural reading sequence.
 
     All spans roughly on the same line are joined to generate an improved line.
     This copes with MuPDF's algorithm that generates new lines also for spans
-    whose horizontal distance is larger than some hreshold.
+    whose horizontal distance is larger than some threshold.
 
     Result is a sorted list of line objects that consist of the recomputed line
-    rectangle and a sorted list of spans in that line.
+    boundary box and the sorted list of spans in that line.
 
-    This result can then be easily converted e.g. to plain or markdown text.
+    This result can then easily be converted e.g. to plain or markdown text.
 
     Args:
         textpage: (mandatory) TextPage object
-        clip: (Rect) specifies a sub-rectangle of the textpage rect (which also
-              may be based on some part of the original page).
+        clip: (Rect) specifies a sub-rectangle of the textpage rect (which in
+              turn may be based on a sub-rectangle of the full page).
         tolerance: (float) put spans on the same line if their top or bottom
-              coordinate differ by no mor than this value.
+              coordinate differ by no more than this value.
 
     Returns:
-        A sorted list of items (rect, [spans]), each representing a line. The
-        spans are sorted left to right, Span dictionaries have been changed
-        in that "bbox" is a Rect object and "line" is an integer representing
-        the line number of the span. This allows to detect where MuPDF has
-        generated line breaks to indicate large inter-span distances.
+        A sorted list of items (rect, [spans]), each representing one line. The
+        spans are sorted left to right, Span dictionaries have been changed:
+        - "bbox" has been converted to a Rect object
+        - "line" (new) the line number in TextPage.extractDICT
+        - "block" (new) the block number in TextPage.extractDICT
+        This allows to detect where MuPDF has generated line breaks to indicate
+        large inter-span distances.
     """
     y_delta = tolerance  # allowable vertical coordinate deviation
-    if clip == None:  # use TextPage if not provided
+
+    def sanitize_spans(line):
+        """Sort and join the spans in a re-synthesized line.
+
+        The PDF may contain "broken" text with words cut into pieces.
+        This funtion joins spans representing the particles and sorts them
+        left to right.
+
+        Arg:
+            A list of spans - as drived from TextPage.extractDICT()
+        Returns:
+            A list of sorted, and potentially cleaned-up spans
+        """
+        line.sort(key=lambda s: s["bbox"].x0)  # sort left to right
+        for i in range(len(line) - 1, 0, -1):  # iterate back to front
+            s0 = line[i - 1]
+            s1 = line[i]
+            # "delta" depends on the font size. Spans  will be joined if
+            # no more than 10% of the font size separates them.
+            delta = s1["size"] * 0.1
+            if s0["bbox"].x1 + delta < s1["bbox"].x0:
+                continue  # all good: no joining neded
+            s0["bbox"] |= s1["bbox"]  # join boundary boxes
+            s0["text"] += s1["text"]  # join the text
+            del line[i]  # delete the joined-in span
+            line[i - 1] = s0  # update the span
+        return line
+
+    if clip is None:  # use TextPage if not provided
         clip = textpage.rect
     # extract text blocks - if bbox is not empty
     blocks = [
@@ -63,40 +93,38 @@ def get_raw_lines(textpage, clip=None, tolerance=3):
         if b["type"] == 0 and not fitz.Rect(b["bbox"]).is_empty
     ]
     spans = []  # all spans in TextPage here
-    for bno, b in enumerate(blocks):
-        for lno, line in enumerate(b["lines"]):
-            lbbox = fitz.Rect(line["bbox"])
-            for sno, s in enumerate(line["spans"]):
-                sbbox = fitz.Rect(s["bbox"])  # turn to a Rect
+    for bno, b in enumerate(blocks):  # the numbered blocks
+        for lno, line in enumerate(b["lines"]):  # the numbered lines
+            for sno, s in enumerate(line["spans"]):  # the numered spans
+                sbbox = fitz.Rect(s["bbox"])  # span bbox as a Rect
                 mpoint = (sbbox.tl + sbbox.br) / 2  # middle point
                 if mpoint not in clip:
                     continue
                 if is_white(s["text"]):  # ignore white text
                     continue
-                if s["flags"] & 1 == 1:  # if a superscript, modify
+                if s["flags"] & 1 == 1:  # if a superscript, modify bbox
+                    # with that of the preceding or following span
                     i = 1 if sno == 0 else sno - 1
                     neighbor = line["spans"][i]
                     sbbox.y1 = neighbor["bbox"][3]
                     s["text"] = f"[{s['text']}]"
                 s["bbox"] = sbbox  # update with the Rect version
-                # include line identifier to facilitate separator insertion
+                # include line/block numbers to facilitate separator insertion
                 s["line"] = lno
                 s["block"] = bno
                 spans.append(s)
 
-    if not spans:  # we may have no text at all
+    if not spans:  # no text at all
         return []
 
-    spans.sort(
-        key=lambda s: s["bbox"].y1
-    )  # sort spans by assending bottom coord
+    spans.sort(key=lambda s: s["bbox"].y1)  # sort spans by bottom coord
     nlines = []  # final result
-    line = [spans[0]]  # collects spans with fitting vertical coordinate
+    line = [spans[0]]  # collects spans with fitting vertical coordinates
     lrect = spans[0]["bbox"]  # rectangle joined from span rectangles
 
-    for s in spans[1:]:
-        sbbox = s["bbox"]
-        sbbox0 = line[-1]["bbox"]
+    for s in spans[1:]:  # walk through the spans
+        sbbox = s["bbox"]  # this bbox
+        sbbox0 = line[-1]["bbox"]  # previous bbox
         # if any of top or bottom coordinates are close enough, join...
         if (
             abs(sbbox.y1 - sbbox0.y1) <= y_delta
@@ -107,7 +135,7 @@ def get_raw_lines(textpage, clip=None, tolerance=3):
             continue
 
         # end of current line, sort its spans from left to right
-        line.sort(key=lambda s: s["bbox"].x0)
+        line = sanitize_spans(line)
 
         # append line rect and its spans to final output
         nlines.append([lrect, line])
@@ -116,7 +144,7 @@ def get_raw_lines(textpage, clip=None, tolerance=3):
         lrect = sbbox  # initialize its rectangle
 
     # need to append last line in the same way
-    line.sort(key=lambda s: s["bbox"].x0)
+    line = sanitize_spans(line)
     nlines.append([lrect, line])
 
     return nlines
@@ -143,6 +171,7 @@ def get_text_lines(
     Returns:
         String of plain text in reading sequence.
     """
+    textflags = fitz.TEXT_MEDIABOX_CLIP
     page.remove_rotation()
     prect = page.rect if not clip else fitz.Rect(clip)  # area to consider
 
@@ -151,7 +180,7 @@ def get_text_lines(
     # make a TextPage if required
     if textpage is None:
         if ocr is False:
-            tp = page.get_textpage(clip=prect, flags=fitz.TEXTFLAGS_TEXT)
+            tp = page.get_textpage(clip=prect, flags=textflags)
         else:
             tp = page.get_textpage_ocr(dpi=300, full=True)
     else:
diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
@@ -187,16 +187,22 @@ def to_markdown(
     if len(margins) == 2:
         margins = (0, margins[0], 0, margins[1])
     if len(margins) != 4:
-        raise ValueError("margins must be a float or a sequence of 2 or 4 floats")
+        raise ValueError(
+            "margins must be one, two or four floats"
+        )
     elif not all([hasattr(m, "__float__") for m in margins]):
         raise ValueError("margin values must be floats")
 
     # If "hdr_info" is not an object having method "get_header_id", scan the
     # document and use font sizes as header level indicators.
     if callable(hdr_info):
         get_header_id = hdr_info
-    elif hasattr(hdr_info, "get_header_id") and callable(hdr_info.get_header_id):
+    elif hasattr(hdr_info, "get_header_id") and callable(
+        hdr_info.get_header_id
+    ):
         get_header_id = hdr_info.get_header_id
+    elif hdr_info is False:
+        get_header_id = lambda s, page=None: ""
     else:
         hdr_info = IdentifyHeaders(doc)
         get_header_id = hdr_info.get_header_id
@@ -378,7 +384,9 @@ def write_text(
                     if ltext:
                         text = f"{hdr_string}{prefix}{ltext}{suffix} "
                     else:
-                        text = f"{hdr_string}{prefix}{s['text'].strip()}{suffix} "
+                        text = (
+                            f"{hdr_string}{prefix}{s['text'].strip()}{suffix} "
+                        )
 
                     if text.startswith(bullet):
                         text = "-  " + text[1:]
@@ -391,7 +399,9 @@ def write_text(
             code = False
 
         return (
-            out_string.replace(" \n", "\n").replace("  ", " ").replace("\n\n\n", "\n\n")
+            out_string.replace(" \n", "\n")
+            .replace("  ", " ")
+            .replace("\n\n\n", "\n\n")
         )
 
     def is_in_rects(rect, rect_list):
@@ -474,6 +484,7 @@ def get_page_output(doc, pno, margins, textflags):
             graphics information.
         """
         page = doc[pno]
+        page.remove_rotation()  # make sure we work on rotation=0
         md_string = ""
         if GRAPHICS_LIMIT is not None:
             test_paths = page.get_cdrawings()
@@ -491,7 +502,9 @@ def get_page_output(doc, pno, margins, textflags):
         # make a TextPage for all later extractions
         textpage = page.get_textpage(flags=textflags, clip=clip)
 
-        img_info = [img for img in page.get_image_info() if img["bbox"] in clip]
+        img_info = [
+            img for img in page.get_image_info() if img["bbox"] in clip
+        ]
         images = img_info[:]
         tables = []
         graphics = []
@@ -560,7 +573,9 @@ def get_page_output(doc, pno, margins, textflags):
             if include is True:  # this box is a significant vector graphic
                 vg_clusters.append(bbox)
 
-        actual_paths = [p for p in paths if is_in_rects(p["rect"], vg_clusters)]
+        actual_paths = [
+            p for p in paths if is_in_rects(p["rect"], vg_clusters)
+        ]
 
         vg_clusters0 = [
             r
@@ -620,7 +635,7 @@ def get_page_output(doc, pno, margins, textflags):
 
     # read the Table of Contents
     toc = doc.get_toc()
-    textflags = fitz.TEXT_MEDIABOX_CLIP
+    textflags = fitz.TEXT_MEDIABOX_CLIP | fitz.TEXT_CID_FOR_UNKNOWN_UNICODE
     for pno in pages:
         page_output, images, tables, graphics = get_page_output(
             doc, pno, margins, textflags
diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py
@@ -17,7 +17,7 @@
 
 setuptools.setup(
     name="pymupdf4llm",
-    version="0.0.8",
+    version="0.0.9",
     author="Artifex",
     author_email="support@artifex.com",
     description="PyMuPDF Utilities for LLM/RAG",