Merge branch 'main' into v0.0.18

JorjMcKie · web-flow · commit 3dd3429c852a · 2025-03-24T06:26:37.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,5 @@
 _build
 build
+*.egg-info
+__pycache__
+.pytest_cache
diff --git a/README.md b/README.md
@@ -42,6 +42,12 @@ To create small **chunks of text** - as opposed to generating one large string f
 
 Also new in version 0.0.2 is the optional **extraction of images** and vector graphics: use of parameter `write_images=True`. The will store PNG images in the document's folder, and the Markdown text will appropriately refer to them. The images are named like `"input.pdf-page_number-index.png"`.
 
+# Documentation and API
+
+[Documentation](https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/index.html)
+
+[API](https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/api.html#pymupdf4llm-api)
+
 # Document Support
 
 While PDF is by far the most important document format worldwide, it is worthwhile mentioning that all examples and helper scripts work in the same way and **_without change_** for [all supported file types](https://pymupdf.readthedocs.io/en/latest/how-to-open-a-file.html#supported-file-types).
diff --git a/pdf4llm/README.md b/pdf4llm/README.md
@@ -50,7 +50,7 @@ Instead of the filename string as above, one can also provide a PyMuPDF `Documen
     ```python
     import pdf4llm
     
-    md_read = LlamaMarkdownReader()
+    md_read = pdf4llm.LlamaMarkdownReader()
     data = md_read.load_data("input.pdf")
 
     # The result 'data' is of type List[LlamaIndexDocument]
diff --git a/pymupdf4llm/README.md b/pymupdf4llm/README.md
@@ -50,7 +50,7 @@ Instead of the filename string as above, one can also provide a PyMuPDF `Documen
     ```python
     import pymupdf4llm
     
-    md_read = LlamaMarkdownReader()
+    md_read = pymupdf4llm.LlamaMarkdownReader()
     data = md_read.load_data("input.pdf")
 
     # The result 'data' is of type List[LlamaIndexDocument]
diff --git a/pymupdf4llm/pymupdf4llm/helpers/multi_column.py b/pymupdf4llm/pymupdf4llm/helpers/multi_column.py
@@ -100,6 +100,23 @@ def in_bbox(bb, bboxes):
                 return i
         return 0
 
+    def in_bbox_using_cache(bb, bboxes, cache):
+        """Return 1-based number if a bbox contains bb, else return 0."""
+        """Results are stored in the cache for speedup."""
+        cache_key = f"{id(bb)}_{id(bboxes)}"
+        cached = cache.get(cache_key)
+        if cached is not None:
+            return cached
+
+        index = 0
+        for i, bbox in enumerate(bboxes, start=1):
+            if bb in bbox:
+                index = i
+                break
+
+        cache[cache_key] = index
+        return index
+
     def intersects_bboxes(bb, bboxes):
         """Return True if a bbox touches bb, else return False."""
         for bbox in bboxes:
@@ -140,6 +157,9 @@ def clean_nblocks(nblocks):
             if bb0 == bb1:
                 del nblocks[i]
 
+        if len(nblocks) == 0:
+            return nblocks
+
         # 2. repair sequence in special cases:
         # consecutive bboxes with almost same bottom value are sorted ascending
         # by x-coordinate.
@@ -225,7 +245,7 @@ def join_rects_phase2(bboxes):
             new_rects.append(r)
         return new_rects
 
-    def join_rects_phase3(bboxes, path_rects):
+    def join_rects_phase3(bboxes, path_rects, cache):
         prects = bboxes[:]
         new_rects = []
 
@@ -239,15 +259,17 @@ def join_rects_phase3(bboxes, path_rects):
                     # do not join across columns
                     if prect1.x0 > prect0.x1 or prect1.x1 < prect0.x0:
                         continue
-                    # do not join areas with a different background
-                    if in_bbox(prect0, path_rects) != in_bbox(prect1, path_rects):
+
+                    # do not join different backgrounds
+                    if in_bbox_using_cache(prect0, path_rects, cache) != in_bbox_using_cache(prect1, path_rects, cache):
                         continue
                     temp = prect0 | prect1
                     test = set(
                         [tuple(b) for b in prects + new_rects if b.intersects(temp)]
                     )
                     if test == set((tuple(prect0), tuple(prect1))):
                         prect0 |= prect1
+                        prects[0] = prect0
                         del prects[i]
                         repeat = True
             new_rects.append(prect0)
@@ -397,6 +419,7 @@ def join_rects_phase3(bboxes, path_rects):
     # the final block bboxes on page
     nblocks = [bboxes[0]]  # pre-fill with first bbox
     bboxes = bboxes[1:]  # remaining old bboxes
+    cache = {}
 
     for i, bb in enumerate(bboxes):  # iterate old bboxes
         check = False  # indicates unwanted joins
@@ -410,7 +433,7 @@ def join_rects_phase3(bboxes, path_rects):
                 continue
 
             # never join across different background colors
-            if in_bbox(nbb, path_rects) != in_bbox(bb, path_rects):
+            if in_bbox_using_cache(nbb, path_rects, cache) != in_bbox_using_cache(bb, path_rects, cache):
                 continue
 
             temp = bb | nbb  # temporary extension of new block
@@ -433,11 +456,13 @@ def join_rects_phase3(bboxes, path_rects):
 
     # do some elementary cleaning
     nblocks = clean_nblocks(nblocks)
+    if len(nblocks) == 0:
+        return nblocks
 
     # several phases of rectangle joining
     nblocks = join_rects_phase1(nblocks)
     nblocks = join_rects_phase2(nblocks)
-    nblocks = join_rects_phase3(nblocks, path_rects)
+    nblocks = join_rects_phase3(nblocks, path_rects, cache)
 
     # return identified text bboxes
     return nblocks
diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
@@ -253,6 +253,7 @@ def to_markdown(
     extract_words=False,
     show_progress=False,
 ):
+) -> str:
     """Process the document and return the text of the selected pages.
 
     Args:
@@ -274,6 +275,7 @@ def to_markdown(
         ignore_code: (bool) suppress code-like formatting (mono-space fonts)
         extract_words: (bool) include "words"-like output in page chunks
         show_progress: (bool) print progress as each page is processed.
+        image_extract_algorithm: (str) which algorithm to use "simple" or "simple-drop".
 
     """
     if write_images is False and embed_images is False and force_text is False:
@@ -583,8 +585,8 @@ def write_text(
                         cwidth = (span0["bbox"][2] - span0["bbox"][0]) / len(
                             span0["text"]
                         )
-                        if cwidth == 0:
-                            cwidth = 1
+                        if cwidth == 0.0:
+                            cwidth = span0["size"] * 0.5
                         text = " " * int(round(dist / cwidth)) + text
 
                     out_string += text
@@ -852,6 +854,7 @@ def get_page_output(doc, pno, margins, textflags, FILENAME):
                     del img_info[i]  # contained in some larger image
                     break
         parms.images = img_info
+
         parms.img_rects = [i["bbox"] for i in parms.images]
 
         # Locate all tables on page
@@ -1015,6 +1018,64 @@ def get_page_output(doc, pno, margins, textflags, FILENAME):
     return document_output
 
 
+def extract_images_on_page_simple(page, parms, image_size_limit):
+    # extract images on page
+    # ignore images contained in some other one (simplified mechanism)
+    img_info = page.get_image_info()
+    for i in range(len(img_info)):
+        item = img_info[i]
+        item["bbox"] = pymupdf.Rect(item["bbox"]) & parms.clip
+        img_info[i] = item
+
+    # sort descending by image area size
+    img_info.sort(key=lambda i: abs(i["bbox"]), reverse=True)
+    # run from back to front (= small to large)
+    for i in range(len(img_info) - 1, 0, -1):
+        r = img_info[i]["bbox"]
+        if r.is_empty:
+            del img_info[i]
+            continue
+        for j in range(i):  # image areas larger than r
+            if r in img_info[j]["bbox"]:
+                del img_info[i]  # contained in some larger image
+                break
+
+    return img_info
+
+
+def filter_small_images(page, parms, image_size_limit):
+    img_info = []
+    for item in page.get_image_info():
+        r = pymupdf.Rect(item["bbox"]) & parms.clip
+        if r.is_empty or (
+            max(r.width / page.rect.width, r.height / page.rect.height)
+            < image_size_limit
+        ):
+            continue
+        item["bbox"] = r
+        img_info.append(item)
+    return img_info
+
+
+def extract_images_on_page_simple_drop(page, parms, image_size_limit):
+    img_info = filter_small_images(page, parms, image_size_limit)
+
+    # sort descending by image area size
+    img_info.sort(key=lambda i: abs(i["bbox"]), reverse=True)
+    # run from back to front (= small to large)
+    for i in range(len(img_info) - 1, 0, -1):
+        r = img_info[i]["bbox"]
+        if r.is_empty:
+            del img_info[i]
+            continue
+        for j in range(i):  # image areas larger than r
+            if r in img_info[j]["bbox"]:
+                del img_info[i]  # contained in some larger image
+                break
+
+    return img_info
+
+
 if __name__ == "__main__":
     import pathlib
     import sys