pymupdf · anup00900 · Nov 26, 2025 · Nov 26, 2025
diff --git a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py
@@ -918,11 +918,13 @@ def parse_document(
             # tables are present on page:
             if not (page_full_ocred or page_text_ocred):
                 # we need the by-character extraction if no OCR
+                # Include both text blocks (type==0) and image blocks (type==1)
                 table_blocks = [
-                    b for b in textpage.extractRAWDICT()["blocks"] if b["type"] == 0
+                    b for b in textpage.extractRAWDICT()["blocks"] if b["type"] in (0, 1)
                 ]
             else:
-                table_blocks = fulltext
+                # Also include images from blocks for OCR case
+                table_blocks = fulltext + [b for b in blocks if b["type"] == 1]
         else:
             table_blocks = None
 
@@ -1001,13 +1003,17 @@ def parse_document(
                         table_blocks,
                         layoutbox,
                         ocrpage=(pagelayout.full_ocred or pagelayout.text_ocred),
+                        page=page,
+                        document=document,
                     )
 
                     layoutbox.table["markdown"] = utils.table_to_markdown(
                         table_blocks,
                         layoutbox,
                         ocrpage=(pagelayout.full_ocred or pagelayout.text_ocred),
                         markdown=True,
+                        page=page,
+                        document=document,
                     )
 
                 except Exception as e:

diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
@@ -560,7 +560,9 @@ def write_text(
                     )
                 ]
                 for i, _ in tab_candidates:
-                    out_string += "\n" + parms.tabs[i].to_markdown(clean=False) + "\n"
+                    table_md = parms.tabs[i].to_markdown(clean=False)
+                    table_md = add_images_to_table_markdown(parms.page, parms.tabs[i], table_md, parms)
+                    out_string += "\n" + table_md + "\n"
                     if EXTRACT_WORDS:
                         # for "words" extraction, add table cells as line rects
                         cells = sorted(
@@ -759,6 +761,95 @@ def intersects_rects(rect, rect_list):
                 return i
         return 0
 
+    def add_images_to_table_markdown(page, table, table_md, parms):
+        """Add images found in table cells to the markdown output."""
+        if not (write_images or embed_images):
+            return table_md
+
+        # Get all images on the page
+        image_list = page.get_image_info()
+        if not image_list:
+            return table_md
+
+        # Split markdown into lines
+        md_lines = table_md.strip().split('\n')
+        if len(md_lines) < 3:  # Need at least header + separator + one row
+            return table_md
+
+        # Track images added to avoid duplicates
+        used_images = set()
+
+        # Process each data row (skip header and separator)
+        for row_idx in range(2, len(md_lines)):
+            line = md_lines[row_idx]
+            if not line.strip() or not line.startswith('|'):
+                continue
+
+            # Parse table cells
+            cells = [c.strip() for c in line.split('|')[1:-1]]  # Remove first/last empty
+
+            # Get table row info
+            # Markdown line 2 = first data row = table.rows[1] (since rows[0] is header)
+            table_row_idx = row_idx - 2 + 1  # +1 to skip header row in table.rows
+            if table_row_idx >= table.row_count:
+                continue
+
+            row_cells = table.rows[table_row_idx].cells
+
+            # Check each cell for images
+            for col_idx, cell_bbox in enumerate(row_cells):
+                if col_idx >= len(cells) or cell_bbox is None:
+                    continue
+
+                cell_rect = pymupdf.Rect(cell_bbox)
+
+                # Find images that overlap with this cell
+                for img_idx, img_info in enumerate(image_list):
+                    if img_idx in used_images:
+                        continue
+
+                    img_bbox = pymupdf.Rect(img_info['bbox'])
+
+                    # Calculate overlap
+                    intersection = cell_rect & img_bbox
+                    if intersection.is_empty:
+                        continue
+
+                    overlap_ratio = abs(intersection) / abs(img_bbox)
+
+                    # If >50% of image is in this cell, it belongs here
+                    if overlap_ratio > 0.5:
+                        # Extract and save the image
+                        try:
+                            pix = page.get_pixmap(clip=img_bbox, dpi=DPI)
+
+                            if write_images:
+                                filename = os.path.basename(parms.filename).replace(" ", "-")
+                                img_filename = os.path.join(
+                                    IMG_PATH, f"{filename}-{page.number}-table-{img_idx}.{IMG_EXTENSION}"
+                                )
+                                pix.save(img_filename)
+                                img_ref = f"![image]({img_filename.replace(chr(92), '/')})"
+                            elif embed_images:
+                                data = b2a_base64(pix.tobytes(IMG_EXTENSION)).decode()
+                                data_uri = f"data:image/{IMG_EXTENSION};base64," + data
+                                img_ref = f"![image]({data_uri})"
+
+                            # Add image reference to cell
+                            if cells[col_idx]:
+                                cells[col_idx] += "<br>" + img_ref
+                            else:
+                                cells[col_idx] = img_ref
+
+                            used_images.add(img_idx)
+                        except Exception:
+                            pass  # Skip failed image extractions
+
+            # Reconstruct the row with images
+            md_lines[row_idx] = '|' + '|'.join(cells) + '|'
+
+        return '\n'.join(md_lines) + '\n'
+
     def output_tables(parms, text_rect):
         """Output tables above given text rectangle."""
         this_md = ""  # markdown string for table(s) content
@@ -769,7 +860,9 @@ def output_tables(parms, text_rect):
             ):
                 if i in parms.written_tables:
                     continue
-                this_md += parms.tabs[i].to_markdown(clean=False) + "\n"
+                table_md = parms.tabs[i].to_markdown(clean=False)
+                table_md = add_images_to_table_markdown(parms.page, parms.tabs[i], table_md, parms)
+                this_md += table_md + "\n"
                 if EXTRACT_WORDS:
                     # for "words" extraction, add table cells as line rects
                     cells = sorted(
@@ -790,7 +883,9 @@ def output_tables(parms, text_rect):
             for i, trect in parms.tab_rects.items():
                 if i in parms.written_tables:
                     continue
-                this_md += parms.tabs[i].to_markdown(clean=False) + "\n"
+                table_md = parms.tabs[i].to_markdown(clean=False)
+                table_md = add_images_to_table_markdown(parms.page, parms.tabs[i], table_md, parms)
+                this_md += table_md + "\n"
                 if EXTRACT_WORDS:
                     # for "words" extraction, add table cells as line rects
                     cells = sorted(

diff --git a/pymupdf4llm/pymupdf4llm/helpers/utils.py b/pymupdf4llm/pymupdf4llm/helpers/utils.py
@@ -750,24 +750,27 @@ def complete_table_structure(page):
     return all_lines, all_boxes
 
 
-def extract_cells(table_blocks, cell, markdown=False, ocrpage=False):
-    """Extract text from a rect-like 'cell' as plain or MD styled text.
+def extract_cells(table_blocks, cell, markdown=False, ocrpage=False, page=None, document=None, cell_image_counter=None):
+    """Extract text and images from a rect-like 'cell' as plain or MD styled text.
 
     This function should ultimately be used to extract text from a table cell.
     Markdown output will only work correctly if extraction flag bit
     TEXT_COLLECT_STYLES is set.
 
     Args:
-        table_blocks: A list of PyMuPDF TextPage text blocks (type = 0). Must
+        table_blocks: A list of PyMuPDF TextPage text blocks (type = 0 or 1). Must
             have been created with TEXT_COLLECT_STYLE for correct markdown.
             Format is either "dict" or "rawdict" depending on ocrpage.
         cell: A tuple (x0, y0, x1, y1) defining the cell's bbox.
         markdown: If True, return text formatted for Markdown.
         ocrpage: If True, text is written with GlyphLessFont. In this case,
             table_blocks is in format "dict".
+        page: Optional Page object for image extraction from cells.
+        document: Optional ParsedDocument object for image write/embed settings.
+        cell_image_counter: Optional list with one element [counter] to track image numbers.
 
     Returns:
-        A string with the text extracted from the cell.
+        A string with the text and images extracted from the cell.
     """
 
     def outside_cell(bbox, cell):
@@ -779,10 +782,31 @@ def outside_cell(bbox, cell):
             or bbox[3] <= cell[1]
         )
 
+    def bbox_overlap(bbox, cell):
+        """Calculate overlap ratio between bbox and cell."""
+        cell_rect = pymupdf.Rect(cell)
+        bbox_rect = pymupdf.Rect(bbox)
+        intersection = cell_rect & bbox_rect
+        if intersection.is_empty:
+            return 0.0
+        return abs(intersection) / abs(bbox_rect)
+
     text = ""
+    images_in_cell = []
+
     for block in table_blocks:
         if outside_cell(block["bbox"], cell):
             continue
+
+        # Check if this is an image block (type == 1)
+        if block.get("type") == 1:
+            # Image block found within cell
+            overlap = bbox_overlap(block["bbox"], cell)
+            if overlap > 0.5:  # More than 50% of image is in this cell
+                images_in_cell.append(block)
+            continue
+
+        # Process text blocks (type == 0)
         for line in block["lines"]:
             if outside_cell(line["bbox"], cell):
                 continue
@@ -848,17 +872,59 @@ def outside_cell(bbox, cell):
         .replace("$\n", "$ ")
         .replace(" $ \n", "$ ")
     )
+
+    # Handle images found in this cell
+    if markdown and images_in_cell and page is not None and document is not None:
+        for img_block in images_in_cell:
+            img_bbox = pymupdf.Rect(img_block["bbox"])
+
+            # Extract and save the image if write_images or embed_images is enabled
+            if document.write_images or document.embed_images:
+                try:
+                    pix = page.get_pixmap(clip=img_bbox, dpi=document.image_dpi)
+
+                    if text:
+                        text += "<br>"
+
+                    if document.write_images:
+                        # Generate unique filename for this cell image
+                        if cell_image_counter is None:
+                            cell_image_counter = [0]
+                        cell_image_counter[0] += 1
+                        img_filename = f"{document.filename}-{page.number+1:04d}-table-cell-{cell_image_counter[0]:03d}.{document.image_format}"
+                        img_filename = img_filename.replace(" ", "_")
+                        img_path = os.path.join(document.image_path, img_filename)
+                        pix.save(img_path)
+                        # Add markdown image reference
+                        text += f"![image]({img_path.replace(chr(92), '/')})"
+
+                    elif document.embed_images:
+                        # Embed as base64
+                        import base64
+                        img_data = base64.b64encode(pix.tobytes(document.image_format)).decode()
+                        data_uri = f"data:image/{document.image_format};base64,{img_data}"
+                        text += f"![image]({data_uri})"
+
+                except Exception as e:
+                    # If image extraction fails, add a placeholder
+                    if text:
+                        text += "<br>"
+                    text += f"[Image extraction failed: {str(e)}]"
+
     return text.strip()
 
 
-def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False):
+def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False, page=None, document=None):
     output = ""
     table = table_item.table
     row_count = table["row_count"]
     col_count = table["col_count"]
     cell_boxes = table["cells"]
     # make empty cell text list
     cells = [[None for i in range(col_count)] for j in range(row_count)]
+
+    # Counter for images in table cells
+    cell_image_counter = [0]
 
     # fill None cells with extracted text
     # for rows, copy content from left to right
@@ -877,7 +943,8 @@ def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False):
         for j, cell in enumerate(row):
             if cell is not None:
                 cells[i][j] = extract_cells(
-                    table_blocks, cell_boxes[i][j], markdown=markdown, ocrpage=ocrpage
+                    table_blocks, cell_boxes[i][j], markdown=markdown, ocrpage=ocrpage,
+                    page=page, document=document, cell_image_counter=cell_image_counter
                 )
     for i, name in enumerate(cells[0]):
         if name is None:
@@ -908,13 +975,16 @@ def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False):
     return output + "\n"
 
 
-def table_extract(table_blocks, table_item, ocrpage=False):
+def table_extract(table_blocks, table_item, ocrpage=False, page=None, document=None):
     table = table_item.table
     row_count = table["row_count"]
     col_count = table["col_count"]
     cell_boxes = table["cells"]
     # make empty cell text list
     cells = [[None for i in range(col_count)] for j in range(row_count)]
+
+    # Counter for images in table cells
+    cell_image_counter = [0]
 
     for i, row in enumerate(cell_boxes):
         for j, cell in enumerate(row):
@@ -924,6 +994,9 @@ def table_extract(table_blocks, table_item, ocrpage=False):
                     cell_boxes[i][j],
                     markdown=False,
                     ocrpage=ocrpage,
+                    page=page,
+                    document=document,
+                    cell_image_counter=cell_image_counter,
                 )
 
     return cells