diff --git a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py
index 9b872d12..d8e9a444 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py
@@ -918,11 +918,13 @@ def parse_document(
# tables are present on page:
if not (page_full_ocred or page_text_ocred):
# we need the by-character extraction if no OCR
+ # Include both text blocks (type==0) and image blocks (type==1)
table_blocks = [
- b for b in textpage.extractRAWDICT()["blocks"] if b["type"] == 0
+ b for b in textpage.extractRAWDICT()["blocks"] if b["type"] in (0, 1)
]
else:
- table_blocks = fulltext
+ # Also include images from blocks for OCR case
+ table_blocks = fulltext + [b for b in blocks if b["type"] == 1]
else:
table_blocks = None
@@ -1001,6 +1003,8 @@ def parse_document(
table_blocks,
layoutbox,
ocrpage=(pagelayout.full_ocred or pagelayout.text_ocred),
+ page=page,
+ document=document,
)
layoutbox.table["markdown"] = utils.table_to_markdown(
@@ -1008,6 +1012,8 @@ def parse_document(
layoutbox,
ocrpage=(pagelayout.full_ocred or pagelayout.text_ocred),
markdown=True,
+ page=page,
+ document=document,
)
except Exception as e:
diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
index 6b860800..7ba0bb29 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
@@ -560,7 +560,9 @@ def write_text(
)
]
for i, _ in tab_candidates:
- out_string += "\n" + parms.tabs[i].to_markdown(clean=False) + "\n"
+ table_md = parms.tabs[i].to_markdown(clean=False)
+ table_md = add_images_to_table_markdown(parms.page, parms.tabs[i], table_md, parms)
+ out_string += "\n" + table_md + "\n"
if EXTRACT_WORDS:
# for "words" extraction, add table cells as line rects
cells = sorted(
@@ -759,6 +761,95 @@ def intersects_rects(rect, rect_list):
return i
return 0
+ def add_images_to_table_markdown(page, table, table_md, parms):
+ """Add images found in table cells to the markdown output."""
+ if not (write_images or embed_images):
+ return table_md
+
+ # Get all images on the page
+ image_list = page.get_image_info()
+ if not image_list:
+ return table_md
+
+ # Split markdown into lines
+ md_lines = table_md.strip().split('\n')
+ if len(md_lines) < 3: # Need at least header + separator + one row
+ return table_md
+
+ # Track images added to avoid duplicates
+ used_images = set()
+
+ # Process each data row (skip header and separator)
+ for row_idx in range(2, len(md_lines)):
+ line = md_lines[row_idx]
+ if not line.strip() or not line.startswith('|'):
+ continue
+
+ # Parse table cells
+ cells = [c.strip() for c in line.split('|')[1:-1]] # Remove first/last empty
+
+ # Get table row info
+ # Markdown line 2 = first data row = table.rows[1] (since rows[0] is header)
+ table_row_idx = row_idx - 2 + 1 # +1 to skip header row in table.rows
+ if table_row_idx >= table.row_count:
+ continue
+
+ row_cells = table.rows[table_row_idx].cells
+
+ # Check each cell for images
+ for col_idx, cell_bbox in enumerate(row_cells):
+ if col_idx >= len(cells) or cell_bbox is None:
+ continue
+
+ cell_rect = pymupdf.Rect(cell_bbox)
+
+ # Find images that overlap with this cell
+ for img_idx, img_info in enumerate(image_list):
+ if img_idx in used_images:
+ continue
+
+ img_bbox = pymupdf.Rect(img_info['bbox'])
+
+ # Calculate overlap
+ intersection = cell_rect & img_bbox
+ if intersection.is_empty:
+ continue
+
+ overlap_ratio = abs(intersection) / abs(img_bbox)
+
+ # If >50% of image is in this cell, it belongs here
+ if overlap_ratio > 0.5:
+ # Extract and save the image
+ try:
+ pix = page.get_pixmap(clip=img_bbox, dpi=DPI)
+
+ if write_images:
+ filename = os.path.basename(parms.filename).replace(" ", "-")
+ img_filename = os.path.join(
+ IMG_PATH, f"{filename}-{page.number}-table-{img_idx}.{IMG_EXTENSION}"
+ )
+ pix.save(img_filename)
+ img_ref = f", '/')})"
+ elif embed_images:
+ data = b2a_base64(pix.tobytes(IMG_EXTENSION)).decode()
+ data_uri = f"data:image/{IMG_EXTENSION};base64," + data
+ img_ref = f""
+
+ # Add image reference to cell
+ if cells[col_idx]:
+ cells[col_idx] += "
" + img_ref
+ else:
+ cells[col_idx] = img_ref
+
+ used_images.add(img_idx)
+ except Exception:
+ pass # Skip failed image extractions
+
+ # Reconstruct the row with images
+ md_lines[row_idx] = '|' + '|'.join(cells) + '|'
+
+ return '\n'.join(md_lines) + '\n'
+
def output_tables(parms, text_rect):
"""Output tables above given text rectangle."""
this_md = "" # markdown string for table(s) content
@@ -769,7 +860,9 @@ def output_tables(parms, text_rect):
):
if i in parms.written_tables:
continue
- this_md += parms.tabs[i].to_markdown(clean=False) + "\n"
+ table_md = parms.tabs[i].to_markdown(clean=False)
+ table_md = add_images_to_table_markdown(parms.page, parms.tabs[i], table_md, parms)
+ this_md += table_md + "\n"
if EXTRACT_WORDS:
# for "words" extraction, add table cells as line rects
cells = sorted(
@@ -790,7 +883,9 @@ def output_tables(parms, text_rect):
for i, trect in parms.tab_rects.items():
if i in parms.written_tables:
continue
- this_md += parms.tabs[i].to_markdown(clean=False) + "\n"
+ table_md = parms.tabs[i].to_markdown(clean=False)
+ table_md = add_images_to_table_markdown(parms.page, parms.tabs[i], table_md, parms)
+ this_md += table_md + "\n"
if EXTRACT_WORDS:
# for "words" extraction, add table cells as line rects
cells = sorted(
diff --git a/pymupdf4llm/pymupdf4llm/helpers/utils.py b/pymupdf4llm/pymupdf4llm/helpers/utils.py
index 03f9cdf8..1d247523 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/utils.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/utils.py
@@ -750,24 +750,27 @@ def complete_table_structure(page):
return all_lines, all_boxes
-def extract_cells(table_blocks, cell, markdown=False, ocrpage=False):
- """Extract text from a rect-like 'cell' as plain or MD styled text.
+def extract_cells(table_blocks, cell, markdown=False, ocrpage=False, page=None, document=None, cell_image_counter=None):
+ """Extract text and images from a rect-like 'cell' as plain or MD styled text.
This function should ultimately be used to extract text from a table cell.
Markdown output will only work correctly if extraction flag bit
TEXT_COLLECT_STYLES is set.
Args:
- table_blocks: A list of PyMuPDF TextPage text blocks (type = 0). Must
+ table_blocks: A list of PyMuPDF TextPage text blocks (type = 0 or 1). Must
have been created with TEXT_COLLECT_STYLE for correct markdown.
Format is either "dict" or "rawdict" depending on ocrpage.
cell: A tuple (x0, y0, x1, y1) defining the cell's bbox.
markdown: If True, return text formatted for Markdown.
ocrpage: If True, text is written with GlyphLessFont. In this case,
table_blocks is in format "dict".
+ page: Optional Page object for image extraction from cells.
+ document: Optional ParsedDocument object for image write/embed settings.
+ cell_image_counter: Optional list with one element [counter] to track image numbers.
Returns:
- A string with the text extracted from the cell.
+ A string with the text and images extracted from the cell.
"""
def outside_cell(bbox, cell):
@@ -779,10 +782,31 @@ def outside_cell(bbox, cell):
or bbox[3] <= cell[1]
)
+ def bbox_overlap(bbox, cell):
+ """Calculate overlap ratio between bbox and cell."""
+ cell_rect = pymupdf.Rect(cell)
+ bbox_rect = pymupdf.Rect(bbox)
+ intersection = cell_rect & bbox_rect
+ if intersection.is_empty:
+ return 0.0
+ return abs(intersection) / abs(bbox_rect)
+
text = ""
+ images_in_cell = []
+
for block in table_blocks:
if outside_cell(block["bbox"], cell):
continue
+
+ # Check if this is an image block (type == 1)
+ if block.get("type") == 1:
+ # Image block found within cell
+ overlap = bbox_overlap(block["bbox"], cell)
+ if overlap > 0.5: # More than 50% of image is in this cell
+ images_in_cell.append(block)
+ continue
+
+ # Process text blocks (type == 0)
for line in block["lines"]:
if outside_cell(line["bbox"], cell):
continue
@@ -848,10 +872,49 @@ def outside_cell(bbox, cell):
.replace("$\n", "$ ")
.replace(" $ \n", "$ ")
)
+
+ # Handle images found in this cell
+ if markdown and images_in_cell and page is not None and document is not None:
+ for img_block in images_in_cell:
+ img_bbox = pymupdf.Rect(img_block["bbox"])
+
+ # Extract and save the image if write_images or embed_images is enabled
+ if document.write_images or document.embed_images:
+ try:
+ pix = page.get_pixmap(clip=img_bbox, dpi=document.image_dpi)
+
+ if text:
+ text += "
"
+
+ if document.write_images:
+ # Generate unique filename for this cell image
+ if cell_image_counter is None:
+ cell_image_counter = [0]
+ cell_image_counter[0] += 1
+ img_filename = f"{document.filename}-{page.number+1:04d}-table-cell-{cell_image_counter[0]:03d}.{document.image_format}"
+ img_filename = img_filename.replace(" ", "_")
+ img_path = os.path.join(document.image_path, img_filename)
+ pix.save(img_path)
+ # Add markdown image reference
+ text += f", '/')})"
+
+ elif document.embed_images:
+ # Embed as base64
+ import base64
+ img_data = base64.b64encode(pix.tobytes(document.image_format)).decode()
+ data_uri = f"data:image/{document.image_format};base64,{img_data}"
+ text += f""
+
+ except Exception as e:
+ # If image extraction fails, add a placeholder
+ if text:
+ text += "
"
+ text += f"[Image extraction failed: {str(e)}]"
+
return text.strip()
-def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False):
+def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False, page=None, document=None):
output = ""
table = table_item.table
row_count = table["row_count"]
@@ -859,6 +922,9 @@ def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False):
cell_boxes = table["cells"]
# make empty cell text list
cells = [[None for i in range(col_count)] for j in range(row_count)]
+
+ # Counter for images in table cells
+ cell_image_counter = [0]
# fill None cells with extracted text
# for rows, copy content from left to right
@@ -877,7 +943,8 @@ def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False):
for j, cell in enumerate(row):
if cell is not None:
cells[i][j] = extract_cells(
- table_blocks, cell_boxes[i][j], markdown=markdown, ocrpage=ocrpage
+ table_blocks, cell_boxes[i][j], markdown=markdown, ocrpage=ocrpage,
+ page=page, document=document, cell_image_counter=cell_image_counter
)
for i, name in enumerate(cells[0]):
if name is None:
@@ -908,13 +975,16 @@ def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False):
return output + "\n"
-def table_extract(table_blocks, table_item, ocrpage=False):
+def table_extract(table_blocks, table_item, ocrpage=False, page=None, document=None):
table = table_item.table
row_count = table["row_count"]
col_count = table["col_count"]
cell_boxes = table["cells"]
# make empty cell text list
cells = [[None for i in range(col_count)] for j in range(row_count)]
+
+ # Counter for images in table cells
+ cell_image_counter = [0]
for i, row in enumerate(cell_boxes):
for j, cell in enumerate(row):
@@ -924,6 +994,9 @@ def table_extract(table_blocks, table_item, ocrpage=False):
cell_boxes[i][j],
markdown=False,
ocrpage=ocrpage,
+ page=page,
+ document=document,
+ cell_image_counter=cell_image_counter,
)
return cells