Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions pymupdf4llm/pymupdf4llm/helpers/document_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -918,11 +918,13 @@ def parse_document(
# tables are present on page:
if not (page_full_ocred or page_text_ocred):
# we need the by-character extraction if no OCR
# Include both text blocks (type==0) and image blocks (type==1)
table_blocks = [
b for b in textpage.extractRAWDICT()["blocks"] if b["type"] == 0
b for b in textpage.extractRAWDICT()["blocks"] if b["type"] in (0, 1)
]
else:
table_blocks = fulltext
# Also include images from blocks for OCR case
table_blocks = fulltext + [b for b in blocks if b["type"] == 1]
else:
table_blocks = None

Expand Down Expand Up @@ -1001,13 +1003,17 @@ def parse_document(
table_blocks,
layoutbox,
ocrpage=(pagelayout.full_ocred or pagelayout.text_ocred),
page=page,
document=document,
)

layoutbox.table["markdown"] = utils.table_to_markdown(
table_blocks,
layoutbox,
ocrpage=(pagelayout.full_ocred or pagelayout.text_ocred),
markdown=True,
page=page,
document=document,
)

except Exception as e:
Expand Down
101 changes: 98 additions & 3 deletions pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,9 @@ def write_text(
)
]
for i, _ in tab_candidates:
out_string += "\n" + parms.tabs[i].to_markdown(clean=False) + "\n"
table_md = parms.tabs[i].to_markdown(clean=False)
table_md = add_images_to_table_markdown(parms.page, parms.tabs[i], table_md, parms)
out_string += "\n" + table_md + "\n"
if EXTRACT_WORDS:
# for "words" extraction, add table cells as line rects
cells = sorted(
Expand Down Expand Up @@ -759,6 +761,95 @@ def intersects_rects(rect, rect_list):
return i
return 0

def add_images_to_table_markdown(page, table, table_md, parms):
"""Add images found in table cells to the markdown output."""
if not (write_images or embed_images):
return table_md

# Get all images on the page
image_list = page.get_image_info()
if not image_list:
return table_md

# Split markdown into lines
md_lines = table_md.strip().split('\n')
if len(md_lines) < 3: # Need at least header + separator + one row
return table_md

# Track images added to avoid duplicates
used_images = set()

# Process each data row (skip header and separator)
for row_idx in range(2, len(md_lines)):
line = md_lines[row_idx]
if not line.strip() or not line.startswith('|'):
continue

# Parse table cells
cells = [c.strip() for c in line.split('|')[1:-1]] # Remove first/last empty

# Get table row info
# Markdown line 2 = first data row = table.rows[1] (since rows[0] is header)
table_row_idx = row_idx - 2 + 1 # +1 to skip header row in table.rows
if table_row_idx >= table.row_count:
continue

row_cells = table.rows[table_row_idx].cells

# Check each cell for images
for col_idx, cell_bbox in enumerate(row_cells):
if col_idx >= len(cells) or cell_bbox is None:
continue

cell_rect = pymupdf.Rect(cell_bbox)

# Find images that overlap with this cell
for img_idx, img_info in enumerate(image_list):
if img_idx in used_images:
continue

img_bbox = pymupdf.Rect(img_info['bbox'])

# Calculate overlap
intersection = cell_rect & img_bbox
if intersection.is_empty:
continue

overlap_ratio = abs(intersection) / abs(img_bbox)

# If >50% of image is in this cell, it belongs here
if overlap_ratio > 0.5:
# Extract and save the image
try:
pix = page.get_pixmap(clip=img_bbox, dpi=DPI)

if write_images:
filename = os.path.basename(parms.filename).replace(" ", "-")
img_filename = os.path.join(
IMG_PATH, f"{filename}-{page.number}-table-{img_idx}.{IMG_EXTENSION}"
)
pix.save(img_filename)
img_ref = f"![image]({img_filename.replace(chr(92), '/')})"
elif embed_images:
data = b2a_base64(pix.tobytes(IMG_EXTENSION)).decode()
data_uri = f"data:image/{IMG_EXTENSION};base64," + data
img_ref = f"![image]({data_uri})"

# Add image reference to cell
if cells[col_idx]:
cells[col_idx] += "<br>" + img_ref
else:
cells[col_idx] = img_ref

used_images.add(img_idx)
except Exception:
pass # Skip failed image extractions

# Reconstruct the row with images
md_lines[row_idx] = '|' + '|'.join(cells) + '|'

return '\n'.join(md_lines) + '\n'

def output_tables(parms, text_rect):
"""Output tables above given text rectangle."""
this_md = "" # markdown string for table(s) content
Expand All @@ -769,7 +860,9 @@ def output_tables(parms, text_rect):
):
if i in parms.written_tables:
continue
this_md += parms.tabs[i].to_markdown(clean=False) + "\n"
table_md = parms.tabs[i].to_markdown(clean=False)
table_md = add_images_to_table_markdown(parms.page, parms.tabs[i], table_md, parms)
this_md += table_md + "\n"
if EXTRACT_WORDS:
# for "words" extraction, add table cells as line rects
cells = sorted(
Expand All @@ -790,7 +883,9 @@ def output_tables(parms, text_rect):
for i, trect in parms.tab_rects.items():
if i in parms.written_tables:
continue
this_md += parms.tabs[i].to_markdown(clean=False) + "\n"
table_md = parms.tabs[i].to_markdown(clean=False)
table_md = add_images_to_table_markdown(parms.page, parms.tabs[i], table_md, parms)
this_md += table_md + "\n"
if EXTRACT_WORDS:
# for "words" extraction, add table cells as line rects
cells = sorted(
Expand Down
87 changes: 80 additions & 7 deletions pymupdf4llm/pymupdf4llm/helpers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -750,24 +750,27 @@ def complete_table_structure(page):
return all_lines, all_boxes


def extract_cells(table_blocks, cell, markdown=False, ocrpage=False):
"""Extract text from a rect-like 'cell' as plain or MD styled text.
def extract_cells(table_blocks, cell, markdown=False, ocrpage=False, page=None, document=None, cell_image_counter=None):
"""Extract text and images from a rect-like 'cell' as plain or MD styled text.

This function should ultimately be used to extract text from a table cell.
Markdown output will only work correctly if extraction flag bit
TEXT_COLLECT_STYLES is set.

Args:
table_blocks: A list of PyMuPDF TextPage text blocks (type = 0). Must
table_blocks: A list of PyMuPDF TextPage text blocks (type = 0 or 1). Must
have been created with TEXT_COLLECT_STYLE for correct markdown.
Format is either "dict" or "rawdict" depending on ocrpage.
cell: A tuple (x0, y0, x1, y1) defining the cell's bbox.
markdown: If True, return text formatted for Markdown.
ocrpage: If True, text is written with GlyphLessFont. In this case,
table_blocks is in format "dict".
page: Optional Page object for image extraction from cells.
document: Optional ParsedDocument object for image write/embed settings.
cell_image_counter: Optional list with one element [counter] to track image numbers.

Returns:
A string with the text extracted from the cell.
A string with the text and images extracted from the cell.
"""

def outside_cell(bbox, cell):
Expand All @@ -779,10 +782,31 @@ def outside_cell(bbox, cell):
or bbox[3] <= cell[1]
)

def bbox_overlap(bbox, cell):
"""Calculate overlap ratio between bbox and cell."""
cell_rect = pymupdf.Rect(cell)
bbox_rect = pymupdf.Rect(bbox)
intersection = cell_rect & bbox_rect
if intersection.is_empty:
return 0.0
return abs(intersection) / abs(bbox_rect)

text = ""
images_in_cell = []

for block in table_blocks:
if outside_cell(block["bbox"], cell):
continue

# Check if this is an image block (type == 1)
if block.get("type") == 1:
# Image block found within cell
overlap = bbox_overlap(block["bbox"], cell)
if overlap > 0.5: # More than 50% of image is in this cell
images_in_cell.append(block)
continue

# Process text blocks (type == 0)
for line in block["lines"]:
if outside_cell(line["bbox"], cell):
continue
Expand Down Expand Up @@ -848,17 +872,59 @@ def outside_cell(bbox, cell):
.replace("$\n", "$ ")
.replace(" $ \n", "$ ")
)

# Handle images found in this cell
if markdown and images_in_cell and page is not None and document is not None:
for img_block in images_in_cell:
img_bbox = pymupdf.Rect(img_block["bbox"])

# Extract and save the image if write_images or embed_images is enabled
if document.write_images or document.embed_images:
try:
pix = page.get_pixmap(clip=img_bbox, dpi=document.image_dpi)

if text:
text += "<br>"

if document.write_images:
# Generate unique filename for this cell image
if cell_image_counter is None:
cell_image_counter = [0]
cell_image_counter[0] += 1
img_filename = f"{document.filename}-{page.number+1:04d}-table-cell-{cell_image_counter[0]:03d}.{document.image_format}"
img_filename = img_filename.replace(" ", "_")
img_path = os.path.join(document.image_path, img_filename)
pix.save(img_path)
# Add markdown image reference
text += f"![image]({img_path.replace(chr(92), '/')})"

elif document.embed_images:
# Embed as base64
import base64
img_data = base64.b64encode(pix.tobytes(document.image_format)).decode()
data_uri = f"data:image/{document.image_format};base64,{img_data}"
text += f"![image]({data_uri})"

except Exception as e:
# If image extraction fails, add a placeholder
if text:
text += "<br>"
text += f"[Image extraction failed: {str(e)}]"

return text.strip()


def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False):
def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False, page=None, document=None):
output = ""
table = table_item.table
row_count = table["row_count"]
col_count = table["col_count"]
cell_boxes = table["cells"]
# make empty cell text list
cells = [[None for i in range(col_count)] for j in range(row_count)]

# Counter for images in table cells
cell_image_counter = [0]

# fill None cells with extracted text
# for rows, copy content from left to right
Expand All @@ -877,7 +943,8 @@ def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False):
for j, cell in enumerate(row):
if cell is not None:
cells[i][j] = extract_cells(
table_blocks, cell_boxes[i][j], markdown=markdown, ocrpage=ocrpage
table_blocks, cell_boxes[i][j], markdown=markdown, ocrpage=ocrpage,
page=page, document=document, cell_image_counter=cell_image_counter
)
for i, name in enumerate(cells[0]):
if name is None:
Expand Down Expand Up @@ -908,13 +975,16 @@ def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False):
return output + "\n"


def table_extract(table_blocks, table_item, ocrpage=False):
def table_extract(table_blocks, table_item, ocrpage=False, page=None, document=None):
table = table_item.table
row_count = table["row_count"]
col_count = table["col_count"]
cell_boxes = table["cells"]
# make empty cell text list
cells = [[None for i in range(col_count)] for j in range(row_count)]

# Counter for images in table cells
cell_image_counter = [0]

for i, row in enumerate(cell_boxes):
for j, cell in enumerate(row):
Expand All @@ -924,6 +994,9 @@ def table_extract(table_blocks, table_item, ocrpage=False):
cell_boxes[i][j],
markdown=False,
ocrpage=ocrpage,
page=page,
document=document,
cell_image_counter=cell_image_counter,
)

return cells