From 78a492a91a4e9c0aeccf1bfcab7117f99867896d Mon Sep 17 00:00:00 2001 From: "anup.roy" Date: Wed, 26 Nov 2025 15:52:17 +0400 Subject: [PATCH 1/2] Fix #21: Enable images to appear inside table cells Modified extract_cells() to detect and extract image blocks (type==1) within table cells, not just text blocks (type==0). Changes: - Updated extract_cells() to accept page and document parameters - Added logic to detect image blocks within cell bounding boxes - Implemented image extraction and saving for cells with images - Images are now embedded in cell markdown as ![image](path) syntax - Updated table_to_markdown() and table_extract() signatures - Updated calls in document_layout.py to pass page/document context - Added test script to demonstrate the fix When write_images=True or embed_images=True, images found in table cells are now properly extracted and referenced inline within the cell markdown, resolving the issue where images appeared below tables. --- .../pymupdf4llm/helpers/document_layout.py | 4 + pymupdf4llm/pymupdf4llm/helpers/utils.py | 87 +++++++++++- test_image_in_table.py | 134 ++++++++++++++++++ 3 files changed, 218 insertions(+), 7 deletions(-) create mode 100644 test_image_in_table.py diff --git a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py index 9b872d12..3e3b3b9a 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py +++ b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py @@ -1001,6 +1001,8 @@ def parse_document( table_blocks, layoutbox, ocrpage=(pagelayout.full_ocred or pagelayout.text_ocred), + page=page, + document=document, ) layoutbox.table["markdown"] = utils.table_to_markdown( @@ -1008,6 +1010,8 @@ def parse_document( layoutbox, ocrpage=(pagelayout.full_ocred or pagelayout.text_ocred), markdown=True, + page=page, + document=document, ) except Exception as e: diff --git a/pymupdf4llm/pymupdf4llm/helpers/utils.py b/pymupdf4llm/pymupdf4llm/helpers/utils.py index 03f9cdf8..1d247523 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/utils.py +++ b/pymupdf4llm/pymupdf4llm/helpers/utils.py @@ -750,24 +750,27 @@ def complete_table_structure(page): return all_lines, all_boxes -def extract_cells(table_blocks, cell, markdown=False, ocrpage=False): - """Extract text from a rect-like 'cell' as plain or MD styled text. +def extract_cells(table_blocks, cell, markdown=False, ocrpage=False, page=None, document=None, cell_image_counter=None): + """Extract text and images from a rect-like 'cell' as plain or MD styled text. This function should ultimately be used to extract text from a table cell. Markdown output will only work correctly if extraction flag bit TEXT_COLLECT_STYLES is set. Args: - table_blocks: A list of PyMuPDF TextPage text blocks (type = 0). Must + table_blocks: A list of PyMuPDF TextPage text blocks (type = 0 or 1). Must have been created with TEXT_COLLECT_STYLE for correct markdown. Format is either "dict" or "rawdict" depending on ocrpage. cell: A tuple (x0, y0, x1, y1) defining the cell's bbox. markdown: If True, return text formatted for Markdown. ocrpage: If True, text is written with GlyphLessFont. In this case, table_blocks is in format "dict". + page: Optional Page object for image extraction from cells. + document: Optional ParsedDocument object for image write/embed settings. + cell_image_counter: Optional list with one element [counter] to track image numbers. Returns: - A string with the text extracted from the cell. + A string with the text and images extracted from the cell. """ def outside_cell(bbox, cell): @@ -779,10 +782,31 @@ def outside_cell(bbox, cell): or bbox[3] <= cell[1] ) + def bbox_overlap(bbox, cell): + """Calculate overlap ratio between bbox and cell.""" + cell_rect = pymupdf.Rect(cell) + bbox_rect = pymupdf.Rect(bbox) + intersection = cell_rect & bbox_rect + if intersection.is_empty: + return 0.0 + return abs(intersection) / abs(bbox_rect) + text = "" + images_in_cell = [] + for block in table_blocks: if outside_cell(block["bbox"], cell): continue + + # Check if this is an image block (type == 1) + if block.get("type") == 1: + # Image block found within cell + overlap = bbox_overlap(block["bbox"], cell) + if overlap > 0.5: # More than 50% of image is in this cell + images_in_cell.append(block) + continue + + # Process text blocks (type == 0) for line in block["lines"]: if outside_cell(line["bbox"], cell): continue @@ -848,10 +872,49 @@ def outside_cell(bbox, cell): .replace("$\n", "$ ") .replace(" $ \n", "$ ") ) + + # Handle images found in this cell + if markdown and images_in_cell and page is not None and document is not None: + for img_block in images_in_cell: + img_bbox = pymupdf.Rect(img_block["bbox"]) + + # Extract and save the image if write_images or embed_images is enabled + if document.write_images or document.embed_images: + try: + pix = page.get_pixmap(clip=img_bbox, dpi=document.image_dpi) + + if text: + text += "
" + + if document.write_images: + # Generate unique filename for this cell image + if cell_image_counter is None: + cell_image_counter = [0] + cell_image_counter[0] += 1 + img_filename = f"{document.filename}-{page.number+1:04d}-table-cell-{cell_image_counter[0]:03d}.{document.image_format}" + img_filename = img_filename.replace(" ", "_") + img_path = os.path.join(document.image_path, img_filename) + pix.save(img_path) + # Add markdown image reference + text += f"![image]({img_path.replace(chr(92), '/')})" + + elif document.embed_images: + # Embed as base64 + import base64 + img_data = base64.b64encode(pix.tobytes(document.image_format)).decode() + data_uri = f"data:image/{document.image_format};base64,{img_data}" + text += f"![image]({data_uri})" + + except Exception as e: + # If image extraction fails, add a placeholder + if text: + text += "
" + text += f"[Image extraction failed: {str(e)}]" + return text.strip() -def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False): +def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False, page=None, document=None): output = "" table = table_item.table row_count = table["row_count"] @@ -859,6 +922,9 @@ def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False): cell_boxes = table["cells"] # make empty cell text list cells = [[None for i in range(col_count)] for j in range(row_count)] + + # Counter for images in table cells + cell_image_counter = [0] # fill None cells with extracted text # for rows, copy content from left to right @@ -877,7 +943,8 @@ def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False): for j, cell in enumerate(row): if cell is not None: cells[i][j] = extract_cells( - table_blocks, cell_boxes[i][j], markdown=markdown, ocrpage=ocrpage + table_blocks, cell_boxes[i][j], markdown=markdown, ocrpage=ocrpage, + page=page, document=document, cell_image_counter=cell_image_counter ) for i, name in enumerate(cells[0]): if name is None: @@ -908,13 +975,16 @@ def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False): return output + "\n" -def table_extract(table_blocks, table_item, ocrpage=False): +def table_extract(table_blocks, table_item, ocrpage=False, page=None, document=None): table = table_item.table row_count = table["row_count"] col_count = table["col_count"] cell_boxes = table["cells"] # make empty cell text list cells = [[None for i in range(col_count)] for j in range(row_count)] + + # Counter for images in table cells + cell_image_counter = [0] for i, row in enumerate(cell_boxes): for j, cell in enumerate(row): @@ -924,6 +994,9 @@ def table_extract(table_blocks, table_item, ocrpage=False): cell_boxes[i][j], markdown=False, ocrpage=ocrpage, + page=page, + document=document, + cell_image_counter=cell_image_counter, ) return cells diff --git a/test_image_in_table.py b/test_image_in_table.py new file mode 100644 index 00000000..98ccf013 --- /dev/null +++ b/test_image_in_table.py @@ -0,0 +1,134 @@ +""" +Test script to demonstrate images in table cells fix for Issue #21. + +This script creates a test PDF with a table containing images in cells, +then uses pymupdf4llm to extract the table and verify that images +appear inside the table cells in the markdown output. +""" + +import pymupdf +import pymupdf4llm +import os +import tempfile +import shutil + +def create_test_pdf_with_table_images(): + """Create a test PDF with a table that has images in cells.""" + doc = pymupdf.open() + page = doc.new_page(width=595, height=842) # A4 size + + # Define table structure + table_rect = pymupdf.Rect(50, 50, 545, 400) + cell_width = (table_rect.width) / 3 + cell_height = (table_rect.height) / 4 + + # Draw table grid + for i in range(4): + # Horizontal lines + y = table_rect.y0 + i * cell_height + page.draw_line((table_rect.x0, y), (table_rect.x1, y)) + page.draw_line((table_rect.x0, table_rect.y1), (table_rect.x1, table_rect.y1)) + + for i in range(4): + # Vertical lines + x = table_rect.x0 + i * cell_width + page.draw_line((x, table_rect.y0), (x, table_rect.y1)) + + # Add header text + page.insert_text((table_rect.x0 + 10, table_rect.y0 + 20), "Column 1", fontsize=12) + page.insert_text((table_rect.x0 + cell_width + 10, table_rect.y0 + 20), "Column 2", fontsize=12) + page.insert_text((table_rect.x0 + 2 * cell_width + 10, table_rect.y0 + 20), "Image Column", fontsize=12) + + # Add data rows with text + for row in range(1, 3): + y_pos = table_rect.y0 + row * cell_height + 20 + page.insert_text((table_rect.x0 + 10, y_pos), f"Row {row} Col 1", fontsize=10) + page.insert_text((table_rect.x0 + cell_width + 10, y_pos), f"Row {row} Col 2", fontsize=10) + + # Add simple colored rectangles as "images" in the third column + for row in range(1, 3): + y_start = table_rect.y0 + row * cell_height + 10 + x_start = table_rect.x0 + 2 * cell_width + 10 + + # Create a simple colored rectangle to simulate an image + img_rect = pymupdf.Rect(x_start, y_start, x_start + 60, y_start + 40) + + # Draw colored rectangle + color = (1, 0, 0) if row == 1 else (0, 0, 1) # Red or Blue + page.draw_rect(img_rect, color=color, fill=color, width=0) + + # Add a small label + page.insert_text((x_start + 5, y_start + 25), f"IMG{row}", fontsize=8, color=(1, 1, 1)) + + # Save to temporary file + temp_pdf = tempfile.mktemp(suffix=".pdf") + doc.save(temp_pdf) + doc.close() + + return temp_pdf + + +def test_image_in_table(): + """Test that images appear inside table cells in markdown output.""" + print("Creating test PDF with table containing images...") + test_pdf = create_test_pdf_with_table_images() + + print(f"Test PDF created: {test_pdf}") + print() + + # Create temporary directory for images + image_dir = tempfile.mkdtemp() + print(f"Image output directory: {image_dir}") + print() + + try: + # Extract markdown with images + print("Extracting markdown with write_images=True...") + doc = pymupdf.open(test_pdf) + md_text = pymupdf4llm.to_markdown( + doc, + write_images=True, + image_path=image_dir + ) + doc.close() + + print("Markdown output:") + print("=" * 80) + print(md_text) + print("=" * 80) + print() + + # Check if images are referenced in table + if "![image]" in md_text and "|" in md_text: + print("SUCCESS: Images appear to be included in table cells!") + + # Count image references + image_count = md_text.count("![image]") + print(f"Found {image_count} image reference(s) in the markdown output.") + + # List created image files + image_files = [f for f in os.listdir(image_dir) if f.endswith(('.png', '.jpg', '.jpeg'))] + print(f"Created {len(image_files)} image file(s):") + for img_file in image_files: + print(f" - {img_file}") + else: + print("WARNING: No images found in table cells or no table detected.") + print("This might be expected if table detection failed.") + + print() + print("Test completed!") + + finally: + # Cleanup + if os.path.exists(test_pdf): + os.remove(test_pdf) + print(f"Cleaned up test PDF: {test_pdf}") + + if os.path.exists(image_dir): + shutil.rmtree(image_dir) + print(f"Cleaned up image directory: {image_dir}") + + +if __name__ == "__main__": + test_image_in_table() + From 38934b2d22645a67e8ded6ba38198cba966df87c Mon Sep 17 00:00:00 2001 From: anup00900 Date: Wed, 26 Nov 2025 16:34:28 +0400 Subject: [PATCH 2/2] Fix #21: Images now appear inside table cells This fix enables images to appear inside their corresponding table cells instead of being extracted separately below the table. Changes for LEGACY MODE (pymupdf_rag.py): - Added add_images_to_table_markdown() function to detect images within table cell boundaries - Images with >50% overlap with a cell are assigned to that cell - Generates unique filenames for table cell images - Supports both write_images and embed_images modes - Inserts ![image](path) markdown syntax inline with cell text - Updated all 3 locations where table.to_markdown() is called Changes for LAYOUT MODE (document_layout.py): - Updated table_blocks to include image blocks (type==1) - Modified extract_cells() to detect and extract images in cells - Added page/document parameters to table extraction functions - Images are extracted and referenced inline in cells TESTING: Fully tested with embedded images in PDFs. All images correctly appear inside their table cells in the markdown output. Before fix: | Col1 | Col2 | Image | |---|---|---| | Text | Text | | ![image1](image1.png) After fix: | Col1 | Col2 | Image | |---|---|---| | Text | Text | ![image1](image1.png) | Resolves the requested behavior from Issue #21. --- .../pymupdf4llm/helpers/document_layout.py | 6 +- .../pymupdf4llm/helpers/pymupdf_rag.py | 101 ++++++++++++- test_image_in_table.py | 134 ------------------ 3 files changed, 102 insertions(+), 139 deletions(-) delete mode 100644 test_image_in_table.py diff --git a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py index 3e3b3b9a..d8e9a444 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py +++ b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py @@ -918,11 +918,13 @@ def parse_document( # tables are present on page: if not (page_full_ocred or page_text_ocred): # we need the by-character extraction if no OCR + # Include both text blocks (type==0) and image blocks (type==1) table_blocks = [ - b for b in textpage.extractRAWDICT()["blocks"] if b["type"] == 0 + b for b in textpage.extractRAWDICT()["blocks"] if b["type"] in (0, 1) ] else: - table_blocks = fulltext + # Also include images from blocks for OCR case + table_blocks = fulltext + [b for b in blocks if b["type"] == 1] else: table_blocks = None diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index 6b860800..7ba0bb29 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -560,7 +560,9 @@ def write_text( ) ] for i, _ in tab_candidates: - out_string += "\n" + parms.tabs[i].to_markdown(clean=False) + "\n" + table_md = parms.tabs[i].to_markdown(clean=False) + table_md = add_images_to_table_markdown(parms.page, parms.tabs[i], table_md, parms) + out_string += "\n" + table_md + "\n" if EXTRACT_WORDS: # for "words" extraction, add table cells as line rects cells = sorted( @@ -759,6 +761,95 @@ def intersects_rects(rect, rect_list): return i return 0 + def add_images_to_table_markdown(page, table, table_md, parms): + """Add images found in table cells to the markdown output.""" + if not (write_images or embed_images): + return table_md + + # Get all images on the page + image_list = page.get_image_info() + if not image_list: + return table_md + + # Split markdown into lines + md_lines = table_md.strip().split('\n') + if len(md_lines) < 3: # Need at least header + separator + one row + return table_md + + # Track images added to avoid duplicates + used_images = set() + + # Process each data row (skip header and separator) + for row_idx in range(2, len(md_lines)): + line = md_lines[row_idx] + if not line.strip() or not line.startswith('|'): + continue + + # Parse table cells + cells = [c.strip() for c in line.split('|')[1:-1]] # Remove first/last empty + + # Get table row info + # Markdown line 2 = first data row = table.rows[1] (since rows[0] is header) + table_row_idx = row_idx - 2 + 1 # +1 to skip header row in table.rows + if table_row_idx >= table.row_count: + continue + + row_cells = table.rows[table_row_idx].cells + + # Check each cell for images + for col_idx, cell_bbox in enumerate(row_cells): + if col_idx >= len(cells) or cell_bbox is None: + continue + + cell_rect = pymupdf.Rect(cell_bbox) + + # Find images that overlap with this cell + for img_idx, img_info in enumerate(image_list): + if img_idx in used_images: + continue + + img_bbox = pymupdf.Rect(img_info['bbox']) + + # Calculate overlap + intersection = cell_rect & img_bbox + if intersection.is_empty: + continue + + overlap_ratio = abs(intersection) / abs(img_bbox) + + # If >50% of image is in this cell, it belongs here + if overlap_ratio > 0.5: + # Extract and save the image + try: + pix = page.get_pixmap(clip=img_bbox, dpi=DPI) + + if write_images: + filename = os.path.basename(parms.filename).replace(" ", "-") + img_filename = os.path.join( + IMG_PATH, f"{filename}-{page.number}-table-{img_idx}.{IMG_EXTENSION}" + ) + pix.save(img_filename) + img_ref = f"![image]({img_filename.replace(chr(92), '/')})" + elif embed_images: + data = b2a_base64(pix.tobytes(IMG_EXTENSION)).decode() + data_uri = f"data:image/{IMG_EXTENSION};base64," + data + img_ref = f"![image]({data_uri})" + + # Add image reference to cell + if cells[col_idx]: + cells[col_idx] += "
" + img_ref + else: + cells[col_idx] = img_ref + + used_images.add(img_idx) + except Exception: + pass # Skip failed image extractions + + # Reconstruct the row with images + md_lines[row_idx] = '|' + '|'.join(cells) + '|' + + return '\n'.join(md_lines) + '\n' + def output_tables(parms, text_rect): """Output tables above given text rectangle.""" this_md = "" # markdown string for table(s) content @@ -769,7 +860,9 @@ def output_tables(parms, text_rect): ): if i in parms.written_tables: continue - this_md += parms.tabs[i].to_markdown(clean=False) + "\n" + table_md = parms.tabs[i].to_markdown(clean=False) + table_md = add_images_to_table_markdown(parms.page, parms.tabs[i], table_md, parms) + this_md += table_md + "\n" if EXTRACT_WORDS: # for "words" extraction, add table cells as line rects cells = sorted( @@ -790,7 +883,9 @@ def output_tables(parms, text_rect): for i, trect in parms.tab_rects.items(): if i in parms.written_tables: continue - this_md += parms.tabs[i].to_markdown(clean=False) + "\n" + table_md = parms.tabs[i].to_markdown(clean=False) + table_md = add_images_to_table_markdown(parms.page, parms.tabs[i], table_md, parms) + this_md += table_md + "\n" if EXTRACT_WORDS: # for "words" extraction, add table cells as line rects cells = sorted( diff --git a/test_image_in_table.py b/test_image_in_table.py deleted file mode 100644 index 98ccf013..00000000 --- a/test_image_in_table.py +++ /dev/null @@ -1,134 +0,0 @@ -""" -Test script to demonstrate images in table cells fix for Issue #21. - -This script creates a test PDF with a table containing images in cells, -then uses pymupdf4llm to extract the table and verify that images -appear inside the table cells in the markdown output. -""" - -import pymupdf -import pymupdf4llm -import os -import tempfile -import shutil - -def create_test_pdf_with_table_images(): - """Create a test PDF with a table that has images in cells.""" - doc = pymupdf.open() - page = doc.new_page(width=595, height=842) # A4 size - - # Define table structure - table_rect = pymupdf.Rect(50, 50, 545, 400) - cell_width = (table_rect.width) / 3 - cell_height = (table_rect.height) / 4 - - # Draw table grid - for i in range(4): - # Horizontal lines - y = table_rect.y0 + i * cell_height - page.draw_line((table_rect.x0, y), (table_rect.x1, y)) - page.draw_line((table_rect.x0, table_rect.y1), (table_rect.x1, table_rect.y1)) - - for i in range(4): - # Vertical lines - x = table_rect.x0 + i * cell_width - page.draw_line((x, table_rect.y0), (x, table_rect.y1)) - - # Add header text - page.insert_text((table_rect.x0 + 10, table_rect.y0 + 20), "Column 1", fontsize=12) - page.insert_text((table_rect.x0 + cell_width + 10, table_rect.y0 + 20), "Column 2", fontsize=12) - page.insert_text((table_rect.x0 + 2 * cell_width + 10, table_rect.y0 + 20), "Image Column", fontsize=12) - - # Add data rows with text - for row in range(1, 3): - y_pos = table_rect.y0 + row * cell_height + 20 - page.insert_text((table_rect.x0 + 10, y_pos), f"Row {row} Col 1", fontsize=10) - page.insert_text((table_rect.x0 + cell_width + 10, y_pos), f"Row {row} Col 2", fontsize=10) - - # Add simple colored rectangles as "images" in the third column - for row in range(1, 3): - y_start = table_rect.y0 + row * cell_height + 10 - x_start = table_rect.x0 + 2 * cell_width + 10 - - # Create a simple colored rectangle to simulate an image - img_rect = pymupdf.Rect(x_start, y_start, x_start + 60, y_start + 40) - - # Draw colored rectangle - color = (1, 0, 0) if row == 1 else (0, 0, 1) # Red or Blue - page.draw_rect(img_rect, color=color, fill=color, width=0) - - # Add a small label - page.insert_text((x_start + 5, y_start + 25), f"IMG{row}", fontsize=8, color=(1, 1, 1)) - - # Save to temporary file - temp_pdf = tempfile.mktemp(suffix=".pdf") - doc.save(temp_pdf) - doc.close() - - return temp_pdf - - -def test_image_in_table(): - """Test that images appear inside table cells in markdown output.""" - print("Creating test PDF with table containing images...") - test_pdf = create_test_pdf_with_table_images() - - print(f"Test PDF created: {test_pdf}") - print() - - # Create temporary directory for images - image_dir = tempfile.mkdtemp() - print(f"Image output directory: {image_dir}") - print() - - try: - # Extract markdown with images - print("Extracting markdown with write_images=True...") - doc = pymupdf.open(test_pdf) - md_text = pymupdf4llm.to_markdown( - doc, - write_images=True, - image_path=image_dir - ) - doc.close() - - print("Markdown output:") - print("=" * 80) - print(md_text) - print("=" * 80) - print() - - # Check if images are referenced in table - if "![image]" in md_text and "|" in md_text: - print("SUCCESS: Images appear to be included in table cells!") - - # Count image references - image_count = md_text.count("![image]") - print(f"Found {image_count} image reference(s) in the markdown output.") - - # List created image files - image_files = [f for f in os.listdir(image_dir) if f.endswith(('.png', '.jpg', '.jpeg'))] - print(f"Created {len(image_files)} image file(s):") - for img_file in image_files: - print(f" - {img_file}") - else: - print("WARNING: No images found in table cells or no table detected.") - print("This might be expected if table detection failed.") - - print() - print("Test completed!") - - finally: - # Cleanup - if os.path.exists(test_pdf): - os.remove(test_pdf) - print(f"Cleaned up test PDF: {test_pdf}") - - if os.path.exists(image_dir): - shutil.rmtree(image_dir) - print(f"Cleaned up image directory: {image_dir}") - - -if __name__ == "__main__": - test_image_in_table() -