diff --git a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py index 9b872d12..d8e9a444 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py +++ b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py @@ -918,11 +918,13 @@ def parse_document( # tables are present on page: if not (page_full_ocred or page_text_ocred): # we need the by-character extraction if no OCR + # Include both text blocks (type==0) and image blocks (type==1) table_blocks = [ - b for b in textpage.extractRAWDICT()["blocks"] if b["type"] == 0 + b for b in textpage.extractRAWDICT()["blocks"] if b["type"] in (0, 1) ] else: - table_blocks = fulltext + # Also include images from blocks for OCR case + table_blocks = fulltext + [b for b in blocks if b["type"] == 1] else: table_blocks = None @@ -1001,6 +1003,8 @@ def parse_document( table_blocks, layoutbox, ocrpage=(pagelayout.full_ocred or pagelayout.text_ocred), + page=page, + document=document, ) layoutbox.table["markdown"] = utils.table_to_markdown( @@ -1008,6 +1012,8 @@ def parse_document( layoutbox, ocrpage=(pagelayout.full_ocred or pagelayout.text_ocred), markdown=True, + page=page, + document=document, ) except Exception as e: diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index 6b860800..7ba0bb29 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -560,7 +560,9 @@ def write_text( ) ] for i, _ in tab_candidates: - out_string += "\n" + parms.tabs[i].to_markdown(clean=False) + "\n" + table_md = parms.tabs[i].to_markdown(clean=False) + table_md = add_images_to_table_markdown(parms.page, parms.tabs[i], table_md, parms) + out_string += "\n" + table_md + "\n" if EXTRACT_WORDS: # for "words" extraction, add table cells as line rects cells = sorted( @@ -759,6 +761,95 @@ def intersects_rects(rect, rect_list): return i return 0 + def add_images_to_table_markdown(page, table, table_md, parms): + """Add images found in table cells to the markdown output.""" + if not (write_images or embed_images): + return table_md + + # Get all images on the page + image_list = page.get_image_info() + if not image_list: + return table_md + + # Split markdown into lines + md_lines = table_md.strip().split('\n') + if len(md_lines) < 3: # Need at least header + separator + one row + return table_md + + # Track images added to avoid duplicates + used_images = set() + + # Process each data row (skip header and separator) + for row_idx in range(2, len(md_lines)): + line = md_lines[row_idx] + if not line.strip() or not line.startswith('|'): + continue + + # Parse table cells + cells = [c.strip() for c in line.split('|')[1:-1]] # Remove first/last empty + + # Get table row info + # Markdown line 2 = first data row = table.rows[1] (since rows[0] is header) + table_row_idx = row_idx - 2 + 1 # +1 to skip header row in table.rows + if table_row_idx >= table.row_count: + continue + + row_cells = table.rows[table_row_idx].cells + + # Check each cell for images + for col_idx, cell_bbox in enumerate(row_cells): + if col_idx >= len(cells) or cell_bbox is None: + continue + + cell_rect = pymupdf.Rect(cell_bbox) + + # Find images that overlap with this cell + for img_idx, img_info in enumerate(image_list): + if img_idx in used_images: + continue + + img_bbox = pymupdf.Rect(img_info['bbox']) + + # Calculate overlap + intersection = cell_rect & img_bbox + if intersection.is_empty: + continue + + overlap_ratio = abs(intersection) / abs(img_bbox) + + # If >50% of image is in this cell, it belongs here + if overlap_ratio > 0.5: + # Extract and save the image + try: + pix = page.get_pixmap(clip=img_bbox, dpi=DPI) + + if write_images: + filename = os.path.basename(parms.filename).replace(" ", "-") + img_filename = os.path.join( + IMG_PATH, f"{filename}-{page.number}-table-{img_idx}.{IMG_EXTENSION}" + ) + pix.save(img_filename) + img_ref = f"![image]({img_filename.replace(chr(92), '/')})" + elif embed_images: + data = b2a_base64(pix.tobytes(IMG_EXTENSION)).decode() + data_uri = f"data:image/{IMG_EXTENSION};base64," + data + img_ref = f"![image]({data_uri})" + + # Add image reference to cell + if cells[col_idx]: + cells[col_idx] += "
" + img_ref + else: + cells[col_idx] = img_ref + + used_images.add(img_idx) + except Exception: + pass # Skip failed image extractions + + # Reconstruct the row with images + md_lines[row_idx] = '|' + '|'.join(cells) + '|' + + return '\n'.join(md_lines) + '\n' + def output_tables(parms, text_rect): """Output tables above given text rectangle.""" this_md = "" # markdown string for table(s) content @@ -769,7 +860,9 @@ def output_tables(parms, text_rect): ): if i in parms.written_tables: continue - this_md += parms.tabs[i].to_markdown(clean=False) + "\n" + table_md = parms.tabs[i].to_markdown(clean=False) + table_md = add_images_to_table_markdown(parms.page, parms.tabs[i], table_md, parms) + this_md += table_md + "\n" if EXTRACT_WORDS: # for "words" extraction, add table cells as line rects cells = sorted( @@ -790,7 +883,9 @@ def output_tables(parms, text_rect): for i, trect in parms.tab_rects.items(): if i in parms.written_tables: continue - this_md += parms.tabs[i].to_markdown(clean=False) + "\n" + table_md = parms.tabs[i].to_markdown(clean=False) + table_md = add_images_to_table_markdown(parms.page, parms.tabs[i], table_md, parms) + this_md += table_md + "\n" if EXTRACT_WORDS: # for "words" extraction, add table cells as line rects cells = sorted( diff --git a/pymupdf4llm/pymupdf4llm/helpers/utils.py b/pymupdf4llm/pymupdf4llm/helpers/utils.py index 03f9cdf8..1d247523 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/utils.py +++ b/pymupdf4llm/pymupdf4llm/helpers/utils.py @@ -750,24 +750,27 @@ def complete_table_structure(page): return all_lines, all_boxes -def extract_cells(table_blocks, cell, markdown=False, ocrpage=False): - """Extract text from a rect-like 'cell' as plain or MD styled text. +def extract_cells(table_blocks, cell, markdown=False, ocrpage=False, page=None, document=None, cell_image_counter=None): + """Extract text and images from a rect-like 'cell' as plain or MD styled text. This function should ultimately be used to extract text from a table cell. Markdown output will only work correctly if extraction flag bit TEXT_COLLECT_STYLES is set. Args: - table_blocks: A list of PyMuPDF TextPage text blocks (type = 0). Must + table_blocks: A list of PyMuPDF TextPage text blocks (type = 0 or 1). Must have been created with TEXT_COLLECT_STYLE for correct markdown. Format is either "dict" or "rawdict" depending on ocrpage. cell: A tuple (x0, y0, x1, y1) defining the cell's bbox. markdown: If True, return text formatted for Markdown. ocrpage: If True, text is written with GlyphLessFont. In this case, table_blocks is in format "dict". + page: Optional Page object for image extraction from cells. + document: Optional ParsedDocument object for image write/embed settings. + cell_image_counter: Optional list with one element [counter] to track image numbers. Returns: - A string with the text extracted from the cell. + A string with the text and images extracted from the cell. """ def outside_cell(bbox, cell): @@ -779,10 +782,31 @@ def outside_cell(bbox, cell): or bbox[3] <= cell[1] ) + def bbox_overlap(bbox, cell): + """Calculate overlap ratio between bbox and cell.""" + cell_rect = pymupdf.Rect(cell) + bbox_rect = pymupdf.Rect(bbox) + intersection = cell_rect & bbox_rect + if intersection.is_empty: + return 0.0 + return abs(intersection) / abs(bbox_rect) + text = "" + images_in_cell = [] + for block in table_blocks: if outside_cell(block["bbox"], cell): continue + + # Check if this is an image block (type == 1) + if block.get("type") == 1: + # Image block found within cell + overlap = bbox_overlap(block["bbox"], cell) + if overlap > 0.5: # More than 50% of image is in this cell + images_in_cell.append(block) + continue + + # Process text blocks (type == 0) for line in block["lines"]: if outside_cell(line["bbox"], cell): continue @@ -848,10 +872,49 @@ def outside_cell(bbox, cell): .replace("$\n", "$ ") .replace(" $ \n", "$ ") ) + + # Handle images found in this cell + if markdown and images_in_cell and page is not None and document is not None: + for img_block in images_in_cell: + img_bbox = pymupdf.Rect(img_block["bbox"]) + + # Extract and save the image if write_images or embed_images is enabled + if document.write_images or document.embed_images: + try: + pix = page.get_pixmap(clip=img_bbox, dpi=document.image_dpi) + + if text: + text += "
" + + if document.write_images: + # Generate unique filename for this cell image + if cell_image_counter is None: + cell_image_counter = [0] + cell_image_counter[0] += 1 + img_filename = f"{document.filename}-{page.number+1:04d}-table-cell-{cell_image_counter[0]:03d}.{document.image_format}" + img_filename = img_filename.replace(" ", "_") + img_path = os.path.join(document.image_path, img_filename) + pix.save(img_path) + # Add markdown image reference + text += f"![image]({img_path.replace(chr(92), '/')})" + + elif document.embed_images: + # Embed as base64 + import base64 + img_data = base64.b64encode(pix.tobytes(document.image_format)).decode() + data_uri = f"data:image/{document.image_format};base64,{img_data}" + text += f"![image]({data_uri})" + + except Exception as e: + # If image extraction fails, add a placeholder + if text: + text += "
" + text += f"[Image extraction failed: {str(e)}]" + return text.strip() -def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False): +def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False, page=None, document=None): output = "" table = table_item.table row_count = table["row_count"] @@ -859,6 +922,9 @@ def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False): cell_boxes = table["cells"] # make empty cell text list cells = [[None for i in range(col_count)] for j in range(row_count)] + + # Counter for images in table cells + cell_image_counter = [0] # fill None cells with extracted text # for rows, copy content from left to right @@ -877,7 +943,8 @@ def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False): for j, cell in enumerate(row): if cell is not None: cells[i][j] = extract_cells( - table_blocks, cell_boxes[i][j], markdown=markdown, ocrpage=ocrpage + table_blocks, cell_boxes[i][j], markdown=markdown, ocrpage=ocrpage, + page=page, document=document, cell_image_counter=cell_image_counter ) for i, name in enumerate(cells[0]): if name is None: @@ -908,13 +975,16 @@ def table_to_markdown(table_blocks, table_item, markdown=True, ocrpage=False): return output + "\n" -def table_extract(table_blocks, table_item, ocrpage=False): +def table_extract(table_blocks, table_item, ocrpage=False, page=None, document=None): table = table_item.table row_count = table["row_count"] col_count = table["col_count"] cell_boxes = table["cells"] # make empty cell text list cells = [[None for i in range(col_count)] for j in range(row_count)] + + # Counter for images in table cells + cell_image_counter = [0] for i, row in enumerate(cell_boxes): for j, cell in enumerate(row): @@ -924,6 +994,9 @@ def table_extract(table_blocks, table_item, ocrpage=False): cell_boxes[i][j], markdown=False, ocrpage=ocrpage, + page=page, + document=document, + cell_image_counter=cell_image_counter, ) return cells