4343except ImportError :
4444 import fitz
4545
46- from get_text_lines import get_raw_lines , is_white
47- from multi_column import column_boxes
46+ from pymupdf4llm .helpers .get_text_lines import (
47+ get_raw_lines ,
48+ is_white ,
49+ )
50+ from pymupdf4llm .helpers .multi_column import column_boxes
4851
4952if fitz .pymupdf_version_tuple < (1 , 24 , 2 ):
5053 raise NotImplementedError ("PyMuPDF version 1.24.2 or later is needed." )
@@ -164,6 +167,7 @@ def save_image(page, rect, i):
164167 return os .path .basename (image_path )
165168
166169 def write_text (
170+ page ,
167171 textpage : fitz .TextPage ,
168172 clip : fitz .Rect ,
169173 tabs = None ,
@@ -197,7 +201,6 @@ def write_text(
197201 prev_hdr_string = None
198202
199203 for lrect , spans in nlines :
200-
201204 # there may tables or images inside the text block: skip them
202205 if intersects_rects (lrect , tab_rects0 ) or intersects_rects (
203206 lrect , img_rects0
@@ -345,8 +348,10 @@ def output_tables(tabs, text_rect, tab_rects):
345348 del tab_rects [i ]
346349 return this_md
347350
348- def output_images (text_rect , img_rects ):
351+ def output_images (page , text_rect , img_rects ):
349352 """Output and remove images and graphics above text rectangle."""
353+ if img_rects is None :
354+ return ""
350355 this_md = "" # markdown string
351356 if text_rect is not None : # select tables above the text block
352357 for i , img_rect in sorted (
@@ -419,10 +424,11 @@ def get_page_output(doc, pno, textflags):
419424 for text_rect in text_rects :
420425 # outpt tables above this block of text
421426 md_string += output_tables (tabs , text_rect , tab_rects )
422- md_string += output_images (text_rect , vg_clusters )
427+ md_string += output_images (page , text_rect , vg_clusters )
423428
424429 # output text inside this rectangle
425430 md_string += write_text (
431+ page ,
426432 textpage ,
427433 text_rect ,
428434 tabs = tabs ,
@@ -434,7 +440,7 @@ def get_page_output(doc, pno, textflags):
434440
435441 # write remaining tables.
436442 md_string += output_tables (tabs , None , tab_rects )
437- md_string += output_images (None , tab_rects )
443+ md_string += output_images (None , tab_rects , None )
438444 md_string += "\n -----\n \n "
439445 return md_string
440446
@@ -456,9 +462,9 @@ def get_page_output(doc, pno, textflags):
456462
457463
458464if __name__ == "__main__" :
465+ import pathlib
459466 import sys
460467 import time
461- import pathlib
462468
463469 try :
464470 filename = sys .argv [1 ]
0 commit comments