Skip to content

Commit d573f44

Browse files
authored
Merge pull request #12 from YanSte/fix/import_fix_0.0.2
[Hotfix] Fixed import and method used for version 0.0.2 (Import issue)
2 parents 3a34b40 + 9c2b9b8 commit d573f44

File tree

1 file changed

+13
-7
lines changed

1 file changed

+13
-7
lines changed

pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,11 @@
4343
except ImportError:
4444
import fitz
4545

46-
from get_text_lines import get_raw_lines, is_white
47-
from multi_column import column_boxes
46+
from pymupdf4llm.helpers.get_text_lines import (
47+
get_raw_lines,
48+
is_white,
49+
)
50+
from pymupdf4llm.helpers.multi_column import column_boxes
4851

4952
if fitz.pymupdf_version_tuple < (1, 24, 2):
5053
raise NotImplementedError("PyMuPDF version 1.24.2 or later is needed.")
@@ -164,6 +167,7 @@ def save_image(page, rect, i):
164167
return os.path.basename(image_path)
165168

166169
def write_text(
170+
page,
167171
textpage: fitz.TextPage,
168172
clip: fitz.Rect,
169173
tabs=None,
@@ -197,7 +201,6 @@ def write_text(
197201
prev_hdr_string = None
198202

199203
for lrect, spans in nlines:
200-
201204
# there may tables or images inside the text block: skip them
202205
if intersects_rects(lrect, tab_rects0) or intersects_rects(
203206
lrect, img_rects0
@@ -345,8 +348,10 @@ def output_tables(tabs, text_rect, tab_rects):
345348
del tab_rects[i]
346349
return this_md
347350

348-
def output_images(text_rect, img_rects):
351+
def output_images(page, text_rect, img_rects):
349352
"""Output and remove images and graphics above text rectangle."""
353+
if img_rects is None:
354+
return ""
350355
this_md = "" # markdown string
351356
if text_rect is not None: # select tables above the text block
352357
for i, img_rect in sorted(
@@ -419,10 +424,11 @@ def get_page_output(doc, pno, textflags):
419424
for text_rect in text_rects:
420425
# outpt tables above this block of text
421426
md_string += output_tables(tabs, text_rect, tab_rects)
422-
md_string += output_images(text_rect, vg_clusters)
427+
md_string += output_images(page, text_rect, vg_clusters)
423428

424429
# output text inside this rectangle
425430
md_string += write_text(
431+
page,
426432
textpage,
427433
text_rect,
428434
tabs=tabs,
@@ -434,7 +440,7 @@ def get_page_output(doc, pno, textflags):
434440

435441
# write remaining tables.
436442
md_string += output_tables(tabs, None, tab_rects)
437-
md_string += output_images(None, tab_rects)
443+
md_string += output_images(None, tab_rects, None)
438444
md_string += "\n-----\n\n"
439445
return md_string
440446

@@ -456,9 +462,9 @@ def get_page_output(doc, pno, textflags):
456462

457463

458464
if __name__ == "__main__":
465+
import pathlib
459466
import sys
460467
import time
461-
import pathlib
462468

463469
try:
464470
filename = sys.argv[1]

0 commit comments

Comments
 (0)