Skip to content

Commit e0dfa9f

Browse files
authored
Merge pull request #347 from pymupdf/0.2.7
Version 0.2.7
2 parents cc29b2d + bd20aee commit e0dfa9f

File tree

8 files changed

+112
-39
lines changed

8 files changed

+112
-39
lines changed

CHANGES.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,17 @@
11
# Change Log
22

3+
## Changes in version 0.2.7
4+
5+
### Fixes:
6+
7+
* [323](https://github.com/pymupdf/pymupdf4llm/issues/323) - `page_chunks=True` parameter was ignored in PyMuPDF-Layout mode
8+
9+
### Other Changes:
10+
11+
* Methods `to_markdown()` / `to_text()` now both support Page chunk output via parameter `page_chunks=True`.
12+
13+
------
14+
315
## Changes in version 0.2.6
416

517
### Fixes:

pdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
77
readme = f.read()
88

9-
version = "0.2.6" # must always equal the pymupdf4llm version
9+
version = "0.2.7" # must always equal the pymupdf4llm version
1010

1111
classifiers = [
1212
"Development Status :: 5 - Production/Stable",

pymupdf4llm/pymupdf4llm/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ def to_text(
147147
ocr_dpi=400,
148148
use_ocr=True,
149149
table_format="grid",
150+
page_chunks=False,
150151
# unsupported options for pymupdf layout:
151152
**kwargs,
152153
):
@@ -166,6 +167,7 @@ def to_text(
166167
ignore_code=ignore_code,
167168
show_progress=show_progress,
168169
table_format=table_format,
170+
page_chunks=page_chunks,
169171
)
170172

171173

pymupdf4llm/pymupdf4llm/helpers/document_layout.py

Lines changed: 87 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
import io
33
import json
44
import os
5+
import math
56
from dataclasses import dataclass
7+
from collections import defaultdict
68
from pathlib import Path
79
from typing import Dict, List, Optional, Union
810

@@ -30,7 +32,6 @@
3032

3133
INFO_MESSAGES = io.StringIO()
3234
GRAPHICS_TEXT = "\n![](%s)\n"
33-
CHECK_OCR_TEXT = {"ignore-text"}
3435
OCR_FONTNAME = "GlyphLessFont" # if encountered do not use "code" style
3536
FLAGS = (
3637
0
@@ -43,6 +44,40 @@
4344
BULLETS = tuple(utils.BULLETS)
4445

4546

47+
def make_page_chunk(doc, page, text) -> Dict:
48+
"""Create a page chunk dictionary for output.
49+
50+
Args:
51+
doc: the ParsedDocument object
52+
page: the PageLayout object
53+
text: the page text string
54+
55+
Returns:
56+
dict: page chunk dictionary
57+
"""
58+
chunk = defaultdict(lambda: None)
59+
page_tocs = [t for t in doc.toc if t[-1] == page.page_number]
60+
chunk["metadata"] = doc.metadata | {
61+
"file_path": doc.filename,
62+
"page_count": doc.page_count,
63+
"page_number": page.page_number,
64+
}
65+
66+
chunk["toc_items"] = page_tocs
67+
chunk["page_boxes"] = [
68+
(
69+
math.floor(b.x0),
70+
math.floor(b.y0),
71+
math.ceil(b.x1),
72+
math.ceil(b.y1),
73+
b.boxclass,
74+
)
75+
for b in page.boxes
76+
]
77+
chunk["text"] = text
78+
return chunk
79+
80+
4681
def omit_if_pua_char(text):
4782
"""Check if character is in the Private Use Area (PUA) of Unicode."""
4883
if len(text) != 1: # only single characters are checked
@@ -594,7 +629,7 @@ class ParsedDocument:
594629
image_dpi: int = 150 # image resolution
595630
image_format: str = "png" # 'png' or 'jpg'
596631
image_path: str = "" # path to save images
597-
use_ocr: bool = True # whether to invoke OCR if beneficial
632+
use_ocr: bool = True # if beneficial invoke OCR
598633

599634
def to_markdown(
600635
self,
@@ -605,18 +640,24 @@ def to_markdown(
605640
ignore_code: bool = False,
606641
show_progress: bool = False,
607642
page_separators: bool = False,
643+
page_chunks: bool = False,
608644
**kwargs,
609-
) -> str:
645+
) -> Union[str, List[Dict]]:
610646
"""
611647
Serialize ParsedDocument to markdown text.
612648
"""
613-
output = ""
649+
if page_chunks:
650+
document_output = []
651+
else:
652+
document_output = ""
653+
614654
if show_progress and len(self.pages) > 5:
615655
print(f"Generating markdown text...")
616656
this_iterator = ProgressBar(self.pages)
617657
else:
618658
this_iterator = self.pages
619659
for page in this_iterator:
660+
md_string = ""
620661
# Make a mapping: box number -> list item hierarchy level
621662
list_item_levels = create_list_item_levels(page.boxes)
622663

@@ -633,25 +674,25 @@ def to_markdown(
633674
# pictures and formulas: either write image file or embed
634675
if btype in ("picture", "formula", "table-fallback"):
635676
if isinstance(box.image, str):
636-
output += GRAPHICS_TEXT % box.image + "\n\n"
677+
md_string += GRAPHICS_TEXT % box.image + "\n\n"
637678
elif isinstance(box.image, bytes):
638679
# make a base64 encoded string of the image
639680
data = base64.b64encode(box.image).decode()
640681
data = f"data:image/{self.image_format};base64," + data
641-
output += GRAPHICS_TEXT % data + "\n\n"
682+
md_string += GRAPHICS_TEXT % data + "\n\n"
642683
else:
643-
output += f"**==> picture [{clip.width} x {clip.height}] intentionally omitted <==**\n\n"
684+
md_string += f"**==> picture [{clip.width} x {clip.height}] intentionally omitted <==**\n\n"
644685

645686
# output text in image if requested
646687
if box.textlines:
647688
if btype == "picture":
648-
output += picture_text_to_md(
689+
md_string += picture_text_to_md(
649690
box.textlines,
650691
ignore_code=ignore_code or page.full_ocred,
651692
clip=clip,
652693
)
653694
elif btype == "table-fallback":
654-
output += fallback_text_to_md(
695+
md_string += fallback_text_to_md(
655696
box.textlines,
656697
ignore_code=ignore_code or page.full_ocred,
657698
clip=clip,
@@ -662,30 +703,35 @@ def to_markdown(
662703
if page.full_ocred:
663704
# remove code style if page was OCR'd
664705
table_text = table_text.replace("`", "")
665-
output += table_text + "\n\n"
706+
md_string += table_text + "\n\n"
666707
continue
667708
if not hasattr(box, "textlines"):
668709
print(f"Warning: box {btype} has no textlines")
669710
continue
670711
if btype == "title":
671-
output += title_to_md(box.textlines)
712+
md_string += title_to_md(box.textlines)
672713
elif btype == "section-header":
673-
output += section_hdr_to_md(box.textlines)
714+
md_string += section_hdr_to_md(box.textlines)
674715
elif btype == "list-item":
675-
output += list_item_to_md(box.textlines, list_item_levels[i])
716+
md_string += list_item_to_md(box.textlines, list_item_levels[i])
676717
elif btype == "footnote":
677-
output += footnote_to_md(box.textlines)
718+
md_string += footnote_to_md(box.textlines)
678719
elif not header and btype == "page-header":
679720
continue
680721
elif not footer and btype == "page-footer":
681722
continue
682723
else: # treat as normal MD text
683-
output += text_to_md(
724+
md_string += text_to_md(
684725
box.textlines, ignore_code=ignore_code or page.full_ocred
685726
)
686727
if page_separators:
687-
output += f"--- end of {page.page_number=} ---\n\n"
688-
return output
728+
md_string += f"--- end of {page.page_number=} ---\n\n"
729+
if not page_chunks:
730+
document_output += md_string
731+
else:
732+
chunk = make_page_chunk(self, page, md_string)
733+
document_output.append(chunk)
734+
return document_output
689735

690736
def to_json(self, show_progress=False) -> str:
691737
# Serialize to JSON
@@ -717,22 +763,29 @@ def to_text(
717763
footer: bool = True,
718764
ignore_code: bool = False,
719765
show_progress: bool = False,
766+
page_chunks: bool = False,
720767
table_format: str = "grid",
721-
) -> str:
768+
**kwargs,
769+
) -> Union[str, List[Dict]]:
722770
"""
723771
Serialize ParsedDocument to plain text. Optionally omit page headers or footers.
724772
"""
725773
if table_format not in tabulate.tabulate_formats:
726774
print(f"Warning: invalid table format '{table_format}', using 'grid'.")
727775
table_format = "grid"
728-
# Flatten all text boxes into plain text
729-
output = ""
776+
777+
if page_chunks:
778+
document_output = []
779+
else:
780+
document_output = ""
781+
730782
if show_progress and len(self.pages) > 5:
731783
print(f"Generating plain text ..")
732784
this_iterator = ProgressBar(self.pages)
733785
else:
734786
this_iterator = self.pages
735787
for page in this_iterator:
788+
text_string = ""
736789
list_item_levels = create_list_item_levels(page.boxes)
737790
for i, box in enumerate(page.boxes):
738791
clip = pymupdf.IRect(box.x0, box.y0, box.x1, box.y1)
@@ -742,38 +795,43 @@ def to_text(
742795
if btype == "page-footer" and footer is False:
743796
continue
744797
if btype in ("picture", "formula", "table-fallback"):
745-
output += f"==> picture [{clip.width} x {clip.height}] <==\n\n"
798+
text_string += f"==> picture [{clip.width} x {clip.height}] <==\n\n"
746799
if box.textlines:
747800
if btype == "picture":
748-
output += picture_text_to_text(
801+
text_string += picture_text_to_text(
749802
box.textlines,
750803
ignore_code=ignore_code or page.full_ocred,
751804
clip=clip,
752805
)
753806
elif btype == "table-fallback":
754-
output += fallback_text_to_text(
807+
text_string += fallback_text_to_text(
755808
box.textlines,
756809
ignore_code=ignore_code or page.full_ocred,
757810
clip=clip,
758811
)
759812
continue
760813
if btype == "table":
761-
output += (
814+
text_string += (
762815
tabulate.tabulate(box.table["extract"], tablefmt=table_format)
763816
+ "\n\n"
764817
)
765818
continue
766819
if btype == "list-item":
767-
output += list_item_to_text(box.textlines, list_item_levels[i])
820+
text_string += list_item_to_text(box.textlines, list_item_levels[i])
768821
continue
769822
if btype == "footnote":
770-
output += footnote_to_text(box.textlines)
823+
text_string += footnote_to_text(box.textlines)
771824
continue
772-
output += text_to_text(
825+
text_string += text_to_text(
773826
box.textlines, ignore_code=ignore_code or page.full_ocred
774827
)
775-
continue
776-
return output
828+
829+
if not page_chunks:
830+
document_output += text_string
831+
else:
832+
chunk = make_page_chunk(self, page, text_string)
833+
document_output.append(chunk)
834+
return document_output
777835

778836

779837
def parse_document(

pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,11 @@
1515
import sys
1616

1717
import pymupdf
18-
from pymupdf4llm.helpers.utils import WHITE_CHARS
18+
from pymupdf4llm.helpers.utils import is_white
1919

2020
TYPE3_FONT_NAME = "Unnamed-T3"
2121

2222

23-
def is_white(text):
24-
return WHITE_CHARS.issuperset(text)
25-
26-
2723
def get_raw_lines(
2824
textpage=None,
2925
blocks=None,

pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,14 @@
4343

4444
import pymupdf
4545
from pymupdf import mupdf
46-
from pymupdf4llm.helpers.get_text_lines import get_raw_lines, is_white
46+
from pymupdf4llm.helpers.get_text_lines import get_raw_lines
4747
from pymupdf4llm.helpers.multi_column import column_boxes
48-
from pymupdf4llm.helpers.utils import BULLETS, REPLACEMENT_CHARACTER, startswith_bullet
48+
from pymupdf4llm.helpers.utils import (
49+
BULLETS,
50+
REPLACEMENT_CHARACTER,
51+
startswith_bullet,
52+
is_white,
53+
)
4954

5055
try:
5156
from tqdm import tqdm as ProgressBar
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
# Generated file - do not edit.
22
MINIMUM_PYMUPDF_VERSION = (1, 26, 6)
3-
VERSION = '0.2.6'
3+
VERSION = '0.2.7'

pymupdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
"Topic :: Utilities",
1212
]
1313

14-
version = "0.2.6"
14+
version = "0.2.7"
1515
pymupdf_version = "1.26.6"
1616
pymupdf_version_tuple = tuple(int(x) for x in pymupdf_version.split("."))
1717
requires = [f"pymupdf>={pymupdf_version}", "tabulate"]

0 commit comments

Comments
 (0)