Skip to content

Commit 5ba23e3

Browse files
committed
Changes for v0.0.9
See changes.rst
1 parent 56eba1f commit 5ba23e3

File tree

5 files changed

+101
-41
lines changed

5 files changed

+101
-41
lines changed

docs/src/changes.rst

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,22 @@
44
Change Log
55
===========================================================================
66

7+
Changes in version 0.0.9
8+
--------------------------
9+
10+
Fixes:
11+
~~~~~~~
12+
13+
* `71 <https://github.com/pymupdf/RAG/issues/71>`_ "Unexpected results in pymupdf4llm but pymupdf works"
14+
* `68 <https://github.com/pymupdf/RAG/issues/68>`_ "Issue with text extraction near footer of page"
15+
16+
17+
Improvements:
18+
~~~~~~~~~~~~~~
19+
* Improved identification of scattered text span particles. This should address most issues with out-of-sequence situations.
20+
* We now correctly process rotated pages (see issue #68).
21+
22+
723
Changes in version 0.0.8
824
--------------------------
925

@@ -24,7 +40,7 @@ Fixes:
2440
Improvements:
2541
~~~~~~~~~~~~~~~~
2642

27-
* Improved the algorithm dealing with vector graphics. Vector graphics are now more reliably classified as irrelevant when they are simple background for text (quite often the case for code snippets).
43+
* Improved the algorithm dealing with vector graphics. Vector graphics are now more reliably classified as irrelevant: We now detect when "strokes" only exist in the neighborhood of the graphics boundary box border itself. This is quite often the case for code snippets.
2844

2945

3046
Changes in version 0.0.6

pymupdf4llm/pymupdf4llm/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown
22

3-
__version__ = "0.0.8"
3+
__version__ = "0.0.9"
44
version = __version__
55
version_tuple = tuple(map(int, version.split(".")))
66

pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py

Lines changed: 60 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -28,33 +28,63 @@ def is_white(text):
2828

2929

3030
def get_raw_lines(textpage, clip=None, tolerance=3):
31-
"""Extract the text spans from a TextPage in a natural reading sequence.
31+
"""Extract the text spans from a TextPage in natural reading sequence.
3232
3333
All spans roughly on the same line are joined to generate an improved line.
3434
This copes with MuPDF's algorithm that generates new lines also for spans
35-
whose horizontal distance is larger than some hreshold.
35+
whose horizontal distance is larger than some threshold.
3636
3737
Result is a sorted list of line objects that consist of the recomputed line
38-
rectangle and a sorted list of spans in that line.
38+
boundary box and the sorted list of spans in that line.
3939
40-
This result can then be easily converted e.g. to plain or markdown text.
40+
This result can then easily be converted e.g. to plain or markdown text.
4141
4242
Args:
4343
textpage: (mandatory) TextPage object
44-
clip: (Rect) specifies a sub-rectangle of the textpage rect (which also
45-
may be based on some part of the original page).
44+
clip: (Rect) specifies a sub-rectangle of the textpage rect (which in
45+
turn may be based on a sub-rectangle of the full page).
4646
tolerance: (float) put spans on the same line if their top or bottom
47-
coordinate differ by no mor than this value.
47+
coordinate differ by no more than this value.
4848
4949
Returns:
50-
A sorted list of items (rect, [spans]), each representing a line. The
51-
spans are sorted left to right, Span dictionaries have been changed
52-
in that "bbox" is a Rect object and "line" is an integer representing
53-
the line number of the span. This allows to detect where MuPDF has
54-
generated line breaks to indicate large inter-span distances.
50+
A sorted list of items (rect, [spans]), each representing one line. The
51+
spans are sorted left to right, Span dictionaries have been changed:
52+
- "bbox" has been converted to a Rect object
53+
- "line" (new) the line number in TextPage.extractDICT
54+
- "block" (new) the block number in TextPage.extractDICT
55+
This allows to detect where MuPDF has generated line breaks to indicate
56+
large inter-span distances.
5557
"""
5658
y_delta = tolerance # allowable vertical coordinate deviation
57-
if clip == None: # use TextPage if not provided
59+
60+
def sanitize_spans(line):
61+
"""Sort and join the spans in a re-synthesized line.
62+
63+
The PDF may contain "broken" text with words cut into pieces.
64+
This funtion joins spans representing the particles and sorts them
65+
left to right.
66+
67+
Arg:
68+
A list of spans - as drived from TextPage.extractDICT()
69+
Returns:
70+
A list of sorted, and potentially cleaned-up spans
71+
"""
72+
line.sort(key=lambda s: s["bbox"].x0) # sort left to right
73+
for i in range(len(line) - 1, 0, -1): # iterate back to front
74+
s0 = line[i - 1]
75+
s1 = line[i]
76+
# "delta" depends on the font size. Spans will be joined if
77+
# no more than 10% of the font size separates them.
78+
delta = s1["size"] * 0.1
79+
if s0["bbox"].x1 + delta < s1["bbox"].x0:
80+
continue # all good: no joining neded
81+
s0["bbox"] |= s1["bbox"] # join boundary boxes
82+
s0["text"] += s1["text"] # join the text
83+
del line[i] # delete the joined-in span
84+
line[i - 1] = s0 # update the span
85+
return line
86+
87+
if clip is None: # use TextPage if not provided
5888
clip = textpage.rect
5989
# extract text blocks - if bbox is not empty
6090
blocks = [
@@ -63,40 +93,38 @@ def get_raw_lines(textpage, clip=None, tolerance=3):
6393
if b["type"] == 0 and not fitz.Rect(b["bbox"]).is_empty
6494
]
6595
spans = [] # all spans in TextPage here
66-
for bno, b in enumerate(blocks):
67-
for lno, line in enumerate(b["lines"]):
68-
lbbox = fitz.Rect(line["bbox"])
69-
for sno, s in enumerate(line["spans"]):
70-
sbbox = fitz.Rect(s["bbox"]) # turn to a Rect
96+
for bno, b in enumerate(blocks): # the numbered blocks
97+
for lno, line in enumerate(b["lines"]): # the numbered lines
98+
for sno, s in enumerate(line["spans"]): # the numered spans
99+
sbbox = fitz.Rect(s["bbox"]) # span bbox as a Rect
71100
mpoint = (sbbox.tl + sbbox.br) / 2 # middle point
72101
if mpoint not in clip:
73102
continue
74103
if is_white(s["text"]): # ignore white text
75104
continue
76-
if s["flags"] & 1 == 1: # if a superscript, modify
105+
if s["flags"] & 1 == 1: # if a superscript, modify bbox
106+
# with that of the preceding or following span
77107
i = 1 if sno == 0 else sno - 1
78108
neighbor = line["spans"][i]
79109
sbbox.y1 = neighbor["bbox"][3]
80110
s["text"] = f"[{s['text']}]"
81111
s["bbox"] = sbbox # update with the Rect version
82-
# include line identifier to facilitate separator insertion
112+
# include line/block numbers to facilitate separator insertion
83113
s["line"] = lno
84114
s["block"] = bno
85115
spans.append(s)
86116

87-
if not spans: # we may have no text at all
117+
if not spans: # no text at all
88118
return []
89119

90-
spans.sort(
91-
key=lambda s: s["bbox"].y1
92-
) # sort spans by assending bottom coord
120+
spans.sort(key=lambda s: s["bbox"].y1) # sort spans by bottom coord
93121
nlines = [] # final result
94-
line = [spans[0]] # collects spans with fitting vertical coordinate
122+
line = [spans[0]] # collects spans with fitting vertical coordinates
95123
lrect = spans[0]["bbox"] # rectangle joined from span rectangles
96124

97-
for s in spans[1:]:
98-
sbbox = s["bbox"]
99-
sbbox0 = line[-1]["bbox"]
125+
for s in spans[1:]: # walk through the spans
126+
sbbox = s["bbox"] # this bbox
127+
sbbox0 = line[-1]["bbox"] # previous bbox
100128
# if any of top or bottom coordinates are close enough, join...
101129
if (
102130
abs(sbbox.y1 - sbbox0.y1) <= y_delta
@@ -107,7 +135,7 @@ def get_raw_lines(textpage, clip=None, tolerance=3):
107135
continue
108136

109137
# end of current line, sort its spans from left to right
110-
line.sort(key=lambda s: s["bbox"].x0)
138+
line = sanitize_spans(line)
111139

112140
# append line rect and its spans to final output
113141
nlines.append([lrect, line])
@@ -116,7 +144,7 @@ def get_raw_lines(textpage, clip=None, tolerance=3):
116144
lrect = sbbox # initialize its rectangle
117145

118146
# need to append last line in the same way
119-
line.sort(key=lambda s: s["bbox"].x0)
147+
line = sanitize_spans(line)
120148
nlines.append([lrect, line])
121149

122150
return nlines
@@ -143,6 +171,7 @@ def get_text_lines(
143171
Returns:
144172
String of plain text in reading sequence.
145173
"""
174+
textflags = fitz.TEXT_MEDIABOX_CLIP
146175
page.remove_rotation()
147176
prect = page.rect if not clip else fitz.Rect(clip) # area to consider
148177

@@ -151,7 +180,7 @@ def get_text_lines(
151180
# make a TextPage if required
152181
if textpage is None:
153182
if ocr is False:
154-
tp = page.get_textpage(clip=prect, flags=fitz.TEXTFLAGS_TEXT)
183+
tp = page.get_textpage(clip=prect, flags=textflags)
155184
else:
156185
tp = page.get_textpage_ocr(dpi=300, full=True)
157186
else:

pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -187,16 +187,22 @@ def to_markdown(
187187
if len(margins) == 2:
188188
margins = (0, margins[0], 0, margins[1])
189189
if len(margins) != 4:
190-
raise ValueError("margins must be a float or a sequence of 2 or 4 floats")
190+
raise ValueError(
191+
"margins must be one, two or four floats"
192+
)
191193
elif not all([hasattr(m, "__float__") for m in margins]):
192194
raise ValueError("margin values must be floats")
193195

194196
# If "hdr_info" is not an object having method "get_header_id", scan the
195197
# document and use font sizes as header level indicators.
196198
if callable(hdr_info):
197199
get_header_id = hdr_info
198-
elif hasattr(hdr_info, "get_header_id") and callable(hdr_info.get_header_id):
200+
elif hasattr(hdr_info, "get_header_id") and callable(
201+
hdr_info.get_header_id
202+
):
199203
get_header_id = hdr_info.get_header_id
204+
elif hdr_info is False:
205+
get_header_id = lambda s, page=None: ""
200206
else:
201207
hdr_info = IdentifyHeaders(doc)
202208
get_header_id = hdr_info.get_header_id
@@ -378,7 +384,9 @@ def write_text(
378384
if ltext:
379385
text = f"{hdr_string}{prefix}{ltext}{suffix} "
380386
else:
381-
text = f"{hdr_string}{prefix}{s['text'].strip()}{suffix} "
387+
text = (
388+
f"{hdr_string}{prefix}{s['text'].strip()}{suffix} "
389+
)
382390

383391
if text.startswith(bullet):
384392
text = "- " + text[1:]
@@ -391,7 +399,9 @@ def write_text(
391399
code = False
392400

393401
return (
394-
out_string.replace(" \n", "\n").replace(" ", " ").replace("\n\n\n", "\n\n")
402+
out_string.replace(" \n", "\n")
403+
.replace(" ", " ")
404+
.replace("\n\n\n", "\n\n")
395405
)
396406

397407
def is_in_rects(rect, rect_list):
@@ -474,6 +484,7 @@ def get_page_output(doc, pno, margins, textflags):
474484
graphics information.
475485
"""
476486
page = doc[pno]
487+
page.remove_rotation() # make sure we work on rotation=0
477488
md_string = ""
478489
if GRAPHICS_LIMIT is not None:
479490
test_paths = page.get_cdrawings()
@@ -491,7 +502,9 @@ def get_page_output(doc, pno, margins, textflags):
491502
# make a TextPage for all later extractions
492503
textpage = page.get_textpage(flags=textflags, clip=clip)
493504

494-
img_info = [img for img in page.get_image_info() if img["bbox"] in clip]
505+
img_info = [
506+
img for img in page.get_image_info() if img["bbox"] in clip
507+
]
495508
images = img_info[:]
496509
tables = []
497510
graphics = []
@@ -560,7 +573,9 @@ def get_page_output(doc, pno, margins, textflags):
560573
if include is True: # this box is a significant vector graphic
561574
vg_clusters.append(bbox)
562575

563-
actual_paths = [p for p in paths if is_in_rects(p["rect"], vg_clusters)]
576+
actual_paths = [
577+
p for p in paths if is_in_rects(p["rect"], vg_clusters)
578+
]
564579

565580
vg_clusters0 = [
566581
r
@@ -620,7 +635,7 @@ def get_page_output(doc, pno, margins, textflags):
620635

621636
# read the Table of Contents
622637
toc = doc.get_toc()
623-
textflags = fitz.TEXT_MEDIABOX_CLIP
638+
textflags = fitz.TEXT_MEDIABOX_CLIP | fitz.TEXT_CID_FOR_UNKNOWN_UNICODE
624639
for pno in pages:
625640
page_output, images, tables, graphics = get_page_output(
626641
doc, pno, margins, textflags

pymupdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
setuptools.setup(
1919
name="pymupdf4llm",
20-
version="0.0.8",
20+
version="0.0.9",
2121
author="Artifex",
2222
author_email="support@artifex.com",
2323
description="PyMuPDF Utilities for LLM/RAG",

0 commit comments

Comments
 (0)