@@ -28,33 +28,63 @@ def is_white(text):
2828
2929
3030def get_raw_lines (textpage , clip = None , tolerance = 3 ):
31- """Extract the text spans from a TextPage in a natural reading sequence.
31+ """Extract the text spans from a TextPage in natural reading sequence.
3232
3333 All spans roughly on the same line are joined to generate an improved line.
3434 This copes with MuPDF's algorithm that generates new lines also for spans
35- whose horizontal distance is larger than some hreshold .
35+ whose horizontal distance is larger than some threshold .
3636
3737 Result is a sorted list of line objects that consist of the recomputed line
38- rectangle and a sorted list of spans in that line.
38+ boundary box and the sorted list of spans in that line.
3939
40- This result can then be easily converted e.g. to plain or markdown text.
40+ This result can then easily be converted e.g. to plain or markdown text.
4141
4242 Args:
4343 textpage: (mandatory) TextPage object
44- clip: (Rect) specifies a sub-rectangle of the textpage rect (which also
45- may be based on some part of the original page).
44+ clip: (Rect) specifies a sub-rectangle of the textpage rect (which in
45+ turn may be based on a sub-rectangle of the full page).
4646 tolerance: (float) put spans on the same line if their top or bottom
47- coordinate differ by no mor than this value.
47+ coordinate differ by no more than this value.
4848
4949 Returns:
50- A sorted list of items (rect, [spans]), each representing a line. The
51- spans are sorted left to right, Span dictionaries have been changed
52- in that "bbox" is a Rect object and "line" is an integer representing
53- the line number of the span. This allows to detect where MuPDF has
54- generated line breaks to indicate large inter-span distances.
50+ A sorted list of items (rect, [spans]), each representing one line. The
51+ spans are sorted left to right, Span dictionaries have been changed:
52+ - "bbox" has been converted to a Rect object
53+ - "line" (new) the line number in TextPage.extractDICT
54+ - "block" (new) the block number in TextPage.extractDICT
55+ This allows to detect where MuPDF has generated line breaks to indicate
56+ large inter-span distances.
5557 """
5658 y_delta = tolerance # allowable vertical coordinate deviation
57- if clip == None : # use TextPage if not provided
59+
60+ def sanitize_spans (line ):
61+ """Sort and join the spans in a re-synthesized line.
62+
63+ The PDF may contain "broken" text with words cut into pieces.
64+ This funtion joins spans representing the particles and sorts them
65+ left to right.
66+
67+ Arg:
68+ A list of spans - as drived from TextPage.extractDICT()
69+ Returns:
70+ A list of sorted, and potentially cleaned-up spans
71+ """
72+ line .sort (key = lambda s : s ["bbox" ].x0 ) # sort left to right
73+ for i in range (len (line ) - 1 , 0 , - 1 ): # iterate back to front
74+ s0 = line [i - 1 ]
75+ s1 = line [i ]
76+ # "delta" depends on the font size. Spans will be joined if
77+ # no more than 10% of the font size separates them.
78+ delta = s1 ["size" ] * 0.1
79+ if s0 ["bbox" ].x1 + delta < s1 ["bbox" ].x0 :
80+ continue # all good: no joining neded
81+ s0 ["bbox" ] |= s1 ["bbox" ] # join boundary boxes
82+ s0 ["text" ] += s1 ["text" ] # join the text
83+ del line [i ] # delete the joined-in span
84+ line [i - 1 ] = s0 # update the span
85+ return line
86+
87+ if clip is None : # use TextPage if not provided
5888 clip = textpage .rect
5989 # extract text blocks - if bbox is not empty
6090 blocks = [
@@ -63,40 +93,38 @@ def get_raw_lines(textpage, clip=None, tolerance=3):
6393 if b ["type" ] == 0 and not fitz .Rect (b ["bbox" ]).is_empty
6494 ]
6595 spans = [] # all spans in TextPage here
66- for bno , b in enumerate (blocks ):
67- for lno , line in enumerate (b ["lines" ]):
68- lbbox = fitz .Rect (line ["bbox" ])
69- for sno , s in enumerate (line ["spans" ]):
70- sbbox = fitz .Rect (s ["bbox" ]) # turn to a Rect
96+ for bno , b in enumerate (blocks ): # the numbered blocks
97+ for lno , line in enumerate (b ["lines" ]): # the numbered lines
98+ for sno , s in enumerate (line ["spans" ]): # the numered spans
99+ sbbox = fitz .Rect (s ["bbox" ]) # span bbox as a Rect
71100 mpoint = (sbbox .tl + sbbox .br ) / 2 # middle point
72101 if mpoint not in clip :
73102 continue
74103 if is_white (s ["text" ]): # ignore white text
75104 continue
76- if s ["flags" ] & 1 == 1 : # if a superscript, modify
105+ if s ["flags" ] & 1 == 1 : # if a superscript, modify bbox
106+ # with that of the preceding or following span
77107 i = 1 if sno == 0 else sno - 1
78108 neighbor = line ["spans" ][i ]
79109 sbbox .y1 = neighbor ["bbox" ][3 ]
80110 s ["text" ] = f"[{ s ['text' ]} ]"
81111 s ["bbox" ] = sbbox # update with the Rect version
82- # include line identifier to facilitate separator insertion
112+ # include line/block numbers to facilitate separator insertion
83113 s ["line" ] = lno
84114 s ["block" ] = bno
85115 spans .append (s )
86116
87- if not spans : # we may have no text at all
117+ if not spans : # no text at all
88118 return []
89119
90- spans .sort (
91- key = lambda s : s ["bbox" ].y1
92- ) # sort spans by assending bottom coord
120+ spans .sort (key = lambda s : s ["bbox" ].y1 ) # sort spans by bottom coord
93121 nlines = [] # final result
94- line = [spans [0 ]] # collects spans with fitting vertical coordinate
122+ line = [spans [0 ]] # collects spans with fitting vertical coordinates
95123 lrect = spans [0 ]["bbox" ] # rectangle joined from span rectangles
96124
97- for s in spans [1 :]:
98- sbbox = s ["bbox" ]
99- sbbox0 = line [- 1 ]["bbox" ]
125+ for s in spans [1 :]: # walk through the spans
126+ sbbox = s ["bbox" ] # this bbox
127+ sbbox0 = line [- 1 ]["bbox" ] # previous bbox
100128 # if any of top or bottom coordinates are close enough, join...
101129 if (
102130 abs (sbbox .y1 - sbbox0 .y1 ) <= y_delta
@@ -107,7 +135,7 @@ def get_raw_lines(textpage, clip=None, tolerance=3):
107135 continue
108136
109137 # end of current line, sort its spans from left to right
110- line . sort ( key = lambda s : s [ "bbox" ]. x0 )
138+ line = sanitize_spans ( line )
111139
112140 # append line rect and its spans to final output
113141 nlines .append ([lrect , line ])
@@ -116,7 +144,7 @@ def get_raw_lines(textpage, clip=None, tolerance=3):
116144 lrect = sbbox # initialize its rectangle
117145
118146 # need to append last line in the same way
119- line . sort ( key = lambda s : s [ "bbox" ]. x0 )
147+ line = sanitize_spans ( line )
120148 nlines .append ([lrect , line ])
121149
122150 return nlines
@@ -143,6 +171,7 @@ def get_text_lines(
143171 Returns:
144172 String of plain text in reading sequence.
145173 """
174+ textflags = fitz .TEXT_MEDIABOX_CLIP
146175 page .remove_rotation ()
147176 prect = page .rect if not clip else fitz .Rect (clip ) # area to consider
148177
@@ -151,7 +180,7 @@ def get_text_lines(
151180 # make a TextPage if required
152181 if textpage is None :
153182 if ocr is False :
154- tp = page .get_textpage (clip = prect , flags = fitz . TEXTFLAGS_TEXT )
183+ tp = page .get_textpage (clip = prect , flags = textflags )
155184 else :
156185 tp = page .get_textpage_ocr (dpi = 300 , full = True )
157186 else :
0 commit comments