1616Text will be sorted in Western reading order. Any table will be included in
1717the text in markdwn format as well.
1818
19- Use in some other script
20- -------------------------
21- import fitz
22- from to_markdown import to_markdown
23-
24- doc = fitz.open("input.pdf")
25- page_list = [ list of 0-based page numbers ]
26- md_text = to_markdown(doc, pages=page_list)
27-
2819Dependencies
2920-------------
3021PyMuPDF v1.24.2 or later
4334except ImportError :
4435 import fitz
4536
46- from pymupdf4llm .helpers .get_text_lines import (
47- get_raw_lines ,
48- is_white ,
49- )
37+ from pymupdf4llm .helpers .get_text_lines import get_raw_lines , is_white
5038from pymupdf4llm .helpers .multi_column import column_boxes
5139
5240if fitz .pymupdf_version_tuple < (1 , 24 , 2 ):
@@ -127,13 +115,13 @@ def get_header_id(self, span):
127115
128116
129117def to_markdown (
130- doc : fitz .Document ,
118+ doc : fitz .Document | str ,
131119 * ,
132- pages : list = None ,
133- hdr_info : IdentifyHeaders = None ,
134- write_images = False ,
135- page_chunks = False ,
136- ) -> str :
120+ pages : list | range | None = None ,
121+ hdr_info : IdentifyHeaders | None = None ,
122+ write_images : bool = False ,
123+ page_chunks : bool = False ,
124+ ) -> str | list [ dict ] :
137125 """Process the document and return the text of its selected pages."""
138126
139127 if isinstance (doc , str ):
@@ -146,7 +134,7 @@ def to_markdown(
146134 hdr_info = IdentifyHeaders (doc )
147135
148136 def resolve_links (links , span ):
149- """Accept a span bbox and return a markdown link string."""
137+ """Accept a span and return a markdown link string."""
150138 bbox = fitz .Rect (span ["bbox" ]) # span bbox
151139 # a link should overlap at least 70% of the span
152140 bbox_area = 0.7 * abs (bbox )
@@ -158,6 +146,10 @@ def resolve_links(links, span):
158146 return text
159147
160148 def save_image (page , rect , i ):
149+ """Optionally render the rect part of a page.
150+
151+ In any case return the image filename.
152+ """
161153 filename = page .parent .name .replace ("\\ " , "/" )
162154 image_path = f"{ filename } -{ page .number } -{ i } .png"
163155 if write_images is True :
@@ -167,13 +159,13 @@ def save_image(page, rect, i):
167159 return os .path .basename (image_path )
168160
169161 def write_text (
170- page ,
162+ page : fitz . Page ,
171163 textpage : fitz .TextPage ,
172164 clip : fitz .Rect ,
173165 tabs = None ,
174- tab_rects : dict = None ,
175- img_rects : dict = None ,
176- links : list = None ,
166+ tab_rects : dict | None = None ,
167+ img_rects : dict | None = None ,
168+ links : list | None = None ,
177169 hdr_info = None ,
178170 ) -> string :
179171 """Output the text found inside the given clip.
@@ -184,12 +176,18 @@ def write_text(
184176 inline code, bold, italic and bold-italic styling.
185177 There is also some effort for list supported (ordered / unordered) in
186178 that typical characters are replaced by respective markdown characters.
179+
180+ 'tab_rects'/'img_rects' are dictionaries of table, respectively image
181+ or vector graphic rectangles.
182+ General Markdown text generation skips these areas. Tables are written
183+ via their own 'to_markdown' method. Images and vector graphics are
184+ optionally saved as files and pointed to by respective markdown text.
187185 """
188186 if clip is None :
189187 clip = textpage .rect
190188 out_string = ""
191189
192- # This is a list of tuples (linerect, [ spanlist] )
190+ # This is a list of tuples (linerect, spanlist)
193191 nlines = get_raw_lines (textpage , clip = clip , tolerance = 3 )
194192
195193 tab_rects0 = list (tab_rects .values ())
@@ -260,12 +258,15 @@ def write_text(
260258 and lrect .y1 - prev_lrect .y1 > lrect .height * 1.5
261259 or span0 ["text" ].startswith ("[" )
262260 or span0 ["text" ].startswith (bullet )
263- or span0 ["flags" ] & 1
261+ or span0 ["flags" ] & 1 # superscript?
264262 ):
265263 out_string += "\n "
266264 prev_lrect = lrect
267265
266+ # if line is a header, this will return multiple "#" characters
268267 hdr_string = hdr_info .get_header_id (spans [0 ])
268+
269+ # intercept if header text has been broken in multiple lines
269270 if hdr_string and hdr_string == prev_hdr_string :
270271 out_string = out_string [:- 1 ] + " " + text + "\n "
271272 continue
@@ -295,6 +296,7 @@ def write_text(
295296 prefix += "_"
296297 suffix = "_" + suffix
297298
299+ # convert intersecting link into markdown syntax
298300 ltext = resolve_links (links , s )
299301 if ltext :
300302 text = f"{ hdr_string } { prefix } { ltext } { suffix } "
@@ -329,27 +331,27 @@ def intersects_rects(rect, rect_list):
329331 return 0
330332
331333 def output_tables (tabs , text_rect , tab_rects ):
332- """Output and remove tables above text rectangle."""
334+ """Output tables above a text rectangle."""
333335 this_md = "" # markdown string for table content
334336 if text_rect is not None : # select tables above the text block
335337 for i , trect in sorted (
336338 [j for j in tab_rects .items () if j [1 ].y1 <= text_rect .y0 ],
337339 key = lambda j : (j [1 ].y1 , j [1 ].x0 ),
338340 ):
339341 this_md += tabs [i ].to_markdown (clean = False )
340- del tab_rects [i ]
342+ del tab_rects [i ] # do not touch this table twice
341343
342344 else : # output all remaining table
343345 for i , trect in sorted (
344346 tab_rects .items (),
345347 key = lambda j : (j [1 ].y1 , j [1 ].x0 ),
346348 ):
347349 this_md += tabs [i ].to_markdown (clean = False )
348- del tab_rects [i ]
350+ del tab_rects [i ] # do not touch this table twice
349351 return this_md
350352
351353 def output_images (page , text_rect , img_rects ):
352- """Output and remove images and graphics above text rectangle."""
354+ """Output images and graphics above text rectangle."""
353355 if img_rects is None :
354356 return ""
355357 this_md = "" # markdown string
@@ -360,7 +362,7 @@ def output_images(page, text_rect, img_rects):
360362 ):
361363 pathname = save_image (page , img_rect , i )
362364 this_md += GRAPHICS_TEXT % (pathname , pathname )
363- del img_rects [i ]
365+ del img_rects [i ] # do not touch this image twice
364366
365367 else : # output all remaining table
366368 for i , img_rect in sorted (
@@ -369,7 +371,7 @@ def output_images(page, text_rect, img_rects):
369371 ):
370372 pathname = save_image (page , img_rect , i )
371373 this_md += GRAPHICS_TEXT % (pathname , pathname )
372- del img_rects [i ]
374+ del img_rects [i ] # do not touch this image twice
373375 return this_md
374376
375377 def get_metadata (doc , pno ):
@@ -380,22 +382,28 @@ def get_metadata(doc, pno):
380382 return meta
381383
382384 def get_page_output (doc , pno , textflags ):
385+ """Process one page."""
383386 page = doc [pno ]
384387 md_string = ""
388+
389+ # extract all links on page
385390 links = [l for l in page .get_links () if l ["kind" ] == 2 ]
391+
392+ # make a TextPage for all later extractions
386393 textpage = page .get_textpage (flags = textflags )
387- # First locate all tables on page
394+
395+ # Locate all tables on page
388396 tabs = page .find_tables ()
389397
390- # Second, make a list of table boundary boxes.
391- # Must include the header bbox (may be outside tab.bbox)
398+ # Make a list of table boundary boxes.
399+ # Must include the header bbox (may exist outside tab.bbox)
392400 tab_rects = {}
393401 for i , t in enumerate (tabs ):
394402 tab_rects [i ] = fitz .Rect (t .bbox ) | fitz .Rect (t .header .bbox )
395403 tab_rects0 = list (tab_rects .values ())
396404
397405 # Select paths that are not contained in any table
398- page_clip = page .rect + (36 , 36 , - 36 , - 36 )
406+ page_clip = page .rect + (36 , 36 , - 36 , - 36 ) # ignore full page graphics
399407 paths = [
400408 p
401409 for p in page .get_drawings ()
@@ -409,6 +417,7 @@ def get_page_output(doc, pno, textflags):
409417 for r in vg_clusters
410418 if not intersects_rects (r , tab_rects0 ) and r .height > 20
411419 ] + [fitz .Rect (i ["bbox" ]) for i in page .get_image_info ()]
420+
412421 vg_clusters = dict ((i , r ) for i , r in enumerate (vg_clusters0 ))
413422 # Determine text column bboxes on page, avoiding tables and graphics
414423 text_rects = column_boxes (
@@ -422,7 +431,7 @@ def get_page_output(doc, pno, textflags):
422431 the text rectangles.
423432 """
424433 for text_rect in text_rects :
425- # outpt tables above this block of text
434+ # output tables above this block of text
426435 md_string += output_tables (tabs , text_rect , tab_rects )
427436 md_string += output_images (page , text_rect , vg_clusters )
428437
0 commit comments