4040if fitz .pymupdf_version_tuple < (1 , 24 , 2 ):
4141 raise NotImplementedError ("PyMuPDF version 1.24.2 or later is needed." )
4242
43- bullet = ("* " , chr (0xF0B7 ), chr (0xB7 ), chr (8226 ), chr (9679 ))
43+ bullet = ("- " , " * ", chr ( 0xF0A7 ) , chr (0xF0B7 ), chr (0xB7 ), chr (8226 ), chr (9679 ))
4444GRAPHICS_TEXT = "\n \n "
4545
4646
4747class IdentifyHeaders :
4848 """Compute data for identifying header text."""
4949
50- def __init__ (self , doc , pages : list = None , body_limit : float = None ):
50+ def __init__ (
51+ self ,
52+ doc : str ,
53+ pages : list = None ,
54+ body_limit : float = 12 ,
55+ ):
5156 """Read all text and make a dictionary of fontsizes.
5257
5358 Args:
@@ -85,53 +90,74 @@ def __init__(self, doc, pages: list = None, body_limit: float = None):
8590 self .header_id = {}
8691
8792 # If not provided, choose the most frequent font size as body text.
88- # If no text at all on all pages, just use 12
89- if body_limit is None :
90- temp = sorted (
91- [(k , v ) for k , v in fontsizes .items ()],
92- key = lambda i : i [1 ],
93- reverse = True ,
94- )
95- if temp :
96- body_limit = temp [0 ][0 ]
97- else :
98- body_limit = 12
93+ # If no text at all on all pages, just use 12.
94+ # In any case all fonts not exceeding
95+ temp = sorted (
96+ [(k , v ) for k , v in fontsizes .items ()],
97+ key = lambda i : i [1 ],
98+ reverse = True ,
99+ )
100+ if temp :
101+ b_limit = max ( body_limit , temp [0 ][0 ])
102+ else :
103+ b_limit = body_limit
99104
100- sizes = sorted ([f for f in fontsizes .keys () if f > body_limit ], reverse = True )
105+ # identify up to 6 font sizes as header candidates
106+ sizes = sorted (
107+ [f for f in fontsizes .keys () if f > b_limit ],
108+ reverse = True ,
109+ )[:6 ]
101110
102111 # make the header tag dictionary
103112 for i , size in enumerate (sizes ):
104113 self .header_id [size ] = "#" * (i + 1 ) + " "
105114
106- def get_header_id (self , span ) :
115+ def get_header_id (self , span : dict , page = None ) -> str :
107116 """Return appropriate markdown header prefix.
108117
109- Given a text span from a "dict"/"radict " extraction, determine the
110- markdown header prefix string of 0 to many concatenated '#' characters.
118+ Given a text span from a "dict"/"rawdict " extraction, determine the
119+ markdown header prefix string of 0 to n concatenated '#' characters.
111120 """
112121 fontsize = round (span ["size" ]) # compute fontsize
113122 hdr_id = self .header_id .get (fontsize , "" )
114123 return hdr_id
115124
116125
117126def to_markdown (
118- doc : fitz . Document | str ,
127+ doc : str ,
119128 * ,
120- pages : list | range | None = None ,
121- hdr_info : IdentifyHeaders | None = None ,
129+ pages : list = None ,
130+ hdr_info = None ,
122131 write_images : bool = False ,
123132 page_chunks : bool = False ,
124- ) -> str | list [dict ]:
133+ margins : typing .Iterable = (0 , 50 , 0 , 50 ),
134+ ) -> str :
125135 """Process the document and return the text of its selected pages."""
126136
127137 if isinstance (doc , str ):
128138 doc = fitz .open (doc )
129139
130- if not pages : # use all pages if argument not given
131- pages = range (doc .page_count )
132-
133- if not isinstance (hdr_info , IdentifyHeaders ):
140+ if pages is None : # use all pages if no selection given
141+ pages = list (range (doc .page_count ))
142+
143+ if hasattr (margins , "__float__" ):
144+ margins = [margins ] * 4
145+ if len (margins ) == 2 :
146+ margins = (0 , margins [0 ], 0 , margins [1 ])
147+ if len (margins ) != 4 :
148+ raise ValueError ("margins must have length 2 or 4 or be a number." )
149+ elif not all ([hasattr (m , "__float__" ) for m in margins ]):
150+ raise ValueError ("margin values must be numbers" )
151+
152+ # If "hdr_info" is not an object having method "get_header_id", scan the
153+ # document and use font sizes as header level indicators.
154+ if callable (hdr_info ):
155+ get_header_id = hdr_info
156+ elif hasattr (hdr_info , "get_header_id" ) and callable (hdr_info .get_header_id ):
157+ get_header_id = hdr_info .get_header_id
158+ else :
134159 hdr_info = IdentifyHeaders (doc )
160+ get_header_id = hdr_info .get_header_id
135161
136162 def resolve_links (links , span ):
137163 """Accept a span and return a markdown link string."""
@@ -146,27 +172,24 @@ def resolve_links(links, span):
146172 return text
147173
148174 def save_image (page , rect , i ):
149- """Optionally render the rect part of a page.
150-
151- In any case return the image filename.
152- """
175+ """Optionally render the rect part of a page."""
153176 filename = page .parent .name .replace ("\\ " , "/" )
154177 image_path = f"{ filename } -{ page .number } -{ i } .png"
155178 if write_images is True :
156179 pix = page .get_pixmap (clip = rect )
157180 pix .save (image_path )
158181 del pix
159- return os .path .basename (image_path )
182+ return os .path .basename (image_path )
183+ return ""
160184
161185 def write_text (
162186 page : fitz .Page ,
163187 textpage : fitz .TextPage ,
164188 clip : fitz .Rect ,
165189 tabs = None ,
166- tab_rects : dict | None = None ,
167- img_rects : dict | None = None ,
168- links : list | None = None ,
169- hdr_info = None ,
190+ tab_rects : dict = None ,
191+ img_rects : dict = None ,
192+ links : list = None ,
170193 ) -> string :
171194 """Output the text found inside the given clip.
172195
@@ -227,7 +250,8 @@ def write_text(
227250 key = lambda j : (j [1 ].y1 , j [1 ].x0 ),
228251 ):
229252 pathname = save_image (page , img_rect , i )
230- out_string += GRAPHICS_TEXT % (pathname , pathname )
253+ if pathname :
254+ out_string += GRAPHICS_TEXT % (pathname , pathname )
231255 del img_rects [i ]
232256
233257 text = " " .join ([s ["text" ] for s in spans ])
@@ -247,11 +271,11 @@ def write_text(
247271 out_string += indent + text + "\n "
248272 continue # done with this line
249273
250- bno = spans [0 ]["block" ] # block number of line
274+ span0 = spans [0 ]
275+ bno = span0 ["block" ] # block number of line
251276 if bno != prev_bno :
252277 out_string += "\n "
253278 prev_bno = bno
254- span0 = spans [0 ]
255279
256280 if ( # check if we need another line break
257281 prev_lrect
@@ -264,19 +288,24 @@ def write_text(
264288 prev_lrect = lrect
265289
266290 # if line is a header, this will return multiple "#" characters
267- hdr_string = hdr_info . get_header_id (spans [ 0 ] )
291+ hdr_string = get_header_id (span0 , page = page )
268292
269293 # intercept if header text has been broken in multiple lines
270294 if hdr_string and hdr_string == prev_hdr_string :
271295 out_string = out_string [:- 1 ] + " " + text + "\n "
272296 continue
297+
273298 prev_hdr_string = hdr_string
299+ if hdr_string .startswith ("#" ): # if a header line skip the rest
300+ out_string += hdr_string + text + "\n "
301+ continue
302+
303+ # this line is not all-mono, so switch off "code" mode
304+ if code : # still in code output mode?
305+ out_string += "```\n " # switch of code mode
306+ code = False
274307
275308 for i , s in enumerate (spans ): # iterate spans of the line
276- # this line is not all-mono, so switch off "code" mode
277- if code : # still in code output mode?
278- out_string += "```\n " # switch of code mode
279- code = False
280309 # decode font properties
281310 mono = s ["flags" ] & 8
282311 bold = s ["flags" ] & 16
@@ -312,6 +341,7 @@ def write_text(
312341 if code :
313342 out_string += "```\n " # switch of code mode
314343 code = False
344+
315345 return (
316346 out_string .replace (" \n " , "\n " ).replace (" " , " " ).replace ("\n \n \n " , "\n \n " )
317347 )
@@ -361,7 +391,8 @@ def output_images(page, text_rect, img_rects):
361391 key = lambda j : (j [1 ].y1 , j [1 ].x0 ),
362392 ):
363393 pathname = save_image (page , img_rect , i )
364- this_md += GRAPHICS_TEXT % (pathname , pathname )
394+ if pathname :
395+ this_md += GRAPHICS_TEXT % (pathname , pathname )
365396 del img_rects [i ] # do not touch this image twice
366397
367398 else : # output all remaining table
@@ -370,7 +401,8 @@ def output_images(page, text_rect, img_rects):
370401 key = lambda j : (j [1 ].y1 , j [1 ].x0 ),
371402 ):
372403 pathname = save_image (page , img_rect , i )
373- this_md += GRAPHICS_TEXT % (pathname , pathname )
404+ if pathname :
405+ this_md += GRAPHICS_TEXT % (pathname , pathname )
374406 del img_rects [i ] # do not touch this image twice
375407 return this_md
376408
@@ -381,48 +413,93 @@ def get_metadata(doc, pno):
381413 meta ["page" ] = pno + 1
382414 return meta
383415
384- def get_page_output (doc , pno , textflags ):
385- """Process one page."""
416+ def get_page_output (doc , pno , margins , textflags ):
417+ """Process one page.
418+
419+ Args:
420+ doc: fitz.Document
421+ pno: 0-based page number
422+ textflags: text extraction flag bits
423+
424+ Returns:
425+ Markdown string of page content and image, table and vector
426+ graphics information.
427+ """
386428 page = doc [pno ]
387429 md_string = ""
388-
430+ left , top , right , bottom = margins
431+ clip = page .rect + (left , top , - right , - bottom )
389432 # extract all links on page
390433 links = [l for l in page .get_links () if l ["kind" ] == 2 ]
391434
392435 # make a TextPage for all later extractions
393- textpage = page .get_textpage (flags = textflags )
436+ textpage = page .get_textpage (flags = textflags , clip = clip )
437+
438+ img_info = [img for img in page .get_image_info () if img ["bbox" ] in clip ]
439+ images = img_info [:]
440+ tables = []
441+ graphics = []
394442
395443 # Locate all tables on page
396- tabs = page .find_tables ()
444+ tabs = page .find_tables (clip = clip , strategy = "lines_strict" )
397445
398446 # Make a list of table boundary boxes.
399447 # Must include the header bbox (may exist outside tab.bbox)
400448 tab_rects = {}
401449 for i , t in enumerate (tabs ):
402450 tab_rects [i ] = fitz .Rect (t .bbox ) | fitz .Rect (t .header .bbox )
451+ tab_dict = {
452+ "bbox" : tuple (tab_rects [i ]),
453+ "rows" : t .row_count ,
454+ "columns" : t .col_count ,
455+ }
456+ tables .append (tab_dict )
403457 tab_rects0 = list (tab_rects .values ())
404458
405459 # Select paths that are not contained in any table
406460 page_clip = page .rect + (36 , 36 , - 36 , - 36 ) # ignore full page graphics
407461 paths = [
408462 p
409463 for p in page .get_drawings ()
410- if not intersects_rects (p ["rect" ], tab_rects0 ) and p ["rect" ] in page_clip
464+ if not intersects_rects (p ["rect" ], tab_rects0 )
465+ and p ["rect" ] in page_clip
466+ and p ["rect" ].width < page_clip .width
467+ and p ["rect" ].height < page_clip .height
411468 ]
412469
413- # determine vector graphics outside any tables
414- vg_clusters = page .cluster_drawings (drawings = paths )
470+ # Determine vector graphics outside any tables, filerting out any
471+ # which contain no stroked paths
472+ vg_clusters = []
473+ for bbox in page .cluster_drawings (drawings = paths ):
474+ include = False
475+ for p in [p for p in paths if p ["rect" ] in bbox ]:
476+ if p ["type" ] != "f" :
477+ include = True
478+ break
479+ if [item [0 ] for item in p ["items" ] if item [0 ] == "c" ]:
480+ include = True
481+ break
482+ if include is True :
483+ vg_clusters .append (bbox )
484+
485+ actual_paths = [p for p in paths if is_in_rects (p ["rect" ], vg_clusters )]
486+
415487 vg_clusters0 = [
416488 r
417489 for r in vg_clusters
418490 if not intersects_rects (r , tab_rects0 ) and r .height > 20
419- ] + [fitz .Rect (i ["bbox" ]) for i in page .get_image_info ()]
491+ ]
492+
493+ if write_images is True :
494+ vg_clusters0 += [fitz .Rect (i ["bbox" ]) for i in img_info ]
420495
421496 vg_clusters = dict ((i , r ) for i , r in enumerate (vg_clusters0 ))
497+
422498 # Determine text column bboxes on page, avoiding tables and graphics
423499 text_rects = column_boxes (
424500 page ,
425- paths = paths ,
501+ paths = actual_paths ,
502+ no_image_text = write_images ,
426503 textpage = textpage ,
427504 avoid = tab_rects0 + vg_clusters0 ,
428505 )
@@ -444,28 +521,46 @@ def get_page_output(doc, pno, textflags):
444521 tab_rects = tab_rects ,
445522 img_rects = vg_clusters ,
446523 links = links ,
447- hdr_info = hdr_info ,
448524 )
449525
450- # write remaining tables.
526+ # write any remaining tables and images
451527 md_string += output_tables (tabs , None , tab_rects )
452528 md_string += output_images (None , tab_rects , None )
453529 md_string += "\n -----\n \n "
454- return md_string
530+ while md_string .startswith ("\n " ):
531+ md_string = md_string [1 :]
532+ return md_string , images , tables , graphics
455533
456534 if page_chunks is False :
457535 document_output = ""
458536 else :
459537 document_output = []
460538
539+ # read the Table of Contents
540+ toc = doc .get_toc ()
461541 textflags = fitz .TEXT_DEHYPHENATE | fitz .TEXT_MEDIABOX_CLIP
462- for pno in list (pages ):
463- page_output = get_page_output (doc , pno , textflags )
542+ for pno in pages :
543+
544+ page_output , images , tables , graphics = get_page_output (
545+ doc , pno , margins , textflags
546+ )
464547 if page_chunks is False :
465548 document_output += page_output
466549 else :
550+ # build subet of TOC for this page
551+ page_tocs = [t for t in toc if t [- 1 ] == pno + 1 ]
552+
467553 metadata = get_metadata (doc , pno )
468- document_output .append ({"metadata" : metadata , "text" : page_output })
554+ document_output .append (
555+ {
556+ "metadata" : metadata ,
557+ "toc_items" : page_tocs ,
558+ "tables" : tables ,
559+ "images" : images ,
560+ "graphics" : graphics ,
561+ "text" : page_output ,
562+ }
563+ )
469564
470565 return document_output
471566
0 commit comments