@@ -187,19 +187,15 @@ def to_markdown(
187187 if len (margins ) == 2 :
188188 margins = (0 , margins [0 ], 0 , margins [1 ])
189189 if len (margins ) != 4 :
190- raise ValueError (
191- "margins must be a float or a sequence of 2 or 4 floats"
192- )
190+ raise ValueError ("margins must be a float or a sequence of 2 or 4 floats" )
193191 elif not all ([hasattr (m , "__float__" ) for m in margins ]):
194192 raise ValueError ("margin values must be floats" )
195193
196194 # If "hdr_info" is not an object having method "get_header_id", scan the
197195 # document and use font sizes as header level indicators.
198196 if callable (hdr_info ):
199197 get_header_id = hdr_info
200- elif hasattr (hdr_info , "get_header_id" ) and callable (
201- hdr_info .get_header_id
202- ):
198+ elif hasattr (hdr_info , "get_header_id" ) and callable (hdr_info .get_header_id ):
203199 get_header_id = hdr_info .get_header_id
204200 else :
205201 hdr_info = IdentifyHeaders (doc )
@@ -382,9 +378,7 @@ def write_text(
382378 if ltext :
383379 text = f"{ hdr_string } { prefix } { ltext } { suffix } "
384380 else :
385- text = (
386- f"{ hdr_string } { prefix } { s ['text' ].strip ()} { suffix } "
387- )
381+ text = f"{ hdr_string } { prefix } { s ['text' ].strip ()} { suffix } "
388382
389383 if text .startswith (bullet ):
390384 text = "- " + text [1 :]
@@ -397,9 +391,7 @@ def write_text(
397391 code = False
398392
399393 return (
400- out_string .replace (" \n " , "\n " )
401- .replace (" " , " " )
402- .replace ("\n \n \n " , "\n \n " )
394+ out_string .replace (" \n " , "\n " ).replace (" " , " " ).replace ("\n \n \n " , "\n \n " )
403395 )
404396
405397 def is_in_rects (rect , rect_list ):
@@ -486,7 +478,9 @@ def get_page_output(doc, pno, margins, textflags):
486478 if GRAPHICS_LIMIT is not None :
487479 test_paths = page .get_cdrawings ()
488480 if (excess := len (test_paths )) > GRAPHICS_LIMIT :
489- md_string = f"\n **Ignoring page { page .number } with { excess } vector graphics.**"
481+ md_string = (
482+ f"\n **Ignoring page { page .number } with { excess } vector graphics.**"
483+ )
490484 md_string += "\n \n -----\n \n "
491485 return md_string , [], [], []
492486 left , top , right , bottom = margins
@@ -497,9 +491,7 @@ def get_page_output(doc, pno, margins, textflags):
497491 # make a TextPage for all later extractions
498492 textpage = page .get_textpage (flags = textflags , clip = clip )
499493
500- img_info = [
501- img for img in page .get_image_info () if img ["bbox" ] in clip
502- ]
494+ img_info = [img for img in page .get_image_info () if img ["bbox" ] in clip ]
503495 images = img_info [:]
504496 tables = []
505497 graphics = []
@@ -533,24 +525,42 @@ def get_page_output(doc, pno, margins, textflags):
533525 and p ["rect" ].height < page_clip .height
534526 ]
535527
536- # Determine vector graphics outside any tables, ignoring any
537- # fill-only (type "f") paths.
538- vg_clusters = []
528+ # We also ignore vector graphics that only represent "background
529+ # sugar".
530+ vg_clusters = [] # worthwhile vector graphics go here
531+
532+ # walk through all vector graphics not belonging to a table
539533 for bbox in page .cluster_drawings (drawings = paths ):
534+ subbox = bbox + (3 , 3 , - 3 , - 3 ) # sub rect without any border
535+ box_area = abs (bbox )
540536 include = False
541- for p in [p for p in paths if p ["rect" ] in bbox ]:
542- if p ["type" ] != "f" :
537+ for p in paths :
538+ mp = (p ["rect" ].tl + p ["rect" ].br ) / 2 # center point of rect
539+
540+ # fill-only paths or being part of the border will not
541+ # make this a worthwhile vector grahic
542+ if mp not in subbox or p ["type" ] == "f" :
543+ continue
544+
545+ # this checks if all items are part of the bbox border
546+ near_border = set ()
547+ for itm in p ["items" ]: # walk through path items
548+ if itm [0 ] == "re" : # a full-sized rectangle
549+ if abs (item [1 ]) / box_area < 1e-3 :
550+ near_border .add (True ) # is part of the border
551+ elif itm [0 ] in ("c" , "l" ): # curves and lines
552+ for temp in itm [1 :]:
553+ # if their points are on the border
554+ near_border .add (temp not in subbox )
555+ # if any stroked path has a point inside bbox (i.e. not on its
556+ # border then this vector graphic is treated as significant
557+ if not near_border == {True }:
543558 include = True
544559 break
545- if [item [0 ] for item in p ["items" ] if item [0 ] == "c" ]:
546- include = True
547- break
548- if include is True :
560+ if include is True : # this box is a significant vector graphic
549561 vg_clusters .append (bbox )
550562
551- actual_paths = [
552- p for p in paths if is_in_rects (p ["rect" ], vg_clusters )
553- ]
563+ actual_paths = [p for p in paths if is_in_rects (p ["rect" ], vg_clusters )]
554564
555565 vg_clusters0 = [
556566 r
@@ -594,6 +604,7 @@ def get_page_output(doc, pno, margins, textflags):
594604 links = links ,
595605 )
596606
607+ md_string = md_string .replace (" ," , "," ).replace ("-\n " , "" )
597608 # write any remaining tables and images
598609 md_string += output_tables (tabs , None , tab_rects )
599610 md_string += output_images (None , tab_rects , None )
@@ -609,7 +620,7 @@ def get_page_output(doc, pno, margins, textflags):
609620
610621 # read the Table of Contents
611622 toc = doc .get_toc ()
612- textflags = fitz .TEXT_DEHYPHENATE | fitz . TEXT_MEDIABOX_CLIP
623+ textflags = fitz .TEXT_MEDIABOX_CLIP
613624 for pno in pages :
614625 page_output , images , tables , graphics = get_page_output (
615626 doc , pno , margins , textflags
0 commit comments