1515
1616Text will be sorted in Western reading order. Any table will be included in
1717the text in markdwn format as well.
18-
18+
1919Dependencies
2020-------------
2121PyMuPDF v1.24.3 or later
@@ -256,6 +256,7 @@ def to_markdown(
256256 ignore_code = False ,
257257 extract_words = False ,
258258 show_progress = True ,
259+ image_extract_algorithm = "simple-drop" ,
259260) -> str :
260261 """Process the document and return the text of the selected pages.
261262
@@ -278,6 +279,7 @@ def to_markdown(
278279 ignore_code: (bool) suppress extra formatting for mono-space fonts
279280 extract_words: (bool) include "words"-like output in page chunks
280281 show_progress: (bool) print progress as each page is processed.
282+ image_extract_algorithm: (str) which algorithm to use "simple" or "simple-drop".
281283
282284 """
283285 if write_images is False and embed_images is False and force_text is False :
@@ -784,27 +786,12 @@ def get_page_output(doc, pno, margins, textflags):
784786 # make a TextPage for all later extractions
785787 parms .textpage = page .get_textpage (flags = textflags , clip = parms .clip )
786788
787- # extract images on page
788- # ignore images contained in some other one (simplified mechanism)
789- img_info = page .get_image_info ()
790- for i in range (len (img_info )):
791- item = img_info [i ]
792- item ["bbox" ] = pymupdf .Rect (item ["bbox" ]) & parms .clip
793- img_info [i ] = item
794-
795- # sort descending by image area size
796- img_info .sort (key = lambda i : abs (i ["bbox" ]), reverse = True )
797- # run from back to front (= small to large)
798- for i in range (len (img_info ) - 1 , 0 , - 1 ):
799- r = img_info [i ]["bbox" ]
800- if r .is_empty :
801- del img_info [i ]
802- continue
803- for j in range (i ): # image areas larger than r
804- if r in img_info [j ]["bbox" ]:
805- del img_info [i ] # contained in some larger image
806- break
807- parms .images = img_info
789+ extract_images_on_page = {
790+ "simple" : extract_images_on_page_simple ,
791+ "simple-drop" : extract_images_on_page_simple_drop ,
792+ }[image_extract_algorithm ]
793+
794+ parms .images = extract_images_on_page (page , parms , image_size_limit )
808795 parms .img_rects = [i ["bbox" ] for i in parms .images ]
809796
810797 # Locate all tables on page
@@ -957,6 +944,64 @@ def get_page_output(doc, pno, margins, textflags):
957944 return document_output
958945
959946
947+ def extract_images_on_page_simple (page , parms , image_size_limit ):
948+ # extract images on page
949+ # ignore images contained in some other one (simplified mechanism)
950+ img_info = page .get_image_info ()
951+ for i in range (len (img_info )):
952+ item = img_info [i ]
953+ item ["bbox" ] = pymupdf .Rect (item ["bbox" ]) & parms .clip
954+ img_info [i ] = item
955+
956+ # sort descending by image area size
957+ img_info .sort (key = lambda i : abs (i ["bbox" ]), reverse = True )
958+ # run from back to front (= small to large)
959+ for i in range (len (img_info ) - 1 , 0 , - 1 ):
960+ r = img_info [i ]["bbox" ]
961+ if r .is_empty :
962+ del img_info [i ]
963+ continue
964+ for j in range (i ): # image areas larger than r
965+ if r in img_info [j ]["bbox" ]:
966+ del img_info [i ] # contained in some larger image
967+ break
968+
969+ return img_info
970+
971+
972+ def filter_small_images (page , parms , image_size_limit ):
973+ img_info = []
974+ for item in page .get_image_info ():
975+ r = pymupdf .Rect (item ["bbox" ]) & parms .clip
976+ if r .is_empty or (
977+ max (r .width / page .rect .width , r .height / page .rect .height )
978+ < image_size_limit
979+ ):
980+ continue
981+ item ["bbox" ] = r
982+ img_info .append (item )
983+ return img_info
984+
985+
986+ def extract_images_on_page_simple_drop (page , parms , image_size_limit ):
987+ img_info = filter_small_images (page , parms , image_size_limit )
988+
989+ # sort descending by image area size
990+ img_info .sort (key = lambda i : abs (i ["bbox" ]), reverse = True )
991+ # run from back to front (= small to large)
992+ for i in range (len (img_info ) - 1 , 0 , - 1 ):
993+ r = img_info [i ]["bbox" ]
994+ if r .is_empty :
995+ del img_info [i ]
996+ continue
997+ for j in range (i ): # image areas larger than r
998+ if r in img_info [j ]["bbox" ]:
999+ del img_info [i ] # contained in some larger image
1000+ break
1001+
1002+ return img_info
1003+
1004+
9601005if __name__ == "__main__" :
9611006 import pathlib
9621007 import sys
0 commit comments