3939import string
4040from binascii import b2a_base64
4141import pymupdf
42+ from pymupdf import mupdf
4243from pymupdf4llm .helpers .get_text_lines import get_raw_lines , is_white
4344from pymupdf4llm .helpers .multi_column import column_boxes
4445from pymupdf4llm .helpers .progress import ProgressBar
4546from dataclasses import dataclass
47+ from collections import defaultdict
4648
4749pymupdf .TOOLS .unset_quad_corrections (True )
4850# Characters recognized as bullets when starting a line.
@@ -88,8 +90,9 @@ def __init__(
8890 """Read all text and make a dictionary of fontsizes.
8991
9092 Args:
91- pages: optional list of pages to consider
92- body_limit: consider text with larger font size as some header
93+ doc: PDF document or filename
94+ pages: consider these page numbers only
95+ body_limit: treat text with larger font size as a header
9396 """
9497 if isinstance (doc , pymupdf .Document ):
9598 mydoc = doc
@@ -99,7 +102,7 @@ def __init__(
99102 if pages is None : # use all pages if omitted
100103 pages = range (mydoc .page_count )
101104
102- fontsizes = {}
105+ fontsizes = defaultdict ( int )
103106 for pno in pages :
104107 page = mydoc .load_page (pno )
105108 blocks = page .get_text ("dict" , flags = pymupdf .TEXTFLAGS_TEXT )["blocks" ]
@@ -111,8 +114,7 @@ def __init__(
111114 if not is_white (s ["text" ])
112115 ]:
113116 fontsz = round (span ["size" ])
114- count = fontsizes .get (fontsz , 0 ) + len (span ["text" ].strip ())
115- fontsizes [fontsz ] = count
117+ fontsizes [fontsz ] += len (span ["text" ].strip ())
116118
117119 if mydoc != doc :
118120 # if opened here, close it now
@@ -242,7 +244,7 @@ def to_markdown(
242244 filename = None ,
243245 force_text = True ,
244246 page_chunks = False ,
245- margins = ( 0 , 0 , 0 , 0 ) ,
247+ margins = 0 ,
246248 dpi = 150 ,
247249 page_width = 612 ,
248250 page_height = None ,
@@ -252,30 +254,30 @@ def to_markdown(
252254 ignore_code = False ,
253255 extract_words = False ,
254256 show_progress = False ,
255- ):
257+ use_glyphs = False ,
256258) -> str :
257259 """Process the document and return the text of the selected pages.
258260
259261 Args:
260262 doc: pymupdf.Document or string.
261263 pages: list of page numbers to consider (0-based).
262- hdr_info: callable or object having a method named 'get_hdr_info'.
263- write_images: (bool) whether to save images / drawing as files.
264- embed_images: (bool) embed images as base64 encoded strings
265- image_path: (str) folder into which images should be stored .
266- image_format: (str) desired image format. Choose a supported one.
267- force_text: (bool) output text despite of background.
264+ hdr_info: callable or object having method 'get_hdr_info'.
265+ write_images: (bool) save images / graphics as files.
266+ embed_images: (bool) embed images in markdown text ( base64 encoded)
267+ image_path: (str) store images in this folder .
268+ image_format: (str) use this image format. Choose a supported one.
269+ force_text: (bool) output text despite of image background.
268270 page_chunks: (bool) whether to segment output by page.
269- margins: do not consider content overlapping margin areas.
271+ margins: omit content overlapping margin areas.
270272 dpi: (int) desired resolution for generated images.
271273 page_width: (float) assumption if page layout is variable.
272274 page_height: (float) assumption if page layout is variable.
273275 table_strategy: choose table detection strategy
274- graphics_limit: (int) ignore page with too many vector graphics .
276+ graphics_limit: (int) if vector graphics count exceeds this, ignore all .
275277 ignore_code: (bool) suppress code-like formatting (mono-space fonts)
276278 extract_words: (bool) include "words"-like output in page chunks
277279 show_progress: (bool) print progress as each page is processed.
278- image_extract_algorithm : (str) which algorithm to use "simple" or "simple-drop" .
280+ glyph_fallback : (bool) replace the Invalid Unicode by glyph number .
279281
280282 """
281283 if write_images is False and embed_images is False and force_text is False :
@@ -339,6 +341,14 @@ def to_markdown(
339341 hdr_info = IdentifyHeaders (doc )
340342 get_header_id = hdr_info .get_header_id
341343
344+ def max_header_id (spans , page ):
345+ hdr_ids = sorted (
346+ [l for l in set ([len (get_header_id (s , page = page )) for s in spans ]) if l > 0 ]
347+ )
348+ if not hdr_ids :
349+ return ""
350+ return "#" * (hdr_ids [0 ] - 1 ) + " "
351+
342352 def resolve_links (links , span ):
343353 """Accept a span and return a markdown link string.
344354
@@ -422,7 +432,11 @@ def write_text(
422432 ]
423433
424434 parms .line_rects .extend (
425- [l [0 ] for l in nlines if not intersects_rects (l [0 ], parms .tab_rects .values ())]
435+ [
436+ l [0 ]
437+ for l in nlines
438+ if not intersects_rects (l [0 ], parms .tab_rects .values ())
439+ ]
426440 ) # store line rectangles
427441
428442 prev_lrect = None # previous line rectangle
@@ -492,8 +506,17 @@ def write_text(
492506 parms .deleted_images .append (i )
493507
494508 parms .line_rects .append (lrect )
509+
510+ # make text string for the full line
495511 text = " " .join ([s ["text" ] for s in spans ])
496512
513+ # if line is a header, this will return multiple "#" characters,
514+ # otherwise an empty string
515+ hdr_string = max_header_id (spans , page = parms .page ) # a header?
516+
517+ # full line strikeout?
518+ all_strikeout = all ([s ["char_flags" ] & 1 for s in spans ])
519+
497520 # full line mono-spaced?
498521 if not IGNORE_CODE :
499522 all_mono = all ([s ["flags" ] & 8 for s in spans ])
@@ -512,6 +535,12 @@ def write_text(
512535 out_string += indent + text + "\n "
513536 continue # done with this line
514537
538+ if hdr_string : # if a header line skip the rest
539+ if all_strikeout :
540+ text = "~~" + text + "~~"
541+ out_string += hdr_string + text + "\n "
542+ continue
543+
515544 span0 = spans [0 ]
516545 bno = span0 ["block" ] # block number of line
517546 if bno != prev_bno :
@@ -528,9 +557,6 @@ def write_text(
528557 out_string += "\n "
529558 prev_lrect = lrect
530559
531- # if line is a header, this will return multiple "#" characters
532- hdr_string = get_header_id (span0 , page = parms .page )
533-
534560 # intercept if header text has been broken in multiple lines
535561 if hdr_string and hdr_string == prev_hdr_string :
536562 while out_string .endswith ("\n " ):
@@ -539,9 +565,6 @@ def write_text(
539565 continue
540566
541567 prev_hdr_string = hdr_string
542- if hdr_string .startswith ("#" ): # if a header line skip the rest
543- out_string += hdr_string + text + "\n "
544- continue
545568
546569 # this line is not all-mono, so switch off "code" mode
547570 if code : # in code output mode?
@@ -551,45 +574,47 @@ def write_text(
551574 for i , s in enumerate (spans ): # iterate spans of the line
552575 # decode font properties
553576 mono = s ["flags" ] & 8 and IGNORE_CODE is False
554- bold = s ["flags" ] & 16
577+ bold = s ["flags" ] & 16 or s [ "char_flags" ] & 8
555578 italic = s ["flags" ] & 2
579+ strikeout = s ["char_flags" ] & 1
556580
557581 if mono :
558582 # this is text in some monospaced font
559583 out_string += f"`{ s ['text' ].strip ()} ` "
560- else : # not a mono text
561- prefix = ""
562- suffix = ""
563- if hdr_string == "" :
564- if bold :
565- prefix = "**"
566- suffix += "**"
567- if italic :
568- prefix += "_"
569- suffix = "_" + suffix
570-
571- # convert intersecting link to markdown syntax
572- ltext = resolve_links (parms .links , s )
573- if ltext :
574- text = f"{ hdr_string } { prefix } { ltext } { suffix } "
575- else :
576- text = f"{ hdr_string } { prefix } { s ['text' ].strip ()} { suffix } "
577- if text .startswith (bullet ):
578- text = text [1 :]
579- if len (text ) > 1 and text [1 ] == " " :
580- t = "-"
581- else :
582- t = "- "
583- text = t + text [1 :]
584- dist = span0 ["bbox" ][0 ] - clip .x0
585- cwidth = (span0 ["bbox" ][2 ] - span0 ["bbox" ][0 ]) / len (
586- span0 ["text" ]
587- )
588- if cwidth == 0.0 :
589- cwidth = span0 ["size" ] * 0.5
590- text = " " * int (round (dist / cwidth )) + text
584+ continue
591585
592- out_string += text
586+ prefix = ""
587+ suffix = ""
588+ if bold :
589+ prefix = "**" + prefix
590+ suffix += "**"
591+ if italic :
592+ prefix = "*" + prefix
593+ suffix += "*"
594+ if strikeout :
595+ prefix = "~~" + prefix
596+ suffix += "~~"
597+
598+ # convert intersecting link to markdown syntax
599+ ltext = resolve_links (parms .links , s )
600+ if ltext :
601+ text = f"{ hdr_string } { prefix } { ltext } { suffix } "
602+ else :
603+ text = f"{ hdr_string } { prefix } { s ['text' ].strip ()} { suffix } "
604+ if text .startswith (bullet ):
605+ text = text [1 :]
606+ if len (text ) > 1 and text [1 ] == " " :
607+ t = "-"
608+ else :
609+ t = "- "
610+ text = t + text [1 :]
611+ dist = span0 ["bbox" ][0 ] - clip .x0
612+ cwidth = (span0 ["bbox" ][2 ] - span0 ["bbox" ][0 ]) / len (span0 ["text" ])
613+ if cwidth == 0.0 :
614+ cwidth = span0 ["size" ] * 0.5
615+ text = " " * int (round (dist / cwidth )) + text
616+
617+ out_string += text
593618 if not code :
594619 out_string += "\n "
595620 out_string += "\n "
@@ -807,17 +832,10 @@ def get_page_output(doc, pno, margins, textflags, FILENAME):
807832 parms .graphics = []
808833 parms .words = []
809834 parms .line_rects = []
835+
810836 # determine background color
811837 parms .bg_color = get_bg_color (page )
812- # catch too-many-graphics situation
813- if GRAPHICS_LIMIT is not None :
814- test_paths = page .get_cdrawings () # fastest access to graphics
815- if (excess := len (test_paths )) > GRAPHICS_LIMIT :
816- parms .md_string = (
817- f"\n **Ignoring page { page .number } with { excess } + vector graphics.**"
818- )
819- parms .md_string += "\n \n -----\n \n "
820- return parms
838+
821839 left , top , right , bottom = margins
822840 parms .clip = page .rect + (left , top , - right , - bottom )
823841
@@ -887,6 +905,10 @@ def get_page_output(doc, pno, margins, textflags, FILENAME):
887905 and not (p ["type" ] == "f" and p ["fill" ] == parms .bg_color )
888906 ]
889907
908+ # catch too-many-graphics situation
909+ if GRAPHICS_LIMIT and len (paths ) > GRAPHICS_LIMIT :
910+ paths = []
911+
890912 # We also ignore vector graphics that only represent
891913 # "text emphasizing sugar".
892914 vg_clusters0 = [] # worthwhile vector graphics go here
@@ -988,7 +1010,17 @@ def get_page_output(doc, pno, margins, textflags, FILENAME):
9881010 # read the Table of Contents
9891011 toc = doc .get_toc ()
9901012
991- textflags = pymupdf .TEXT_MEDIABOX_CLIP | pymupdf .TEXT_ACCURATE_BBOXES
1013+ # Text extraction flags:
1014+ # omit invisible text, collect styles, use accurate bounding boxes
1015+ textflags = (
1016+ 0
1017+ | mupdf .FZ_STEXT_CLIP
1018+ | mupdf .FZ_STEXT_ACCURATE_BBOXES
1019+ | 32768 # mupdf.FZ_STEXT_COLLECT_STYLES
1020+ )
1021+ # optionally replace 0xFFFD by glyph number
1022+ if use_glyphs :
1023+ textflags |= mupdf .FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE
9921024
9931025 if show_progress :
9941026 print (f"Processing { FILENAME } ..." )
@@ -1082,9 +1114,7 @@ def extract_images_on_page_simple_drop(page, parms, image_size_limit):
10821114 import time
10831115
10841116 try :
1085- filename = (
1086- "C:/Users/haral/OneDrive/Desktop/pymupdf4llm/issues/0225/e000050.full.pdf"
1087- )
1117+ filename = "markdown.pdf"
10881118 except IndexError :
10891119 print (f"Usage:\n python { os .path .basename (__file__ )} input.pdf" )
10901120 sys .exit ()
@@ -1117,11 +1147,10 @@ def extract_images_on_page_simple_drop(page, parms, image_size_limit):
11171147 pages = pages ,
11181148 write_images = True ,
11191149 force_text = False ,
1120- image_path = r"C:\Users\haral\OneDrive\Desktop\pymupdf4llm\rag\pymupdf4llm\pymupdf4llm\helpers" ,
11211150 )
11221151 FILENAME = doc .name
11231152 # output to a text file with extension ".md"
1124- outname = FILENAME . replace ( ".pdf" , ".md" )
1153+ outname = FILENAME + ".md"
11251154 pathlib .Path (outname ).write_bytes (md_string .encode ())
11261155 t1 = time .perf_counter () # stop timer
11271156 print (f"Markdown creation time for { FILENAME = } { round (t1 - t0 ,2 )} sec." )
0 commit comments