2828
2929import os
3030import string
31- import typing
3231
3332try :
3433 import pymupdf as fitz # available with v1.24.3
@@ -50,8 +49,8 @@ class IdentifyHeaders:
5049
5150 def __init__ (
5251 self ,
53- doc : fitz . Document | str ,
54- pages : list | range | None = None ,
52+ doc : str ,
53+ pages : list = None ,
5554 body_limit : float = 12 ,
5655 ):
5756 """Read all text and make a dictionary of fontsizes.
@@ -113,7 +112,7 @@ def __init__(
113112 for i , size in enumerate (sizes ):
114113 self .header_id [size ] = "#" * (i + 1 ) + " "
115114
116- def get_header_id (self , span : dict , ** kwargs ) -> str :
115+ def get_header_id (self , span : dict , page = None ) -> str :
117116 """Return appropriate markdown header prefix.
118117
119118 Given a text span from a "dict"/"rawdict" extraction, determine the
@@ -125,14 +124,14 @@ def get_header_id(self, span: dict, **kwargs) -> str:
125124
126125
127126def to_markdown (
128- doc : fitz . Document | str ,
127+ doc : str ,
129128 * ,
130- pages : list | range | None = None ,
131- hdr_info : typing . Any = None ,
129+ pages : list = None ,
130+ hdr_info = None ,
132131 write_images : bool = False ,
133132 page_chunks : bool = False ,
134- margins : float | typing .Iterable = (0 , 50 , 0 , 50 ),
135- ) -> str | list [ dict ] :
133+ margins : typing .Iterable = (0 , 50 , 0 , 50 ),
134+ ) -> str :
136135 """Process the document and return the text of its selected pages."""
137136
138137 if isinstance (doc , str ):
@@ -188,9 +187,9 @@ def write_text(
188187 textpage : fitz .TextPage ,
189188 clip : fitz .Rect ,
190189 tabs = None ,
191- tab_rects : dict | None = None ,
192- img_rects : dict | None = None ,
193- links : list | None = None ,
190+ tab_rects : dict = None ,
191+ img_rects : dict = None ,
192+ links : list = None ,
194193 ) -> string :
195194 """Output the text found inside the given clip.
196195
@@ -289,15 +288,15 @@ def write_text(
289288 prev_lrect = lrect
290289
291290 # if line is a header, this will return multiple "#" characters
292- hdr_string = get_header_id (span0 )
291+ hdr_string = get_header_id (span0 , page = page )
293292
294293 # intercept if header text has been broken in multiple lines
295294 if hdr_string and hdr_string == prev_hdr_string :
296295 out_string = out_string [:- 1 ] + " " + text + "\n "
297296 continue
298297
299298 prev_hdr_string = hdr_string
300- if hdr_string .startswith ("#" ): # if a header output and skip the rest
299+ if hdr_string .startswith ("#" ): # if a header line skip the rest
301300 out_string += hdr_string + text + "\n "
302301 continue
303302
@@ -421,29 +420,28 @@ def get_page_output(doc, pno, margins, textflags):
421420 doc: fitz.Document
422421 pno: 0-based page number
423422 textflags: text extraction flag bits
424- images: store image information here
425- tables: store table information here
426- graphics: store graphics information here
427423
428424 Returns:
429- Markdown string of page content.
425+ Markdown string of page content and image, table and vector
426+ graphics information.
430427 """
431428 page = doc [pno ]
432429 md_string = ""
433-
430+ left , top , right , bottom = margins
431+ clip = page .rect + (left , top , - right , - bottom )
434432 # extract all links on page
435433 links = [l for l in page .get_links () if l ["kind" ] == 2 ]
436434
437435 # make a TextPage for all later extractions
438- textpage = page .get_textpage (flags = textflags )
436+ textpage = page .get_textpage (flags = textflags , clip = clip )
439437
440- img_info = page .get_image_info ()
438+ img_info = [ img for img in page .get_image_info () if img [ "bbox" ] in clip ]
441439 images = img_info [:]
442440 tables = []
443441 graphics = []
444442
445443 # Locate all tables on page
446- tabs = page .find_tables (strategy = "lines_strict" )
444+ tabs = page .find_tables (clip = clip , strategy = "lines_strict" )
447445
448446 # Make a list of table boundary boxes.
449447 # Must include the header bbox (may exist outside tab.bbox)
@@ -485,7 +483,7 @@ def get_page_output(doc, pno, margins, textflags):
485483 vg_clusters .append (bbox )
486484
487485 actual_paths = [p for p in paths if is_in_rects (p ["rect" ], vg_clusters )]
488- print ( f"before: { len ( vg_clusters ) = } " )
486+
489487 vg_clusters0 = [
490488 r
491489 for r in vg_clusters
@@ -496,8 +494,8 @@ def get_page_output(doc, pno, margins, textflags):
496494 vg_clusters0 += [fitz .Rect (i ["bbox" ]) for i in img_info ]
497495
498496 vg_clusters = dict ((i , r ) for i , r in enumerate (vg_clusters0 ))
497+
499498 # Determine text column bboxes on page, avoiding tables and graphics
500- print (f"{ len (tab_rects0 )= } , { len (vg_clusters0 )= } " )
501499 text_rects = column_boxes (
502500 page ,
503501 paths = actual_paths ,
0 commit comments