2020class PDFMardownReader (BaseReader ):
2121 """Read PDF files using PyMuPDF library."""
2222
23- use_meta : bool = True
23+ use_doc_meta : bool = True
2424 meta_filter : Optional [Callable [[Dict [str , Any ]], Dict [str , Any ]]] = None
2525
2626 def __init__ (
2727 self ,
28- use_meta : bool = True ,
28+ use_doc_meta : bool = True ,
2929 meta_filter : Optional [Callable [[Dict [str , Any ]], Dict [str , Any ]]] = None ,
3030 ):
31- self .use_meta = use_meta
31+ self .use_doc_meta = use_doc_meta
3232 self .meta_filter = meta_filter
3333
3434 def load_data (
@@ -108,13 +108,16 @@ def _process_doc_page(
108108 page_number : int ,
109109 ):
110110 """Processes a single page of a PDF document."""
111- if self .use_meta :
112- extra_info = self ._process_meta (doc , file_path , page_number , extra_info )
111+ if self .use_doc_meta :
112+ extra_info = self ._process_doc_meta (doc , file_path , page_number , extra_info )
113+
114+ if self .meta_filter :
115+ extra_info = self .meta_filter (extra_info )
113116
114117 text = pymupdf4llm .to_markdown (doc , [page_number ])
115118 return LlamaIndexDocument (text = text , extra_info = extra_info )
116119
117- def _process_meta (
120+ def _process_doc_meta (
118121 self ,
119122 doc : FitzDocument ,
120123 file_path : Union [Path , str ],
@@ -127,7 +130,4 @@ def _process_meta(
127130 extra_info ["total_pages" ] = len (doc )
128131 extra_info ["file_path" ] = str (file_path )
129132
130- if self .meta_filter :
131- extra_info = self .meta_filter (extra_info )
132-
133133 return extra_info
0 commit comments