Skip to content

Commit 4578d2d

Browse files
committed
[PDFMardownReader] Improved use doc meta
1 parent 49be1f8 commit 4578d2d

File tree

1 file changed

+9
-9
lines changed

1 file changed

+9
-9
lines changed

pymupdf4llm/pymupdf4llm/llama_index/pdf_markdown_reader.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,15 @@
2020
class PDFMardownReader(BaseReader):
2121
"""Read PDF files using PyMuPDF library."""
2222

23-
use_meta: bool = True
23+
use_doc_meta: bool = True
2424
meta_filter: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None
2525

2626
def __init__(
2727
self,
28-
use_meta: bool = True,
28+
use_doc_meta: bool = True,
2929
meta_filter: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
3030
):
31-
self.use_meta = use_meta
31+
self.use_doc_meta = use_doc_meta
3232
self.meta_filter = meta_filter
3333

3434
def load_data(
@@ -108,13 +108,16 @@ def _process_doc_page(
108108
page_number: int,
109109
):
110110
"""Processes a single page of a PDF document."""
111-
if self.use_meta:
112-
extra_info = self._process_meta(doc, file_path, page_number, extra_info)
111+
if self.use_doc_meta:
112+
extra_info = self._process_doc_meta(doc, file_path, page_number, extra_info)
113+
114+
if self.meta_filter:
115+
extra_info = self.meta_filter(extra_info)
113116

114117
text = pymupdf4llm.to_markdown(doc, [page_number])
115118
return LlamaIndexDocument(text=text, extra_info=extra_info)
116119

117-
def _process_meta(
120+
def _process_doc_meta(
118121
self,
119122
doc: FitzDocument,
120123
file_path: Union[Path, str],
@@ -127,7 +130,4 @@ def _process_meta(
127130
extra_info["total_pages"] = len(doc)
128131
extra_info["file_path"] = str(file_path)
129132

130-
if self.meta_filter:
131-
extra_info = self.meta_filter(extra_info)
132-
133133
return extra_info

0 commit comments

Comments
 (0)