1- import asyncio
21from pathlib import Path
32from typing import Any , Callable , Dict , List , Optional , Union
43
2322class PDFMarkdownReader (BaseReader ):
2423 """Read PDF files using PyMuPDF library."""
2524
26- use_doc_meta : bool = True
2725 meta_filter : Optional [Callable [[Dict [str , Any ]], Dict [str , Any ]]] = None
2826
2927 def __init__ (
3028 self ,
31- use_doc_meta : bool = True ,
3229 meta_filter : Optional [Callable [[Dict [str , Any ]], Dict [str , Any ]]] = None ,
3330 ):
34- self .use_doc_meta = use_doc_meta
3531 self .meta_filter = meta_filter
3632
3733 def load_data (
@@ -73,45 +69,6 @@ def load_data(
7369 )
7470 return docs
7571
76- async def aload_data (
77- self ,
78- file_path : Union [Path , str ],
79- extra_info : Optional [Dict ] = None ,
80- ** load_kwargs : Any ,
81- ) -> List [LlamaIndexDocument ]:
82- """Asynchronously loads list of documents from PDF file and also accepts extra information in dict format.
83-
84- Args:
85- file_path (Union[Path, str]): The path to the PDF file.
86- extra_info (Optional[Dict], optional): A dictionary containing extra information. Defaults to None.
87- **load_kwargs (Any): Additional keyword arguments to be passed to the load method.
88-
89- Returns:
90- List[LlamaIndexDocument]: A list of LlamaIndexDocument objects.
91- """
92- if not isinstance (file_path , str ) and not isinstance (file_path , Path ):
93- raise TypeError ("file_path must be a string or Path." )
94-
95- if not extra_info :
96- extra_info = {}
97-
98- if extra_info and not isinstance (extra_info , dict ):
99- raise TypeError ("extra_info must be a dictionary." )
100-
101- # extract text header information
102- hdr_info = IdentifyHeaders (file_path )
103-
104- doc : FitzDocument = fitz .open (file_path )
105- tasks = []
106-
107- for page in doc :
108- tasks .append (
109- self ._process_doc_page (
110- doc , extra_info , file_path , page .number , hdr_info
111- )
112- )
113- return await asyncio .gather (* tasks )
114-
11572 # Helpers
11673 # ---
11774
@@ -124,8 +81,7 @@ def _process_doc_page(
12481 hdr_info : IdentifyHeaders ,
12582 ):
12683 """Processes a single page of a PDF document."""
127- if self .use_doc_meta :
128- extra_info = self ._process_doc_meta (doc , file_path , page_number , extra_info )
84+ extra_info = self ._process_doc_meta (doc , file_path , page_number , extra_info )
12985
13086 if self .meta_filter :
13187 extra_info = self .meta_filter (extra_info )
0 commit comments