Skip to content

Commit 5601ebf

Browse files
authored
Merge pull request #20 from YanSte/main
[Llama] Fixed Async load
2 parents a65ea4a + 30bf200 commit 5601ebf

File tree

2 files changed

+1
-69
lines changed

2 files changed

+1
-69
lines changed

pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py

Lines changed: 1 addition & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import asyncio
21
from pathlib import Path
32
from typing import Any, Callable, Dict, List, Optional, Union
43

@@ -23,15 +22,12 @@
2322
class PDFMarkdownReader(BaseReader):
2423
"""Read PDF files using PyMuPDF library."""
2524

26-
use_doc_meta: bool = True
2725
meta_filter: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None
2826

2927
def __init__(
3028
self,
31-
use_doc_meta: bool = True,
3229
meta_filter: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
3330
):
34-
self.use_doc_meta = use_doc_meta
3531
self.meta_filter = meta_filter
3632

3733
def load_data(
@@ -73,45 +69,6 @@ def load_data(
7369
)
7470
return docs
7571

76-
async def aload_data(
77-
self,
78-
file_path: Union[Path, str],
79-
extra_info: Optional[Dict] = None,
80-
**load_kwargs: Any,
81-
) -> List[LlamaIndexDocument]:
82-
"""Asynchronously loads list of documents from PDF file and also accepts extra information in dict format.
83-
84-
Args:
85-
file_path (Union[Path, str]): The path to the PDF file.
86-
extra_info (Optional[Dict], optional): A dictionary containing extra information. Defaults to None.
87-
**load_kwargs (Any): Additional keyword arguments to be passed to the load method.
88-
89-
Returns:
90-
List[LlamaIndexDocument]: A list of LlamaIndexDocument objects.
91-
"""
92-
if not isinstance(file_path, str) and not isinstance(file_path, Path):
93-
raise TypeError("file_path must be a string or Path.")
94-
95-
if not extra_info:
96-
extra_info = {}
97-
98-
if extra_info and not isinstance(extra_info, dict):
99-
raise TypeError("extra_info must be a dictionary.")
100-
101-
# extract text header information
102-
hdr_info = IdentifyHeaders(file_path)
103-
104-
doc: FitzDocument = fitz.open(file_path)
105-
tasks = []
106-
107-
for page in doc:
108-
tasks.append(
109-
self._process_doc_page(
110-
doc, extra_info, file_path, page.number, hdr_info
111-
)
112-
)
113-
return await asyncio.gather(*tasks)
114-
11572
# Helpers
11673
# ---
11774

@@ -124,8 +81,7 @@ def _process_doc_page(
12481
hdr_info: IdentifyHeaders,
12582
):
12683
"""Processes a single page of a PDF document."""
127-
if self.use_doc_meta:
128-
extra_info = self._process_doc_meta(doc, file_path, page_number, extra_info)
84+
extra_info = self._process_doc_meta(doc, file_path, page_number, extra_info)
12985

13086
if self.meta_filter:
13187
extra_info = self.meta_filter(extra_info)

tests/pymupdf4llm/llama_index/test_pdf_markdown_reader.py

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -73,30 +73,6 @@ def test_load_data_with_invalid_extra_info():
7373
pdf_reader.load_data(path, extra_info)
7474

7575

76-
@pytest.mark.asyncio
77-
async def test_aload_data():
78-
# Arrange
79-
# ---
80-
pdf_reader = PDFMarkdownReader(use_meta=True)
81-
extra_info = {"test_key": "test_value"}
82-
path = _get_test_file_path(PDF, __file__)
83-
84-
# Act
85-
# ---
86-
documents = await pdf_reader.aload_data(path, extra_info)
87-
88-
# Assert
89-
# ---
90-
expected_key = "test_key"
91-
expected_value = "test_value"
92-
93-
assert isinstance(documents, list)
94-
for doc in documents:
95-
assert expected_key in doc.metadata
96-
assert expected_value in doc.metadata.values()
97-
assert isinstance(doc, LlamaIndexDocument)
98-
99-
10076
@pytest.mark.asyncio
10177
async def test_aload_data_with_invalid_file_path():
10278
# Arrange

0 commit comments

Comments
 (0)