Merge pull request #20 from YanSte/main

JorjMcKie · web-flow · commit 5601ebf4d405 · 2024-05-23T06:02:39.000-04:00
[Llama] Fixed Async load
diff --git a/pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py b/pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py
@@ -1,4 +1,3 @@
-import asyncio
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Union
 
@@ -23,15 +22,12 @@
 class PDFMarkdownReader(BaseReader):
     """Read PDF files using PyMuPDF library."""
 
-    use_doc_meta: bool = True
     meta_filter: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None
 
     def __init__(
         self,
-        use_doc_meta: bool = True,
         meta_filter: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
     ):
-        self.use_doc_meta = use_doc_meta
         self.meta_filter = meta_filter
 
     def load_data(
@@ -73,45 +69,6 @@ def load_data(
             )
         return docs
 
-    async def aload_data(
-        self,
-        file_path: Union[Path, str],
-        extra_info: Optional[Dict] = None,
-        **load_kwargs: Any,
-    ) -> List[LlamaIndexDocument]:
-        """Asynchronously loads list of documents from PDF file and also accepts extra information in dict format.
-
-        Args:
-            file_path (Union[Path, str]): The path to the PDF file.
-            extra_info (Optional[Dict], optional): A dictionary containing extra information. Defaults to None.
-            **load_kwargs (Any): Additional keyword arguments to be passed to the load method.
-
-        Returns:
-            List[LlamaIndexDocument]: A list of LlamaIndexDocument objects.
-        """
-        if not isinstance(file_path, str) and not isinstance(file_path, Path):
-            raise TypeError("file_path must be a string or Path.")
-
-        if not extra_info:
-            extra_info = {}
-
-        if extra_info and not isinstance(extra_info, dict):
-            raise TypeError("extra_info must be a dictionary.")
-
-        # extract text header information
-        hdr_info = IdentifyHeaders(file_path)
-
-        doc: FitzDocument = fitz.open(file_path)
-        tasks = []
-
-        for page in doc:
-            tasks.append(
-                self._process_doc_page(
-                    doc, extra_info, file_path, page.number, hdr_info
-                )
-            )
-        return await asyncio.gather(*tasks)
-
     # Helpers
     # ---
 
@@ -124,8 +81,7 @@ def _process_doc_page(
         hdr_info: IdentifyHeaders,
     ):
         """Processes a single page of a PDF document."""
-        if self.use_doc_meta:
-            extra_info = self._process_doc_meta(doc, file_path, page_number, extra_info)
+        extra_info = self._process_doc_meta(doc, file_path, page_number, extra_info)
 
         if self.meta_filter:
             extra_info = self.meta_filter(extra_info)
diff --git a/tests/pymupdf4llm/llama_index/test_pdf_markdown_reader.py b/tests/pymupdf4llm/llama_index/test_pdf_markdown_reader.py
@@ -73,30 +73,6 @@ def test_load_data_with_invalid_extra_info():
         pdf_reader.load_data(path, extra_info)
 
 
-@pytest.mark.asyncio
-async def test_aload_data():
-    # Arrange
-    # ---
-    pdf_reader = PDFMarkdownReader(use_meta=True)
-    extra_info = {"test_key": "test_value"}
-    path = _get_test_file_path(PDF, __file__)
-
-    # Act
-    # ---
-    documents = await pdf_reader.aload_data(path, extra_info)
-
-    # Assert
-    # ---
-    expected_key = "test_key"
-    expected_value = "test_value"
-
-    assert isinstance(documents, list)
-    for doc in documents:
-        assert expected_key in doc.metadata
-        assert expected_value in doc.metadata.values()
-        assert isinstance(doc, LlamaIndexDocument)
-
-
 @pytest.mark.asyncio
 async def test_aload_data_with_invalid_file_path():
     # Arrange