[PDFMardownReader] Added LlamaIndex Reader

YanSte · YanSte · commit 90fa4fb806d1 · 2024-05-10T11:12:37.000+02:00
diff --git a/pymupdf4llm/pymupdf4llm/llama_index/pdf_markdown_reader.py b/pymupdf4llm/pymupdf4llm/llama_index/pdf_markdown_reader.py
@@ -0,0 +1,143 @@
+import asyncio
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Union
+
+try:
+    import fitz
+    from fitz import Document as FitzDocument
+    from llama_index.core.readers.base import BaseReader
+    from llama_index.core.schema import Document as LlamaIndexDocument
+
+    print("All imports are successful.")
+except ImportError:
+    raise NotImplementedError(
+        "Please install 'llama_index' version 1.24.0 or later is needed."
+    )
+
+
+import pymupdf4llm
+
+
+class PDFMardownReader(BaseReader):
+    """Read PDF files using PyMuPDF library."""
+
+    use_meta: bool = True
+    meta_filter: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None
+
+    def __init__(
+        self,
+        use_meta: bool = True,
+        meta_filter: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
+    ):
+        self.use_meta = use_meta
+        self.meta_filter = meta_filter
+
+    def load_data(
+        self,
+        file_path: Union[Path, str],
+        extra_info: Optional[Dict] = None,
+        **load_kwargs: Any,
+    ) -> List[LlamaIndexDocument]:
+        """Loads list of documents from PDF file and also accepts extra information in dict format.
+
+        Args:
+            file_path (Union[Path, str]): The path to the PDF file.
+            extra_info (Optional[Dict], optional): A dictionary containing extra information. Defaults to None.
+            **load_kwargs (Any): Additional keyword arguments to be passed to the load method.
+
+        Returns:
+            List[LlamaIndexDocument]: A list of LlamaIndexDocument objects.
+        """
+        if not isinstance(file_path, str) and not isinstance(file_path, Path):
+            raise TypeError("file_path must be a string or Path.")
+
+        if not extra_info:
+            extra_info = {}
+
+        if extra_info and not isinstance(extra_info, dict):
+            raise TypeError("extra_info must be a dictionary.")
+
+        doc: FitzDocument = fitz.open(file_path)
+
+        docs = []
+        for page in doc:
+            docs.append(self._process_doc_page(doc, extra_info, file_path, page.number))
+        return docs
+
+    async def aload_data(
+        self,
+        file_path: Union[Path, str],
+        extra_info: Optional[Dict] = None,
+        **load_kwargs: Any,
+    ) -> List[LlamaIndexDocument]:
+        """Asynchronously loads list of documents from PDF file and also accepts extra information in dict format.
+
+        Args:
+            file_path (Union[Path, str]): The path to the PDF file.
+            extra_info (Optional[Dict], optional): A dictionary containing extra information. Defaults to None.
+            **load_kwargs (Any): Additional keyword arguments to be passed to the load method.
+
+        Returns:
+            List[LlamaIndexDocument]: A list of LlamaIndexDocument objects.
+        """
+        if not isinstance(file_path, str) and not isinstance(file_path, Path):
+            raise TypeError("file_path must be a string or Path.")
+
+        if not extra_info:
+            extra_info = {}
+
+        if extra_info and not isinstance(extra_info, dict):
+            raise TypeError("extra_info must be a dictionary.")
+
+        doc: FitzDocument = fitz.open(file_path)
+
+        tasks = []
+        for page in doc:
+            tasks.append(
+                self._aprocess_doc_page(doc, extra_info, file_path, page.number)
+            )
+        return await asyncio.gather(*tasks)
+
+    # Helpers
+    # ---
+    async def _aprocess_doc_page(
+        self,
+        doc: FitzDocument,
+        extra_info: Dict[str, Any],
+        file_path: str,
+        page_number: int,
+    ):
+        """Asynchronously processes a single page of a PDF document."""
+        return self._process_doc_page(doc, extra_info, file_path, page_number)
+
+    def _process_doc_page(
+        self,
+        doc: FitzDocument,
+        extra_info: Dict[str, Any],
+        file_path: str,
+        page_number: int,
+    ):
+        """Processes a single page of a PDF document."""
+        if self.use_meta:
+            extra_info = self._process_meta(doc, file_path, page_number, extra_info)
+
+        text = pymupdf4llm.to_markdown(doc, [page_number])
+        return LlamaIndexDocument(text=text, extra_info=extra_info)
+
+    def _process_meta(
+        self,
+        doc: FitzDocument,
+        file_path: Union[Path, str],
+        page_number: int,
+        extra_info: Optional[Dict] = None,
+    ):
+        """Processes metas of a PDF document."""
+        extra_info.update(doc.metadata)
+        extra_info["page_number"] = f"{page_number+1}"
+        extra_info["total_pages"] = len(doc)
+        extra_info["file_path"] = str(file_path)
+
+        if self.meta_filter:
+            extra_info = self.meta_filter(extra_info)
+
+        return extra_info
diff --git a/tests/pymupdf4llm/llama_index/test_pdf_markdown_reader.py b/tests/pymupdf4llm/llama_index/test_pdf_markdown_reader.py
@@ -0,0 +1,124 @@
+import os
+
+import pytest
+from gary.services.rag.reader.pdf import LlamaIndexDocument
+
+from pymupdf4llm.pymupdf4llm.llama_index.pdf_markdown_reader import PDFMardownReader
+
+# Temporay.
+
+PDF = "input.pdf"
+
+
+def _get_test_file_path(file_name: str, __file__: str = __file__) -> str:
+    file_path = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        "..",
+        "..",
+        ".." "helpers",
+        file_name,
+    )
+    file_path = os.path.normpath(file_path)
+    return file_path
+
+
+def test_load_data():
+    # Arrange
+    # ---
+    pdf_reader = PDFMardownReader()
+    path = _get_test_file_path(PDF, __file__)
+    extra_info = {"test_key": "test_value"}
+
+    # Act
+    # ---
+    documents = pdf_reader.load_data(path, extra_info)
+
+    # Assert
+    # ---
+    assert isinstance(documents, list)
+    for doc in documents:
+        assert isinstance(doc, LlamaIndexDocument)
+
+
+def test_load_data_with_invalid_file_path():
+    # Arrange
+    # ---
+    pdf_reader = PDFMardownReader()
+    extra_info = {"test_key": "test_value"}
+    path = "fake/path"
+
+    # Act & Assert
+    # ---
+    with pytest.raises(Exception):
+        pdf_reader.load_data(path, extra_info)
+
+
+def test_load_data_with_invalid_extra_info():
+    # Arrange
+    # ---
+    pdf_reader = PDFMardownReader()
+    extra_info = "invalid_extra_info"
+    path = _get_test_file_path(PDF, __file__)
+
+    # Act & Assert
+    # ---
+    with pytest.raises(TypeError):
+        pdf_reader.load_data(path, extra_info)
+
+
+@pytest.mark.asyncio
+async def test_aload_data():
+    # Arrange
+    # ---
+    pdf_reader = PDFMardownReader(use_meta=True)
+    extra_info = {"test_key": "test_value"}
+    path = _get_test_file_path(PDF, __file__)
+
+    # Act
+    # ---
+    documents = await pdf_reader.aload_data(path, extra_info)
+
+    # Assert
+    # ---
+    expected_key = "test_key"
+    expected_value = "test_value"
+
+    assert isinstance(documents, list)
+    for doc in documents:
+        assert expected_key in doc.metadata
+        assert expected_value in doc.metadata.values()
+        assert isinstance(doc, LlamaIndexDocument)
+
+
+@pytest.mark.asyncio
+async def test_aload_data_with_invalid_file_path():
+    # Arrange
+    # ---
+    pdf_reader = PDFMardownReader()
+    extra_info = {"test_key": "test_value"}
+
+    # Act
+    # ---
+    path = "Fake/path"
+
+    # Assert
+    # ---
+    with pytest.raises(Exception):
+        await pdf_reader.aload_data(path, extra_info)
+
+
+@pytest.mark.asyncio
+async def test_aload_data_with_invalid_extra_info():
+    # Arrange
+    # ---
+    pdf_reader = PDFMardownReader()
+    extra_info = "invalid_extra_info"
+
+    # Act
+    # ---
+    path = _get_test_file_path(PDF, __file__)
+
+    # Assert
+    # ---
+    with pytest.raises(TypeError):
+        await pdf_reader.aload_data(path, extra_info)