Skip to content

Commit 6d55dd4

Browse files
authored
Merge pull request #7 from YanSte/feature/llama_index_reader
[PDFMardownReader] LlamaIndex Reader
2 parents f3c29c8 + 4578d2d commit 6d55dd4

File tree

2 files changed

+264
-0
lines changed

2 files changed

+264
-0
lines changed
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import asyncio
2+
from pathlib import Path
3+
from typing import Any, Callable, Dict, List, Optional, Union
4+
5+
import fitz
6+
from fitz import Document as FitzDocument
7+
8+
try:
9+
from llama_index.core.readers.base import BaseReader
10+
from llama_index.core.schema import Document as LlamaIndexDocument
11+
12+
print("All imports are successful.")
13+
except ImportError:
14+
raise NotImplementedError("Please install 'llama_index' is needed.")
15+
16+
17+
import pymupdf4llm
18+
19+
20+
class PDFMardownReader(BaseReader):
21+
"""Read PDF files using PyMuPDF library."""
22+
23+
use_doc_meta: bool = True
24+
meta_filter: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None
25+
26+
def __init__(
27+
self,
28+
use_doc_meta: bool = True,
29+
meta_filter: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
30+
):
31+
self.use_doc_meta = use_doc_meta
32+
self.meta_filter = meta_filter
33+
34+
def load_data(
35+
self,
36+
file_path: Union[Path, str],
37+
extra_info: Optional[Dict] = None,
38+
**load_kwargs: Any,
39+
) -> List[LlamaIndexDocument]:
40+
"""Loads list of documents from PDF file and also accepts extra information in dict format.
41+
42+
Args:
43+
file_path (Union[Path, str]): The path to the PDF file.
44+
extra_info (Optional[Dict], optional): A dictionary containing extra information. Defaults to None.
45+
**load_kwargs (Any): Additional keyword arguments to be passed to the load method.
46+
47+
Returns:
48+
List[LlamaIndexDocument]: A list of LlamaIndexDocument objects.
49+
"""
50+
if not isinstance(file_path, str) and not isinstance(file_path, Path):
51+
raise TypeError("file_path must be a string or Path.")
52+
53+
if not extra_info:
54+
extra_info = {}
55+
56+
if extra_info and not isinstance(extra_info, dict):
57+
raise TypeError("extra_info must be a dictionary.")
58+
59+
doc: FitzDocument = fitz.open(file_path)
60+
61+
docs = []
62+
for page in doc:
63+
docs.append(self._process_doc_page(doc, extra_info, file_path, page.number))
64+
return docs
65+
66+
async def aload_data(
67+
self,
68+
file_path: Union[Path, str],
69+
extra_info: Optional[Dict] = None,
70+
**load_kwargs: Any,
71+
) -> List[LlamaIndexDocument]:
72+
"""Asynchronously loads list of documents from PDF file and also accepts extra information in dict format.
73+
74+
Args:
75+
file_path (Union[Path, str]): The path to the PDF file.
76+
extra_info (Optional[Dict], optional): A dictionary containing extra information. Defaults to None.
77+
**load_kwargs (Any): Additional keyword arguments to be passed to the load method.
78+
79+
Returns:
80+
List[LlamaIndexDocument]: A list of LlamaIndexDocument objects.
81+
"""
82+
if not isinstance(file_path, str) and not isinstance(file_path, Path):
83+
raise TypeError("file_path must be a string or Path.")
84+
85+
if not extra_info:
86+
extra_info = {}
87+
88+
if extra_info and not isinstance(extra_info, dict):
89+
raise TypeError("extra_info must be a dictionary.")
90+
91+
doc: FitzDocument = fitz.open(file_path)
92+
93+
tasks = []
94+
for page in doc:
95+
tasks.append(
96+
self._process_doc_page(doc, extra_info, file_path, page.number)
97+
)
98+
return await asyncio.gather(*tasks)
99+
100+
# Helpers
101+
# ---
102+
103+
def _process_doc_page(
104+
self,
105+
doc: FitzDocument,
106+
extra_info: Dict[str, Any],
107+
file_path: str,
108+
page_number: int,
109+
):
110+
"""Processes a single page of a PDF document."""
111+
if self.use_doc_meta:
112+
extra_info = self._process_doc_meta(doc, file_path, page_number, extra_info)
113+
114+
if self.meta_filter:
115+
extra_info = self.meta_filter(extra_info)
116+
117+
text = pymupdf4llm.to_markdown(doc, [page_number])
118+
return LlamaIndexDocument(text=text, extra_info=extra_info)
119+
120+
def _process_doc_meta(
121+
self,
122+
doc: FitzDocument,
123+
file_path: Union[Path, str],
124+
page_number: int,
125+
extra_info: Optional[Dict] = None,
126+
):
127+
"""Processes metas of a PDF document."""
128+
extra_info.update(doc.metadata)
129+
extra_info["page_number"] = f"{page_number+1}"
130+
extra_info["total_pages"] = len(doc)
131+
extra_info["file_path"] = str(file_path)
132+
133+
return extra_info
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
import os
2+
3+
import pytest
4+
from llama_index.core.schema import Document as LlamaIndexDocument
5+
6+
try:
7+
from llama_index.core.readers.base import BaseReader
8+
from llama_index.core.schema import Document as LlamaIndexDocument
9+
10+
print("All imports are successful.")
11+
except ImportError:
12+
raise NotImplementedError("Please install 'llama_index' is needed.")
13+
14+
15+
from pymupdf4llm.pymupdf4llm.llama_index.pdf_markdown_reader import PDFMardownReader
16+
17+
PDF = "input.pdf"
18+
19+
20+
def _get_test_file_path(file_name: str, __file__: str = __file__) -> str:
21+
file_path = os.path.join(
22+
os.path.dirname(os.path.abspath(__file__)),
23+
"..",
24+
"..",
25+
".." "helpers",
26+
file_name,
27+
)
28+
file_path = os.path.normpath(file_path)
29+
return file_path
30+
31+
32+
def test_load_data():
33+
# Arrange
34+
# ---
35+
pdf_reader = PDFMardownReader()
36+
path = _get_test_file_path(PDF, __file__)
37+
extra_info = {"test_key": "test_value"}
38+
39+
# Act
40+
# ---
41+
documents = pdf_reader.load_data(path, extra_info)
42+
43+
# Assert
44+
# ---
45+
assert isinstance(documents, list)
46+
for doc in documents:
47+
assert isinstance(doc, LlamaIndexDocument)
48+
49+
50+
def test_load_data_with_invalid_file_path():
51+
# Arrange
52+
# ---
53+
pdf_reader = PDFMardownReader()
54+
extra_info = {"test_key": "test_value"}
55+
path = "fake/path"
56+
57+
# Act & Assert
58+
# ---
59+
with pytest.raises(Exception):
60+
pdf_reader.load_data(path, extra_info)
61+
62+
63+
def test_load_data_with_invalid_extra_info():
64+
# Arrange
65+
# ---
66+
pdf_reader = PDFMardownReader()
67+
extra_info = "invalid_extra_info"
68+
path = _get_test_file_path(PDF, __file__)
69+
70+
# Act & Assert
71+
# ---
72+
with pytest.raises(TypeError):
73+
pdf_reader.load_data(path, extra_info)
74+
75+
76+
@pytest.mark.asyncio
77+
async def test_aload_data():
78+
# Arrange
79+
# ---
80+
pdf_reader = PDFMardownReader(use_meta=True)
81+
extra_info = {"test_key": "test_value"}
82+
path = _get_test_file_path(PDF, __file__)
83+
84+
# Act
85+
# ---
86+
documents = await pdf_reader.aload_data(path, extra_info)
87+
88+
# Assert
89+
# ---
90+
expected_key = "test_key"
91+
expected_value = "test_value"
92+
93+
assert isinstance(documents, list)
94+
for doc in documents:
95+
assert expected_key in doc.metadata
96+
assert expected_value in doc.metadata.values()
97+
assert isinstance(doc, LlamaIndexDocument)
98+
99+
100+
@pytest.mark.asyncio
101+
async def test_aload_data_with_invalid_file_path():
102+
# Arrange
103+
# ---
104+
pdf_reader = PDFMardownReader()
105+
extra_info = {"test_key": "test_value"}
106+
107+
# Act
108+
# ---
109+
path = "Fake/path"
110+
111+
# Assert
112+
# ---
113+
with pytest.raises(Exception):
114+
await pdf_reader.aload_data(path, extra_info)
115+
116+
117+
@pytest.mark.asyncio
118+
async def test_aload_data_with_invalid_extra_info():
119+
# Arrange
120+
# ---
121+
pdf_reader = PDFMardownReader()
122+
extra_info = "invalid_extra_info"
123+
124+
# Act
125+
# ---
126+
path = _get_test_file_path(PDF, __file__)
127+
128+
# Assert
129+
# ---
130+
with pytest.raises(TypeError):
131+
await pdf_reader.aload_data(path, extra_info)

0 commit comments

Comments
 (0)