Skip to content

Commit 90fa4fb

Browse files
committed
[PDFMardownReader] Added LlamaIndex Reader
1 parent f3c29c8 commit 90fa4fb

File tree

2 files changed

+267
-0
lines changed

2 files changed

+267
-0
lines changed
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
import asyncio
2+
from pathlib import Path
3+
from typing import Any, Callable, Dict, List, Optional, Union
4+
5+
try:
6+
import fitz
7+
from fitz import Document as FitzDocument
8+
from llama_index.core.readers.base import BaseReader
9+
from llama_index.core.schema import Document as LlamaIndexDocument
10+
11+
print("All imports are successful.")
12+
except ImportError:
13+
raise NotImplementedError(
14+
"Please install 'llama_index' version 1.24.0 or later is needed."
15+
)
16+
17+
18+
import pymupdf4llm
19+
20+
21+
class PDFMardownReader(BaseReader):
22+
"""Read PDF files using PyMuPDF library."""
23+
24+
use_meta: bool = True
25+
meta_filter: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None
26+
27+
def __init__(
28+
self,
29+
use_meta: bool = True,
30+
meta_filter: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
31+
):
32+
self.use_meta = use_meta
33+
self.meta_filter = meta_filter
34+
35+
def load_data(
36+
self,
37+
file_path: Union[Path, str],
38+
extra_info: Optional[Dict] = None,
39+
**load_kwargs: Any,
40+
) -> List[LlamaIndexDocument]:
41+
"""Loads list of documents from PDF file and also accepts extra information in dict format.
42+
43+
Args:
44+
file_path (Union[Path, str]): The path to the PDF file.
45+
extra_info (Optional[Dict], optional): A dictionary containing extra information. Defaults to None.
46+
**load_kwargs (Any): Additional keyword arguments to be passed to the load method.
47+
48+
Returns:
49+
List[LlamaIndexDocument]: A list of LlamaIndexDocument objects.
50+
"""
51+
if not isinstance(file_path, str) and not isinstance(file_path, Path):
52+
raise TypeError("file_path must be a string or Path.")
53+
54+
if not extra_info:
55+
extra_info = {}
56+
57+
if extra_info and not isinstance(extra_info, dict):
58+
raise TypeError("extra_info must be a dictionary.")
59+
60+
doc: FitzDocument = fitz.open(file_path)
61+
62+
docs = []
63+
for page in doc:
64+
docs.append(self._process_doc_page(doc, extra_info, file_path, page.number))
65+
return docs
66+
67+
async def aload_data(
68+
self,
69+
file_path: Union[Path, str],
70+
extra_info: Optional[Dict] = None,
71+
**load_kwargs: Any,
72+
) -> List[LlamaIndexDocument]:
73+
"""Asynchronously loads list of documents from PDF file and also accepts extra information in dict format.
74+
75+
Args:
76+
file_path (Union[Path, str]): The path to the PDF file.
77+
extra_info (Optional[Dict], optional): A dictionary containing extra information. Defaults to None.
78+
**load_kwargs (Any): Additional keyword arguments to be passed to the load method.
79+
80+
Returns:
81+
List[LlamaIndexDocument]: A list of LlamaIndexDocument objects.
82+
"""
83+
if not isinstance(file_path, str) and not isinstance(file_path, Path):
84+
raise TypeError("file_path must be a string or Path.")
85+
86+
if not extra_info:
87+
extra_info = {}
88+
89+
if extra_info and not isinstance(extra_info, dict):
90+
raise TypeError("extra_info must be a dictionary.")
91+
92+
doc: FitzDocument = fitz.open(file_path)
93+
94+
tasks = []
95+
for page in doc:
96+
tasks.append(
97+
self._aprocess_doc_page(doc, extra_info, file_path, page.number)
98+
)
99+
return await asyncio.gather(*tasks)
100+
101+
# Helpers
102+
# ---
103+
async def _aprocess_doc_page(
104+
self,
105+
doc: FitzDocument,
106+
extra_info: Dict[str, Any],
107+
file_path: str,
108+
page_number: int,
109+
):
110+
"""Asynchronously processes a single page of a PDF document."""
111+
return self._process_doc_page(doc, extra_info, file_path, page_number)
112+
113+
def _process_doc_page(
114+
self,
115+
doc: FitzDocument,
116+
extra_info: Dict[str, Any],
117+
file_path: str,
118+
page_number: int,
119+
):
120+
"""Processes a single page of a PDF document."""
121+
if self.use_meta:
122+
extra_info = self._process_meta(doc, file_path, page_number, extra_info)
123+
124+
text = pymupdf4llm.to_markdown(doc, [page_number])
125+
return LlamaIndexDocument(text=text, extra_info=extra_info)
126+
127+
def _process_meta(
128+
self,
129+
doc: FitzDocument,
130+
file_path: Union[Path, str],
131+
page_number: int,
132+
extra_info: Optional[Dict] = None,
133+
):
134+
"""Processes metas of a PDF document."""
135+
extra_info.update(doc.metadata)
136+
extra_info["page_number"] = f"{page_number+1}"
137+
extra_info["total_pages"] = len(doc)
138+
extra_info["file_path"] = str(file_path)
139+
140+
if self.meta_filter:
141+
extra_info = self.meta_filter(extra_info)
142+
143+
return extra_info
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
import os
2+
3+
import pytest
4+
from gary.services.rag.reader.pdf import LlamaIndexDocument
5+
6+
from pymupdf4llm.pymupdf4llm.llama_index.pdf_markdown_reader import PDFMardownReader
7+
8+
# Temporay.
9+
10+
PDF = "input.pdf"
11+
12+
13+
def _get_test_file_path(file_name: str, __file__: str = __file__) -> str:
14+
file_path = os.path.join(
15+
os.path.dirname(os.path.abspath(__file__)),
16+
"..",
17+
"..",
18+
".." "helpers",
19+
file_name,
20+
)
21+
file_path = os.path.normpath(file_path)
22+
return file_path
23+
24+
25+
def test_load_data():
26+
# Arrange
27+
# ---
28+
pdf_reader = PDFMardownReader()
29+
path = _get_test_file_path(PDF, __file__)
30+
extra_info = {"test_key": "test_value"}
31+
32+
# Act
33+
# ---
34+
documents = pdf_reader.load_data(path, extra_info)
35+
36+
# Assert
37+
# ---
38+
assert isinstance(documents, list)
39+
for doc in documents:
40+
assert isinstance(doc, LlamaIndexDocument)
41+
42+
43+
def test_load_data_with_invalid_file_path():
44+
# Arrange
45+
# ---
46+
pdf_reader = PDFMardownReader()
47+
extra_info = {"test_key": "test_value"}
48+
path = "fake/path"
49+
50+
# Act & Assert
51+
# ---
52+
with pytest.raises(Exception):
53+
pdf_reader.load_data(path, extra_info)
54+
55+
56+
def test_load_data_with_invalid_extra_info():
57+
# Arrange
58+
# ---
59+
pdf_reader = PDFMardownReader()
60+
extra_info = "invalid_extra_info"
61+
path = _get_test_file_path(PDF, __file__)
62+
63+
# Act & Assert
64+
# ---
65+
with pytest.raises(TypeError):
66+
pdf_reader.load_data(path, extra_info)
67+
68+
69+
@pytest.mark.asyncio
70+
async def test_aload_data():
71+
# Arrange
72+
# ---
73+
pdf_reader = PDFMardownReader(use_meta=True)
74+
extra_info = {"test_key": "test_value"}
75+
path = _get_test_file_path(PDF, __file__)
76+
77+
# Act
78+
# ---
79+
documents = await pdf_reader.aload_data(path, extra_info)
80+
81+
# Assert
82+
# ---
83+
expected_key = "test_key"
84+
expected_value = "test_value"
85+
86+
assert isinstance(documents, list)
87+
for doc in documents:
88+
assert expected_key in doc.metadata
89+
assert expected_value in doc.metadata.values()
90+
assert isinstance(doc, LlamaIndexDocument)
91+
92+
93+
@pytest.mark.asyncio
94+
async def test_aload_data_with_invalid_file_path():
95+
# Arrange
96+
# ---
97+
pdf_reader = PDFMardownReader()
98+
extra_info = {"test_key": "test_value"}
99+
100+
# Act
101+
# ---
102+
path = "Fake/path"
103+
104+
# Assert
105+
# ---
106+
with pytest.raises(Exception):
107+
await pdf_reader.aload_data(path, extra_info)
108+
109+
110+
@pytest.mark.asyncio
111+
async def test_aload_data_with_invalid_extra_info():
112+
# Arrange
113+
# ---
114+
pdf_reader = PDFMardownReader()
115+
extra_info = "invalid_extra_info"
116+
117+
# Act
118+
# ---
119+
path = _get_test_file_path(PDF, __file__)
120+
121+
# Assert
122+
# ---
123+
with pytest.raises(TypeError):
124+
await pdf_reader.aload_data(path, extra_info)

0 commit comments

Comments
 (0)