Skip to content

Commit f626f2a

Browse files
authored
Merge pull request #122 from zane-programs/main
Add show_progress option to to_markdown()
2 parents d5df039 + 91da91e commit f626f2a

File tree

2 files changed

+70
-0
lines changed

2 files changed

+70
-0
lines changed
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import sys
2+
from typing import List, Any
3+
4+
5+
class _ProgressBar:
6+
def __init__(self, items: List[Any], progress_width: int = 40):
7+
self._len = len(items)
8+
self._iter = iter(items)
9+
self._len_digits = len(str(self._len))
10+
self._progress_width = progress_width
11+
self._progress_bar = 0
12+
self._current_index = 0
13+
14+
# Calculate the increment for each item based on the list length and the progress width
15+
self._increment = self._progress_width / self._len if self._len else 1
16+
17+
# Init progress bar
18+
sys.stdout.write("[%s] (0/%d)" %
19+
(" " * self._progress_width, self._len))
20+
sys.stdout.flush()
21+
sys.stdout.write(
22+
"\b" * (self._progress_width + len(str(self._len)) + 6))
23+
24+
def __iter__(self):
25+
return self
26+
27+
def __next__(self):
28+
try:
29+
result = next(self._iter)
30+
except StopIteration as e:
31+
# End progress on StopIteration
32+
sys.stdout.write("]\n")
33+
raise e
34+
35+
# Update the current index
36+
self._current_index += 1
37+
38+
# Add the increment to the progress bar and calculate how many "=" to add
39+
self._progress_bar += self._increment
40+
while self._progress_bar >= 1:
41+
sys.stdout.write("=")
42+
sys.stdout.flush()
43+
self._progress_bar -= 1
44+
45+
# Update the numerical progress
46+
padded_index = str(self._current_index).rjust(self._len_digits)
47+
progress_info = f" ({padded_index}/{self._len})"
48+
sys.stdout.write(
49+
"\b" * (self._progress_width + len(progress_info) + 1))
50+
sys.stdout.write("[")
51+
sys.stdout.write("=" * int(self._current_index *
52+
self._progress_width / self._len))
53+
sys.stdout.write(" " * (self._progress_width -
54+
int(self._current_index * self._progress_width / self._len)))
55+
sys.stdout.write("]" + progress_info)
56+
sys.stdout.flush()
57+
sys.stdout.write("\b" * (self._progress_width - int(self._current_index * self._progress_width / self._len)
58+
+ len(progress_info) + 1))
59+
60+
return result
61+
62+
63+
def ProgressBar(list: List[Any], progress_width: int = 40):
64+
return iter(_ProgressBar(list, progress_width))

pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333

3434
from pymupdf4llm.helpers.get_text_lines import get_raw_lines, is_white
3535
from pymupdf4llm.helpers.multi_column import column_boxes
36+
from pymupdf4llm.helpers.progress import ProgressBar
3637

3738
bullet = tuple(
3839
[
@@ -227,6 +228,7 @@ def to_markdown(
227228
graphics_limit=None,
228229
ignore_code=False,
229230
extract_words=False,
231+
show_progress=True,
230232
) -> str:
231233
"""Process the document and return the text of the selected pages.
232234
@@ -245,6 +247,7 @@ def to_markdown(
245247
page_height: (float) assumption if page layout is variable.
246248
table_strategy: choose table detection strategy
247249
graphics_limit: (int) ignore page with too many vector graphics.
250+
show_progress: (bool) print progress as each page is processed.
248251
249252
"""
250253
if write_images is False and force_text is False:
@@ -795,6 +798,9 @@ def get_page_output(doc, pno, margins, textflags):
795798
# read the Table of Contents
796799
toc = doc.get_toc()
797800
textflags = pymupdf.TEXT_MEDIABOX_CLIP
801+
if show_progress:
802+
print(f"Processing {doc.name}...")
803+
pages = ProgressBar(pages)
798804
for pno in pages:
799805
page_output, images, tables, graphics, words = get_page_output(
800806
doc, pno, margins, textflags

0 commit comments

Comments
 (0)