Skip to content

Commit cd947bb

Browse files
committed
#feat: Report progress of the crawling #64
1 parent 7478f1d commit cd947bb

File tree

2 files changed

+71
-42
lines changed

2 files changed

+71
-42
lines changed

nodes.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,23 @@ def exec(self, prep_res):
5656
)
5757
else:
5858
print(f"Crawling directory: {prep_res['local_dir']}...")
59+
60+
def progress_callback(processed, total):
61+
percentage = (processed / total) * 100 if total > 0 else 0
62+
rounded_percentage = int(percentage)
63+
if rounded_percentage > progress_callback.last_reported:
64+
progress_callback.last_reported = rounded_percentage
65+
print(f"\033[92mProgress: {processed}/{total} files ({rounded_percentage}%)\033[0m")
66+
67+
progress_callback.last_reported = -1
68+
5969
result = crawl_local_files(
6070
directory=prep_res["local_dir"],
6171
include_patterns=prep_res["include_patterns"],
6272
exclude_patterns=prep_res["exclude_patterns"],
6373
max_file_size=prep_res["max_file_size"],
64-
use_relative_paths=prep_res["use_relative_paths"]
74+
use_relative_paths=prep_res["use_relative_paths"],
75+
progress_callback=progress_callback
6576
)
6677

6778
# Convert dict to list of tuples: [(path, content), ...]

utils/crawl_local_files.py

Lines changed: 59 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import os
22
import fnmatch
33

4-
def crawl_local_files(directory, include_patterns=None, exclude_patterns=None, max_file_size=None, use_relative_paths=True):
4+
def crawl_local_files(directory, include_patterns=None, exclude_patterns=None, max_file_size=None, use_relative_paths=True, progress_callback=None):
55
"""
66
Crawl files in a local directory with similar interface as crawl_github_files.
77
@@ -11,6 +11,7 @@ def crawl_local_files(directory, include_patterns=None, exclude_patterns=None, m
1111
exclude_patterns (set): File patterns to exclude (e.g. {"tests/*"})
1212
max_file_size (int): Maximum file size in bytes
1313
use_relative_paths (bool): Whether to use paths relative to directory
14+
progress_callback (callable): Function to report progress, takes (processed, total) as arguments
1415
1516
Returns:
1617
dict: {"files": {filepath: content}}
@@ -19,49 +20,66 @@ def crawl_local_files(directory, include_patterns=None, exclude_patterns=None, m
1920
raise ValueError(f"Directory does not exist: {directory}")
2021

2122
files_dict = {}
22-
23+
all_files = []
24+
25+
# Collect all files first to calculate total
2326
for root, _, files in os.walk(directory):
2427
for filename in files:
2528
filepath = os.path.join(root, filename)
26-
27-
# Get path relative to directory if requested
28-
if use_relative_paths:
29-
relpath = os.path.relpath(filepath, directory)
30-
else:
31-
relpath = filepath
32-
33-
# Check if file matches any include pattern
34-
included = False
35-
if include_patterns:
36-
for pattern in include_patterns:
37-
if fnmatch.fnmatch(relpath, pattern):
38-
included = True
39-
break
40-
else:
41-
included = True
42-
43-
# Check if file matches any exclude pattern
44-
excluded = False
45-
if exclude_patterns:
46-
for pattern in exclude_patterns:
47-
if fnmatch.fnmatch(relpath, pattern):
48-
excluded = True
49-
break
50-
51-
if not included or excluded:
52-
continue
53-
54-
# Check file size
55-
if max_file_size and os.path.getsize(filepath) > max_file_size:
56-
continue
57-
58-
try:
59-
with open(filepath, 'r', encoding='utf-8') as f:
60-
content = f.read()
61-
files_dict[relpath] = content
62-
except Exception as e:
63-
print(f"Warning: Could not read file {filepath}: {e}")
64-
29+
all_files.append(filepath)
30+
31+
total_files = len(all_files)
32+
processed_files = 0
33+
34+
for filepath in all_files:
35+
# Get path relative to directory if requested
36+
if use_relative_paths:
37+
relpath = os.path.relpath(filepath, directory)
38+
else:
39+
relpath = filepath
40+
41+
# Check if file matches any include pattern
42+
included = False
43+
if include_patterns:
44+
for pattern in include_patterns:
45+
if fnmatch.fnmatch(relpath, pattern):
46+
included = True
47+
break
48+
else:
49+
included = True
50+
51+
# Check if file matches any exclude pattern
52+
excluded = False
53+
if exclude_patterns:
54+
for pattern in exclude_patterns:
55+
if fnmatch.fnmatch(relpath, pattern):
56+
excluded = True
57+
break
58+
59+
if not included or excluded:
60+
processed_files += 1
61+
if progress_callback:
62+
progress_callback(processed_files, total_files)
63+
continue
64+
65+
# Check file size
66+
if max_file_size and os.path.getsize(filepath) > max_file_size:
67+
processed_files += 1
68+
if progress_callback:
69+
progress_callback(processed_files, total_files)
70+
continue
71+
72+
try:
73+
with open(filepath, 'r', encoding='utf-8') as f:
74+
content = f.read()
75+
files_dict[relpath] = content
76+
except Exception as e:
77+
print(f"Warning: Could not read file {filepath}: {e}")
78+
79+
processed_files += 1
80+
if progress_callback:
81+
progress_callback(processed_files, total_files)
82+
6583
return {"files": files_dict}
6684

6785
if __name__ == "__main__":

0 commit comments

Comments
 (0)