11import os
22import fnmatch
33
4- def crawl_local_files (directory , include_patterns = None , exclude_patterns = None , max_file_size = None , use_relative_paths = True ):
4+ def crawl_local_files (directory , include_patterns = None , exclude_patterns = None , max_file_size = None , use_relative_paths = True , progress_callback = None ):
55 """
66 Crawl files in a local directory with similar interface as crawl_github_files.
77
@@ -11,6 +11,7 @@ def crawl_local_files(directory, include_patterns=None, exclude_patterns=None, m
1111 exclude_patterns (set): File patterns to exclude (e.g. {"tests/*"})
1212 max_file_size (int): Maximum file size in bytes
1313 use_relative_paths (bool): Whether to use paths relative to directory
14+ progress_callback (callable): Function to report progress, takes (processed, total) as arguments
1415
1516 Returns:
1617 dict: {"files": {filepath: content}}
@@ -19,49 +20,66 @@ def crawl_local_files(directory, include_patterns=None, exclude_patterns=None, m
1920 raise ValueError (f"Directory does not exist: { directory } " )
2021
2122 files_dict = {}
22-
23+ all_files = []
24+
25+ # Collect all files first to calculate total
2326 for root , _ , files in os .walk (directory ):
2427 for filename in files :
2528 filepath = os .path .join (root , filename )
26-
27- # Get path relative to directory if requested
28- if use_relative_paths :
29- relpath = os .path .relpath (filepath , directory )
30- else :
31- relpath = filepath
32-
33- # Check if file matches any include pattern
34- included = False
35- if include_patterns :
36- for pattern in include_patterns :
37- if fnmatch .fnmatch (relpath , pattern ):
38- included = True
39- break
40- else :
41- included = True
42-
43- # Check if file matches any exclude pattern
44- excluded = False
45- if exclude_patterns :
46- for pattern in exclude_patterns :
47- if fnmatch .fnmatch (relpath , pattern ):
48- excluded = True
49- break
50-
51- if not included or excluded :
52- continue
53-
54- # Check file size
55- if max_file_size and os .path .getsize (filepath ) > max_file_size :
56- continue
57-
58- try :
59- with open (filepath , 'r' , encoding = 'utf-8' ) as f :
60- content = f .read ()
61- files_dict [relpath ] = content
62- except Exception as e :
63- print (f"Warning: Could not read file { filepath } : { e } " )
64-
29+ all_files .append (filepath )
30+
31+ total_files = len (all_files )
32+ processed_files = 0
33+
34+ for filepath in all_files :
35+ # Get path relative to directory if requested
36+ if use_relative_paths :
37+ relpath = os .path .relpath (filepath , directory )
38+ else :
39+ relpath = filepath
40+
41+ # Check if file matches any include pattern
42+ included = False
43+ if include_patterns :
44+ for pattern in include_patterns :
45+ if fnmatch .fnmatch (relpath , pattern ):
46+ included = True
47+ break
48+ else :
49+ included = True
50+
51+ # Check if file matches any exclude pattern
52+ excluded = False
53+ if exclude_patterns :
54+ for pattern in exclude_patterns :
55+ if fnmatch .fnmatch (relpath , pattern ):
56+ excluded = True
57+ break
58+
59+ if not included or excluded :
60+ processed_files += 1
61+ if progress_callback :
62+ progress_callback (processed_files , total_files )
63+ continue
64+
65+ # Check file size
66+ if max_file_size and os .path .getsize (filepath ) > max_file_size :
67+ processed_files += 1
68+ if progress_callback :
69+ progress_callback (processed_files , total_files )
70+ continue
71+
72+ try :
73+ with open (filepath , 'r' , encoding = 'utf-8' ) as f :
74+ content = f .read ()
75+ files_dict [relpath ] = content
76+ except Exception as e :
77+ print (f"Warning: Could not read file { filepath } : { e } " )
78+
79+ processed_files += 1
80+ if progress_callback :
81+ progress_callback (processed_files , total_files )
82+
6583 return {"files" : files_dict }
6684
6785if __name__ == "__main__" :
0 commit comments