22import fnmatch
33import pathspec
44
5-
65def crawl_local_files (
76 directory ,
87 include_patterns = None ,
98 exclude_patterns = None ,
109 max_file_size = None ,
1110 use_relative_paths = True ,
11+ progress_callback = None ,
1212):
1313 """
1414 Crawl files in a local directory with similar interface as crawl_github_files.
@@ -18,6 +18,7 @@ def crawl_local_files(
1818 exclude_patterns (set): File patterns to exclude (e.g. {"tests/*"})
1919 max_file_size (int): Maximum file size in bytes
2020 use_relative_paths (bool): Whether to use paths relative to directory
21+ progress_callback (callable): Function to report progress, takes (processed, total) as arguments
2122
2223 Returns:
2324 dict: {"files": {filepath: content}}
@@ -34,91 +35,84 @@ def crawl_local_files(
3435 try :
3536 with open (gitignore_path , "r" , encoding = "utf-8" ) as f :
3637 gitignore_patterns = f .readlines ()
37- gitignore_spec = pathspec .PathSpec .from_lines (
38- "gitwildmatch" , gitignore_patterns
39- )
38+ gitignore_spec = pathspec .PathSpec .from_lines ("gitwildmatch" , gitignore_patterns )
4039 print (f"Loaded .gitignore patterns from { gitignore_path } " )
4140 except Exception as e :
42- print (
43- f"Warning: Could not read or parse .gitignore file { gitignore_path } : { e } "
44- )
45- # --- End Load .gitignore ---
41+ print (f"Warning: Could not read or parse .gitignore file { gitignore_path } : { e } " )
4642
43+ all_files = []
4744 for root , dirs , files in os .walk (directory ):
48- # Filter directories using .gitignore and exclude_patterns early to avoid descending
49- # Need to process dirs list *in place* for os.walk to respect it
45+ # Filter directories using .gitignore and exclude_patterns early
5046 excluded_dirs = set ()
5147 for d in dirs :
5248 dirpath_rel = os .path .relpath (os .path .join (root , d ), directory )
5349
54- # Check against .gitignore (important for directories)
5550 if gitignore_spec and gitignore_spec .match_file (dirpath_rel ):
5651 excluded_dirs .add (d )
57- continue # Skip further checks if gitignored
52+ continue
5853
59- # Check against standard exclude_patterns
6054 if exclude_patterns :
6155 for pattern in exclude_patterns :
62- # Match pattern against full relative path or directory name itself
63- if fnmatch .fnmatch (dirpath_rel , pattern ) or fnmatch .fnmatch (
64- d , pattern
65- ):
56+ if fnmatch .fnmatch (dirpath_rel , pattern ) or fnmatch .fnmatch (d , pattern ):
6657 excluded_dirs .add (d )
6758 break
6859
69- # Modify dirs in-place: remove excluded ones
70- # Iterate over a copy (.copy()) because we are modifying the list during iteration
7160 for d in dirs .copy ():
7261 if d in excluded_dirs :
7362 dirs .remove (d )
7463
75- # Now process files in the non-excluded directories
7664 for filename in files :
7765 filepath = os .path .join (root , filename )
66+ all_files .append (filepath )
67+
68+ total_files = len (all_files )
69+ processed_files = 0
70+
71+ for filepath in all_files :
72+ relpath = os .path .relpath (filepath , directory ) if use_relative_paths else filepath
73+
74+ # --- Exclusion check ---
75+ excluded = False
76+ if gitignore_spec and gitignore_spec .match_file (relpath ):
77+ excluded = True
78+
79+ if not excluded and exclude_patterns :
80+ for pattern in exclude_patterns :
81+ if fnmatch .fnmatch (relpath , pattern ):
82+ excluded = True
83+ break
84+
85+ included = False
86+ if include_patterns :
87+ for pattern in include_patterns :
88+ if fnmatch .fnmatch (relpath , pattern ):
89+ included = True
90+ break
91+ else :
92+ included = True
93+
94+ if not included or excluded :
95+ processed_files += 1
96+ if progress_callback :
97+ progress_callback (processed_files , total_files )
98+ continue
99+
100+ if max_file_size and os .path .getsize (filepath ) > max_file_size :
101+ processed_files += 1
102+ if progress_callback :
103+ progress_callback (processed_files , total_files )
104+ continue
78105
79- # Get path relative to directory if requested
80- if use_relative_paths :
81- relpath = os .path .relpath (filepath , directory )
82- else :
83- relpath = filepath
84-
85- # --- Exclusion check ---
86- excluded = False
87- # 1. Check .gitignore first
88- if gitignore_spec and gitignore_spec .match_file (relpath ):
89- excluded = True
90-
91- # 2. Check standard exclude_patterns if not already excluded by .gitignore
92- if not excluded and exclude_patterns :
93- for pattern in exclude_patterns :
94- if fnmatch .fnmatch (relpath , pattern ):
95- excluded = True
96- break
97-
98- included = False
99- if include_patterns :
100- for pattern in include_patterns :
101- if fnmatch .fnmatch (relpath , pattern ):
102- included = True
103- break
104- else :
105- # If no include patterns, include everything *not excluded*
106- included = True
107-
108- # Skip if not included or if excluded (by either method)
109- if not included or excluded :
110- continue
111-
112- # Check file size
113- if max_file_size and os .path .getsize (filepath ) > max_file_size :
114- continue
106+ try :
107+ with open (filepath , "r" , encoding = "utf-8" ) as f :
108+ content = f .read ()
109+ files_dict [relpath ] = content
110+ except Exception as e :
111+ print (f"Warning: Could not read file { filepath } : { e } " )
115112
116- try :
117- with open (filepath , "r" , encoding = "utf-8" ) as f :
118- content = f .read ()
119- files_dict [relpath ] = content
120- except Exception as e :
121- print (f"Warning: Could not read file { filepath } : { e } " )
113+ processed_files += 1
114+ if progress_callback :
115+ progress_callback (processed_files , total_files )
122116
123117 return {"files" : files_dict }
124118
@@ -138,4 +132,4 @@ def crawl_local_files(
138132 )
139133 print (f"Found { len (files_data ['files' ])} files:" )
140134 for path in files_data ["files" ]:
141- print (f" { path } " )
135+ print (f" { path } " )
0 commit comments