11use scraper:: { Html , Selector } ;
2- use std:: { collections:: HashMap , fs:: { self , File , create_dir_all} , io:: { Write } , path:: { Path , PathBuf } } ; // Removed BufRead, BufReader
2+ use std:: { collections:: HashMap , fs:: { self , File , create_dir_all} , io:: { Write } , path:: { Path , PathBuf } , ffi :: OsStr } ; // Added OsStr
33use cargo:: core:: resolver:: features:: { CliFeatures } ;
44// use cargo::core::SourceId; // Removed unused import
55// use cargo::util::Filesystem; // Removed unused import
@@ -37,20 +37,19 @@ pub struct Document {
3737
3838
3939/// Generates documentation for a given crate in a temporary directory,
40- /// then loads and parses the HTML documents.
41- /// Extracts text content from the main content area of rustdoc generated HTML.
40+ /// then loads and parses the HTML and Markdown documents.
41+ /// Extracts text content from the main content area of rustdoc generated HTML,
42+ /// and uses raw content for Markdown files.
4243pub fn load_documents (
4344 crate_name : & str ,
4445 crate_version_req : & str ,
4546 features : Option < & Vec < String > > , // Add optional features parameter
4647) -> Result < Vec < Document > , DocLoaderError > {
47- let mut documents = Vec :: new ( ) ;
48-
48+ // --- Setup Temporary Environment ---
4949 let temp_dir = tempdir ( ) . map_err ( DocLoaderError :: TempDirCreationFailed ) ?;
5050 let temp_dir_path = temp_dir. path ( ) ;
5151 let temp_manifest_path = temp_dir_path. join ( "Cargo.toml" ) ;
5252
53-
5453 // Create a temporary Cargo.toml using the version requirement and features
5554 let features_string = features
5655 . filter ( |f| !f. is_empty ( ) ) // Only add features if provided and not empty
@@ -83,7 +82,7 @@ edition = "2021"
8382 temp_manifest_file. write_all ( cargo_toml_content. as_bytes ( ) ) ?;
8483
8584
86- // --- Use Cargo API ---
85+ // --- Use Cargo API to Generate Docs ---
8786 let mut config = GlobalContext :: default ( ) ?; // Make mutable
8887 // Configure context (set quiet to false for more detailed errors)
8988 config. configure (
@@ -122,31 +121,41 @@ edition = "2021"
122121 ops:: doc ( & ws, & doc_opts) . map_err ( DocLoaderError :: CargoLib ) ?; // Use ws
123122 // --- End Cargo API ---
124123
124+ // --- Find and Process Generated Docs ---
125125 let base_doc_path = temp_dir_path. join ( "doc" ) ;
126126 let docs_path = find_documentation_path ( & base_doc_path, crate_name) ?;
127127
128128 eprintln ! ( "Using documentation path: {}" , docs_path. display( ) ) ; // Log the path we are actually using
129129
130+ // Call the refactored processing function
131+ process_documentation_directory ( & docs_path)
132+ }
133+
134+ /// Processes files within a documentation directory, extracting content from HTML and MD files.
135+ fn process_documentation_directory ( docs_path : & Path ) -> Result < Vec < Document > , DocLoaderError > {
136+ let mut documents = Vec :: new ( ) ;
130137 // Define the CSS selector for the main content area in rustdoc HTML
131138 // This might need adjustment based on the exact rustdoc version/theme
132139 let content_selector = Selector :: parse ( "section#main-content.content" )
133140 . map_err ( |e| DocLoaderError :: Selector ( e. to_string ( ) ) ) ?;
134141
135- // --- Collect all HTML file paths first ---
136- let all_html_paths : Vec < PathBuf > = WalkDir :: new ( & docs_path)
142+ // --- Collect all relevant HTML and MD file paths first ---
143+ let relevant_files : Vec < PathBuf > = WalkDir :: new ( docs_path)
137144 . into_iter ( )
138145 . filter_map ( Result :: ok) // Ignore errors during iteration
139146 . filter ( |e| {
140- !e. file_type ( ) . is_dir ( ) && e. path ( ) . extension ( ) . is_some_and ( |ext| ext == "html" )
147+ if e. file_type ( ) . is_dir ( ) { return false ; }
148+ // Check if the extension is either "html" or "md"
149+ e. path ( ) . extension ( ) . map_or ( false , |ext| ext == "html" || ext == "md" )
141150 } )
142151 . map ( |e| e. into_path ( ) ) // Get the PathBuf
143152 . collect ( ) ;
144153
145- eprintln ! ( "[DEBUG] Found {} total HTML files initially." , all_html_paths . len( ) ) ;
154+ eprintln ! ( "[DEBUG] Found {} total HTML/MD files initially." , relevant_files . len( ) ) ;
146155
147- // --- Group files by basename ---
156+ // --- Group files by basename to handle duplicates (primarily for HTML) ---
148157 let mut basename_groups: HashMap < String , Vec < PathBuf > > = HashMap :: new ( ) ;
149- for path in all_html_paths {
158+ for path in relevant_files { // Use the combined list
150159 if let Some ( filename_osstr) = path. file_name ( ) {
151160 if let Some ( filename_str) = filename_osstr. to_str ( ) {
152161 basename_groups
@@ -162,35 +171,47 @@ edition = "2021"
162171 }
163172
164173 // --- Initialize paths_to_process and explicitly add the root index.html if it exists ---
174+ // This ensures the main crate page is always included if present.
165175 let mut paths_to_process: Vec < PathBuf > = Vec :: new ( ) ;
166176 let root_index_path = docs_path. join ( "index.html" ) ;
167177 if root_index_path. is_file ( ) {
168178 paths_to_process. push ( root_index_path) ;
169179 }
180+ // Also check for a root README.md
181+ let root_readme_path = docs_path. join ( "README.md" ) ;
182+ if root_readme_path. is_file ( ) && !paths_to_process. contains ( & root_readme_path) { // Avoid adding if index.html was README.md (unlikely)
183+ paths_to_process. push ( root_readme_path) ;
184+ }
170185
171- // --- Filter based on duplicates and size ---
172- // NOTE: Initialization of paths_to_process moved before this loop
186+
187+ // --- Filter based on duplicates (keep largest HTML) and ignore source view ---
173188 for ( basename, mut paths) in basename_groups {
174- // Always ignore index.html at this stage (except the root one added earlier)
175- if basename == "index.html" {
189+ // Always ignore index.html and README.md at this stage, as the root ones were handled above.
190+ // This prevents including module index pages or nested readmes multiple times if they share names.
191+ if basename == "index.html" || basename == "README.md" {
176192 continue ;
177193 }
178194
179- // Also ignore files within source code view directories
195+ // Also ignore files within source code view directories (e.g., `doc/src/...`)
180196 // Check the first path (they should share the problematic component if any)
181- if paths. first ( ) . map_or ( false , |p| p. components ( ) . any ( |comp| comp. as_os_str ( ) == "src" ) ) {
197+ if paths. first ( ) . map_or ( false , |p| p. components ( ) . any ( |comp| comp. as_os_str ( ) == OsStr :: new ( "src" ) ) ) {
198+ eprintln ! ( "[DEBUG] Ignoring file in src view: {}" , paths. first( ) . unwrap( ) . display( ) ) ;
182199 continue ;
183200 }
184201
185202
186203 if paths. len ( ) == 1 {
187- // Single file with this basename (and not index.html), keep it
204+ // Single file with this basename (and not index.html/README.md ), keep it
188205 paths_to_process. push ( paths. remove ( 0 ) ) ;
189206 } else {
190- // Multiple files with the same basename (duplicates)
191- // Find the largest one by file size
207+ // Multiple files with the same basename (likely HTML duplicates)
208+ // Find the largest one by file size - typically the main definition page vs. re-exports.
192209 // Explicit type annotation needed for the error type in try_fold
193210 let largest_path_result: Result < Option < ( PathBuf , u64 ) > , std:: io:: Error > = paths. into_iter ( ) . try_fold ( None :: < ( PathBuf , u64 ) > , |largest, current| {
211+ // Only consider HTML files for size comparison duplicate resolution
212+ if current. extension ( ) . map_or ( false , |ext| ext != "html" ) {
213+ return Ok ( largest) ; // Skip non-HTML files in this check
214+ }
194215 let current_meta = fs:: metadata ( & current) ?;
195216 let current_size = current_meta. len ( ) ;
196217 match largest {
@@ -211,68 +232,84 @@ edition = "2021"
211232 paths_to_process. push ( p) ;
212233 }
213234 Ok ( None ) => {
214- // This case should ideally not happen if the input `paths` was not empty,
215- // but handle it defensively.
216- eprintln ! ( "[WARN] No files found for basename '{}' during size comparison." , basename) ;
235+ // This case might happen if all duplicates were non-HTML
236+ eprintln ! ( "[WARN] No HTML files found for basename '{}' during size comparison, or group was empty." , basename) ;
217237 }
218238 Err ( e) => {
219- eprintln ! ( "[WARN] Error getting metadata for basename '{}', skipping: {}" , basename, e) ;
220- // Decide if you want to skip the whole group or handle differently
239+ eprintln ! ( "[WARN] Error getting metadata for basename '{}', skipping group : {}" , basename, e) ;
240+ // Skip the whole group if metadata fails
221241 }
222242 }
223243 }
224244 }
225245
226- eprintln ! ( "[DEBUG] Filtered down to {} files to process." , paths_to_process. len( ) ) ;
246+ eprintln ! ( "[DEBUG] Filtered down to {} unique files/paths to process." , paths_to_process. len( ) ) ;
227247
228248
229249 // --- Process the filtered list of files ---
230250 for path in paths_to_process {
231- // Calculate path relative to the docs_path root
232- let relative_path = match path. strip_prefix ( & docs_path) {
251+ // Calculate path relative to the docs_path root for storing in Document
252+ let relative_path = match path. strip_prefix ( docs_path) {
233253 Ok ( p) => p. to_path_buf ( ) ,
234- Err ( e) => {
254+ Err ( e) => {
235255 eprintln ! ( "[WARN] Failed to strip prefix {} from {}: {}" , docs_path. display( ) , path. display( ) , e) ;
236256 continue ; // Skip if path manipulation fails
237257 }
238258 } ;
239259 let path_str = relative_path. to_string_lossy ( ) . to_string ( ) ;
240260
241- let html_content = match fs:: read_to_string ( & path) { // Read from the absolute path
242- Ok ( content) => content,
243- Err ( e) => {
244- eprintln ! ( "[WARN] Failed to read file {}: {}" , path. display( ) , e) ;
245- continue ; // Skip this file if reading fails
246- }
261+ let file_content = match fs:: read_to_string ( & path) { // Read from the absolute path
262+ Ok ( content) => content,
263+ Err ( e) => {
264+ eprintln ! ( "[WARN] Failed to read file {}: {}" , path. display( ) , e) ;
265+ continue ; // Skip this file if reading fails
266+ }
247267 } ;
248268
249- let document = Html :: parse_document ( & html_content) ;
250-
251- if let Some ( main_content_element) = document. select ( & content_selector) . next ( ) {
252- let text_content: String = main_content_element
253- . text ( )
254- . map ( |s| s. trim ( ) )
255- . filter ( |s| !s. is_empty ( ) )
256- . collect :: < Vec < & str > > ( )
257- . join ( "\n " ) ;
258-
259- if !text_content. is_empty ( ) {
269+ // Check file extension to decide processing method
270+ if path. extension ( ) . map_or ( false , |ext| ext == "html" ) {
271+ // Process HTML using scraper
272+ let html_document = Html :: parse_document ( & file_content) ;
273+ if let Some ( main_content_element) = html_document. select ( & content_selector) . next ( ) {
274+ let text_content: String = main_content_element
275+ . text ( )
276+ . map ( |s| s. trim ( ) )
277+ . filter ( |s| !s. is_empty ( ) )
278+ . collect :: < Vec < & str > > ( )
279+ . join ( "\n " ) ;
280+
281+ if !text_content. is_empty ( ) {
282+ documents. push ( Document {
283+ path : path_str,
284+ content : text_content,
285+ } ) ;
286+ } else {
287+ // eprintln!("[DEBUG] No text content found in main section for HTML: {}", path.display());
288+ }
289+ } else {
290+ // eprintln!("[DEBUG] 'main-content' selector not found for HTML: {}", path.display());
291+ }
292+ } else if path. extension ( ) . map_or ( false , |ext| ext == "md" ) {
293+ // Process Markdown: Use raw content
294+ if !file_content. trim ( ) . is_empty ( ) {
260295 documents. push ( Document {
261296 path : path_str,
262- content : text_content ,
297+ content : file_content , // Store the raw Markdown content
263298 } ) ;
264299 } else {
265- // eprintln!("[DEBUG] No text content found in main section for : {}", path.display());
300+ eprintln ! ( "[DEBUG] Skipping empty Markdown file : {}" , path. display( ) ) ;
266301 }
267302 } else {
268- // eprintln!("[DEBUG] 'main-content' selector not found for: {}", path.display());
303+ // Should not happen due to WalkDir filter, but handle defensively
304+ eprintln ! ( "[WARN] Skipping file with unexpected extension: {}" , path. display( ) ) ;
269305 }
270306 }
271307
272308 eprintln ! ( "Finished document loading. Found {} final documents." , documents. len( ) ) ;
273309 Ok ( documents)
274310}
275311
312+
276313/// Finds the correct documentation directory for a specific crate within a base 'doc' directory.
277314///
278315/// Handles cases where multiple subdirectories might exist (e.g., due to dependencies)
@@ -361,9 +398,10 @@ fn find_documentation_path(base_doc_path: &Path, crate_name: &str) -> Result<Pat
361398#[ cfg( test) ]
362399mod tests {
363400 use super :: * ;
364- use std:: fs;
401+ use std:: fs:: { self , File } ;
365402 use std:: io:: Write ;
366403 use tempfile:: tempdir;
404+ use std:: path:: Path ; // Add Path import
367405
368406 // Helper to create dummy doc structure including index.html content
369407 fn setup_test_dir_with_titles ( base : & Path , dirs : & [ ( & str , Option < & str > ) ] ) -> std:: io:: Result < ( ) > {
@@ -392,6 +430,42 @@ mod tests {
392430 Ok ( ( ) )
393431 }
394432
433+ // Helper to create a mock documentation directory with HTML and MD files
434+ fn setup_mock_docs ( base_path : & Path ) -> std:: io:: Result < ( ) > {
435+ // Root index.html (should be processed)
436+ let mut index_file = File :: create ( base_path. join ( "index.html" ) ) ?;
437+ writeln ! ( index_file, "<!DOCTYPE html><html><head><title>Root Crate - Rust</title></head><body><section id='main-content' class='content'>Root Index Content</section></body></html>" ) ?;
438+
439+ // A regular HTML file (should be processed)
440+ let mod_path = base_path. join ( "module" ) ;
441+ fs:: create_dir_all ( & mod_path) ?;
442+ let mut mod_file = File :: create ( mod_path. join ( "struct.MyStruct.html" ) ) ?;
443+ writeln ! ( mod_file, "<!DOCTYPE html><html><head><title>MyStruct - Rust</title></head><body><section id='main-content' class='content'>MyStruct Content Larger</section></body></html>" ) ?; // Make slightly larger
444+
445+ // A Markdown file (should be processed, raw content)
446+ let mut md_file = File :: create ( base_path. join ( "README.md" ) ) ?;
447+ writeln ! ( md_file, "# Project Readme\n \n This is the content." ) ?;
448+
449+ // An HTML file inside a 'src' directory (should be ignored)
450+ let src_view_path = base_path. join ( "src" ) . join ( "my_crate" ) ;
451+ fs:: create_dir_all ( & src_view_path) ?;
452+ let mut src_html_file = File :: create ( src_view_path. join ( "lib.rs.html" ) ) ?;
453+ writeln ! ( src_html_file, "<html><body>Source Code View</body></html>" ) ?;
454+
455+ // A duplicate HTML file (only largest should be kept - module one is larger)
456+ // Create a smaller duplicate in another dir
457+ let dup_dir = base_path. join ( "duplicate" ) ;
458+ fs:: create_dir_all ( & dup_dir) ?;
459+ let mut dup_file = File :: create ( dup_dir. join ( "struct.MyStruct.html" ) ) ?;
460+ writeln ! ( dup_file, "<html><body>Smaller Duplicate Content</body></html>" ) ?; // Smaller content
461+
462+ // Another Markdown file in a subdirectory
463+ let mut sub_md_file = File :: create ( mod_path. join ( "GUIDE.md" ) ) ?;
464+ writeln ! ( sub_md_file, "## Guide\n \n More details here." ) ?;
465+
466+ Ok ( ( ) )
467+ }
468+
395469
396470 #[ test]
397471 fn test_find_docs_no_dirs ( ) -> Result < ( ) , Box < dyn std:: error:: Error > > {
@@ -522,4 +596,52 @@ mod tests {
522596 Ok ( ( ) )
523597 }
524598
599+ #[ test]
600+ fn test_process_documentation_directory_includes_md ( ) -> Result < ( ) , Box < dyn std:: error:: Error > > {
601+ let temp = tempdir ( ) ?;
602+ let docs_path = temp. path ( ) ;
603+ setup_mock_docs ( docs_path) ?;
604+
605+ let documents = process_documentation_directory ( docs_path) ?;
606+
607+ assert_eq ! ( documents. len( ) , 4 , "Should find root index.html, MyStruct.html, README.md, and GUIDE.md" ) ;
608+
609+ // Check for specific documents (order might vary)
610+ let mut found_index = false ;
611+ let mut found_struct = false ;
612+ let mut found_readme = false ;
613+ let mut found_guide = false ;
614+
615+ for doc in & documents {
616+ eprintln ! ( "Found doc: path='{}', content='{}'" , doc. path, doc. content. chars( ) . take( 50 ) . collect:: <String >( ) ) ; // Debug print
617+ if doc. path == "index.html" {
618+ assert ! ( doc. content. contains( "Root Index Content" ) ) ;
619+ found_index = true ;
620+ } else if doc. path == "module/struct.MyStruct.html" { // Path relative to docs_path
621+ assert ! ( doc. content. contains( "MyStruct Content Larger" ) ) ; // Check content of the larger one
622+ found_struct = true ;
623+ } else if doc. path == "README.md" {
624+ assert ! ( doc. content. contains( "# Project Readme" ) ) ;
625+ assert ! ( doc. content. contains( "This is the content." ) ) ;
626+ found_readme = true ;
627+ } else if doc. path == "module/GUIDE.md" { // Path relative to docs_path
628+ assert ! ( doc. content. contains( "## Guide" ) ) ;
629+ assert ! ( doc. content. contains( "More details here." ) ) ;
630+ found_guide = true ;
631+ }
632+ }
633+
634+ assert ! ( found_index, "Root index.html content not found or incorrect" ) ;
635+ assert ! ( found_struct, "MyStruct.html content not found or incorrect" ) ;
636+ assert ! ( found_readme, "README.md content not found or incorrect" ) ;
637+ assert ! ( found_guide, "module/GUIDE.md content not found or incorrect" ) ;
638+
639+ // Verify ignored files are not present
640+ assert ! ( !documents. iter( ) . any( |d| d. path. contains( "src/" ) ) , "Should ignore files in 'src/' directories" ) ;
641+ // Check that the smaller duplicate wasn't included
642+ assert ! ( !documents. iter( ) . any( |d| d. path == "duplicate/struct.MyStruct.html" ) , "Smaller duplicate should be ignored" ) ;
643+
644+ Ok ( ( ) )
645+ }
646+
525647}
0 commit comments