Skip to content

Commit 423f1f8

Browse files
committed
feat(doc_loader): include markdown files in loaded documentation
Refactors the documentation file processing logic from `load_documents` into a new `process_documentation_directory` function. This improves testability and separation of concerns. The new function now walks the documentation directory and collects both `.html` and `.md` files. HTML files have their main content extracted via CSS selectors, while Markdown files are included with their raw content. Adds unit tests (`test_process_documentation_directory_includes_md`) to verify the correct handling of Markdown files, HTML parsing, duplicate HTML file resolution (keeping the largest), and ignoring files within `src` directories.
1 parent 0a55551 commit 423f1f8

File tree

1 file changed

+174
-52
lines changed

1 file changed

+174
-52
lines changed

src/doc_loader.rs

Lines changed: 174 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use scraper::{Html, Selector};
2-
use std::{collections::HashMap, fs::{self, File, create_dir_all}, io::{Write}, path::{Path, PathBuf}}; // Removed BufRead, BufReader
2+
use std::{collections::HashMap, fs::{self, File, create_dir_all}, io::{Write}, path::{Path, PathBuf}, ffi::OsStr}; // Added OsStr
33
use cargo::core::resolver::features::{CliFeatures};
44
// use cargo::core::SourceId; // Removed unused import
55
// use cargo::util::Filesystem; // Removed unused import
@@ -37,20 +37,19 @@ pub struct Document {
3737

3838

3939
/// Generates documentation for a given crate in a temporary directory,
40-
/// then loads and parses the HTML documents.
41-
/// Extracts text content from the main content area of rustdoc generated HTML.
40+
/// then loads and parses the HTML and Markdown documents.
41+
/// Extracts text content from the main content area of rustdoc generated HTML,
42+
/// and uses raw content for Markdown files.
4243
pub fn load_documents(
4344
crate_name: &str,
4445
crate_version_req: &str,
4546
features: Option<&Vec<String>>, // Add optional features parameter
4647
) -> Result<Vec<Document>, DocLoaderError> {
47-
let mut documents = Vec::new();
48-
48+
// --- Setup Temporary Environment ---
4949
let temp_dir = tempdir().map_err(DocLoaderError::TempDirCreationFailed)?;
5050
let temp_dir_path = temp_dir.path();
5151
let temp_manifest_path = temp_dir_path.join("Cargo.toml");
5252

53-
5453
// Create a temporary Cargo.toml using the version requirement and features
5554
let features_string = features
5655
.filter(|f| !f.is_empty()) // Only add features if provided and not empty
@@ -83,7 +82,7 @@ edition = "2021"
8382
temp_manifest_file.write_all(cargo_toml_content.as_bytes())?;
8483

8584

86-
// --- Use Cargo API ---
85+
// --- Use Cargo API to Generate Docs ---
8786
let mut config = GlobalContext::default()?; // Make mutable
8887
// Configure context (set quiet to false for more detailed errors)
8988
config.configure(
@@ -122,31 +121,41 @@ edition = "2021"
122121
ops::doc(&ws, &doc_opts).map_err(DocLoaderError::CargoLib)?; // Use ws
123122
// --- End Cargo API ---
124123

124+
// --- Find and Process Generated Docs ---
125125
let base_doc_path = temp_dir_path.join("doc");
126126
let docs_path = find_documentation_path(&base_doc_path, crate_name)?;
127127

128128
eprintln!("Using documentation path: {}", docs_path.display()); // Log the path we are actually using
129129

130+
// Call the refactored processing function
131+
process_documentation_directory(&docs_path)
132+
}
133+
134+
/// Processes files within a documentation directory, extracting content from HTML and MD files.
135+
fn process_documentation_directory(docs_path: &Path) -> Result<Vec<Document>, DocLoaderError> {
136+
let mut documents = Vec::new();
130137
// Define the CSS selector for the main content area in rustdoc HTML
131138
// This might need adjustment based on the exact rustdoc version/theme
132139
let content_selector = Selector::parse("section#main-content.content")
133140
.map_err(|e| DocLoaderError::Selector(e.to_string()))?;
134141

135-
// --- Collect all HTML file paths first ---
136-
let all_html_paths: Vec<PathBuf> = WalkDir::new(&docs_path)
142+
// --- Collect all relevant HTML and MD file paths first ---
143+
let relevant_files: Vec<PathBuf> = WalkDir::new(docs_path)
137144
.into_iter()
138145
.filter_map(Result::ok) // Ignore errors during iteration
139146
.filter(|e| {
140-
!e.file_type().is_dir() && e.path().extension().is_some_and(|ext| ext == "html")
147+
if e.file_type().is_dir() { return false; }
148+
// Check if the extension is either "html" or "md"
149+
e.path().extension().map_or(false, |ext| ext == "html" || ext == "md")
141150
})
142151
.map(|e| e.into_path()) // Get the PathBuf
143152
.collect();
144153

145-
eprintln!("[DEBUG] Found {} total HTML files initially.", all_html_paths.len());
154+
eprintln!("[DEBUG] Found {} total HTML/MD files initially.", relevant_files.len());
146155

147-
// --- Group files by basename ---
156+
// --- Group files by basename to handle duplicates (primarily for HTML) ---
148157
let mut basename_groups: HashMap<String, Vec<PathBuf>> = HashMap::new();
149-
for path in all_html_paths {
158+
for path in relevant_files { // Use the combined list
150159
if let Some(filename_osstr) = path.file_name() {
151160
if let Some(filename_str) = filename_osstr.to_str() {
152161
basename_groups
@@ -162,35 +171,47 @@ edition = "2021"
162171
}
163172

164173
// --- Initialize paths_to_process and explicitly add the root index.html if it exists ---
174+
// This ensures the main crate page is always included if present.
165175
let mut paths_to_process: Vec<PathBuf> = Vec::new();
166176
let root_index_path = docs_path.join("index.html");
167177
if root_index_path.is_file() {
168178
paths_to_process.push(root_index_path);
169179
}
180+
// Also check for a root README.md
181+
let root_readme_path = docs_path.join("README.md");
182+
if root_readme_path.is_file() && !paths_to_process.contains(&root_readme_path) { // Avoid adding if index.html was README.md (unlikely)
183+
paths_to_process.push(root_readme_path);
184+
}
170185

171-
// --- Filter based on duplicates and size ---
172-
// NOTE: Initialization of paths_to_process moved before this loop
186+
187+
// --- Filter based on duplicates (keep largest HTML) and ignore source view ---
173188
for (basename, mut paths) in basename_groups {
174-
// Always ignore index.html at this stage (except the root one added earlier)
175-
if basename == "index.html" {
189+
// Always ignore index.html and README.md at this stage, as the root ones were handled above.
190+
// This prevents including module index pages or nested readmes multiple times if they share names.
191+
if basename == "index.html" || basename == "README.md" {
176192
continue;
177193
}
178194

179-
// Also ignore files within source code view directories
195+
// Also ignore files within source code view directories (e.g., `doc/src/...`)
180196
// Check the first path (they should share the problematic component if any)
181-
if paths.first().map_or(false, |p| p.components().any(|comp| comp.as_os_str() == "src")) {
197+
if paths.first().map_or(false, |p| p.components().any(|comp| comp.as_os_str() == OsStr::new("src"))) {
198+
eprintln!("[DEBUG] Ignoring file in src view: {}", paths.first().unwrap().display());
182199
continue;
183200
}
184201

185202

186203
if paths.len() == 1 {
187-
// Single file with this basename (and not index.html), keep it
204+
// Single file with this basename (and not index.html/README.md), keep it
188205
paths_to_process.push(paths.remove(0));
189206
} else {
190-
// Multiple files with the same basename (duplicates)
191-
// Find the largest one by file size
207+
// Multiple files with the same basename (likely HTML duplicates)
208+
// Find the largest one by file size - typically the main definition page vs. re-exports.
192209
// Explicit type annotation needed for the error type in try_fold
193210
let largest_path_result: Result<Option<(PathBuf, u64)>, std::io::Error> = paths.into_iter().try_fold(None::<(PathBuf, u64)>, |largest, current| {
211+
// Only consider HTML files for size comparison duplicate resolution
212+
if current.extension().map_or(false, |ext| ext != "html") {
213+
return Ok(largest); // Skip non-HTML files in this check
214+
}
194215
let current_meta = fs::metadata(&current)?;
195216
let current_size = current_meta.len();
196217
match largest {
@@ -211,68 +232,84 @@ edition = "2021"
211232
paths_to_process.push(p);
212233
}
213234
Ok(None) => {
214-
// This case should ideally not happen if the input `paths` was not empty,
215-
// but handle it defensively.
216-
eprintln!("[WARN] No files found for basename '{}' during size comparison.", basename);
235+
// This case might happen if all duplicates were non-HTML
236+
eprintln!("[WARN] No HTML files found for basename '{}' during size comparison, or group was empty.", basename);
217237
}
218238
Err(e) => {
219-
eprintln!("[WARN] Error getting metadata for basename '{}', skipping: {}", basename, e);
220-
// Decide if you want to skip the whole group or handle differently
239+
eprintln!("[WARN] Error getting metadata for basename '{}', skipping group: {}", basename, e);
240+
// Skip the whole group if metadata fails
221241
}
222242
}
223243
}
224244
}
225245

226-
eprintln!("[DEBUG] Filtered down to {} files to process.", paths_to_process.len());
246+
eprintln!("[DEBUG] Filtered down to {} unique files/paths to process.", paths_to_process.len());
227247

228248

229249
// --- Process the filtered list of files ---
230250
for path in paths_to_process {
231-
// Calculate path relative to the docs_path root
232-
let relative_path = match path.strip_prefix(&docs_path) {
251+
// Calculate path relative to the docs_path root for storing in Document
252+
let relative_path = match path.strip_prefix(docs_path) {
233253
Ok(p) => p.to_path_buf(),
234-
Err(e) => {
254+
Err(e) => {
235255
eprintln!("[WARN] Failed to strip prefix {} from {}: {}", docs_path.display(), path.display(), e);
236256
continue; // Skip if path manipulation fails
237257
}
238258
};
239259
let path_str = relative_path.to_string_lossy().to_string();
240260

241-
let html_content = match fs::read_to_string(&path) { // Read from the absolute path
242-
Ok(content) => content,
243-
Err(e) => {
244-
eprintln!("[WARN] Failed to read file {}: {}", path.display(), e);
245-
continue; // Skip this file if reading fails
246-
}
261+
let file_content = match fs::read_to_string(&path) { // Read from the absolute path
262+
Ok(content) => content,
263+
Err(e) => {
264+
eprintln!("[WARN] Failed to read file {}: {}", path.display(), e);
265+
continue; // Skip this file if reading fails
266+
}
247267
};
248268

249-
let document = Html::parse_document(&html_content);
250-
251-
if let Some(main_content_element) = document.select(&content_selector).next() {
252-
let text_content: String = main_content_element
253-
.text()
254-
.map(|s| s.trim())
255-
.filter(|s| !s.is_empty())
256-
.collect::<Vec<&str>>()
257-
.join("\n");
258-
259-
if !text_content.is_empty() {
269+
// Check file extension to decide processing method
270+
if path.extension().map_or(false, |ext| ext == "html") {
271+
// Process HTML using scraper
272+
let html_document = Html::parse_document(&file_content);
273+
if let Some(main_content_element) = html_document.select(&content_selector).next() {
274+
let text_content: String = main_content_element
275+
.text()
276+
.map(|s| s.trim())
277+
.filter(|s| !s.is_empty())
278+
.collect::<Vec<&str>>()
279+
.join("\n");
280+
281+
if !text_content.is_empty() {
282+
documents.push(Document {
283+
path: path_str,
284+
content: text_content,
285+
});
286+
} else {
287+
// eprintln!("[DEBUG] No text content found in main section for HTML: {}", path.display());
288+
}
289+
} else {
290+
// eprintln!("[DEBUG] 'main-content' selector not found for HTML: {}", path.display());
291+
}
292+
} else if path.extension().map_or(false, |ext| ext == "md") {
293+
// Process Markdown: Use raw content
294+
if !file_content.trim().is_empty() {
260295
documents.push(Document {
261296
path: path_str,
262-
content: text_content,
297+
content: file_content, // Store the raw Markdown content
263298
});
264299
} else {
265-
// eprintln!("[DEBUG] No text content found in main section for: {}", path.display());
300+
eprintln!("[DEBUG] Skipping empty Markdown file: {}", path.display());
266301
}
267302
} else {
268-
// eprintln!("[DEBUG] 'main-content' selector not found for: {}", path.display());
303+
// Should not happen due to WalkDir filter, but handle defensively
304+
eprintln!("[WARN] Skipping file with unexpected extension: {}", path.display());
269305
}
270306
}
271307

272308
eprintln!("Finished document loading. Found {} final documents.", documents.len());
273309
Ok(documents)
274310
}
275311

312+
276313
/// Finds the correct documentation directory for a specific crate within a base 'doc' directory.
277314
///
278315
/// Handles cases where multiple subdirectories might exist (e.g., due to dependencies)
@@ -361,9 +398,10 @@ fn find_documentation_path(base_doc_path: &Path, crate_name: &str) -> Result<Pat
361398
#[cfg(test)]
362399
mod tests {
363400
use super::*;
364-
use std::fs;
401+
use std::fs::{self, File};
365402
use std::io::Write;
366403
use tempfile::tempdir;
404+
use std::path::Path; // Add Path import
367405

368406
// Helper to create dummy doc structure including index.html content
369407
fn setup_test_dir_with_titles(base: &Path, dirs: &[(&str, Option<&str>)]) -> std::io::Result<()> {
@@ -392,6 +430,42 @@ mod tests {
392430
Ok(())
393431
}
394432

433+
// Helper to create a mock documentation directory with HTML and MD files
434+
fn setup_mock_docs(base_path: &Path) -> std::io::Result<()> {
435+
// Root index.html (should be processed)
436+
let mut index_file = File::create(base_path.join("index.html"))?;
437+
writeln!(index_file, "<!DOCTYPE html><html><head><title>Root Crate - Rust</title></head><body><section id='main-content' class='content'>Root Index Content</section></body></html>")?;
438+
439+
// A regular HTML file (should be processed)
440+
let mod_path = base_path.join("module");
441+
fs::create_dir_all(&mod_path)?;
442+
let mut mod_file = File::create(mod_path.join("struct.MyStruct.html"))?;
443+
writeln!(mod_file, "<!DOCTYPE html><html><head><title>MyStruct - Rust</title></head><body><section id='main-content' class='content'>MyStruct Content Larger</section></body></html>")?; // Make slightly larger
444+
445+
// A Markdown file (should be processed, raw content)
446+
let mut md_file = File::create(base_path.join("README.md"))?;
447+
writeln!(md_file, "# Project Readme\n\nThis is the content.")?;
448+
449+
// An HTML file inside a 'src' directory (should be ignored)
450+
let src_view_path = base_path.join("src").join("my_crate");
451+
fs::create_dir_all(&src_view_path)?;
452+
let mut src_html_file = File::create(src_view_path.join("lib.rs.html"))?;
453+
writeln!(src_html_file, "<html><body>Source Code View</body></html>")?;
454+
455+
// A duplicate HTML file (only largest should be kept - module one is larger)
456+
// Create a smaller duplicate in another dir
457+
let dup_dir = base_path.join("duplicate");
458+
fs::create_dir_all(&dup_dir)?;
459+
let mut dup_file = File::create(dup_dir.join("struct.MyStruct.html"))?;
460+
writeln!(dup_file, "<html><body>Smaller Duplicate Content</body></html>")?; // Smaller content
461+
462+
// Another Markdown file in a subdirectory
463+
let mut sub_md_file = File::create(mod_path.join("GUIDE.md"))?;
464+
writeln!(sub_md_file, "## Guide\n\nMore details here.")?;
465+
466+
Ok(())
467+
}
468+
395469

396470
#[test]
397471
fn test_find_docs_no_dirs() -> Result<(), Box<dyn std::error::Error>> {
@@ -522,4 +596,52 @@ mod tests {
522596
Ok(())
523597
}
524598

599+
#[test]
600+
fn test_process_documentation_directory_includes_md() -> Result<(), Box<dyn std::error::Error>> {
601+
let temp = tempdir()?;
602+
let docs_path = temp.path();
603+
setup_mock_docs(docs_path)?;
604+
605+
let documents = process_documentation_directory(docs_path)?;
606+
607+
assert_eq!(documents.len(), 4, "Should find root index.html, MyStruct.html, README.md, and GUIDE.md");
608+
609+
// Check for specific documents (order might vary)
610+
let mut found_index = false;
611+
let mut found_struct = false;
612+
let mut found_readme = false;
613+
let mut found_guide = false;
614+
615+
for doc in &documents {
616+
eprintln!("Found doc: path='{}', content='{}'", doc.path, doc.content.chars().take(50).collect::<String>()); // Debug print
617+
if doc.path == "index.html" {
618+
assert!(doc.content.contains("Root Index Content"));
619+
found_index = true;
620+
} else if doc.path == "module/struct.MyStruct.html" { // Path relative to docs_path
621+
assert!(doc.content.contains("MyStruct Content Larger")); // Check content of the larger one
622+
found_struct = true;
623+
} else if doc.path == "README.md" {
624+
assert!(doc.content.contains("# Project Readme"));
625+
assert!(doc.content.contains("This is the content."));
626+
found_readme = true;
627+
} else if doc.path == "module/GUIDE.md" { // Path relative to docs_path
628+
assert!(doc.content.contains("## Guide"));
629+
assert!(doc.content.contains("More details here."));
630+
found_guide = true;
631+
}
632+
}
633+
634+
assert!(found_index, "Root index.html content not found or incorrect");
635+
assert!(found_struct, "MyStruct.html content not found or incorrect");
636+
assert!(found_readme, "README.md content not found or incorrect");
637+
assert!(found_guide, "module/GUIDE.md content not found or incorrect");
638+
639+
// Verify ignored files are not present
640+
assert!(!documents.iter().any(|d| d.path.contains("src/")), "Should ignore files in 'src/' directories");
641+
// Check that the smaller duplicate wasn't included
642+
assert!(!documents.iter().any(|d| d.path == "duplicate/struct.MyStruct.html"), "Smaller duplicate should be ignored");
643+
644+
Ok(())
645+
}
646+
525647
}

0 commit comments

Comments
 (0)