Skip to content

Commit 6830774

Browse files
committed
fix(doc_loader): preserve raw content for source files
Updates the document processing logic to treat rustdoc-generated source files (ending in .rs.html) like Markdown files, preserving their raw HTML content instead of stripping tags. This ensures the full source code context is available for embeddings. Regular HTML documentation files continue to have their main content extracted.
1 parent 672165b commit 6830774

File tree

1 file changed

+35
-12
lines changed

1 file changed

+35
-12
lines changed

src/doc_loader.rs

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -267,8 +267,19 @@ fn process_documentation_directory(docs_path: &Path) -> Result<Vec<Document>, Do
267267
};
268268

269269
// Check file extension to decide processing method
270-
if path.extension().map_or(false, |ext| ext == "html") {
271-
// Process HTML using scraper
270+
let extension = path.extension().and_then(OsStr::to_str);
271+
let path_str_for_check = path.to_string_lossy(); // For checking substrings
272+
273+
if extension == Some("md") {
274+
// Process Markdown: Use raw content
275+
if !file_content.trim().is_empty() {
276+
documents.push(Document {
277+
path: path_str,
278+
content: file_content, // Store the raw Markdown content
279+
});
280+
}
281+
} else if path_str_for_check.ends_with(".rs.html") { // Check for rust source files specifically
282+
// Process Rust source HTML view: Use raw content (like Markdown)
272283
let html_document = Html::parse_document(&file_content);
273284
if let Some(main_content_element) = html_document.select(&content_selector).next() {
274285
let text_content: String = main_content_element
@@ -289,19 +300,31 @@ fn process_documentation_directory(docs_path: &Path) -> Result<Vec<Document>, Do
289300
} else {
290301
// eprintln!("[DEBUG] 'main-content' selector not found for HTML: {}", path.display());
291302
}
292-
} else if path.extension().map_or(false, |ext| ext == "md") {
293-
// Process Markdown: Use raw content
294-
if !file_content.trim().is_empty() {
295-
documents.push(Document {
296-
path: path_str,
297-
content: file_content, // Store the raw Markdown content
298-
});
303+
} else if extension == Some("html") { // Process other HTML using scraper
304+
// Process regular HTML using scraper
305+
let html_document = Html::parse_document(&file_content);
306+
if let Some(main_content_element) = html_document.select(&content_selector).next() {
307+
let text_content: String = main_content_element
308+
.text()
309+
.map(|s| s.trim())
310+
.filter(|s| !s.is_empty())
311+
.collect::<Vec<&str>>()
312+
.join("\n");
313+
314+
if !text_content.is_empty() {
315+
documents.push(Document {
316+
path: path_str,
317+
content: text_content,
318+
});
319+
} else {
320+
// eprintln!("[DEBUG] No text content found in main section for HTML: {}", path.display());
321+
}
299322
} else {
300-
eprintln!("[DEBUG] Skipping empty Markdown file: {}", path.display());
323+
// eprintln!("[DEBUG] 'main-content' selector not found for HTML: {}", path.display());
301324
}
302325
} else {
303-
// Should not happen due to WalkDir filter, but handle defensively
304-
eprintln!("[WARN] Skipping file with unexpected extension: {}", path.display());
326+
// Should not happen due to WalkDir filter, but handle defensively
327+
eprintln!("[WARN] Skipping file with unexpected extension: {}", path.display());
305328
}
306329
}
307330

0 commit comments

Comments
 (0)