Skip to content

Commit 96f5274

Browse files
committed
stream sitemap xml to the client
1 parent 2682ed4 commit 96f5274

File tree

8 files changed

+114
-63
lines changed

8 files changed

+114
-63
lines changed

.sqlx/query-65b0ead56880b369931c3a5ec324910dde51096de4ee2ad868cc5025161ab466.json renamed to .sqlx/query-df1c002b7c4f5e2567eeefff56ee51003d90122f83429313966add5b224f5f6c.json

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/db/mimes.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ macro_rules! mime {
1010
mime!(APPLICATION_ZIP, "application/zip");
1111
mime!(APPLICATION_ZSTD, "application/zstd");
1212
mime!(APPLICATION_GZIP, "application/gzip");
13+
mime!(APPLICATION_XML, "application/xml");
1314
mime!(TEXT_MARKDOWN, "text/markdown");
1415
mime!(TEXT_RUST, "text/rust");
1516
mime!(TEXT_TOML, "text/toml");

src/web/sitemap.rs

Lines changed: 98 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,35 @@
11
use crate::{
22
Config,
3+
db::mimes,
34
docbuilder::Limits,
45
impl_axum_webpage,
5-
utils::{ConfigName, get_config},
6+
utils::{ConfigName, get_config, report_error},
67
web::{
78
AxumErrorPage,
89
error::{AxumNope, AxumResult},
910
extractors::{DbConnection, Path},
1011
page::templates::{RenderBrands, RenderSolid, filters},
1112
},
1213
};
14+
use anyhow::Context as _;
1315
use askama::Template;
14-
use axum::{extract::Extension, http::StatusCode, response::IntoResponse};
16+
use async_stream::stream;
17+
use axum::{
18+
body::{Body, Bytes},
19+
extract::Extension,
20+
http::StatusCode,
21+
response::IntoResponse,
22+
};
23+
use axum_extra::{TypedHeader, headers::ContentType};
1524
use chrono::{TimeZone, Utc};
16-
use futures_util::stream::TryStreamExt;
25+
use futures_util::{StreamExt as _, pin_mut};
1726
use std::sync::Arc;
27+
use tracing::{Span, error};
28+
use tracing_futures::Instrument as _;
1829

1930
/// sitemap index
2031
#[derive(Template)]
21-
#[template(path = "core/sitemapindex.xml")]
32+
#[template(path = "core/sitemap/index.xml")]
2233
#[derive(Debug, Clone, PartialEq, Eq)]
2334
struct SitemapIndexXml {
2435
sitemaps: Vec<char>,
@@ -35,25 +46,17 @@ pub(crate) async fn sitemapindex_handler() -> impl IntoResponse {
3546
SitemapIndexXml { sitemaps }
3647
}
3748

49+
#[derive(Template)]
50+
#[template(path = "core/sitemap/_item.xml")]
3851
#[derive(Debug, Clone, PartialEq, Eq)]
39-
struct SitemapRow {
52+
struct SitemapItemXml {
4053
crate_name: String,
4154
last_modified: String,
4255
target_name: String,
4356
}
4457

45-
/// The sitemap
46-
#[derive(Template)]
47-
#[template(path = "core/sitemap.xml")]
48-
#[derive(Debug, Clone, PartialEq, Eq)]
49-
struct SitemapXml {
50-
releases: Vec<SitemapRow>,
51-
}
52-
53-
impl_axum_webpage! {
54-
SitemapXml,
55-
content_type = "application/xml",
56-
}
58+
const SITEMAP_HEADER: &[u8] = include_bytes!("./../../templates/core/sitemap/_header.xml");
59+
const SITEMAP_FOOTER: &[u8] = include_bytes!("./../../templates/core/sitemap/_footer.xml");
5760

5861
pub(crate) async fn sitemap_handler(
5962
Path(letter): Path<String>,
@@ -67,37 +70,86 @@ pub(crate) async fn sitemap_handler(
6770
return Err(AxumNope::ResourceNotFound);
6871
}
6972

70-
let releases: Vec<_> = sqlx::query!(
71-
r#"SELECT crates.name,
72-
releases.target_name,
73-
MAX(releases.release_time) as "release_time!"
74-
FROM crates
75-
INNER JOIN releases ON releases.crate_id = crates.id
76-
WHERE
77-
rustdoc_status = true AND
78-
crates.name ILIKE $1
79-
GROUP BY crates.name, releases.target_name
80-
"#,
81-
format!("{letter}%"),
82-
)
83-
.fetch(&mut *conn)
84-
.map_ok(|row| SitemapRow {
85-
crate_name: row.name,
86-
target_name: row
87-
.target_name
88-
.expect("when we have rustdoc_status=true, this field is filled"),
89-
last_modified: row
90-
.release_time
91-
// On Aug 27 2022 we added `<link rel="canonical">` to all pages,
92-
// so they should all get recrawled if they haven't been since then.
93-
.max(Utc.with_ymd_and_hms(2022, 8, 28, 0, 0, 0).unwrap())
94-
.format("%+")
95-
.to_string(),
73+
let stream_span = Span::current();
74+
75+
let stream = stream!({
76+
let mut items: usize = 0;
77+
let mut streamed_bytes: usize = SITEMAP_HEADER.len();
78+
79+
yield Ok(Bytes::from_static(SITEMAP_HEADER));
80+
81+
let result = sqlx::query!(
82+
r#"SELECT crates.name,
83+
releases.target_name,
84+
MAX(releases.release_time) as "release_time!"
85+
FROM crates
86+
INNER JOIN releases ON releases.crate_id = crates.id
87+
WHERE
88+
rustdoc_status = true AND
89+
crates.name ILIKE $1
90+
GROUP BY crates.name, releases.target_name
91+
"#,
92+
format!("{letter}%"),
93+
)
94+
.fetch(&mut *conn);
95+
96+
pin_mut!(result);
97+
while let Some(row) = result.next().await {
98+
let row = match row.context("error fetching row from database") {
99+
Ok(row) => row,
100+
Err(err) => {
101+
report_error(&err);
102+
yield Err(AxumNope::InternalError(err));
103+
break;
104+
}
105+
};
106+
107+
match (SitemapItemXml {
108+
crate_name: row.name,
109+
target_name: row
110+
.target_name
111+
.expect("when we have rustdoc_status=true, this field is filled"),
112+
last_modified: row
113+
.release_time
114+
// On Aug 27 2022 we added `<link rel="canonical">` to all pages,
115+
// so they should all get recrawled if they haven't been since then.
116+
.max(Utc.with_ymd_and_hms(2022, 8, 28, 0, 0, 0).unwrap())
117+
.format("%+")
118+
.to_string(),
119+
}
120+
.render()
121+
.context("error when rendering sitemap item xml"))
122+
{
123+
Ok(item) => {
124+
let bytes = Bytes::from(item);
125+
items += 1;
126+
streamed_bytes += bytes.len();
127+
yield Ok(bytes);
128+
}
129+
Err(err) => {
130+
report_error(&err);
131+
yield Err(AxumNope::InternalError(err));
132+
break;
133+
}
134+
};
135+
}
136+
137+
streamed_bytes += SITEMAP_FOOTER.len();
138+
yield Ok(Bytes::from_static(SITEMAP_FOOTER));
139+
140+
if items > 50_000 || streamed_bytes > 50 * 1024 * 1024 {
141+
// alert when sitemap limits are reached
142+
// https://developers.google.com/search/docs/crawling-indexing/sitemaps/build-sitemap#general-guidelines
143+
error!(items, streamed_bytes, letter, "sitemap limits exceeded")
144+
}
96145
})
97-
.try_collect()
98-
.await?;
146+
.instrument(stream_span);
99147

100-
Ok(SitemapXml { releases })
148+
Ok((
149+
StatusCode::OK,
150+
TypedHeader(ContentType::from(mimes::APPLICATION_XML.clone())),
151+
Body::from_stream(stream),
152+
))
101153
}
102154

103155
#[derive(Template)]

templates/core/sitemap.xml

Lines changed: 0 additions & 15 deletions
This file was deleted.

templates/core/sitemap/_footer.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
</urlset>

templates/core/sitemap/_header.xml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">

templates/core/sitemap/_item.xml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
<url>
2+
<loc>https://docs.rs/{{ crate_name }}/latest/{{ target_name }}/</loc>
3+
<lastmod>{{ last_modified|escape_xml }}</lastmod>
4+
<priority>1.0</priority>
5+
</url>
6+
<url>
7+
<loc>https://docs.rs/{{ crate_name }}/latest/{{ target_name }}/all.html</loc>
8+
<lastmod>{{ last_modified|escape_xml }}</lastmod>
9+
<priority>0.8</priority>
10+
</url>

0 commit comments

Comments
 (0)