11use crate :: {
22 Config ,
3+ db:: mimes,
34 docbuilder:: Limits ,
45 impl_axum_webpage,
5- utils:: { ConfigName , get_config} ,
6+ utils:: { ConfigName , get_config, report_error } ,
67 web:: {
78 AxumErrorPage ,
89 error:: { AxumNope , AxumResult } ,
910 extractors:: { DbConnection , Path } ,
1011 page:: templates:: { RenderBrands , RenderSolid , filters} ,
1112 } ,
1213} ;
14+ use anyhow:: Context as _;
1315use askama:: Template ;
14- use axum:: { extract:: Extension , http:: StatusCode , response:: IntoResponse } ;
16+ use async_stream:: stream;
17+ use axum:: {
18+ body:: { Body , Bytes } ,
19+ extract:: Extension ,
20+ http:: StatusCode ,
21+ response:: IntoResponse ,
22+ } ;
23+ use axum_extra:: { TypedHeader , headers:: ContentType } ;
1524use chrono:: { TimeZone , Utc } ;
16- use futures_util:: stream :: TryStreamExt ;
25+ use futures_util:: { StreamExt as _ , pin_mut } ;
1726use std:: sync:: Arc ;
27+ use tracing:: { Span , error} ;
28+ use tracing_futures:: Instrument as _;
1829
1930/// sitemap index
2031#[ derive( Template ) ]
21- #[ template( path = "core/sitemapindex .xml" ) ]
32+ #[ template( path = "core/sitemap/index .xml" ) ]
2233#[ derive( Debug , Clone , PartialEq , Eq ) ]
2334struct SitemapIndexXml {
2435 sitemaps : Vec < char > ,
@@ -35,25 +46,17 @@ pub(crate) async fn sitemapindex_handler() -> impl IntoResponse {
3546 SitemapIndexXml { sitemaps }
3647}
3748
49+ #[ derive( Template ) ]
50+ #[ template( path = "core/sitemap/_item.xml" ) ]
3851#[ derive( Debug , Clone , PartialEq , Eq ) ]
39- struct SitemapRow {
52+ struct SitemapItemXml {
4053 crate_name : String ,
4154 last_modified : String ,
4255 target_name : String ,
4356}
4457
45- /// The sitemap
46- #[ derive( Template ) ]
47- #[ template( path = "core/sitemap.xml" ) ]
48- #[ derive( Debug , Clone , PartialEq , Eq ) ]
49- struct SitemapXml {
50- releases : Vec < SitemapRow > ,
51- }
52-
53- impl_axum_webpage ! {
54- SitemapXml ,
55- content_type = "application/xml" ,
56- }
58+ const SITEMAP_HEADER : & [ u8 ] = include_bytes ! ( "./../../templates/core/sitemap/_header.xml" ) ;
59+ const SITEMAP_FOOTER : & [ u8 ] = include_bytes ! ( "./../../templates/core/sitemap/_footer.xml" ) ;
5760
5861pub ( crate ) async fn sitemap_handler (
5962 Path ( letter) : Path < String > ,
@@ -67,37 +70,86 @@ pub(crate) async fn sitemap_handler(
6770 return Err ( AxumNope :: ResourceNotFound ) ;
6871 }
6972
70- let releases: Vec < _ > = sqlx:: query!(
71- r#"SELECT crates.name,
72- releases.target_name,
73- MAX(releases.release_time) as "release_time!"
74- FROM crates
75- INNER JOIN releases ON releases.crate_id = crates.id
76- WHERE
77- rustdoc_status = true AND
78- crates.name ILIKE $1
79- GROUP BY crates.name, releases.target_name
80- "# ,
81- format!( "{letter}%" ) ,
82- )
83- . fetch ( & mut * conn)
84- . map_ok ( |row| SitemapRow {
85- crate_name : row. name ,
86- target_name : row
87- . target_name
88- . expect ( "when we have rustdoc_status=true, this field is filled" ) ,
89- last_modified : row
90- . release_time
91- // On Aug 27 2022 we added `<link rel="canonical">` to all pages,
92- // so they should all get recrawled if they haven't been since then.
93- . max ( Utc . with_ymd_and_hms ( 2022 , 8 , 28 , 0 , 0 , 0 ) . unwrap ( ) )
94- . format ( "%+" )
95- . to_string ( ) ,
73+ let stream_span = Span :: current ( ) ;
74+
75+ let stream = stream ! ( {
76+ let mut items: usize = 0 ;
77+ let mut streamed_bytes: usize = SITEMAP_HEADER . len( ) ;
78+
79+ yield Ok ( Bytes :: from_static( SITEMAP_HEADER ) ) ;
80+
81+ let result = sqlx:: query!(
82+ r#"SELECT crates.name,
83+ releases.target_name,
84+ MAX(releases.release_time) as "release_time!"
85+ FROM crates
86+ INNER JOIN releases ON releases.crate_id = crates.id
87+ WHERE
88+ rustdoc_status = true AND
89+ crates.name ILIKE $1
90+ GROUP BY crates.name, releases.target_name
91+ "# ,
92+ format!( "{letter}%" ) ,
93+ )
94+ . fetch( & mut * conn) ;
95+
96+ pin_mut!( result) ;
97+ while let Some ( row) = result. next( ) . await {
98+ let row = match row. context( "error fetching row from database" ) {
99+ Ok ( row) => row,
100+ Err ( err) => {
101+ report_error( & err) ;
102+ yield Err ( AxumNope :: InternalError ( err) ) ;
103+ break ;
104+ }
105+ } ;
106+
107+ match ( SitemapItemXml {
108+ crate_name: row. name,
109+ target_name: row
110+ . target_name
111+ . expect( "when we have rustdoc_status=true, this field is filled" ) ,
112+ last_modified: row
113+ . release_time
114+ // On Aug 27 2022 we added `<link rel="canonical">` to all pages,
115+ // so they should all get recrawled if they haven't been since then.
116+ . max( Utc . with_ymd_and_hms( 2022 , 8 , 28 , 0 , 0 , 0 ) . unwrap( ) )
117+ . format( "%+" )
118+ . to_string( ) ,
119+ }
120+ . render( )
121+ . context( "error when rendering sitemap item xml" ) )
122+ {
123+ Ok ( item) => {
124+ let bytes = Bytes :: from( item) ;
125+ items += 1 ;
126+ streamed_bytes += bytes. len( ) ;
127+ yield Ok ( bytes) ;
128+ }
129+ Err ( err) => {
130+ report_error( & err) ;
131+ yield Err ( AxumNope :: InternalError ( err) ) ;
132+ break ;
133+ }
134+ } ;
135+ }
136+
137+ streamed_bytes += SITEMAP_FOOTER . len( ) ;
138+ yield Ok ( Bytes :: from_static( SITEMAP_FOOTER ) ) ;
139+
140+ if items > 50_000 || streamed_bytes > 50 * 1024 * 1024 {
141+ // alert when sitemap limits are reached
142+ // https://developers.google.com/search/docs/crawling-indexing/sitemaps/build-sitemap#general-guidelines
143+ error!( items, streamed_bytes, letter, "sitemap limits exceeded" )
144+ }
96145 } )
97- . try_collect ( )
98- . await ?;
146+ . instrument ( stream_span) ;
99147
100- Ok ( SitemapXml { releases } )
148+ Ok ( (
149+ StatusCode :: OK ,
150+ TypedHeader ( ContentType :: from ( mimes:: APPLICATION_XML . clone ( ) ) ) ,
151+ Body :: from_stream ( stream) ,
152+ ) )
101153}
102154
103155#[ derive( Template ) ]
0 commit comments