From 9de3b0ce95c6908acf1943a73def276fc9e7bce3 Mon Sep 17 00:00:00 2001 From: Akshay Date: Thu, 9 Nov 2023 17:11:56 +0000 Subject: [PATCH 1/2] permit all docs to contribute metadata documents that failed to parse into markdown were typically ejected from the stream, and were therefore unable to contribute metadata in the form of page titles & document count. now, they contribute metadata and are subsequently ejected from the stream. this scenario previously had strange side-effects with https://stripe.com/docs, where the metadata was scraped from the next available page, and setting random page titles instead of "Stripe Documentation. --- server/bleep/src/indexes/doc.rs | 93 ++++++++++++++++++++------------- server/bleep/src/scraper.rs | 19 +++---- 2 files changed, 66 insertions(+), 46 deletions(-) diff --git a/server/bleep/src/indexes/doc.rs b/server/bleep/src/indexes/doc.rs index 6a0415c8e9..e4f1cb9141 100644 --- a/server/bleep/src/indexes/doc.rs +++ b/server/bleep/src/indexes/doc.rs @@ -134,9 +134,9 @@ impl Doc { }) } - async fn set_title<'a, E>(&self, title: &str, id: i64, executor: E) -> Result<(), Error> + async fn set_title<'a, E>(&self, title: &str, id: i64, executor: &'a mut E) -> Result<(), Error> where - E: sqlx::Executor<'a, Database = sqlx::Sqlite>, + &'a mut E: sqlx::Executor<'a, Database = sqlx::Sqlite>, { sqlx::query! { "UPDATE docs SET name = ? WHERE id = ?", @@ -149,9 +149,14 @@ impl Doc { .map_err(Error::Sql) } - async fn set_favicon<'a, E>(&self, favicon: &str, id: i64, executor: E) -> Result<(), Error> + async fn set_favicon<'a, E>( + &self, + favicon: &str, + id: i64, + executor: &'a mut E, + ) -> Result<(), Error> where - E: sqlx::Executor<'a, Database = sqlx::Sqlite>, + &'a mut E: sqlx::Executor<'a, Database = sqlx::Sqlite>, { sqlx::query! { "UPDATE docs SET favicon = ? WHERE id = ?", @@ -168,10 +173,10 @@ impl Doc { &self, description: &str, id: i64, - executor: E, + executor: &'a mut E, ) -> Result<(), Error> where - E: sqlx::Executor<'a, Database = sqlx::Sqlite>, + &'a mut E: sqlx::Executor<'a, Database = sqlx::Sqlite>, { sqlx::query! { "UPDATE docs SET description = ? WHERE id = ?", @@ -184,6 +189,45 @@ impl Doc { .map_err(Error::Sql) } + async fn set_metadata<'a, E>( + &self, + metadata: &scraper::Meta, + id: i64, + doc_source: &url::Url, + executor: &'a mut E, + ) where + for<'t> &'t mut E: sqlx::Executor<'t, Database = sqlx::Sqlite>, + { + // set title + if let Some(title) = &metadata.title { + if let Err(e) = self.set_title(title, id, executor).await { + error!(%e, %title, %id, "failed to set doc title"); + } else { + info!(%id, %title, "doc title set"); + }; + } + + // set favicon + if let Some(favicon) = &metadata.icon { + let resolved_url = url::Url::parse(favicon) + .unwrap_or_else(|_| normalize_absolute_url(&doc_source, favicon)); + if let Err(e) = self.set_favicon(resolved_url.as_str(), id, executor).await { + error!(%e, %favicon, %id, "failed to set doc icon"); + } else { + info!(%id, %favicon, "doc icon set"); + }; + } + + // set description + if let Some(description) = &metadata.description { + if let Err(e) = self.set_description(description, id, executor).await { + error!(%e, %description, %id, "failed to set doc description"); + } else { + info!(%id, %description, "doc description set"); + }; + } + } + async fn set_index_status<'a, E>(&self, status: &str, id: i64, executor: E) -> Result<(), Error> where E: sqlx::Executor<'a, Database = sqlx::Sqlite>, @@ -236,38 +280,10 @@ impl Doc { // scraped url if let Progress::Update(update) = progress.clone() { discovered_count = update.discovered_count; - if !update.metadata.is_empty() && !is_meta_set { + if update.url == url || (!update.metadata.is_empty() && !is_meta_set) { // do not set meta for this doc provider in subsequent turns is_meta_set = true; - - // set title - if let Some(title) = &update.metadata.title { - if let Err(e) = self.set_title(title, id, &mut transaction).await { - error!(%e, %title, %id, "failed to set doc title"); - } else { - info!(%id, %title, "doc title set"); - }; - } - - // set favicon - if let Some(favicon) = &update.metadata.icon { - let resolved_url = url::Url::parse(favicon) - .unwrap_or_else(|_| normalize_absolute_url(&url, favicon)); - if let Err(e) = self.set_favicon(resolved_url.as_str(), id, &mut transaction).await { - error!(%e, %favicon, %id, "failed to set doc icon"); - } else { - info!(%id, %favicon, "doc icon set"); - }; - } - - // set description - if let Some(description) = &update.metadata.description { - if let Err(e) = self.set_description(description, id, &mut transaction).await { - error!(%e, %description, %id, "failed to set doc description"); - } else { - info!(%id, %description, "doc description set"); - }; - } + self.set_metadata(&update.metadata, id, &url, &mut transaction).await; }; } yield progress; @@ -708,6 +724,9 @@ impl Doc { metadata: doc.meta.clone(), }); yield progress; + if doc.is_empty() { + continue; + } let doc_source = doc_source.clone(); let section_schema = self.section_schema.clone(); let index_writer = Arc::clone(&index_writer); @@ -760,7 +779,7 @@ impl scraper::Document { doc_id = %id, "indexing doc", ); - scraper::chunk::by_section(&self.content) + scraper::chunk::by_section(self.content.as_deref().unwrap_or_default()) // this is an infallible unwrap however .into_par_iter() .filter_map(|section| { let point_id = { diff --git a/server/bleep/src/scraper.rs b/server/bleep/src/scraper.rs index 66f0a3672a..ea56d60c4d 100644 --- a/server/bleep/src/scraper.rs +++ b/server/bleep/src/scraper.rs @@ -79,11 +79,8 @@ impl Scraper { trace!("task finished"); match h.await { Ok(Ok(mut scraper_result)) => { - // insert doc into the stream if any - if let Some(d) = scraper_result.doc.take() { - self.visited_links.insert(d.url.to_string()); - yield d; - } + // insert doc into the stream + yield scraper_result.doc; // there could be dupes among the new urls, collect them into a set first let new_urls = scraper_result @@ -155,7 +152,7 @@ pub struct ScraperRequest { } pub struct ScraperResult { - pub doc: Option, + pub doc: Document, pub new_urls: Vec<(usize, Url)>, } @@ -182,7 +179,7 @@ impl Config { pub struct Document { pub url: Url, pub path: PathBuf, - pub content: String, + pub content: Option, pub meta: Meta, } @@ -196,6 +193,10 @@ impl Document { base.make_relative(&other) } + + pub fn is_empty(&self) -> bool { + self.content.is_none() + } } #[derive(Default, Clone, serde::Serialize)] @@ -257,12 +258,12 @@ async fn visit(ScraperRequest { url, depth }: ScraperRequest) -> Result Date: Fri, 10 Nov 2023 09:50:41 +0000 Subject: [PATCH 2/2] clippy --- server/bleep/src/indexes/doc.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/bleep/src/indexes/doc.rs b/server/bleep/src/indexes/doc.rs index e4f1cb9141..32bb5f8792 100644 --- a/server/bleep/src/indexes/doc.rs +++ b/server/bleep/src/indexes/doc.rs @@ -210,7 +210,7 @@ impl Doc { // set favicon if let Some(favicon) = &metadata.icon { let resolved_url = url::Url::parse(favicon) - .unwrap_or_else(|_| normalize_absolute_url(&doc_source, favicon)); + .unwrap_or_else(|_| normalize_absolute_url(doc_source, favicon)); if let Err(e) = self.set_favicon(resolved_url.as_str(), id, executor).await { error!(%e, %favicon, %id, "failed to set doc icon"); } else {