Skip to content
This repository was archived by the owner on Jan 2, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 56 additions & 37 deletions server/bleep/src/indexes/doc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,9 @@ impl Doc {
})
}

async fn set_title<'a, E>(&self, title: &str, id: i64, executor: E) -> Result<(), Error>
async fn set_title<'a, E>(&self, title: &str, id: i64, executor: &'a mut E) -> Result<(), Error>
where
E: sqlx::Executor<'a, Database = sqlx::Sqlite>,
&'a mut E: sqlx::Executor<'a, Database = sqlx::Sqlite>,
{
sqlx::query! {
"UPDATE docs SET name = ? WHERE id = ?",
Expand All @@ -149,9 +149,14 @@ impl Doc {
.map_err(Error::Sql)
}

async fn set_favicon<'a, E>(&self, favicon: &str, id: i64, executor: E) -> Result<(), Error>
async fn set_favicon<'a, E>(
&self,
favicon: &str,
id: i64,
executor: &'a mut E,
) -> Result<(), Error>
where
E: sqlx::Executor<'a, Database = sqlx::Sqlite>,
&'a mut E: sqlx::Executor<'a, Database = sqlx::Sqlite>,
{
sqlx::query! {
"UPDATE docs SET favicon = ? WHERE id = ?",
Expand All @@ -168,10 +173,10 @@ impl Doc {
&self,
description: &str,
id: i64,
executor: E,
executor: &'a mut E,
) -> Result<(), Error>
where
E: sqlx::Executor<'a, Database = sqlx::Sqlite>,
&'a mut E: sqlx::Executor<'a, Database = sqlx::Sqlite>,
{
sqlx::query! {
"UPDATE docs SET description = ? WHERE id = ?",
Expand All @@ -184,6 +189,45 @@ impl Doc {
.map_err(Error::Sql)
}

async fn set_metadata<'a, E>(
&self,
metadata: &scraper::Meta,
id: i64,
doc_source: &url::Url,
executor: &'a mut E,
) where
for<'t> &'t mut E: sqlx::Executor<'t, Database = sqlx::Sqlite>,
{
// set title
if let Some(title) = &metadata.title {
if let Err(e) = self.set_title(title, id, executor).await {
error!(%e, %title, %id, "failed to set doc title");
} else {
info!(%id, %title, "doc title set");
};
}

// set favicon
if let Some(favicon) = &metadata.icon {
let resolved_url = url::Url::parse(favicon)
.unwrap_or_else(|_| normalize_absolute_url(doc_source, favicon));
if let Err(e) = self.set_favicon(resolved_url.as_str(), id, executor).await {
error!(%e, %favicon, %id, "failed to set doc icon");
} else {
info!(%id, %favicon, "doc icon set");
};
}

// set description
if let Some(description) = &metadata.description {
if let Err(e) = self.set_description(description, id, executor).await {
error!(%e, %description, %id, "failed to set doc description");
} else {
info!(%id, %description, "doc description set");
};
}
}

async fn set_index_status<'a, E>(&self, status: &str, id: i64, executor: E) -> Result<(), Error>
where
E: sqlx::Executor<'a, Database = sqlx::Sqlite>,
Expand Down Expand Up @@ -236,38 +280,10 @@ impl Doc {
// scraped url
if let Progress::Update(update) = progress.clone() {
discovered_count = update.discovered_count;
if !update.metadata.is_empty() && !is_meta_set {
if update.url == url || (!update.metadata.is_empty() && !is_meta_set) {
// do not set meta for this doc provider in subsequent turns
is_meta_set = true;

// set title
if let Some(title) = &update.metadata.title {
if let Err(e) = self.set_title(title, id, &mut transaction).await {
error!(%e, %title, %id, "failed to set doc title");
} else {
info!(%id, %title, "doc title set");
};
}

// set favicon
if let Some(favicon) = &update.metadata.icon {
let resolved_url = url::Url::parse(favicon)
.unwrap_or_else(|_| normalize_absolute_url(&url, favicon));
if let Err(e) = self.set_favicon(resolved_url.as_str(), id, &mut transaction).await {
error!(%e, %favicon, %id, "failed to set doc icon");
} else {
info!(%id, %favicon, "doc icon set");
};
}

// set description
if let Some(description) = &update.metadata.description {
if let Err(e) = self.set_description(description, id, &mut transaction).await {
error!(%e, %description, %id, "failed to set doc description");
} else {
info!(%id, %description, "doc description set");
};
}
self.set_metadata(&update.metadata, id, &url, &mut transaction).await;
};
}
yield progress;
Expand Down Expand Up @@ -708,6 +724,9 @@ impl Doc {
metadata: doc.meta.clone(),
});
yield progress;
if doc.is_empty() {
continue;
}
let doc_source = doc_source.clone();
let section_schema = self.section_schema.clone();
let index_writer = Arc::clone(&index_writer);
Expand Down Expand Up @@ -760,7 +779,7 @@ impl scraper::Document {
doc_id = %id,
"indexing doc",
);
scraper::chunk::by_section(&self.content)
scraper::chunk::by_section(self.content.as_deref().unwrap_or_default()) // this is an infallible unwrap however
.into_par_iter()
.filter_map(|section| {
let point_id = {
Expand Down
19 changes: 10 additions & 9 deletions server/bleep/src/scraper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,8 @@ impl Scraper {
trace!("task finished");
match h.await {
Ok(Ok(mut scraper_result)) => {
// insert doc into the stream if any
if let Some(d) = scraper_result.doc.take() {
self.visited_links.insert(d.url.to_string());
yield d;
}
// insert doc into the stream
yield scraper_result.doc;

// there could be dupes among the new urls, collect them into a set first
let new_urls = scraper_result
Expand Down Expand Up @@ -155,7 +152,7 @@ pub struct ScraperRequest {
}

pub struct ScraperResult {
pub doc: Option<Document>,
pub doc: Document,
pub new_urls: Vec<(usize, Url)>,
}

Expand All @@ -182,7 +179,7 @@ impl Config {
pub struct Document {
pub url: Url,
pub path: PathBuf,
pub content: String,
pub content: Option<String>,
pub meta: Meta,
}

Expand All @@ -196,6 +193,10 @@ impl Document {

base.make_relative(&other)
}

pub fn is_empty(&self) -> bool {
self.content.is_none()
}
}

#[derive(Default, Clone, serde::Serialize)]
Expand Down Expand Up @@ -257,12 +258,12 @@ async fn visit(ScraperRequest { url, depth }: ScraperRequest) -> Result<ScraperR
icon: article.content.icon.map(|c| c.to_string()),
};

let doc = content.map(|content| Document {
let doc = Document {
url,
path: doc_path,
content,
meta,
});
};

Ok(ScraperResult { doc, new_urls })
}