From 638615be3e920293d259ccef8434ea21e7587b25 Mon Sep 17 00:00:00 2001 From: Thom Chiovoloni Date: Mon, 20 Aug 2018 16:40:55 -0700 Subject: [PATCH 1/5] Combine thomcc/anonymize-places and thomcc/mentat-places-test into a single tool, and add it --- .gitignore | 4 + Cargo.toml | 1 + places-tool/Cargo.toml | 17 ++ places-tool/src/anonymize.rs | 122 ++++++++++++ places-tool/src/find_db.rs | 84 ++++++++ places-tool/src/main.rs | 131 +++++++++++++ places-tool/src/places-schema.edn | 51 +++++ places-tool/src/to_mentat.rs | 312 ++++++++++++++++++++++++++++++ 8 files changed, 722 insertions(+) create mode 100644 places-tool/Cargo.toml create mode 100644 places-tool/src/anonymize.rs create mode 100644 places-tool/src/find_db.rs create mode 100644 places-tool/src/main.rs create mode 100644 places-tool/src/places-schema.edn create mode 100644 places-tool/src/to_mentat.rs diff --git a/.gitignore b/.gitignore index 32f98fd2ce..b89b437177 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,7 @@ Cargo.lock credentials.json *-engine.json .cargo +# Mentat database and journal files +*.db* +# Sqlite database and journal files +*.sqlite* diff --git a/Cargo.toml b/Cargo.toml index 929940e310..bad254dd3b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ members = [ "sync15-adapter", "sync15/passwords", "sync15/passwords/ffi", + "places-tool" ] # For RSA keys cloning. Remove once openssl 0.10.8+ is released. diff --git a/places-tool/Cargo.toml b/places-tool/Cargo.toml new file mode 100644 index 0000000000..791f162caa --- /dev/null +++ b/places-tool/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "places-tool" +version = "0.1.0" +authors = ["Thom Chiovoloni "] + +[dependencies] +mentat = { git = "https://github.com/mozilla/mentat.git", tag = "v0.11.0" } +rusqlite = { version = "0.13", features = ["bundled", "limits", "functions"] } +lazy_static = "1.1.0" +clap = "2.32.0" +log = "0.4.4" +env_logger = "0.5.12" +failure = "0.1.1" +tempfile = "3.0.3" +dirs = "1.0.3" +rand = "0.5.5" + diff --git a/places-tool/src/anonymize.rs b/places-tool/src/anonymize.rs new file mode 100644 index 0000000000..8b00d173e9 --- /dev/null +++ b/places-tool/src/anonymize.rs @@ -0,0 +1,122 @@ +use rand::{self, prelude::*}; +use std::collections::HashMap; +use rusqlite::{self, Connection, OpenFlags}; +use std::path::PathBuf; +use std::fs; +use failure; + + +#[derive(Default, Clone, Debug)] +struct StringAnonymizer { + table: HashMap, +} + +fn rand_string_of_len(len: usize) -> String { + let mut rng = thread_rng(); + rng.sample_iter(&rand::distributions::Alphanumeric).take(len).collect() +} + +impl StringAnonymizer { + fn anonymize(&mut self, s: &str) -> String { + if s.len() == 0 { + return "".into(); + } + if let Some(a) = self.table.get(s) { + return a.clone(); + } + for i in 0..10 { + let replacement = rand_string_of_len(s.len()); + // keep trying but force it at the last time + if self.table.get(&replacement).is_some() && i != 9 { + continue; + } + + self.table.insert(s.into(), replacement.clone()); + return replacement; + } + unreachable!("Bug in anonymize retry loop"); + } +} + +#[derive(Debug, Clone)] +struct TableInfo { + name: String, + cols: Vec +} + +impl TableInfo { + + fn for_table(name: String, conn: &Connection) -> Result { + let stmt = conn.prepare(&format!("SELECT * FROM {}", name))?; + let cols = stmt.column_names().into_iter().map(|x| x.to_owned()).collect(); + Ok(TableInfo { name, cols }) + } + + fn make_update(&self, updater_fn: &str) -> String { + let sets = self.cols.iter() + .map(|col| format!("{} = {}({})", col, updater_fn, col)) + .collect::>() + .join(",\n "); + format!("UPDATE {}\nSET {}", self.name, sets) + } +} + +fn anonymize(anon_places: &Connection) -> Result<(), failure::Error> { + { + let mut anonymizer = StringAnonymizer::default(); + anon_places.create_scalar_function("anonymize", 1, true, move |ctx| { + let arg = ctx.get::(0)?; + Ok(match arg { + rusqlite::types::Value::Text(s) => + rusqlite::types::Value::Text(anonymizer.anonymize(&s)), + not_text => not_text + }) + })?; + } + + let schema = { + let mut stmt = anon_places.prepare(" + SELECT name FROM sqlite_master + WHERE type = 'table' + AND name NOT LIKE 'sqlite_%' -- ('sqlite_sequence', 'sqlite_stat1', 'sqlite_master', anyt) + ")?; + let mut rows = stmt.query(&[])?; + let mut tables = vec![]; + while let Some(row_or_error) = rows.next() { + tables.push(TableInfo::for_table(row_or_error?.get("name"), &anon_places)?); + } + tables + }; + + for info in schema { + let sql = info.make_update("anonymize"); + debug!("Executing sql:\n{}", sql); + anon_places.execute(&sql, &[])?; + } + + debug!("Clearing places url_hash"); + anon_places.execute("UPDATE moz_places SET url_hash = 0", &[])?; + + Ok(()) +} + +#[derive(Debug, Clone)] +pub struct AnonymizePlaces { + pub input_path: PathBuf, + pub output_path: PathBuf, +} + +impl AnonymizePlaces { + + pub fn run(self) -> Result<(), failure::Error> { + fs::copy(&self.input_path, &self.output_path)?; + let anon_places = Connection::open_with_flags(&self.output_path, + OpenFlags::SQLITE_OPEN_READ_WRITE)?; + anonymize(&anon_places)?; + Ok(()) + } + +} + + + diff --git a/places-tool/src/find_db.rs b/places-tool/src/find_db.rs new file mode 100644 index 0000000000..6ab3c85b72 --- /dev/null +++ b/places-tool/src/find_db.rs @@ -0,0 +1,84 @@ +use failure; +use dirs; +use std::{process, fs, path::PathBuf}; + +#[derive(Clone, Debug, PartialEq)] +pub struct PlacesLocation { + pub profile_name: String, + pub path: PathBuf, + pub db_size: u64, +} + +impl PlacesLocation { + pub fn friendly_db_size(&self) -> String { + let sizes = [ + (1024 * 1024 * 1024, "Gb"), + (1024 * 1024, "Mb"), + (1024, "Kb"), + ]; + for (lim, suffix) in &sizes { + if self.db_size >= *lim { + + return format!("~{} {}", ((self.db_size as f64 / *lim as f64) * 10.0).round() / 10.0, suffix); + } + } + format!("{} bytes", self.db_size) + } +} + +pub fn get_all_places_dbs() -> Result, failure::Error> { + let mut path = match dirs::home_dir() { + Some(dir) => dir, + None => return Err(format_err!("No home directory found!")) + }; + if cfg!(windows) { + path.extend(&["AppData", "Roaming", "Mozilla", "Firefox", "Profiles"]); + } else { + let out = String::from_utf8( + process::Command::new("uname").args(&["-s"]).output()?.stdout)?; + info!("Uname says: {:?}", out); + if out.trim() == "Darwin" { + // ~/Library/Application Support/Firefox/Profiles + path.extend(&["Library", "Application Support", "Firefox", "Profiles"]); + } else { + // I'm not actually sure if this is true for all non-macos unix likes. + path.extend(&[".mozilla", "firefox"]); + } + } + debug!("Using profile path: {:?}", path); + let mut res = fs::read_dir(path)? + .map(|entry_result| { + let entry = entry_result?; + trace!("Considering path {:?}", entry.path()); + if !entry.path().is_dir() { + trace!(" Not dir: {:?}", entry.path()); + return Ok(None); + } + let mut path = entry.path().to_owned(); + let profile_name = path.file_name().unwrap_or_default().to_str().ok_or_else(|| { + warn!(" Path has invalid UTF8: {:?}", path); + format_err!("Path has invalid UTF8: {:?}", path) + })?.into(); + path.push("places.sqlite"); + if !path.exists() { + return Ok(None); + } + let metadata = fs::metadata(&path)?; + let db_size = metadata.len(); + Ok(Some(PlacesLocation { + profile_name, + path, + db_size, + })) + }).filter_map(|result: Result, failure::Error>| { + match result { + Ok(val) => val, + Err(e) => { + debug!("Got error finding profile directory, skipping: {}", e); + None + } + } + }).collect::>(); + res.sort_by(|a, b| b.db_size.cmp(&a.db_size)); + Ok(res) +} diff --git a/places-tool/src/main.rs b/places-tool/src/main.rs new file mode 100644 index 0000000000..a1d3388da4 --- /dev/null +++ b/places-tool/src/main.rs @@ -0,0 +1,131 @@ +extern crate dirs; +#[macro_use] +extern crate failure; +#[macro_use] +extern crate log; +#[macro_use] +extern crate mentat; +extern crate rusqlite; + +#[macro_use] +extern crate lazy_static; +extern crate rand; +extern crate env_logger; +extern crate clap; +extern crate tempfile; + +use std::path::{Path, PathBuf}; +use std::fs; + +mod find_db; +mod anonymize; +mod to_mentat; + + +fn main() -> Result<(), failure::Error> { + let matches = clap::App::new("places-tool") + .subcommand(clap::SubCommand::with_name("to-mentat") + .about("Convert a places database to a mentat database") + .arg(clap::Arg::with_name("OUTPUT") + .index(1) + .help("Path where we should output the mentat db (defaults to ./mentat_places.db)")) + .arg(clap::Arg::with_name("PLACES") + .index(2) + .help("Path to places.sqlite. If not provided, we'll use the largest places.sqlite in your firefox profiles")) + .arg(clap::Arg::with_name("v") + .short("v") + .multiple(true) + .help("Sets the level of verbosity (pass up to 3 times for more verbosity -- e.g. -vvv enables trace logs)")) + .arg(clap::Arg::with_name("force") + .short("f") + .long("force") + .help("Overwrite OUTPUT if it already exists")) + .arg(clap::Arg::with_name("realistic") + .short("r") + .long("realistic") + .help("Insert everything with one transaction per visit. This is a lot slower, \ + but is a more realistic workload. It produces databases that are ~30% larger (for me)."))) + .subcommand(clap::SubCommand::with_name("anonymize") + .about("Anonymize a places database") + .arg(clap::Arg::with_name("OUTPUT") + .index(1) + .help("Path where we should output the anonymized db (defaults to places_anonymized.sqlite)")) + .arg(clap::Arg::with_name("PLACES") + .index(2) + .help("Path to places.sqlite. If not provided, we'll use the largest places.sqlite in your firefox profiles")) + .arg(clap::Arg::with_name("v") + .short("v") + .multiple(true) + .help("Sets the level of verbosity (pass up to 3 times for more verbosity -- e.g. -vvv enables trace logs)")) + .arg(clap::Arg::with_name("force") + .short("f") + .long("force") + .help("Overwrite OUTPUT if it already exists"))) + .get_matches(); + + let subcommand = matches.subcommand_name().map(|s| s.to_owned()).expect("Must provide subcommand"); + let is_anon = subcommand == "anonymize"; + let subcmd_matches = matches.subcommand_matches(&subcommand).unwrap(); + + env_logger::init_from_env(match subcmd_matches.occurrences_of("v") { + 0 => env_logger::Env::default().filter_or("RUST_LOG", "warn"), + 1 => env_logger::Env::default().filter_or("RUST_LOG", "info"), + 2 => env_logger::Env::default().filter_or("RUST_LOG", "debug"), + 3 | _ => env_logger::Env::default().filter_or("RUST_LOG", "trace"), + }); + + let places_db = if let Some(places) = subcmd_matches.value_of("PLACES") { + let meta = fs::metadata(&places)?; + find_db::PlacesLocation { + profile_name: "".into(), + path: fs::canonicalize(places)?, + db_size: meta.len(), + } + } else { + let mut dbs = find_db::get_all_places_dbs()?; + if dbs.len() == 0 { + error!("No dbs found!"); + return Err(format_err!("No dbs found!")); + } + for p in &dbs { + debug!("Found: profile {:?} with a {} places.sqlite", p.profile_name, p.friendly_db_size()) + } + info!("Using profile {:?}", dbs[0].profile_name); + dbs.into_iter().next().unwrap() + }; + + let out_db_path = subcmd_matches.value_of("OUTPUT").unwrap_or_else(|| { + if is_anon { + "./places_anonymized.sqlite" + } else { + "./mentat_places.db" + } + }).to_owned(); + + if Path::new(&out_db_path).exists() { + if subcmd_matches.is_present("force") { + info!("Deleting previous `{}` because -f was passed", out_db_path); + fs::remove_file(&out_db_path)?; + } else { + error!("{} already exists but `-f` argument was not provided", out_db_path); + return Err(format_err!("Output path already exists")); + } + } + + if is_anon { + let cmd = anonymize::AnonymizePlaces { + input_path: places_db.path, + output_path: PathBuf::from(out_db_path) + }; + cmd.run()?; + } else { + let cmd = to_mentat::PlacesToMentat { + mentat_db_path: PathBuf::from(out_db_path), + places_db_path: places_db.path, + one_tx_per_visit: subcmd_matches.is_present("realistic"), + }; + cmd.run()?; + } + + Ok(()) +} diff --git a/places-tool/src/places-schema.edn b/places-tool/src/places-schema.edn new file mode 100644 index 0000000000..8a0b55b783 --- /dev/null +++ b/places-tool/src/places-schema.edn @@ -0,0 +1,51 @@ +[ + { :db/ident :place/url + :db/valueType :db.type/string + :db/cardinality :db.cardinality/one + } + + { :db/ident :place/url_hash + :db/valueType :db.type/long + :db/cardinality :db.cardinality/one + } + + { :db/ident :place/title + :db/valueType :db.type/string + :db/cardinality :db.cardinality/one + } + + { :db/ident :place/description + :db/valueType :db.type/string + :db/cardinality :db.cardinality/one + } + + { :db/ident :place/frecency + :db/valueType :db.type/long + :db/cardinality :db.cardinality/one + } + + { :db/ident :visit/place + :db/valueType :db.type/ref + :db/cardinality :db.cardinality/one + } + + { :db/ident :visit/type + :db/valueType :db.type/ref + :db/cardinality :db.cardinality/one + } + + { :db/ident :visit/date + :db/valueType :db.type/instant + :db/cardinality :db.cardinality/one + } + + { :db/ident :visit.type/link } + { :db/ident :visit.type/typed } + { :db/ident :visit.type/bookmark } + { :db/ident :visit.type/embed } + { :db/ident :visit.type/redirect_permanent } + { :db/ident :visit.type/redirect_temporary } + { :db/ident :visit.type/download } + { :db/ident :visit.type/framed_link } + { :db/ident :visit.type/reload } +] diff --git a/places-tool/src/to_mentat.rs b/places-tool/src/to_mentat.rs new file mode 100644 index 0000000000..82cf95cb42 --- /dev/null +++ b/places-tool/src/to_mentat.rs @@ -0,0 +1,312 @@ + +use failure; +use std::fs; +use std::io::{Write, self}; +use std::fmt::{Write as FmtWrite}; +use std::path::PathBuf; +use tempfile; + +use rusqlite::{ + Connection, + OpenFlags, + Row, +}; + +use mentat::{ + self, + Store, + Keyword, + errors::Result as MentatResult, +}; + +#[derive(Debug, Clone)] +struct TransactBuilder { + counter: u64, + data: String, + total_terms: u64, + terms: u64, + max_buffer_size: usize +} + +impl TransactBuilder { + #[inline] + pub fn new_with_size(max_buffer_size: usize) -> Self { + Self { counter: 0, data: "[\n".into(), terms: 0, total_terms: 0, max_buffer_size } + } + + #[inline] + pub fn next_tempid(&mut self) -> u64 { + self.counter += 1; + self.counter + } + + #[inline] + pub fn add_ref_to_tmpid(&mut self, tmpid: u64, attr: &Keyword, ref_tmpid: u64) { + write!(self.data, " [:db/add \"{}\" {} \"{}\"]\n", tmpid, attr, ref_tmpid).unwrap(); + self.terms += 1; + self.total_terms += 1; + } + + #[inline] + pub fn add_inst(&mut self, tmpid: u64, attr: &Keyword, micros: i64) { + write!(self.data, " [:db/add \"{}\" {} #instmicros {}]\n", tmpid, attr, micros).unwrap(); + self.terms += 1; + self.total_terms += 1; + } + + #[inline] + pub fn add_kw(&mut self, tmpid: u64, attr: &Keyword, val: &Keyword) { + write!(self.data, " [:db/add \"{}\" {} {}]\n", tmpid, attr, val).unwrap(); + self.terms += 1; + self.total_terms += 1; + } + + #[inline] + pub fn add_str(&mut self, tmpid: u64, attr: &Keyword, val: &str) { + // {:?} escapes some chars EDN can't parse (e.g. \'...) + let s = val.replace("\\", "\\\\").replace("\"", "\\\""); + write!(self.data, " [:db/add \"{}\" {} \"{}\"]\n", tmpid, attr, s).unwrap(); + self.terms += 1; + self.total_terms += 1; + } + + #[inline] + pub fn add_long(&mut self, tmpid: u64, attr: &Keyword, val: i64) { + write!(self.data, " [:db/add \"{}\" {} {}]\n", tmpid, attr, val).unwrap(); + self.terms += 1; + self.total_terms += 1; + } + + #[inline] + pub fn finish(&mut self) -> &str { + self.data.push(']'); + &self.data + } + + #[inline] + pub fn reset(&mut self) { + self.terms = 0; + self.data.clear(); + self.data.push_str("[\n") + } + + #[inline] + pub fn should_finish(&self) -> bool { + self.data.len() >= self.max_buffer_size + } + + #[inline] + pub fn maybe_transact(&mut self, store: &mut Store) -> MentatResult> { + if self.should_finish() { + Ok(self.transact(store)?) + } else { + Ok(None) + } + } + + #[inline] + pub fn transact(&mut self, store: &mut Store) -> MentatResult> { + if self.terms != 0 { + debug!("\nTransacting {} terms (total = {})", self.terms, self.total_terms); + let res = store.transact(self.finish()); + if res.is_err() { error!("Error transacting:\n{}", self.data); } + let report = res?; + self.reset(); + Ok(Some(report)) + } else { + Ok(None) + } + } +} + +lazy_static! { + static ref PLACE_URL: Keyword = kw!(:place/url); + static ref PLACE_URL_HASH: Keyword = kw!(:place/url_hash); + static ref PLACE_TITLE: Keyword = kw!(:place/title); + static ref PLACE_DESCRIPTION: Keyword = kw!(:place/description); + static ref PLACE_FRECENCY: Keyword = kw!(:place/frecency); + static ref VISIT_PLACE: Keyword = kw!(:visit/place); + static ref VISIT_DATE: Keyword = kw!(:visit/date); + static ref VISIT_TYPE: Keyword = kw!(:visit/type); + + static ref VISIT_TYPES: Vec = vec![ + kw!(:visit.type/link), + kw!(:visit.type/typed), + kw!(:visit.type/bookmark), + kw!(:visit.type/embed), + kw!(:visit.type/redirect_permanent), + kw!(:visit.type/redirect_temporary), + kw!(:visit.type/download), + kw!(:visit.type/framed_link), + kw!(:visit.type/reload), + ]; +} + +#[derive(Debug, Clone)] +struct PlaceEntry { + pub id: i64, + pub url: String, + pub url_hash: i64, + pub description: Option, + pub title: String, + pub frecency: i64, + pub visits: Vec<(i64, &'static Keyword)>, +} + +impl PlaceEntry { + pub fn add(&self, builder: &mut TransactBuilder, store: &mut Store) -> Result<(), failure::Error> { + let place_id = builder.next_tempid(); + builder.add_str(place_id, &*PLACE_URL, &self.url); + builder.add_long(place_id, &*PLACE_URL_HASH, self.url_hash); + builder.add_str(place_id, &*PLACE_TITLE, &self.title); + if let Some(desc) = &self.description { + builder.add_str(place_id, &*PLACE_DESCRIPTION, desc); + } + + builder.add_long(place_id, &*PLACE_FRECENCY, self.frecency); + + assert!(self.visits.len() > 0); + + if builder.max_buffer_size == 0 { + let report = builder.transact(store)?.unwrap(); + let place_eid = report.tempids.get(&format!("{}", place_id)).unwrap(); + // One transaction per visit. + for (microtime, visit_type) in &self.visits { + let visit_id = builder.next_tempid(); + builder.add_long(visit_id, &*VISIT_PLACE, *place_eid); + builder.add_inst(visit_id, &*VISIT_DATE, *microtime); + builder.add_kw(visit_id, &*VISIT_TYPE, visit_type); + builder.transact(store)?; + } + } else { + for (microtime, visit_type) in &self.visits { + let visit_id = builder.next_tempid(); + builder.add_ref_to_tmpid(visit_id, &*VISIT_PLACE, place_id); + builder.add_inst(visit_id, &*VISIT_DATE, *microtime); + builder.add_kw(visit_id, &*VISIT_TYPE, visit_type); + } + builder.maybe_transact(store)?; + } + Ok(()) + } + + pub fn from_row(row: &Row) -> PlaceEntry { + let transition_type: i64 = row.get("visit_type"); + PlaceEntry { + id: row.get("place_id"), + url: row.get("place_url"), + url_hash: row.get("place_url_hash"), + description: row.get("place_description"), + title: row.get::<_, Option>("place_title").unwrap_or("".into()), + frecency: row.get("place_frecency"), + visits: vec![(row.get("visit_date"), &VISIT_TYPES[(transition_type as usize).saturating_sub(1)])], + } + } +} + +#[derive(Debug, Clone)] +pub struct PlacesToMentat { + pub mentat_db_path: PathBuf, + pub places_db_path: PathBuf, + pub one_tx_per_visit: bool, +} + + +static SCHEMA: &'static str = include_str!("places-schema.edn"); + + +impl PlacesToMentat { + pub fn run(self) -> Result<(), failure::Error> { + + debug!("Copying places.sqlite to a temp file for reading"); + let temp_dir = tempfile::tempdir()?; + let temp_places_path = temp_dir.path().join("places.sqlite"); + + fs::copy(&self.places_db_path, &temp_places_path)?; + let places = Connection::open_with_flags(&temp_places_path, OpenFlags::SQLITE_OPEN_READ_ONLY)?; + + let mut store = Store::open_empty(self.mentat_db_path.to_str().unwrap())?; + + debug!("Transacting initial schema"); + store.transact(SCHEMA)?; + + let mut stmt = places.prepare(" + SELECT + p.id as place_id, + p.url as place_url, + p.url_hash as place_url_hash, + p.description as place_description, + p.title as place_title, + p.frecency as place_frecency, + v.visit_date as visit_date, + v.visit_type as visit_type + FROM moz_places p + JOIN moz_historyvisits v + ON p.id = v.place_id + ORDER BY p.id + ").unwrap(); + + let (place_count, visit_count) = { + let mut stmt = places.prepare("select count(*) from moz_places").unwrap(); + let mut rows = stmt.query(&[]).unwrap(); + let ps: i64 = rows.next().unwrap()?.get(0); + + let mut stmt = places.prepare("select count(*) from moz_historyvisits").unwrap(); + let mut rows = stmt.query(&[]).unwrap(); + let vs: i64 = rows.next().unwrap()?.get(0); + (ps, vs) + }; + + println!("Querying {} places ({} visits)", place_count, visit_count); + + let mut current_place = PlaceEntry { + id: -1, + url: "".into(), + url_hash: 0, + description: None, + title: "".into(), + frecency: 0, + visits: vec![], + }; + + let max_buffer_size = if self.one_tx_per_visit { 0 } else { 1024 * 1024 * 1024 * 1024 }; + + let mut builder = TransactBuilder::new_with_size(max_buffer_size); + + let mut so_far = 0; + let mut rows = stmt.query(&[])?; + + while let Some(row_or_error) = rows.next() { + let row = row_or_error?; + let id: i64 = row.get("place_id"); + if current_place.id == id { + let tty: i64 = row.get("visit_type"); + current_place.visits.push(( + row.get("visit_date"), + &VISIT_TYPES.get((tty.max(0) as usize).saturating_sub(1)) + .unwrap_or_else(|| &VISIT_TYPES[0]) + )); + continue; + } + + if current_place.id >= 0 { + current_place.add(&mut builder, &mut store)?; + // builder.maybe_transact(&mut store)?; + print!("\rProcessing {} / {} places (approx.)", so_far, place_count); + io::stdout().flush()?; + so_far += 1; + } + current_place = PlaceEntry::from_row(&row); + } + + if current_place.id >= 0 { + current_place.add(&mut builder, &mut store)?; + // builder.maybe_transact(&mut store)?; + println!("\rProcessing {} / {} places (approx.)", so_far + 1, place_count); + } + builder.transact(&mut store)?; + println!("Done!"); + Ok(()) + } +} + From 64557ea577449fc933b1364bf8fddfa73190751d Mon Sep 17 00:00:00 2001 From: Thom Chiovoloni Date: Mon, 20 Aug 2018 20:59:24 -0700 Subject: [PATCH 2/5] Fixes for mentat/sqlite dependency hell --- places-tool/Cargo.toml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/places-tool/Cargo.toml b/places-tool/Cargo.toml index 791f162caa..3b09e1d767 100644 --- a/places-tool/Cargo.toml +++ b/places-tool/Cargo.toml @@ -4,8 +4,6 @@ version = "0.1.0" authors = ["Thom Chiovoloni "] [dependencies] -mentat = { git = "https://github.com/mozilla/mentat.git", tag = "v0.11.0" } -rusqlite = { version = "0.13", features = ["bundled", "limits", "functions"] } lazy_static = "1.1.0" clap = "2.32.0" log = "0.4.4" @@ -15,3 +13,15 @@ tempfile = "3.0.3" dirs = "1.0.3" rand = "0.5.5" +[dependencies.mentat] +git = "https://github.com/mozilla/mentat.git" +tag = "v0.11.0" +# It seems like we need to use sqlcipher since other projects in this +# workspace are using sqlcipher. Otherwise we get conflicts... +default_features = false +features = ["sqlcipher"] + +[dependencies.rusqlite] +version = "0.13" +features = ["sqlcipher", "limits", "functions"] + From 39529113beddcce0afdfa5d6f3ba90a4ae0b1917 Mon Sep 17 00:00:00 2001 From: Thom Chiovoloni Date: Tue, 21 Aug 2018 14:04:48 -0700 Subject: [PATCH 3/5] Make the schema closer to our whiteboard schema --- places-tool/src/initial-data.edn | 14 ++ places-tool/src/main.rs | 2 +- places-tool/src/places-schema.edn | 137 ++++++++++++++-- places-tool/src/to_mentat.rs | 254 +++++++++++++++++------------- 4 files changed, 283 insertions(+), 124 deletions(-) create mode 100644 places-tool/src/initial-data.edn diff --git a/places-tool/src/initial-data.edn b/places-tool/src/initial-data.edn new file mode 100644 index 0000000000..db042d0254 --- /dev/null +++ b/places-tool/src/initial-data.edn @@ -0,0 +1,14 @@ + +[ + {:db/id "container0" :container/name "Default"} + {:db/id "container1" :container/name "Personal"} + + {:db/id "device0" :device/name "My very cool computer" :device/type :device.type/desktop} + {:db/id "device1" :device/name "My cool phone" :device/type :device.type/mobile} + + ; We randomly assign one of these to each visit. + {:context/id 0 :context/device "device0" :context/container "container0"} + {:context/id 1 :context/device "device1" :context/container "container0"} + {:context/id 2 :context/device "device0" :context/container "container1"} + {:context/id 3 :context/device "device1" :context/container "container1"} +] diff --git a/places-tool/src/main.rs b/places-tool/src/main.rs index a1d3388da4..7ef2e257d1 100644 --- a/places-tool/src/main.rs +++ b/places-tool/src/main.rs @@ -122,7 +122,7 @@ fn main() -> Result<(), failure::Error> { let cmd = to_mentat::PlacesToMentat { mentat_db_path: PathBuf::from(out_db_path), places_db_path: places_db.path, - one_tx_per_visit: subcmd_matches.is_present("realistic"), + realistic: subcmd_matches.is_present("realistic"), }; cmd.run()?; } diff --git a/places-tool/src/places-schema.edn b/places-tool/src/places-schema.edn index 8a0b55b783..6932802b83 100644 --- a/places-tool/src/places-schema.edn +++ b/places-tool/src/places-schema.edn @@ -1,35 +1,98 @@ [ - { :db/ident :place/url + + { :db/ident :origin/prefix :db/valueType :db.type/string :db/cardinality :db.cardinality/one } - { :db/ident :place/url_hash + { :db/ident :origin/host + :db/valueType :db.type/string + :db/cardinality :db.cardinality/one + } + + ; used for lookup refs so that we aren't force to insert everything in + ; one enormous transaction (making --realistic impossible) + + { :db/ident :origin/places_id :db/valueType :db.type/long :db/cardinality :db.cardinality/one + :db/unique :db.unique/identity + :db/index true + } + + ; Pages + + { :db/ident :page/url + :db/valueType :db.type/string + :db/cardinality :db.cardinality/one + :db/unique :db.unique/value + :db/index true ; required for unique/value. + ; TODO: should this be fulltext? + } + + { :db/ident :page/origin + :db/valueType :db.type/ref + :db/cardinality :db.cardinality/one } - { :db/ident :place/title + ; Page metadata (frequently stuff from the HEAD of the page) + + { :db/ident :page_meta/title :db/valueType :db.type/string :db/cardinality :db.cardinality/one + ; TODO this probably should have :db/index true :db/fulltext true } - { :db/ident :place/description + ; This is stored in favicons.sqlite, which we don't read, so in practice + ; it's always empty >_>. + { :db/ident :page_meta/favicon_url :db/valueType :db.type/string :db/cardinality :db.cardinality/one } - { :db/ident :place/frecency + { :db/ident :page_meta/description + :db/valueType :db.type/string + :db/cardinality :db.cardinality/one + ; TODO: should this have index or fulltext? + } + + { :db/ident :page_meta/preview_image_url + :db/valueType :db.type/string + :db/cardinality :db.cardinality/one + } + + ; A "context" is a tuple of (device, container). + + ; This ID is just so that we can insert with a lookup ref. + { :db/ident :context/id :db/valueType :db.type/long :db/cardinality :db.cardinality/one + :db/unique :db.unique/identity + :db/index true } - { :db/ident :visit/place + { :db/ident :context/device :db/valueType :db.type/ref :db/cardinality :db.cardinality/one } - { :db/ident :visit/type + { :db/ident :context/container + :db/valueType :db.type/ref + :db/cardinality :db.cardinality/one + } + + ; Visits + { :db/ident :visit/page_meta + :db/valueType :db.type/ref + :db/cardinality :db.cardinality/one + } + + { :db/ident :visit/context + :db/valueType :db.type/ref + :db/cardinality :db.cardinality/one + } + + { :db/ident :visit/page :db/valueType :db.type/ref :db/cardinality :db.cardinality/one } @@ -37,15 +100,55 @@ { :db/ident :visit/date :db/valueType :db.type/instant :db/cardinality :db.cardinality/one + :db/index true + } + + ; In the future we probably will always have sources. In practice, at the + ; moment, we only sometimes have them. Unfortunately, even if we do have them, + ; we fake it (and make `:visit/source_visit` point to the visit itself) + ; because otherwise this would be really tricky to do in --realistic mode. + ; + { :db/ident :visit/source_visit + :db/valueType :db.type/ref + :db/cardinality :db.cardinality/one } - { :db/ident :visit.type/link } - { :db/ident :visit.type/typed } - { :db/ident :visit.type/bookmark } - { :db/ident :visit.type/embed } - { :db/ident :visit.type/redirect_permanent } - { :db/ident :visit.type/redirect_temporary } - { :db/ident :visit.type/download } - { :db/ident :visit.type/framed_link } - { :db/ident :visit.type/reload } -] + ; { :db/ident :visit/source_redirect + ; :db/valueType :db.type/ref + ; :db/cardinality :db.cardinality/one + ; } + + ; Never used at the moment because I'm too lazy to try and model bookmarks. + ; { :db/ident :visit/source_bookmark + ; :db/valueType :db.type/ref + ; :db/cardinality :db.cardinality/one + ; } + + ; Device + + { :db/ident :device/name + :db/valueType :db.type/string + :db/cardinality :db.cardinality/one + } + + { :db/ident :device/type + :db/valueType :db.type/ref + :db/cardinality :db.cardinality/one + } + + ; Several other things... + + ; (Open) enumeration of possible device types + { :db/ident :device.type/desktop } + { :db/ident :device.type/mobile } + + ; Container + { :db/ident :container/name + :db/valueType :db.type/string + :db/cardinality :db.cardinality/one + } + ; ... etc. We're omitting color, the fact that some origins will open in a + ; specific container by default, etc. + + +] \ No newline at end of file diff --git a/places-tool/src/to_mentat.rs b/places-tool/src/to_mentat.rs index 82cf95cb42..435b70a981 100644 --- a/places-tool/src/to_mentat.rs +++ b/places-tool/src/to_mentat.rs @@ -5,6 +5,7 @@ use std::io::{Write, self}; use std::fmt::{Write as FmtWrite}; use std::path::PathBuf; use tempfile; +use rand::prelude::*; use rusqlite::{ Connection, @@ -35,44 +36,47 @@ impl TransactBuilder { } #[inline] - pub fn next_tempid(&mut self) -> u64 { + pub fn next_tempid(&mut self) -> String { self.counter += 1; - self.counter + self.counter.to_string() } #[inline] - pub fn add_ref_to_tmpid(&mut self, tmpid: u64, attr: &Keyword, ref_tmpid: u64) { - write!(self.data, " [:db/add \"{}\" {} \"{}\"]\n", tmpid, attr, ref_tmpid).unwrap(); + pub fn add_ref_to_tmpid(&mut self, tmpid: &str, attr: &Keyword, ref_tmpid: &str) { + write!(self.data, " [:db/add {:?} {} {:?}]\n", tmpid, attr, ref_tmpid).unwrap(); self.terms += 1; self.total_terms += 1; } #[inline] - pub fn add_inst(&mut self, tmpid: u64, attr: &Keyword, micros: i64) { - write!(self.data, " [:db/add \"{}\" {} #instmicros {}]\n", tmpid, attr, micros).unwrap(); + pub fn add_ref_to_lookup_ref_long(&mut self, + tmpid: &str, attr: &Keyword, + lookup_ref_attr: &Keyword, lookup_ref_val: i64) { + write!(self.data, " [:db/add {:?} {} (lookup-ref {} {})]\n", + tmpid, attr, lookup_ref_attr, lookup_ref_val).unwrap(); self.terms += 1; self.total_terms += 1; } #[inline] - pub fn add_kw(&mut self, tmpid: u64, attr: &Keyword, val: &Keyword) { - write!(self.data, " [:db/add \"{}\" {} {}]\n", tmpid, attr, val).unwrap(); + pub fn add_inst(&mut self, tmpid: &str, attr: &Keyword, micros: i64) { + write!(self.data, " [:db/add {:?} {} #instmicros {}]\n", tmpid, attr, micros).unwrap(); self.terms += 1; self.total_terms += 1; } #[inline] - pub fn add_str(&mut self, tmpid: u64, attr: &Keyword, val: &str) { + pub fn add_str(&mut self, tmpid: &str, attr: &Keyword, val: &str) { // {:?} escapes some chars EDN can't parse (e.g. \'...) let s = val.replace("\\", "\\\\").replace("\"", "\\\""); - write!(self.data, " [:db/add \"{}\" {} \"{}\"]\n", tmpid, attr, s).unwrap(); + write!(self.data, " [:db/add {:?} {} \"{}\"]\n", tmpid, attr, s).unwrap(); self.terms += 1; self.total_terms += 1; } #[inline] - pub fn add_long(&mut self, tmpid: u64, attr: &Keyword, val: i64) { - write!(self.data, " [:db/add \"{}\" {} {}]\n", tmpid, attr, val).unwrap(); + pub fn add_long(&mut self, tmpid: &str, attr: &Keyword, val: i64) { + write!(self.data, " [:db/add {:?} {} {}]\n", tmpid, attr, val).unwrap(); self.terms += 1; self.total_terms += 1; } @@ -120,86 +124,112 @@ impl TransactBuilder { } lazy_static! { - static ref PLACE_URL: Keyword = kw!(:place/url); - static ref PLACE_URL_HASH: Keyword = kw!(:place/url_hash); - static ref PLACE_TITLE: Keyword = kw!(:place/title); - static ref PLACE_DESCRIPTION: Keyword = kw!(:place/description); - static ref PLACE_FRECENCY: Keyword = kw!(:place/frecency); - static ref VISIT_PLACE: Keyword = kw!(:visit/place); + + static ref ORIGIN_PREFIX: Keyword = kw!(:origin/prefix); + static ref ORIGIN_HOST: Keyword = kw!(:origin/host); + static ref ORIGIN_PLACES_ID: Keyword = kw!(:origin/places_id); + + + static ref PAGE_URL: Keyword = kw!(:page/url); + static ref PAGE_ORIGIN: Keyword = kw!(:page/origin); + + static ref PAGE_META_TITLE: Keyword = kw!(:page_meta/title); + // static ref PAGE_META_FAVICON_URL: Keyword = kw!(:page_meta/favicon_url); + static ref PAGE_META_DESCRIPTION: Keyword = kw!(:page_meta/description); + static ref PAGE_META_PREVIEW_IMAGE_URL: Keyword = kw!(:page_meta/preview_image_url); + + // static ref CONTEXT_DEVICE: Keyword = kw!(:context/device); + // static ref CONTEXT_CONTAINER: Keyword = kw!(:context/container); + static ref CONTEXT_ID: Keyword = kw!(:context/id); + + static ref VISIT_PAGE_META: Keyword = kw!(:visit/page_meta); + static ref VISIT_CONTEXT: Keyword = kw!(:visit/context); + static ref VISIT_PAGE: Keyword = kw!(:visit/page); static ref VISIT_DATE: Keyword = kw!(:visit/date); - static ref VISIT_TYPE: Keyword = kw!(:visit/type); - - static ref VISIT_TYPES: Vec = vec![ - kw!(:visit.type/link), - kw!(:visit.type/typed), - kw!(:visit.type/bookmark), - kw!(:visit.type/embed), - kw!(:visit.type/redirect_permanent), - kw!(:visit.type/redirect_temporary), - kw!(:visit.type/download), - kw!(:visit.type/framed_link), - kw!(:visit.type/reload), - ]; + + static ref VISIT_SOURCE_VISIT: Keyword = kw!(:visit/source_visit); + + // static ref VISIT_SOURCE_REDIRECT: Keyword = kw!(:visit/source_redirect); + // static ref VISIT_SOURCE_BOOKMARK: Keyword = kw!(:visit/source_bookmark); + + // Only used in `initial-data.edn` + // + // static ref DEVICE_NAME: Keyword = kw!(:device/name) + // static ref DEVICE_TYPE: Keyword = kw!(:device/type) + // static ref DEVICE_TYPE_DESKTOP: Keyword = kw!(:device.type/desktop) + // static ref DEVICE_TYPE_MOBILE: Keyword = kw!(:device.type/mobile) + // static ref CONTAINER_NAME: Keyword = kw!(:container/name) + } -#[derive(Debug, Clone)] +const MAX_CONTEXT_ID: i64 = 4; + + +#[derive(Debug, Clone, Default)] +struct VisitInfo { + // Everything else we fabricate (for reasons). + date: i64, +} + +#[derive(Debug, Clone, Default)] struct PlaceEntry { pub id: i64, pub url: String, - pub url_hash: i64, pub description: Option, + pub preview_image_url: Option, pub title: String, - pub frecency: i64, - pub visits: Vec<(i64, &'static Keyword)>, + pub origin_id: i64, + pub visits: Vec, } impl PlaceEntry { pub fn add(&self, builder: &mut TransactBuilder, store: &mut Store) -> Result<(), failure::Error> { - let place_id = builder.next_tempid(); - builder.add_str(place_id, &*PLACE_URL, &self.url); - builder.add_long(place_id, &*PLACE_URL_HASH, self.url_hash); - builder.add_str(place_id, &*PLACE_TITLE, &self.title); - if let Some(desc) = &self.description { - builder.add_str(place_id, &*PLACE_DESCRIPTION, desc); - } + let page_id = builder.next_tempid(); + builder.add_str(&page_id, &*PAGE_URL, &self.url); + builder.add_ref_to_lookup_ref_long(&page_id, &*PAGE_ORIGIN, &*ORIGIN_PLACES_ID, self.origin_id); - builder.add_long(place_id, &*PLACE_FRECENCY, self.frecency); + let page_meta_id = builder.next_tempid(); - assert!(self.visits.len() > 0); + builder.add_str(&page_meta_id, &*PAGE_META_TITLE, &self.title); + if let Some(desc) = &self.description { + builder.add_str(&page_meta_id, &*PAGE_META_DESCRIPTION, &desc); + } + if let Some(preview) = &self.preview_image_url { + builder.add_str(&page_meta_id, &*PAGE_META_PREVIEW_IMAGE_URL, &preview); + } - if builder.max_buffer_size == 0 { - let report = builder.transact(store)?.unwrap(); - let place_eid = report.tempids.get(&format!("{}", place_id)).unwrap(); - // One transaction per visit. - for (microtime, visit_type) in &self.visits { - let visit_id = builder.next_tempid(); - builder.add_long(visit_id, &*VISIT_PLACE, *place_eid); - builder.add_inst(visit_id, &*VISIT_DATE, *microtime); - builder.add_kw(visit_id, &*VISIT_TYPE, visit_type); - builder.transact(store)?; - } - } else { - for (microtime, visit_type) in &self.visits { - let visit_id = builder.next_tempid(); - builder.add_ref_to_tmpid(visit_id, &*VISIT_PLACE, place_id); - builder.add_inst(visit_id, &*VISIT_DATE, *microtime); - builder.add_kw(visit_id, &*VISIT_TYPE, visit_type); - } - builder.maybe_transact(store)?; + let mut rng = thread_rng(); + for visit in &self.visits { + let visit_id = builder.next_tempid(); + builder.add_ref_to_tmpid(&visit_id, &*VISIT_PAGE, &page_id); + builder.add_ref_to_tmpid(&visit_id, &*VISIT_PAGE_META, &page_meta_id); + // unwrap is safe, only None for an empty slice. + builder.add_ref_to_lookup_ref_long(&visit_id, &*VISIT_CONTEXT, + &*CONTEXT_ID, + rng.gen_range(0, MAX_CONTEXT_ID)); + builder.add_inst(&visit_id, &*VISIT_DATE, visit.date); + // Point the visit at itself. This doesn't really matter, but + // pointing at another visit would require us keep a huge hashmap in + // memory, or to keep the places id on the visit as a unique + // identity which we use as a lookup ref, which will effect the db + // size a lot in a way we wouldn't need to in reality. + builder.add_ref_to_tmpid(&visit_id, &*VISIT_SOURCE_VISIT, &visit_id); } + // not one tx per visit anymore (and doing per place instead) because + // the bookkeeping/separation required is too annoying. + builder.maybe_transact(store)?; Ok(()) } pub fn from_row(row: &Row) -> PlaceEntry { - let transition_type: i64 = row.get("visit_type"); PlaceEntry { id: row.get("place_id"), url: row.get("place_url"), - url_hash: row.get("place_url_hash"), + origin_id: row.get("place_origin_id"), description: row.get("place_description"), + preview_image_url: row.get("place_preview_image_url"), title: row.get::<_, Option>("place_title").unwrap_or("".into()), - frecency: row.get("place_frecency"), - visits: vec![(row.get("visit_date"), &VISIT_TYPES[(transition_type as usize).saturating_sub(1)])], + visits: vec![VisitInfo { date: row.get("visit_date") }], } } } @@ -208,12 +238,11 @@ impl PlaceEntry { pub struct PlacesToMentat { pub mentat_db_path: PathBuf, pub places_db_path: PathBuf, - pub one_tx_per_visit: bool, + pub realistic: bool, } - static SCHEMA: &'static str = include_str!("places-schema.edn"); - +static INITIAL_DATA: &'static str = include_str!("initial-data.edn"); impl PlacesToMentat { pub fn run(self) -> Result<(), failure::Error> { @@ -225,33 +254,42 @@ impl PlacesToMentat { fs::copy(&self.places_db_path, &temp_places_path)?; let places = Connection::open_with_flags(&temp_places_path, OpenFlags::SQLITE_OPEN_READ_ONLY)?; - let mut store = Store::open_empty(self.mentat_db_path.to_str().unwrap())?; + // New versions of mentat kill open_empty, and we already know this is empty. + let mut store = Store::open(self.mentat_db_path.to_str().unwrap())?; debug!("Transacting initial schema"); store.transact(SCHEMA)?; + store.transact(INITIAL_DATA)?; + + let max_buffer_size = if self.realistic { 0 } else { 1024 * 1024 * 1024 * 1024 }; + let mut builder = TransactBuilder::new_with_size(max_buffer_size); - let mut stmt = places.prepare(" - SELECT - p.id as place_id, - p.url as place_url, - p.url_hash as place_url_hash, - p.description as place_description, - p.title as place_title, - p.frecency as place_frecency, - v.visit_date as visit_date, - v.visit_type as visit_type - FROM moz_places p - JOIN moz_historyvisits v - ON p.id = v.place_id - ORDER BY p.id - ").unwrap(); + { + let mut origins_stmt = places.prepare("SELECT id, prefix, host FROM moz_origins")?; + let origins = origins_stmt.query_map(&[], |row| { + (row.get::<_, i64>("id"), + row.get::<_, String>("prefix"), + row.get::<_, String>("host")) + })?.collect::, _>>()?; + + println!("Adding {} origins...", origins.len()); + for (id, prefix, host) in origins { + let tmpid = builder.next_tempid(); + builder.add_long(&tmpid, &*ORIGIN_PLACES_ID, id); + builder.add_str(&tmpid, &*ORIGIN_PREFIX, &host); + builder.add_str(&tmpid, &*ORIGIN_HOST, &prefix); + builder.maybe_transact(&mut store)?; + } + // Force a transaction so that lookup refs work. + builder.transact(&mut store)?; + } let (place_count, visit_count) = { - let mut stmt = places.prepare("select count(*) from moz_places").unwrap(); + let mut stmt = places.prepare("SELECT count(*) FROM moz_places").unwrap(); let mut rows = stmt.query(&[]).unwrap(); let ps: i64 = rows.next().unwrap()?.get(0); - let mut stmt = places.prepare("select count(*) from moz_historyvisits").unwrap(); + let mut stmt = places.prepare("SELECT count(*) FROM moz_historyvisits").unwrap(); let mut rows = stmt.query(&[]).unwrap(); let vs: i64 = rows.next().unwrap()?.get(0); (ps, vs) @@ -259,19 +297,22 @@ impl PlacesToMentat { println!("Querying {} places ({} visits)", place_count, visit_count); - let mut current_place = PlaceEntry { - id: -1, - url: "".into(), - url_hash: 0, - description: None, - title: "".into(), - frecency: 0, - visits: vec![], - }; - - let max_buffer_size = if self.one_tx_per_visit { 0 } else { 1024 * 1024 * 1024 * 1024 }; + let mut stmt = places.prepare(" + SELECT + p.id as place_id, + p.url as place_url, + p.description as place_description, + p.preview_image_url as place_preview_image_url, + p.title as place_title, + p.origin_id as place_origin_id, + v.visit_date as visit_date + FROM moz_places p + JOIN moz_historyvisits v + ON p.id = v.place_id + ORDER BY p.id + ")?; - let mut builder = TransactBuilder::new_with_size(max_buffer_size); + let mut current_place = PlaceEntry { id: -1, .. PlaceEntry::default() }; let mut so_far = 0; let mut rows = stmt.query(&[])?; @@ -280,12 +321,7 @@ impl PlacesToMentat { let row = row_or_error?; let id: i64 = row.get("place_id"); if current_place.id == id { - let tty: i64 = row.get("visit_type"); - current_place.visits.push(( - row.get("visit_date"), - &VISIT_TYPES.get((tty.max(0) as usize).saturating_sub(1)) - .unwrap_or_else(|| &VISIT_TYPES[0]) - )); + current_place.visits.push(VisitInfo { date: row.get("visit_date") }); continue; } @@ -305,8 +341,14 @@ impl PlacesToMentat { println!("\rProcessing {} / {} places (approx.)", so_far + 1, place_count); } builder.transact(&mut store)?; + + println!("Vacuuming mentat DB"); + + let mentat_sqlite_conn = store.dismantle().0; + mentat_sqlite_conn.execute("VACUUM", &[])?; println!("Done!"); Ok(()) } + } From d87738442302d8ebd4c17bfd7012b063ca4869da Mon Sep 17 00:00:00 2001 From: Thom Chiovoloni Date: Tue, 21 Aug 2018 14:37:47 -0700 Subject: [PATCH 4/5] Avoid lookup refs since they're too slow for `-r` (even though they're convenient) --- places-tool/src/places-schema.edn | 12 ++--- places-tool/src/to_mentat.rs | 75 +++++++++++++++++-------------- 2 files changed, 48 insertions(+), 39 deletions(-) diff --git a/places-tool/src/places-schema.edn b/places-tool/src/places-schema.edn index 6932802b83..2416e41605 100644 --- a/places-tool/src/places-schema.edn +++ b/places-tool/src/places-schema.edn @@ -13,12 +13,12 @@ ; used for lookup refs so that we aren't force to insert everything in ; one enormous transaction (making --realistic impossible) - { :db/ident :origin/places_id - :db/valueType :db.type/long - :db/cardinality :db.cardinality/one - :db/unique :db.unique/identity - :db/index true - } + ; { :db/ident :origin/places_id + ; :db/valueType :db.type/long + ; :db/cardinality :db.cardinality/one + ; :db/unique :db.unique/identity + ; :db/index true + ; } ; Pages diff --git a/places-tool/src/to_mentat.rs b/places-tool/src/to_mentat.rs index 435b70a981..335f909b2d 100644 --- a/places-tool/src/to_mentat.rs +++ b/places-tool/src/to_mentat.rs @@ -4,6 +4,7 @@ use std::fs; use std::io::{Write, self}; use std::fmt::{Write as FmtWrite}; use std::path::PathBuf; +use std::collections::HashMap; use tempfile; use rand::prelude::*; @@ -17,6 +18,7 @@ use mentat::{ self, Store, Keyword, + Queryable, errors::Result as MentatResult, }; @@ -48,16 +50,6 @@ impl TransactBuilder { self.total_terms += 1; } - #[inline] - pub fn add_ref_to_lookup_ref_long(&mut self, - tmpid: &str, attr: &Keyword, - lookup_ref_attr: &Keyword, lookup_ref_val: i64) { - write!(self.data, " [:db/add {:?} {} (lookup-ref {} {})]\n", - tmpid, attr, lookup_ref_attr, lookup_ref_val).unwrap(); - self.terms += 1; - self.total_terms += 1; - } - #[inline] pub fn add_inst(&mut self, tmpid: &str, attr: &Keyword, micros: i64) { write!(self.data, " [:db/add {:?} {} #instmicros {}]\n", tmpid, attr, micros).unwrap(); @@ -138,10 +130,6 @@ lazy_static! { static ref PAGE_META_DESCRIPTION: Keyword = kw!(:page_meta/description); static ref PAGE_META_PREVIEW_IMAGE_URL: Keyword = kw!(:page_meta/preview_image_url); - // static ref CONTEXT_DEVICE: Keyword = kw!(:context/device); - // static ref CONTEXT_CONTAINER: Keyword = kw!(:context/container); - static ref CONTEXT_ID: Keyword = kw!(:context/id); - static ref VISIT_PAGE_META: Keyword = kw!(:visit/page_meta); static ref VISIT_CONTEXT: Keyword = kw!(:visit/context); static ref VISIT_PAGE: Keyword = kw!(:visit/page); @@ -160,10 +148,11 @@ lazy_static! { // static ref DEVICE_TYPE_MOBILE: Keyword = kw!(:device.type/mobile) // static ref CONTAINER_NAME: Keyword = kw!(:container/name) -} - -const MAX_CONTEXT_ID: i64 = 4; + // static ref CONTEXT_DEVICE: Keyword = kw!(:context/device); + // static ref CONTEXT_CONTAINER: Keyword = kw!(:context/container); + // static ref CONTEXT_ID: Keyword = kw!(:context/id); +} #[derive(Debug, Clone, Default)] struct VisitInfo { @@ -183,10 +172,20 @@ struct PlaceEntry { } impl PlaceEntry { - pub fn add(&self, builder: &mut TransactBuilder, store: &mut Store) -> Result<(), failure::Error> { + pub fn add( + &self, + builder: &mut TransactBuilder, + store: &mut Store, + context_ids: &[i64], + origin_ids: &HashMap + ) -> Result<(), failure::Error> { let page_id = builder.next_tempid(); builder.add_str(&page_id, &*PAGE_URL, &self.url); - builder.add_ref_to_lookup_ref_long(&page_id, &*PAGE_ORIGIN, &*ORIGIN_PLACES_ID, self.origin_id); + if let Some(origin_entid) = origin_ids.get(&self.origin_id) { + builder.add_long(&page_id, &*PAGE_ORIGIN, *origin_entid); + } else { + warn!("Unknown entid? {}", self.origin_id); + } let page_meta_id = builder.next_tempid(); @@ -204,9 +203,7 @@ impl PlaceEntry { builder.add_ref_to_tmpid(&visit_id, &*VISIT_PAGE, &page_id); builder.add_ref_to_tmpid(&visit_id, &*VISIT_PAGE_META, &page_meta_id); // unwrap is safe, only None for an empty slice. - builder.add_ref_to_lookup_ref_long(&visit_id, &*VISIT_CONTEXT, - &*CONTEXT_ID, - rng.gen_range(0, MAX_CONTEXT_ID)); + builder.add_long(&visit_id, &*VISIT_CONTEXT, *rng.choose(context_ids).unwrap()); builder.add_inst(&visit_id, &*VISIT_DATE, visit.date); // Point the visit at itself. This doesn't really matter, but // pointing at another visit would require us keep a huge hashmap in @@ -264,7 +261,7 @@ impl PlacesToMentat { let max_buffer_size = if self.realistic { 0 } else { 1024 * 1024 * 1024 * 1024 }; let mut builder = TransactBuilder::new_with_size(max_buffer_size); - { + let origin_ids = { let mut origins_stmt = places.prepare("SELECT id, prefix, host FROM moz_origins")?; let origins = origins_stmt.query_map(&[], |row| { (row.get::<_, i64>("id"), @@ -273,16 +270,30 @@ impl PlacesToMentat { })?.collect::, _>>()?; println!("Adding {} origins...", origins.len()); - for (id, prefix, host) in origins { + let temp_ids = origins.into_iter().map(|(id, prefix, host)| { let tmpid = builder.next_tempid(); - builder.add_long(&tmpid, &*ORIGIN_PLACES_ID, id); builder.add_str(&tmpid, &*ORIGIN_PREFIX, &host); builder.add_str(&tmpid, &*ORIGIN_HOST, &prefix); - builder.maybe_transact(&mut store)?; + (id, tmpid) + }).collect::>(); + if let Some(tx_report) = builder.transact(&mut store)? { + let mut table: HashMap = HashMap::with_capacity(temp_ids.len()); + for (origin_id, tmpid) in temp_ids { + let entid = tx_report.tempids.get(&tmpid).unwrap(); + table.insert(origin_id, *entid); + } + table + } else { + HashMap::default() } - // Force a transaction so that lookup refs work. - builder.transact(&mut store)?; - } + }; + + let context_ids = store.q_once("[:find [?e ...] :where [?e :context/device _]]", None)? + .results + .into_coll()? + .into_iter() + .map(|binding| binding.into_entid().unwrap()) + .collect::>(); let (place_count, visit_count) = { let mut stmt = places.prepare("SELECT count(*) FROM moz_places").unwrap(); @@ -326,8 +337,7 @@ impl PlacesToMentat { } if current_place.id >= 0 { - current_place.add(&mut builder, &mut store)?; - // builder.maybe_transact(&mut store)?; + current_place.add(&mut builder, &mut store, &context_ids, &origin_ids)?; print!("\rProcessing {} / {} places (approx.)", so_far, place_count); io::stdout().flush()?; so_far += 1; @@ -336,8 +346,7 @@ impl PlacesToMentat { } if current_place.id >= 0 { - current_place.add(&mut builder, &mut store)?; - // builder.maybe_transact(&mut store)?; + current_place.add(&mut builder, &mut store, &context_ids, &origin_ids)?; println!("\rProcessing {} / {} places (approx.)", so_far + 1, place_count); } builder.transact(&mut store)?; From 294261af0be8b92c7875f88bcd1fef01943d5f14 Mon Sep 17 00:00:00 2001 From: Thom Chiovoloni Date: Wed, 22 Aug 2018 16:38:59 -0700 Subject: [PATCH 5/5] Combine comments in application-services 191 to make a README.md --- places-tool/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/places-tool/Cargo.toml b/places-tool/Cargo.toml index 3b09e1d767..9c1bb76a93 100644 --- a/places-tool/Cargo.toml +++ b/places-tool/Cargo.toml @@ -19,9 +19,9 @@ tag = "v0.11.0" # It seems like we need to use sqlcipher since other projects in this # workspace are using sqlcipher. Otherwise we get conflicts... default_features = false -features = ["sqlcipher"] +features = ["bundled"] [dependencies.rusqlite] version = "0.13" -features = ["sqlcipher", "limits", "functions"] +features = ["bundled", "limits", "functions"]