diff --git a/.gitignore b/.gitignore index 32f98fd2ce..b89b437177 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,7 @@ Cargo.lock credentials.json *-engine.json .cargo +# Mentat database and journal files +*.db* +# Sqlite database and journal files +*.sqlite* diff --git a/Cargo.toml b/Cargo.toml index 929940e310..bad254dd3b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ members = [ "sync15-adapter", "sync15/passwords", "sync15/passwords/ffi", + "places-tool" ] # For RSA keys cloning. Remove once openssl 0.10.8+ is released. diff --git a/places-tool/Cargo.toml b/places-tool/Cargo.toml new file mode 100644 index 0000000000..9c1bb76a93 --- /dev/null +++ b/places-tool/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "places-tool" +version = "0.1.0" +authors = ["Thom Chiovoloni "] + +[dependencies] +lazy_static = "1.1.0" +clap = "2.32.0" +log = "0.4.4" +env_logger = "0.5.12" +failure = "0.1.1" +tempfile = "3.0.3" +dirs = "1.0.3" +rand = "0.5.5" + +[dependencies.mentat] +git = "https://github.com/mozilla/mentat.git" +tag = "v0.11.0" +# It seems like we need to use sqlcipher since other projects in this +# workspace are using sqlcipher. Otherwise we get conflicts... +default_features = false +features = ["bundled"] + +[dependencies.rusqlite] +version = "0.13" +features = ["bundled", "limits", "functions"] + diff --git a/places-tool/src/anonymize.rs b/places-tool/src/anonymize.rs new file mode 100644 index 0000000000..8b00d173e9 --- /dev/null +++ b/places-tool/src/anonymize.rs @@ -0,0 +1,122 @@ +use rand::{self, prelude::*}; +use std::collections::HashMap; +use rusqlite::{self, Connection, OpenFlags}; +use std::path::PathBuf; +use std::fs; +use failure; + + +#[derive(Default, Clone, Debug)] +struct StringAnonymizer { + table: HashMap, +} + +fn rand_string_of_len(len: usize) -> String { + let mut rng = thread_rng(); + rng.sample_iter(&rand::distributions::Alphanumeric).take(len).collect() +} + +impl StringAnonymizer { + fn anonymize(&mut self, s: &str) -> String { + if s.len() == 0 { + return "".into(); + } + if let Some(a) = self.table.get(s) { + return a.clone(); + } + for i in 0..10 { + let replacement = rand_string_of_len(s.len()); + // keep trying but force it at the last time + if self.table.get(&replacement).is_some() && i != 9 { + continue; + } + + self.table.insert(s.into(), replacement.clone()); + return replacement; + } + unreachable!("Bug in anonymize retry loop"); + } +} + +#[derive(Debug, Clone)] +struct TableInfo { + name: String, + cols: Vec +} + +impl TableInfo { + + fn for_table(name: String, conn: &Connection) -> Result { + let stmt = conn.prepare(&format!("SELECT * FROM {}", name))?; + let cols = stmt.column_names().into_iter().map(|x| x.to_owned()).collect(); + Ok(TableInfo { name, cols }) + } + + fn make_update(&self, updater_fn: &str) -> String { + let sets = self.cols.iter() + .map(|col| format!("{} = {}({})", col, updater_fn, col)) + .collect::>() + .join(",\n "); + format!("UPDATE {}\nSET {}", self.name, sets) + } +} + +fn anonymize(anon_places: &Connection) -> Result<(), failure::Error> { + { + let mut anonymizer = StringAnonymizer::default(); + anon_places.create_scalar_function("anonymize", 1, true, move |ctx| { + let arg = ctx.get::(0)?; + Ok(match arg { + rusqlite::types::Value::Text(s) => + rusqlite::types::Value::Text(anonymizer.anonymize(&s)), + not_text => not_text + }) + })?; + } + + let schema = { + let mut stmt = anon_places.prepare(" + SELECT name FROM sqlite_master + WHERE type = 'table' + AND name NOT LIKE 'sqlite_%' -- ('sqlite_sequence', 'sqlite_stat1', 'sqlite_master', anyt) + ")?; + let mut rows = stmt.query(&[])?; + let mut tables = vec![]; + while let Some(row_or_error) = rows.next() { + tables.push(TableInfo::for_table(row_or_error?.get("name"), &anon_places)?); + } + tables + }; + + for info in schema { + let sql = info.make_update("anonymize"); + debug!("Executing sql:\n{}", sql); + anon_places.execute(&sql, &[])?; + } + + debug!("Clearing places url_hash"); + anon_places.execute("UPDATE moz_places SET url_hash = 0", &[])?; + + Ok(()) +} + +#[derive(Debug, Clone)] +pub struct AnonymizePlaces { + pub input_path: PathBuf, + pub output_path: PathBuf, +} + +impl AnonymizePlaces { + + pub fn run(self) -> Result<(), failure::Error> { + fs::copy(&self.input_path, &self.output_path)?; + let anon_places = Connection::open_with_flags(&self.output_path, + OpenFlags::SQLITE_OPEN_READ_WRITE)?; + anonymize(&anon_places)?; + Ok(()) + } + +} + + + diff --git a/places-tool/src/find_db.rs b/places-tool/src/find_db.rs new file mode 100644 index 0000000000..6ab3c85b72 --- /dev/null +++ b/places-tool/src/find_db.rs @@ -0,0 +1,84 @@ +use failure; +use dirs; +use std::{process, fs, path::PathBuf}; + +#[derive(Clone, Debug, PartialEq)] +pub struct PlacesLocation { + pub profile_name: String, + pub path: PathBuf, + pub db_size: u64, +} + +impl PlacesLocation { + pub fn friendly_db_size(&self) -> String { + let sizes = [ + (1024 * 1024 * 1024, "Gb"), + (1024 * 1024, "Mb"), + (1024, "Kb"), + ]; + for (lim, suffix) in &sizes { + if self.db_size >= *lim { + + return format!("~{} {}", ((self.db_size as f64 / *lim as f64) * 10.0).round() / 10.0, suffix); + } + } + format!("{} bytes", self.db_size) + } +} + +pub fn get_all_places_dbs() -> Result, failure::Error> { + let mut path = match dirs::home_dir() { + Some(dir) => dir, + None => return Err(format_err!("No home directory found!")) + }; + if cfg!(windows) { + path.extend(&["AppData", "Roaming", "Mozilla", "Firefox", "Profiles"]); + } else { + let out = String::from_utf8( + process::Command::new("uname").args(&["-s"]).output()?.stdout)?; + info!("Uname says: {:?}", out); + if out.trim() == "Darwin" { + // ~/Library/Application Support/Firefox/Profiles + path.extend(&["Library", "Application Support", "Firefox", "Profiles"]); + } else { + // I'm not actually sure if this is true for all non-macos unix likes. + path.extend(&[".mozilla", "firefox"]); + } + } + debug!("Using profile path: {:?}", path); + let mut res = fs::read_dir(path)? + .map(|entry_result| { + let entry = entry_result?; + trace!("Considering path {:?}", entry.path()); + if !entry.path().is_dir() { + trace!(" Not dir: {:?}", entry.path()); + return Ok(None); + } + let mut path = entry.path().to_owned(); + let profile_name = path.file_name().unwrap_or_default().to_str().ok_or_else(|| { + warn!(" Path has invalid UTF8: {:?}", path); + format_err!("Path has invalid UTF8: {:?}", path) + })?.into(); + path.push("places.sqlite"); + if !path.exists() { + return Ok(None); + } + let metadata = fs::metadata(&path)?; + let db_size = metadata.len(); + Ok(Some(PlacesLocation { + profile_name, + path, + db_size, + })) + }).filter_map(|result: Result, failure::Error>| { + match result { + Ok(val) => val, + Err(e) => { + debug!("Got error finding profile directory, skipping: {}", e); + None + } + } + }).collect::>(); + res.sort_by(|a, b| b.db_size.cmp(&a.db_size)); + Ok(res) +} diff --git a/places-tool/src/initial-data.edn b/places-tool/src/initial-data.edn new file mode 100644 index 0000000000..db042d0254 --- /dev/null +++ b/places-tool/src/initial-data.edn @@ -0,0 +1,14 @@ + +[ + {:db/id "container0" :container/name "Default"} + {:db/id "container1" :container/name "Personal"} + + {:db/id "device0" :device/name "My very cool computer" :device/type :device.type/desktop} + {:db/id "device1" :device/name "My cool phone" :device/type :device.type/mobile} + + ; We randomly assign one of these to each visit. + {:context/id 0 :context/device "device0" :context/container "container0"} + {:context/id 1 :context/device "device1" :context/container "container0"} + {:context/id 2 :context/device "device0" :context/container "container1"} + {:context/id 3 :context/device "device1" :context/container "container1"} +] diff --git a/places-tool/src/main.rs b/places-tool/src/main.rs new file mode 100644 index 0000000000..7ef2e257d1 --- /dev/null +++ b/places-tool/src/main.rs @@ -0,0 +1,131 @@ +extern crate dirs; +#[macro_use] +extern crate failure; +#[macro_use] +extern crate log; +#[macro_use] +extern crate mentat; +extern crate rusqlite; + +#[macro_use] +extern crate lazy_static; +extern crate rand; +extern crate env_logger; +extern crate clap; +extern crate tempfile; + +use std::path::{Path, PathBuf}; +use std::fs; + +mod find_db; +mod anonymize; +mod to_mentat; + + +fn main() -> Result<(), failure::Error> { + let matches = clap::App::new("places-tool") + .subcommand(clap::SubCommand::with_name("to-mentat") + .about("Convert a places database to a mentat database") + .arg(clap::Arg::with_name("OUTPUT") + .index(1) + .help("Path where we should output the mentat db (defaults to ./mentat_places.db)")) + .arg(clap::Arg::with_name("PLACES") + .index(2) + .help("Path to places.sqlite. If not provided, we'll use the largest places.sqlite in your firefox profiles")) + .arg(clap::Arg::with_name("v") + .short("v") + .multiple(true) + .help("Sets the level of verbosity (pass up to 3 times for more verbosity -- e.g. -vvv enables trace logs)")) + .arg(clap::Arg::with_name("force") + .short("f") + .long("force") + .help("Overwrite OUTPUT if it already exists")) + .arg(clap::Arg::with_name("realistic") + .short("r") + .long("realistic") + .help("Insert everything with one transaction per visit. This is a lot slower, \ + but is a more realistic workload. It produces databases that are ~30% larger (for me)."))) + .subcommand(clap::SubCommand::with_name("anonymize") + .about("Anonymize a places database") + .arg(clap::Arg::with_name("OUTPUT") + .index(1) + .help("Path where we should output the anonymized db (defaults to places_anonymized.sqlite)")) + .arg(clap::Arg::with_name("PLACES") + .index(2) + .help("Path to places.sqlite. If not provided, we'll use the largest places.sqlite in your firefox profiles")) + .arg(clap::Arg::with_name("v") + .short("v") + .multiple(true) + .help("Sets the level of verbosity (pass up to 3 times for more verbosity -- e.g. -vvv enables trace logs)")) + .arg(clap::Arg::with_name("force") + .short("f") + .long("force") + .help("Overwrite OUTPUT if it already exists"))) + .get_matches(); + + let subcommand = matches.subcommand_name().map(|s| s.to_owned()).expect("Must provide subcommand"); + let is_anon = subcommand == "anonymize"; + let subcmd_matches = matches.subcommand_matches(&subcommand).unwrap(); + + env_logger::init_from_env(match subcmd_matches.occurrences_of("v") { + 0 => env_logger::Env::default().filter_or("RUST_LOG", "warn"), + 1 => env_logger::Env::default().filter_or("RUST_LOG", "info"), + 2 => env_logger::Env::default().filter_or("RUST_LOG", "debug"), + 3 | _ => env_logger::Env::default().filter_or("RUST_LOG", "trace"), + }); + + let places_db = if let Some(places) = subcmd_matches.value_of("PLACES") { + let meta = fs::metadata(&places)?; + find_db::PlacesLocation { + profile_name: "".into(), + path: fs::canonicalize(places)?, + db_size: meta.len(), + } + } else { + let mut dbs = find_db::get_all_places_dbs()?; + if dbs.len() == 0 { + error!("No dbs found!"); + return Err(format_err!("No dbs found!")); + } + for p in &dbs { + debug!("Found: profile {:?} with a {} places.sqlite", p.profile_name, p.friendly_db_size()) + } + info!("Using profile {:?}", dbs[0].profile_name); + dbs.into_iter().next().unwrap() + }; + + let out_db_path = subcmd_matches.value_of("OUTPUT").unwrap_or_else(|| { + if is_anon { + "./places_anonymized.sqlite" + } else { + "./mentat_places.db" + } + }).to_owned(); + + if Path::new(&out_db_path).exists() { + if subcmd_matches.is_present("force") { + info!("Deleting previous `{}` because -f was passed", out_db_path); + fs::remove_file(&out_db_path)?; + } else { + error!("{} already exists but `-f` argument was not provided", out_db_path); + return Err(format_err!("Output path already exists")); + } + } + + if is_anon { + let cmd = anonymize::AnonymizePlaces { + input_path: places_db.path, + output_path: PathBuf::from(out_db_path) + }; + cmd.run()?; + } else { + let cmd = to_mentat::PlacesToMentat { + mentat_db_path: PathBuf::from(out_db_path), + places_db_path: places_db.path, + realistic: subcmd_matches.is_present("realistic"), + }; + cmd.run()?; + } + + Ok(()) +} diff --git a/places-tool/src/places-schema.edn b/places-tool/src/places-schema.edn new file mode 100644 index 0000000000..2416e41605 --- /dev/null +++ b/places-tool/src/places-schema.edn @@ -0,0 +1,154 @@ +[ + + { :db/ident :origin/prefix + :db/valueType :db.type/string + :db/cardinality :db.cardinality/one + } + + { :db/ident :origin/host + :db/valueType :db.type/string + :db/cardinality :db.cardinality/one + } + + ; used for lookup refs so that we aren't force to insert everything in + ; one enormous transaction (making --realistic impossible) + + ; { :db/ident :origin/places_id + ; :db/valueType :db.type/long + ; :db/cardinality :db.cardinality/one + ; :db/unique :db.unique/identity + ; :db/index true + ; } + + ; Pages + + { :db/ident :page/url + :db/valueType :db.type/string + :db/cardinality :db.cardinality/one + :db/unique :db.unique/value + :db/index true ; required for unique/value. + ; TODO: should this be fulltext? + } + + { :db/ident :page/origin + :db/valueType :db.type/ref + :db/cardinality :db.cardinality/one + } + + ; Page metadata (frequently stuff from the HEAD of the page) + + { :db/ident :page_meta/title + :db/valueType :db.type/string + :db/cardinality :db.cardinality/one + ; TODO this probably should have :db/index true :db/fulltext true + } + + ; This is stored in favicons.sqlite, which we don't read, so in practice + ; it's always empty >_>. + { :db/ident :page_meta/favicon_url + :db/valueType :db.type/string + :db/cardinality :db.cardinality/one + } + + { :db/ident :page_meta/description + :db/valueType :db.type/string + :db/cardinality :db.cardinality/one + ; TODO: should this have index or fulltext? + } + + { :db/ident :page_meta/preview_image_url + :db/valueType :db.type/string + :db/cardinality :db.cardinality/one + } + + ; A "context" is a tuple of (device, container). + + ; This ID is just so that we can insert with a lookup ref. + { :db/ident :context/id + :db/valueType :db.type/long + :db/cardinality :db.cardinality/one + :db/unique :db.unique/identity + :db/index true + } + + { :db/ident :context/device + :db/valueType :db.type/ref + :db/cardinality :db.cardinality/one + } + + { :db/ident :context/container + :db/valueType :db.type/ref + :db/cardinality :db.cardinality/one + } + + ; Visits + { :db/ident :visit/page_meta + :db/valueType :db.type/ref + :db/cardinality :db.cardinality/one + } + + { :db/ident :visit/context + :db/valueType :db.type/ref + :db/cardinality :db.cardinality/one + } + + { :db/ident :visit/page + :db/valueType :db.type/ref + :db/cardinality :db.cardinality/one + } + + { :db/ident :visit/date + :db/valueType :db.type/instant + :db/cardinality :db.cardinality/one + :db/index true + } + + ; In the future we probably will always have sources. In practice, at the + ; moment, we only sometimes have them. Unfortunately, even if we do have them, + ; we fake it (and make `:visit/source_visit` point to the visit itself) + ; because otherwise this would be really tricky to do in --realistic mode. + ; + { :db/ident :visit/source_visit + :db/valueType :db.type/ref + :db/cardinality :db.cardinality/one + } + + ; { :db/ident :visit/source_redirect + ; :db/valueType :db.type/ref + ; :db/cardinality :db.cardinality/one + ; } + + ; Never used at the moment because I'm too lazy to try and model bookmarks. + ; { :db/ident :visit/source_bookmark + ; :db/valueType :db.type/ref + ; :db/cardinality :db.cardinality/one + ; } + + ; Device + + { :db/ident :device/name + :db/valueType :db.type/string + :db/cardinality :db.cardinality/one + } + + { :db/ident :device/type + :db/valueType :db.type/ref + :db/cardinality :db.cardinality/one + } + + ; Several other things... + + ; (Open) enumeration of possible device types + { :db/ident :device.type/desktop } + { :db/ident :device.type/mobile } + + ; Container + { :db/ident :container/name + :db/valueType :db.type/string + :db/cardinality :db.cardinality/one + } + ; ... etc. We're omitting color, the fact that some origins will open in a + ; specific container by default, etc. + + +] \ No newline at end of file diff --git a/places-tool/src/to_mentat.rs b/places-tool/src/to_mentat.rs new file mode 100644 index 0000000000..335f909b2d --- /dev/null +++ b/places-tool/src/to_mentat.rs @@ -0,0 +1,363 @@ + +use failure; +use std::fs; +use std::io::{Write, self}; +use std::fmt::{Write as FmtWrite}; +use std::path::PathBuf; +use std::collections::HashMap; +use tempfile; +use rand::prelude::*; + +use rusqlite::{ + Connection, + OpenFlags, + Row, +}; + +use mentat::{ + self, + Store, + Keyword, + Queryable, + errors::Result as MentatResult, +}; + +#[derive(Debug, Clone)] +struct TransactBuilder { + counter: u64, + data: String, + total_terms: u64, + terms: u64, + max_buffer_size: usize +} + +impl TransactBuilder { + #[inline] + pub fn new_with_size(max_buffer_size: usize) -> Self { + Self { counter: 0, data: "[\n".into(), terms: 0, total_terms: 0, max_buffer_size } + } + + #[inline] + pub fn next_tempid(&mut self) -> String { + self.counter += 1; + self.counter.to_string() + } + + #[inline] + pub fn add_ref_to_tmpid(&mut self, tmpid: &str, attr: &Keyword, ref_tmpid: &str) { + write!(self.data, " [:db/add {:?} {} {:?}]\n", tmpid, attr, ref_tmpid).unwrap(); + self.terms += 1; + self.total_terms += 1; + } + + #[inline] + pub fn add_inst(&mut self, tmpid: &str, attr: &Keyword, micros: i64) { + write!(self.data, " [:db/add {:?} {} #instmicros {}]\n", tmpid, attr, micros).unwrap(); + self.terms += 1; + self.total_terms += 1; + } + + #[inline] + pub fn add_str(&mut self, tmpid: &str, attr: &Keyword, val: &str) { + // {:?} escapes some chars EDN can't parse (e.g. \'...) + let s = val.replace("\\", "\\\\").replace("\"", "\\\""); + write!(self.data, " [:db/add {:?} {} \"{}\"]\n", tmpid, attr, s).unwrap(); + self.terms += 1; + self.total_terms += 1; + } + + #[inline] + pub fn add_long(&mut self, tmpid: &str, attr: &Keyword, val: i64) { + write!(self.data, " [:db/add {:?} {} {}]\n", tmpid, attr, val).unwrap(); + self.terms += 1; + self.total_terms += 1; + } + + #[inline] + pub fn finish(&mut self) -> &str { + self.data.push(']'); + &self.data + } + + #[inline] + pub fn reset(&mut self) { + self.terms = 0; + self.data.clear(); + self.data.push_str("[\n") + } + + #[inline] + pub fn should_finish(&self) -> bool { + self.data.len() >= self.max_buffer_size + } + + #[inline] + pub fn maybe_transact(&mut self, store: &mut Store) -> MentatResult> { + if self.should_finish() { + Ok(self.transact(store)?) + } else { + Ok(None) + } + } + + #[inline] + pub fn transact(&mut self, store: &mut Store) -> MentatResult> { + if self.terms != 0 { + debug!("\nTransacting {} terms (total = {})", self.terms, self.total_terms); + let res = store.transact(self.finish()); + if res.is_err() { error!("Error transacting:\n{}", self.data); } + let report = res?; + self.reset(); + Ok(Some(report)) + } else { + Ok(None) + } + } +} + +lazy_static! { + + static ref ORIGIN_PREFIX: Keyword = kw!(:origin/prefix); + static ref ORIGIN_HOST: Keyword = kw!(:origin/host); + static ref ORIGIN_PLACES_ID: Keyword = kw!(:origin/places_id); + + + static ref PAGE_URL: Keyword = kw!(:page/url); + static ref PAGE_ORIGIN: Keyword = kw!(:page/origin); + + static ref PAGE_META_TITLE: Keyword = kw!(:page_meta/title); + // static ref PAGE_META_FAVICON_URL: Keyword = kw!(:page_meta/favicon_url); + static ref PAGE_META_DESCRIPTION: Keyword = kw!(:page_meta/description); + static ref PAGE_META_PREVIEW_IMAGE_URL: Keyword = kw!(:page_meta/preview_image_url); + + static ref VISIT_PAGE_META: Keyword = kw!(:visit/page_meta); + static ref VISIT_CONTEXT: Keyword = kw!(:visit/context); + static ref VISIT_PAGE: Keyword = kw!(:visit/page); + static ref VISIT_DATE: Keyword = kw!(:visit/date); + + static ref VISIT_SOURCE_VISIT: Keyword = kw!(:visit/source_visit); + + // static ref VISIT_SOURCE_REDIRECT: Keyword = kw!(:visit/source_redirect); + // static ref VISIT_SOURCE_BOOKMARK: Keyword = kw!(:visit/source_bookmark); + + // Only used in `initial-data.edn` + // + // static ref DEVICE_NAME: Keyword = kw!(:device/name) + // static ref DEVICE_TYPE: Keyword = kw!(:device/type) + // static ref DEVICE_TYPE_DESKTOP: Keyword = kw!(:device.type/desktop) + // static ref DEVICE_TYPE_MOBILE: Keyword = kw!(:device.type/mobile) + // static ref CONTAINER_NAME: Keyword = kw!(:container/name) + + // static ref CONTEXT_DEVICE: Keyword = kw!(:context/device); + // static ref CONTEXT_CONTAINER: Keyword = kw!(:context/container); + // static ref CONTEXT_ID: Keyword = kw!(:context/id); + +} + +#[derive(Debug, Clone, Default)] +struct VisitInfo { + // Everything else we fabricate (for reasons). + date: i64, +} + +#[derive(Debug, Clone, Default)] +struct PlaceEntry { + pub id: i64, + pub url: String, + pub description: Option, + pub preview_image_url: Option, + pub title: String, + pub origin_id: i64, + pub visits: Vec, +} + +impl PlaceEntry { + pub fn add( + &self, + builder: &mut TransactBuilder, + store: &mut Store, + context_ids: &[i64], + origin_ids: &HashMap + ) -> Result<(), failure::Error> { + let page_id = builder.next_tempid(); + builder.add_str(&page_id, &*PAGE_URL, &self.url); + if let Some(origin_entid) = origin_ids.get(&self.origin_id) { + builder.add_long(&page_id, &*PAGE_ORIGIN, *origin_entid); + } else { + warn!("Unknown entid? {}", self.origin_id); + } + + let page_meta_id = builder.next_tempid(); + + builder.add_str(&page_meta_id, &*PAGE_META_TITLE, &self.title); + if let Some(desc) = &self.description { + builder.add_str(&page_meta_id, &*PAGE_META_DESCRIPTION, &desc); + } + if let Some(preview) = &self.preview_image_url { + builder.add_str(&page_meta_id, &*PAGE_META_PREVIEW_IMAGE_URL, &preview); + } + + let mut rng = thread_rng(); + for visit in &self.visits { + let visit_id = builder.next_tempid(); + builder.add_ref_to_tmpid(&visit_id, &*VISIT_PAGE, &page_id); + builder.add_ref_to_tmpid(&visit_id, &*VISIT_PAGE_META, &page_meta_id); + // unwrap is safe, only None for an empty slice. + builder.add_long(&visit_id, &*VISIT_CONTEXT, *rng.choose(context_ids).unwrap()); + builder.add_inst(&visit_id, &*VISIT_DATE, visit.date); + // Point the visit at itself. This doesn't really matter, but + // pointing at another visit would require us keep a huge hashmap in + // memory, or to keep the places id on the visit as a unique + // identity which we use as a lookup ref, which will effect the db + // size a lot in a way we wouldn't need to in reality. + builder.add_ref_to_tmpid(&visit_id, &*VISIT_SOURCE_VISIT, &visit_id); + } + // not one tx per visit anymore (and doing per place instead) because + // the bookkeeping/separation required is too annoying. + builder.maybe_transact(store)?; + Ok(()) + } + + pub fn from_row(row: &Row) -> PlaceEntry { + PlaceEntry { + id: row.get("place_id"), + url: row.get("place_url"), + origin_id: row.get("place_origin_id"), + description: row.get("place_description"), + preview_image_url: row.get("place_preview_image_url"), + title: row.get::<_, Option>("place_title").unwrap_or("".into()), + visits: vec![VisitInfo { date: row.get("visit_date") }], + } + } +} + +#[derive(Debug, Clone)] +pub struct PlacesToMentat { + pub mentat_db_path: PathBuf, + pub places_db_path: PathBuf, + pub realistic: bool, +} + +static SCHEMA: &'static str = include_str!("places-schema.edn"); +static INITIAL_DATA: &'static str = include_str!("initial-data.edn"); + +impl PlacesToMentat { + pub fn run(self) -> Result<(), failure::Error> { + + debug!("Copying places.sqlite to a temp file for reading"); + let temp_dir = tempfile::tempdir()?; + let temp_places_path = temp_dir.path().join("places.sqlite"); + + fs::copy(&self.places_db_path, &temp_places_path)?; + let places = Connection::open_with_flags(&temp_places_path, OpenFlags::SQLITE_OPEN_READ_ONLY)?; + + // New versions of mentat kill open_empty, and we already know this is empty. + let mut store = Store::open(self.mentat_db_path.to_str().unwrap())?; + + debug!("Transacting initial schema"); + store.transact(SCHEMA)?; + store.transact(INITIAL_DATA)?; + + let max_buffer_size = if self.realistic { 0 } else { 1024 * 1024 * 1024 * 1024 }; + let mut builder = TransactBuilder::new_with_size(max_buffer_size); + + let origin_ids = { + let mut origins_stmt = places.prepare("SELECT id, prefix, host FROM moz_origins")?; + let origins = origins_stmt.query_map(&[], |row| { + (row.get::<_, i64>("id"), + row.get::<_, String>("prefix"), + row.get::<_, String>("host")) + })?.collect::, _>>()?; + + println!("Adding {} origins...", origins.len()); + let temp_ids = origins.into_iter().map(|(id, prefix, host)| { + let tmpid = builder.next_tempid(); + builder.add_str(&tmpid, &*ORIGIN_PREFIX, &host); + builder.add_str(&tmpid, &*ORIGIN_HOST, &prefix); + (id, tmpid) + }).collect::>(); + if let Some(tx_report) = builder.transact(&mut store)? { + let mut table: HashMap = HashMap::with_capacity(temp_ids.len()); + for (origin_id, tmpid) in temp_ids { + let entid = tx_report.tempids.get(&tmpid).unwrap(); + table.insert(origin_id, *entid); + } + table + } else { + HashMap::default() + } + }; + + let context_ids = store.q_once("[:find [?e ...] :where [?e :context/device _]]", None)? + .results + .into_coll()? + .into_iter() + .map(|binding| binding.into_entid().unwrap()) + .collect::>(); + + let (place_count, visit_count) = { + let mut stmt = places.prepare("SELECT count(*) FROM moz_places").unwrap(); + let mut rows = stmt.query(&[]).unwrap(); + let ps: i64 = rows.next().unwrap()?.get(0); + + let mut stmt = places.prepare("SELECT count(*) FROM moz_historyvisits").unwrap(); + let mut rows = stmt.query(&[]).unwrap(); + let vs: i64 = rows.next().unwrap()?.get(0); + (ps, vs) + }; + + println!("Querying {} places ({} visits)", place_count, visit_count); + + let mut stmt = places.prepare(" + SELECT + p.id as place_id, + p.url as place_url, + p.description as place_description, + p.preview_image_url as place_preview_image_url, + p.title as place_title, + p.origin_id as place_origin_id, + v.visit_date as visit_date + FROM moz_places p + JOIN moz_historyvisits v + ON p.id = v.place_id + ORDER BY p.id + ")?; + + let mut current_place = PlaceEntry { id: -1, .. PlaceEntry::default() }; + + let mut so_far = 0; + let mut rows = stmt.query(&[])?; + + while let Some(row_or_error) = rows.next() { + let row = row_or_error?; + let id: i64 = row.get("place_id"); + if current_place.id == id { + current_place.visits.push(VisitInfo { date: row.get("visit_date") }); + continue; + } + + if current_place.id >= 0 { + current_place.add(&mut builder, &mut store, &context_ids, &origin_ids)?; + print!("\rProcessing {} / {} places (approx.)", so_far, place_count); + io::stdout().flush()?; + so_far += 1; + } + current_place = PlaceEntry::from_row(&row); + } + + if current_place.id >= 0 { + current_place.add(&mut builder, &mut store, &context_ids, &origin_ids)?; + println!("\rProcessing {} / {} places (approx.)", so_far + 1, place_count); + } + builder.transact(&mut store)?; + + println!("Vacuuming mentat DB"); + + let mentat_sqlite_conn = store.dismantle().0; + mentat_sqlite_conn.execute("VACUUM", &[])?; + println!("Done!"); + Ok(()) + } + +} +