diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 428a6613e44..bbf5ea0998d 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -3182,6 +3182,7 @@ dependencies = [ "prost-types", "rand 0.9.1", "roaring", + "semver", "serde", "serde_json", "snafu", @@ -3568,6 +3569,7 @@ dependencies = [ "rand 0.9.1", "rangemap", "roaring", + "semver", "serde", "serde_json", "snafu", diff --git a/protos/table.proto b/protos/table.proto index d8f637a5d04..580abeaf954 100644 --- a/protos/table.proto +++ b/protos/table.proto @@ -58,7 +58,33 @@ message Manifest { // that the library is semantically versioned, this is a string. However, if it // is semantically versioned, it should be a valid semver string without any 'v' // prefix. For example: `2.0.0`, `2.0.0-rc.1`. + // + // For forward compatibility with older readers, when writing new manifests this + // field should contain only the core version (major.minor.patch) without any + // prerelease or build metadata. The prerelease/build info should be stored in + // the separate prerelease and build_metadata fields instead. string version = 2; + // Optional semver prerelease identifier. + // + // This field stores the prerelease portion of a semantic version separately + // from the core version number. For example, if the full version is "2.0.0-rc.1", + // the version field would contain "2.0.0" and prerelease would contain "rc.1". + // + // This separation ensures forward compatibility: older readers can parse the + // clean version field without errors, while newer readers can reconstruct the + // full semantic version by combining version, prerelease, and build_metadata. + // + // If absent, the version field is used as-is. + optional string prerelease = 3; + // Optional semver build metadata. + // + // This field stores the build metadata portion of a semantic version separately + // from the core version number. For example, if the full version is + // "2.0.0-rc.1+build.123", the version field would contain "2.0.0", prerelease + // would contain "rc.1", and build_metadata would contain "build.123". + // + // If absent, no build metadata is present. + optional string build_metadata = 4; } // The version of the writer that created this file. diff --git a/python/python/tests/forward_compat/test_compat.py b/python/python/tests/forward_compat/test_compat.py index a9b3ec8fbee..5e57dac5c71 100644 --- a/python/python/tests/forward_compat/test_compat.py +++ b/python/python/tests/forward_compat/test_compat.py @@ -6,6 +6,8 @@ # This file will be run on older versions of Lance to test that the # current version of Lance can read the test data generated by datagen.py. +import shutil + import lance import pyarrow as pa import pyarrow.compute as pc @@ -116,3 +118,52 @@ def test_list_indices_ignores_new_fts_index_version(): indices = ds.list_indices() # the new index version should be ignored assert len(indices) == 0 + + +@pytest.mark.forward +@pytest.mark.skipif( + Version(lance.__version__) < Version("0.20.0"), + reason="Version is too old to read index files stored with Lance 2.0 file format", +) +def test_write_scalar_index(tmp_path: str): + path = get_path("scalar_index") + # copy to tmp path to avoid modifying original + shutil.copytree(path, tmp_path, dirs_exist_ok=True) + + ds = lance.dataset(tmp_path) + data = pa.table( + { + "idx": pa.array([1000]), + "btree": pa.array([1000]), + "bitmap": pa.array([1000]), + "label_list": pa.array([["label1000"]]), + "ngram": pa.array(["word1000"]), + "zonemap": pa.array([1000]), + "bloomfilter": pa.array([1000]), + } + ) + ds.insert(data) + # ds.optimize.optimize_indices() + ds.optimize.compact_files() + + +@pytest.mark.forward +@pytest.mark.skipif( + Version(lance.__version__) < Version("0.36.0"), + reason="FTS token set format was introduced in 0.36.0", +) +def test_write_fts(tmp_path: str): + path = get_path("fts_index") + # copy to tmp path to avoid modifying original + shutil.copytree(path, tmp_path, dirs_exist_ok=True) + + ds = lance.dataset(tmp_path) + data = pa.table( + { + "idx": pa.array([1000]), + "text": pa.array(["new document to index"]), + } + ) + ds.insert(data) + # ds.optimize.optimize_indices() + ds.optimize.compact_files() diff --git a/rust/lance-table/src/format/manifest.rs b/rust/lance-table/src/format/manifest.rs index 11512065c7c..722a9205a71 100644 --- a/rust/lance-table/src/format/manifest.rs +++ b/rust/lance-table/src/format/manifest.rs @@ -617,6 +617,8 @@ impl DeepSizeOf for BasePath { pub struct WriterVersion { pub library: String, pub version: String, + pub prerelease: Option, + pub build_metadata: Option, } #[derive(Debug, Clone, PartialEq, DeepSizeOf)] @@ -680,6 +682,36 @@ fn bump_version(version: &mut semver::Version, part: VersionPart) { } impl WriterVersion { + /// Split a version string into clean version (major.minor.patch), prerelease, and build metadata. + /// + /// Returns None if the input is not a valid semver string. + /// + /// For example: + /// - "2.0.0-rc.1" -> Some(("2.0.0", Some("rc.1"), None)) + /// - "2.0.0-rc.1+build.123" -> Some(("2.0.0", Some("rc.1"), Some("build.123"))) + /// - "2.0.0+build.123" -> Some(("2.0.0", None, Some("build.123"))) + /// - "not-a-version" -> None + fn split_version(full_version: &str) -> Option<(String, Option, Option)> { + let mut parsed = semver::Version::parse(full_version).ok()?; + + let prerelease = if parsed.pre.is_empty() { + None + } else { + Some(parsed.pre.to_string()) + }; + + let build_metadata = if parsed.build.is_empty() { + None + } else { + Some(parsed.build.to_string()) + }; + + // Remove prerelease and build metadata to get clean version + parsed.pre = semver::Prerelease::EMPTY; + parsed.build = semver::BuildMetadata::EMPTY; + Some((parsed.to_string(), prerelease, build_metadata)) + } + /// Try to parse the version string as a semver string. Returns None if /// not successful. #[deprecated(note = "Use `lance_lib_version()` instead")] @@ -704,12 +736,26 @@ impl WriterVersion { /// If the library is "lance", parse the version as semver and return it. /// Returns None if the library is not "lance" or the version cannot be parsed as semver. + /// + /// This method reconstructs the full semantic version by combining the version field + /// with the prerelease and build_metadata fields (if present). For example: + /// - version="2.0.0" + prerelease=Some("rc.1") -> "2.0.0-rc.1" + /// - version="2.0.0" + prerelease=Some("rc.1") + build_metadata=Some("build.123") -> "2.0.0-rc.1+build.123" pub fn lance_lib_version(&self) -> Option { if self.library != "lance" { return None; } - let version = semver::Version::parse(&self.version).ok()?; + let mut version = semver::Version::parse(&self.version).ok()?; + + if let Some(ref prerelease) = self.prerelease { + version.pre = semver::Prerelease::new(prerelease).ok()?; + } + + if let Some(ref build_metadata) = self.build_metadata { + version.build = semver::BuildMetadata::new(build_metadata).ok()?; + } + Some(version) } @@ -749,9 +795,13 @@ impl WriterVersion { if !keep_tag { version.pre = semver::Prerelease::EMPTY; } + let (clean_version, prerelease, build_metadata) = Self::split_version(&version.to_string()) + .expect("Bumped version should be valid semver"); Self { library: self.library.clone(), - version: version.to_string(), + version: clean_version, + prerelease, + build_metadata, } } } @@ -759,9 +809,14 @@ impl WriterVersion { impl Default for WriterVersion { #[cfg(not(test))] fn default() -> Self { + let full_version = env!("CARGO_PKG_VERSION"); + let (version, prerelease, build_metadata) = + Self::split_version(full_version).expect("CARGO_PKG_VERSION should be valid semver"); Self { library: "lance".to_string(), - version: env!("CARGO_PKG_VERSION").to_string(), + version, + prerelease, + build_metadata, } } @@ -769,9 +824,14 @@ impl Default for WriterVersion { #[cfg(test)] #[allow(deprecated)] fn default() -> Self { + let full_version = env!("CARGO_PKG_VERSION"); + let (version, prerelease, build_metadata) = + Self::split_version(full_version).expect("CARGO_PKG_VERSION should be valid semver"); Self { library: "lance".to_string(), - version: env!("CARGO_PKG_VERSION").to_string(), + version, + prerelease, + build_metadata, } .bump(VersionPart::Patch, true) } @@ -809,9 +869,17 @@ impl TryFrom for Manifest { }); // We only use the writer version if it is fully set. let writer_version = match p.writer_version { - Some(pb::manifest::WriterVersion { library, version }) => { - Some(WriterVersion { library, version }) - } + Some(pb::manifest::WriterVersion { + library, + version, + prerelease, + build_metadata, + }) => Some(WriterVersion { + library, + version, + prerelease, + build_metadata, + }), _ => None, }; let fragments = Arc::new( @@ -922,6 +990,8 @@ impl From<&Manifest> for pb::Manifest { .map(|wv| pb::manifest::WriterVersion { library: wv.library.clone(), version: wv.version.clone(), + prerelease: wv.prerelease.clone(), + build_metadata: wv.build_metadata.clone(), }), fragments: m.fragments.iter().map(pb::DataFragment::from).collect(), table_metadata: m.table_metadata.clone(), @@ -1031,7 +1101,6 @@ mod tests { fn test_writer_version() { let wv = WriterVersion::default(); assert_eq!(wv.library, "lance"); - let version = wv.lance_lib_version().unwrap(); // Parse the actual cargo version to check if it has a pre-release tag let cargo_version = env!("CARGO_PKG_VERSION"); @@ -1041,6 +1110,25 @@ mod tests { None }; + // Verify the version field contains only major.minor.patch + let version_parts: Vec<&str> = wv.version.split('.').collect(); + assert_eq!( + version_parts.len(), + 3, + "Version should be major.minor.patch" + ); + assert!( + !wv.version.contains('-'), + "Version field should not contain prerelease" + ); + + // Verify the prerelease field matches the expected tag + assert_eq!(wv.prerelease.as_deref(), expected_tag); + // Build metadata should be None for default version + assert_eq!(wv.build_metadata, None); + + // Verify lance_lib_version() reconstructs the full semver correctly + let version = wv.lance_lib_version().unwrap(); assert_eq!( version.major, env!("CARGO_PKG_VERSION_MAJOR").parse::().unwrap() @@ -1063,6 +1151,132 @@ mod tests { } } + #[test] + fn test_writer_version_split() { + // Test splitting version with prerelease + let (version, prerelease, build_metadata) = + WriterVersion::split_version("2.0.0-rc.1").unwrap(); + assert_eq!(version, "2.0.0"); + assert_eq!(prerelease, Some("rc.1".to_string())); + assert_eq!(build_metadata, None); + + // Test splitting version without prerelease + let (version, prerelease, build_metadata) = WriterVersion::split_version("2.0.0").unwrap(); + assert_eq!(version, "2.0.0"); + assert_eq!(prerelease, None); + assert_eq!(build_metadata, None); + + // Test splitting version with prerelease and build metadata + let (version, prerelease, build_metadata) = + WriterVersion::split_version("2.0.0-rc.1+build.123").unwrap(); + assert_eq!(version, "2.0.0"); + assert_eq!(prerelease, Some("rc.1".to_string())); + assert_eq!(build_metadata, Some("build.123".to_string())); + + // Test splitting version with only build metadata + let (version, prerelease, build_metadata) = + WriterVersion::split_version("2.0.0+build.123").unwrap(); + assert_eq!(version, "2.0.0"); + assert_eq!(prerelease, None); + assert_eq!(build_metadata, Some("build.123".to_string())); + + // Test with invalid version returns None + assert!(WriterVersion::split_version("not-a-version").is_none()); + } + + #[test] + fn test_writer_version_comparison_with_prerelease() { + let v1 = WriterVersion { + library: "lance".to_string(), + version: "2.0.0".to_string(), + prerelease: Some("rc.1".to_string()), + build_metadata: None, + }; + + let v2 = WriterVersion { + library: "lance".to_string(), + version: "2.0.0".to_string(), + prerelease: None, + build_metadata: None, + }; + + let semver1 = v1.lance_lib_version().unwrap(); + let semver2 = v2.lance_lib_version().unwrap(); + + // rc.1 should be less than the release version + assert!(semver1 < semver2); + } + + #[test] + fn test_writer_version_with_build_metadata() { + let v = WriterVersion { + library: "lance".to_string(), + version: "2.0.0".to_string(), + prerelease: Some("rc.1".to_string()), + build_metadata: Some("build.123".to_string()), + }; + + let semver = v.lance_lib_version().unwrap(); + assert_eq!(semver.to_string(), "2.0.0-rc.1+build.123"); + assert_eq!(semver.major, 2); + assert_eq!(semver.minor, 0); + assert_eq!(semver.patch, 0); + assert_eq!(semver.pre.as_str(), "rc.1"); + assert_eq!(semver.build.as_str(), "build.123"); + } + + #[test] + fn test_writer_version_non_semver() { + // Test that Lance library can have non-semver version strings + let v = WriterVersion { + library: "lance".to_string(), + version: "custom-build-v1".to_string(), + prerelease: None, + build_metadata: None, + }; + + // lance_lib_version should return None for non-semver + assert!(v.lance_lib_version().is_none()); + + // But the WriterVersion itself should still be valid and usable + assert_eq!(v.library, "lance"); + assert_eq!(v.version, "custom-build-v1"); + } + + #[test] + #[allow(deprecated)] + fn test_older_than_with_prerelease() { + // Test that older_than correctly handles prerelease + let v_rc = WriterVersion { + library: "lance".to_string(), + version: "2.0.0".to_string(), + prerelease: Some("rc.1".to_string()), + build_metadata: None, + }; + + // 2.0.0-rc.1 should be older than 2.0.0 + assert!(v_rc.older_than(2, 0, 0)); + + // 2.0.0-rc.1 should be older than 2.0.1 + assert!(v_rc.older_than(2, 0, 1)); + + // 2.0.0-rc.1 should not be older than 1.9.9 + assert!(!v_rc.older_than(1, 9, 9)); + + let v_release = WriterVersion { + library: "lance".to_string(), + version: "2.0.0".to_string(), + prerelease: None, + build_metadata: None, + }; + + // 2.0.0 should not be older than 2.0.0 + assert!(!v_release.older_than(2, 0, 0)); + + // 2.0.0 should be older than 2.0.1 + assert!(v_release.older_than(2, 0, 1)); + } + #[test] fn test_fragments_by_offset_range() { let arrow_schema = ArrowSchema::new(vec![ArrowField::new(