diff --git a/python/python/lance/indices/builder.py b/python/python/lance/indices/builder.py index ca033780a0e..7105c9234bb 100644 --- a/python/python/lance/indices/builder.py +++ b/python/python/lance/indices/builder.py @@ -51,7 +51,8 @@ def __init__(self, dataset, column: str): the dataset containing the data column: str The vector column to index, must be a fixed size list of floats - or 1-dimensional fixed-shape tensor column. + (or unsigned integers for hamming distance) or 1-dimensional + fixed-shape tensor column. """ self.dataset = dataset self.column = self._normalize_column(column) @@ -89,7 +90,7 @@ def train_ivf( overtraining, reduced recall, and require large nprobes values. If not specified the default will be the integer nearest the square root of the number of rows. - distance_type: "l2" | "dot" | "cosine" + distance_type: "l2" | "dot" | "cosine" | "hamming" The distance type to used. This is defined in more detail in the LanceDB documentation on creating indices. accelerator: str | torch.Device @@ -529,6 +530,7 @@ def _normalize_distance_type(self, distance_type): "cosine", "euclidean", "dot", + "hamming", ]: raise ValueError(f"Distance type {distance_type} not supported.") return distance_type.lower() @@ -555,10 +557,13 @@ def _normalize_column(self, column): f"Vector column {c} must be FixedSizeListArray " f"1-dimensional FixedShapeTensorArray, got {field.type}" ) - if not pa.types.is_floating(field.type.value_type): + if not ( + pa.types.is_floating(field.type.value_type) + or pa.types.is_unsigned_integer(field.type.value_type) + ): raise TypeError( - f"Vector column {c} must have floating value type, " - f"got {field.type.value_type}" + f"Vector column {c} must have floating or unsigned integer " + f"value type, got {field.type.value_type}" ) return column diff --git a/python/python/tests/test_indices.py b/python/python/tests/test_indices.py index e29f02705e2..f0444e4abcb 100644 --- a/python/python/tests/test_indices.py +++ b/python/python/tests/test_indices.py @@ -77,6 +77,31 @@ def test_ivf_centroids(tmpdir, rand_dataset): assert ivf.centroids == reloaded.centroids +def test_ivf_centroids_hamming(tmpdir): + num_rows = NUM_ROWS + vectors = np.random.randint(0, 256, size=(num_rows, DIMENSION), dtype=np.uint8) + vectors_flat = vectors.reshape(-1) + vectors_arr = pa.FixedSizeListArray.from_arrays( + pa.array(vectors_flat, type=pa.uint8()), DIMENSION + ) + table = pa.Table.from_arrays([vectors_arr], names=["vectors"]) + uri = str(tmpdir / "hamming_dataset") + ds = lance.write_dataset(table, uri, max_rows_per_file=NUM_ROWS_PER_FRAGMENT) + + ivf = IndicesBuilder(ds, "vectors").train_ivf( + sample_rate=16, distance_type="hamming" + ) + + assert ivf.distance_type == "hamming" + expected_partitions = round(math.sqrt(num_rows)) + assert len(ivf.centroids) == expected_partitions + + ivf.save(str(tmpdir / "ivf_hamming")) + reloaded = IvfModel.load(str(tmpdir / "ivf_hamming")) + assert reloaded.distance_type == "hamming" + assert ivf.centroids == reloaded.centroids + + @pytest.mark.parametrize("distance_type", ["l2", "cosine", "dot"]) def test_ivf_centroids_mostly_null(mostly_null_dataset, distance_type): ivf = IndicesBuilder(mostly_null_dataset, "vectors").train_ivf(