From c922a17ae27ad127db380111632d108b34e3ae52 Mon Sep 17 00:00:00 2001 From: Jen Hamon Date: Sat, 15 Nov 2025 14:59:27 -0500 Subject: [PATCH 1/6] Update by metadata --- pinecone/db_data/index.py | 2 + pinecone/db_data/index_asyncio.py | 2 + pinecone/db_data/index_asyncio_interface.py | 13 +++ pinecone/db_data/interfaces.py | 7 ++ pinecone/db_data/request_factory.py | 2 + pinecone/grpc/index_grpc.py | 13 +++ tests/unit/data/test_request_factory.py | 114 ++++++++++++++++++++ tests/unit/test_index.py | 50 +++++++++ tests/unit_grpc/test_grpc_index_update.py | 49 +++++++++ 9 files changed, 252 insertions(+) diff --git a/pinecone/db_data/index.py b/pinecone/db_data/index.py index 9a5ae9d42..7e4b88b06 100644 --- a/pinecone/db_data/index.py +++ b/pinecone/db_data/index.py @@ -657,6 +657,7 @@ def update( set_metadata: Optional[VectorMetadataTypedDict] = None, namespace: Optional[str] = None, sparse_values: Optional[Union[SparseValues, SparseVectorTypedDict]] = None, + filter: Optional[FilterTypedDict] = None, **kwargs, ) -> UpdateResponse: result = self._vector_api.update_vector( @@ -666,6 +667,7 @@ def update( set_metadata=set_metadata, namespace=namespace, sparse_values=sparse_values, + filter=filter, **kwargs, ), **self._openapi_kwargs(kwargs), diff --git a/pinecone/db_data/index_asyncio.py b/pinecone/db_data/index_asyncio.py index a274e4925..32a3bcf49 100644 --- a/pinecone/db_data/index_asyncio.py +++ b/pinecone/db_data/index_asyncio.py @@ -628,6 +628,7 @@ async def update( set_metadata: Optional[VectorMetadataTypedDict] = None, namespace: Optional[str] = None, sparse_values: Optional[Union[SparseValues, SparseVectorTypedDict]] = None, + filter: Optional[FilterTypedDict] = None, **kwargs, ) -> UpdateResponse: result = await self._vector_api.update_vector( @@ -637,6 +638,7 @@ async def update( set_metadata=set_metadata, namespace=namespace, sparse_values=sparse_values, + filter=filter, **kwargs, ), **self._openapi_kwargs(kwargs), diff --git a/pinecone/db_data/index_asyncio_interface.py b/pinecone/db_data/index_asyncio_interface.py index c125afb34..5094acfd8 100644 --- a/pinecone/db_data/index_asyncio_interface.py +++ b/pinecone/db_data/index_asyncio_interface.py @@ -530,6 +530,7 @@ async def update( set_metadata: Optional[VectorMetadataTypedDict] = None, namespace: Optional[str] = None, sparse_values: Optional[Union[SparseValues, SparseVectorTypedDict]] = None, + filter: Optional[FilterTypedDict] = None, **kwargs, ) -> UpdateResponse: """ @@ -544,6 +545,10 @@ async def update( sparse_values: (Dict[str, Union[List[float], List[int]]]): sparse values to update for the vector. Expected to be either a SparseValues object or a dict of the form: {'indices': List[int], 'values': List[float]} where the lists each have the same length. + filter (Dict[str, Union[str, float, int, bool, List, dict]]): A metadata filter expression. + When updating metadata across records in a namespace, the update is applied to all records + that match the filter. See `metadata filtering _`. + [optional] If a value is included, it will overwrite the previous value. If a set_metadata is included, @@ -588,6 +593,14 @@ async def main(): namespace='my_namespace' ) + # Update by metadata filter + await idx.update( + id='id1', + set_metadata={'status': 'active'}, + filter={'genre': {'$eq': 'drama'}}, + namespace='my_namespace' + ) + asyncio.run(main()) """ diff --git a/pinecone/db_data/interfaces.py b/pinecone/db_data/interfaces.py index 2a33d4779..af48a9e0c 100644 --- a/pinecone/db_data/interfaces.py +++ b/pinecone/db_data/interfaces.py @@ -715,6 +715,7 @@ def update( set_metadata: Optional[VectorMetadataTypedDict] = None, namespace: Optional[str] = None, sparse_values: Optional[Union[SparseValues, SparseVectorTypedDict]] = None, + filter: Optional[FilterTypedDict] = None, **kwargs, ) -> UpdateResponse: """ @@ -733,6 +734,8 @@ def update( >>> namespace='my_namespace') >>> index.update(id='id1', values=[1, 2, 3], sparse_values=SparseValues(indices=[1, 2], values=[0.2, 0.4]), >>> namespace='my_namespace') + >>> index.update(id='id1', set_metadata={'status': 'active'}, filter={'genre': {'$eq': 'drama'}}, + >>> namespace='my_namespace') Args: id (str): Vector's unique id. @@ -743,6 +746,10 @@ def update( sparse_values: (Dict[str, Union[List[float], List[int]]]): sparse values to update for the vector. Expected to be either a SparseValues object or a dict of the form: {'indices': List[int], 'values': List[float]} where the lists each have the same length. + filter (Dict[str, Union[str, float, int, bool, List, dict]]): A metadata filter expression. + When updating metadata across records in a namespace, the update is applied to all records + that match the filter. See `metadata filtering _`. + [optional] Returns: An empty dictionary if the update was successful. diff --git a/pinecone/db_data/request_factory.py b/pinecone/db_data/request_factory.py index 64bb65d9c..ac0a27336 100644 --- a/pinecone/db_data/request_factory.py +++ b/pinecone/db_data/request_factory.py @@ -140,6 +140,7 @@ def update_request( set_metadata: Optional[VectorMetadataTypedDict] = None, namespace: Optional[str] = None, sparse_values: Optional[Union[SparseValues, SparseVectorTypedDict]] = None, + filter: Optional[FilterTypedDict] = None, **kwargs, ) -> UpdateRequest: _check_type = kwargs.pop("_check_type", False) @@ -150,6 +151,7 @@ def update_request( ("set_metadata", set_metadata), ("namespace", namespace), ("sparse_values", sparse_values_normalized), + ("filter", filter), ] ) diff --git a/pinecone/grpc/index_grpc.py b/pinecone/grpc/index_grpc.py index ee5e86b83..bae88b7a4 100644 --- a/pinecone/grpc/index_grpc.py +++ b/pinecone/grpc/index_grpc.py @@ -687,6 +687,7 @@ def update( set_metadata: Optional[VectorMetadataTypedDict] = None, namespace: Optional[str] = None, sparse_values: Optional[Union[GRPCSparseValues, SparseVectorTypedDict]] = None, + filter: Optional[FilterTypedDict] = None, **kwargs, ) -> Union[UpdateResponse, PineconeGrpcFuture]: """ @@ -705,6 +706,8 @@ def update( >>> namespace='my_namespace') >>> index.update(id='id1', values=[1, 2, 3], sparse_values=GRPCSparseValues(indices=[1, 2], values=[0.2, 0.4]), >>> namespace='my_namespace') + >>> index.update(id='id1', set_metadata={'status': 'active'}, filter={'genre': {'$eq': 'drama'}}, + >>> namespace='my_namespace') Args: id (str): Vector's unique id. @@ -717,6 +720,10 @@ def update( sparse_values: (Dict[str, Union[List[float], List[int]]]): sparse values to update for the vector. Expected to be either a GRPCSparseValues object or a dict of the form: {'indices': List[int], 'values': List[float]} where the lists each have the same length. + filter (Dict[str, Union[str, float, int, bool, List, dict]]): A metadata filter expression. + When updating metadata across records in a namespace, the update is applied to all records + that match the filter. See `metadata filtering _`. + [optional] Returns: UpdateResponse (contains no data) or a PineconeGrpcFuture object if async_req is True. @@ -726,6 +733,11 @@ def update( else: set_metadata_struct = None + if filter is not None: + filter_struct = dict_to_proto_struct(filter) + else: + filter_struct = None + timeout = kwargs.pop("timeout", None) sparse_values = SparseValuesFactory.build(sparse_values) args_dict = self._parse_non_empty_args( @@ -734,6 +746,7 @@ def update( ("set_metadata", set_metadata_struct), ("namespace", namespace), ("sparse_values", sparse_values), + ("filter", filter_struct), ] ) diff --git a/tests/unit/data/test_request_factory.py b/tests/unit/data/test_request_factory.py index 0092bc921..a5ccda6a4 100644 --- a/tests/unit/data/test_request_factory.py +++ b/tests/unit/data/test_request_factory.py @@ -447,3 +447,117 @@ def test_fetch_by_metadata_request_without_optional_params(self): assert request.namespace is None assert request.limit is None assert request.pagination_token is None + + # region: update request tests + + def test_update_request_with_filter(self): + request = IndexRequestFactory.update_request(id="vec1", filter={"genre": {"$eq": "action"}}) + assert request.id == "vec1" + assert request.filter == {"genre": {"$eq": "action"}} + + def test_update_request_with_filter_and_set_metadata(self): + request = IndexRequestFactory.update_request( + id="vec1", set_metadata={"status": "active"}, filter={"genre": {"$eq": "drama"}} + ) + assert request.id == "vec1" + assert request.set_metadata == {"status": "active"} + assert request.filter == {"genre": {"$eq": "drama"}} + + def test_update_request_with_filter_and_values(self): + values = [0.1, 0.2, 0.3] + request = IndexRequestFactory.update_request( + id="vec1", values=values, filter={"year": {"$gte": 2020}} + ) + assert request.id == "vec1" + assert request.values == values + assert request.filter == {"year": {"$gte": 2020}} + + def test_update_request_with_filter_and_namespace(self): + request = IndexRequestFactory.update_request( + id="vec1", filter={"status": "active"}, namespace="my_namespace" + ) + assert request.id == "vec1" + assert request.filter == {"status": "active"} + assert request.namespace == "my_namespace" + + def test_update_request_with_filter_and_sparse_values(self): + sparse_values = {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]} + request = IndexRequestFactory.update_request( + id="vec1", sparse_values=sparse_values, filter={"genre": {"$in": ["action", "comedy"]}} + ) + assert request.id == "vec1" + assert request.sparse_values is not None + assert request.filter == {"genre": {"$in": ["action", "comedy"]}} + + def test_update_request_with_all_params_including_filter(self): + values = [0.1, 0.2, 0.3] + set_metadata = {"status": "active", "updated": True} + sparse_values = {"indices": [1, 2], "values": [0.4, 0.5]} + filter_dict = {"genre": {"$eq": "action"}, "year": {"$gte": 2020}} + request = IndexRequestFactory.update_request( + id="vec1", + values=values, + set_metadata=set_metadata, + namespace="my_namespace", + sparse_values=sparse_values, + filter=filter_dict, + ) + assert request.id == "vec1" + assert request.values == values + assert request.set_metadata == set_metadata + assert request.namespace == "my_namespace" + assert request.sparse_values is not None + assert request.filter == filter_dict + + def test_update_request_without_filter_backward_compatibility(self): + """Test that update_request still works without filter parameter (backward compatibility).""" + request = IndexRequestFactory.update_request( + id="vec1", values=[0.1, 0.2, 0.3], namespace="ns" + ) + assert request.id == "vec1" + assert request.values == [0.1, 0.2, 0.3] + assert request.namespace == "ns" + # Filter should not be set when not provided + assert not hasattr(request, "filter") or request.filter is None + + def test_update_request_with_simple_equality_filter(self): + """Test update_request with simple equality filter.""" + request = IndexRequestFactory.update_request(id="vec1", filter={"genre": "action"}) + assert request.id == "vec1" + assert request.filter == {"genre": "action"} + + def test_update_request_with_filter_operators(self): + """Test update_request with various filter operators.""" + # Test $in operator + request1 = IndexRequestFactory.update_request( + id="vec1", filter={"genre": {"$in": ["action", "comedy", "drama"]}} + ) + assert request1.filter == {"genre": {"$in": ["action", "comedy", "drama"]}} + + # Test $gte operator + request2 = IndexRequestFactory.update_request(id="vec1", filter={"year": {"$gte": 2020}}) + assert request2.filter == {"year": {"$gte": 2020}} + + # Test $lte operator + request3 = IndexRequestFactory.update_request(id="vec1", filter={"rating": {"$lte": 4.5}}) + assert request3.filter == {"rating": {"$lte": 4.5}} + + # Test $ne operator + request4 = IndexRequestFactory.update_request( + id="vec1", filter={"status": {"$ne": "deleted"}} + ) + assert request4.filter == {"status": {"$ne": "deleted"}} + + def test_update_request_with_complex_nested_filter(self): + """Test update_request with complex nested filters using $and and $or.""" + complex_filter = { + "$or": [ + {"$and": [{"genre": "drama"}, {"year": {"$gte": 2020}}]}, + {"$and": [{"genre": "comedy"}, {"year": {"$lt": 2000}}]}, + ] + } + request = IndexRequestFactory.update_request(id="vec1", filter=complex_filter) + assert request.id == "vec1" + assert request.filter == complex_filter + + # endregion diff --git a/tests/unit/test_index.py b/tests/unit/test_index.py index 9284a0cda..849a34d12 100644 --- a/tests/unit/test_index.py +++ b/tests/unit/test_index.py @@ -513,6 +513,56 @@ def test_update_byIdAnValuesAndMetadata_updateByIdAndValuesAndMetadata(self, moc oai.UpdateRequest(id="vec1", values=self.vals1, metadata=self.md1) ) + def test_update_withFilter_updateWithFilter(self, mocker): + mocker.patch.object(self.index._vector_api, "update_vector", autospec=True) + self.index.update(id="vec1", filter=self.filter1, namespace="ns") + self.index._vector_api.update_vector.assert_called_once_with( + oai.UpdateRequest(id="vec1", filter=self.filter1, namespace="ns") + ) + + def test_update_withFilterAndSetMetadata_updateWithFilterAndSetMetadata(self, mocker): + mocker.patch.object(self.index._vector_api, "update_vector", autospec=True) + self.index.update(id="vec1", set_metadata=self.md1, filter=self.filter1, namespace="ns") + self.index._vector_api.update_vector.assert_called_once_with( + oai.UpdateRequest(id="vec1", set_metadata=self.md1, filter=self.filter1, namespace="ns") + ) + + def test_update_withFilterAndValues_updateWithFilterAndValues(self, mocker): + mocker.patch.object(self.index._vector_api, "update_vector", autospec=True) + self.index.update(id="vec1", values=self.vals1, filter=self.filter1, namespace="ns") + self.index._vector_api.update_vector.assert_called_once_with( + oai.UpdateRequest(id="vec1", values=self.vals1, filter=self.filter1, namespace="ns") + ) + + def test_update_withFilterAndAllParams_updateWithFilterAndAllParams(self, mocker): + mocker.patch.object(self.index._vector_api, "update_vector", autospec=True) + self.index.update( + id="vec1", + values=self.vals1, + set_metadata=self.md1, + sparse_values=self.sv1, + filter=self.filter1, + namespace="ns", + ) + self.index._vector_api.update_vector.assert_called_once_with( + oai.UpdateRequest( + id="vec1", + values=self.vals1, + set_metadata=self.md1, + sparse_values=oai.SparseValues(indices=self.svi1, values=self.svv1), + filter=self.filter1, + namespace="ns", + ) + ) + + def test_update_withoutFilter_backwardCompatibility(self, mocker): + """Test that update without filter still works (backward compatibility).""" + mocker.patch.object(self.index._vector_api, "update_vector", autospec=True) + self.index.update(id="vec1", values=self.vals1, namespace="ns") + self.index._vector_api.update_vector.assert_called_once_with( + oai.UpdateRequest(id="vec1", values=self.vals1, namespace="ns") + ) + # endregion # region: describe index tests diff --git a/tests/unit_grpc/test_grpc_index_update.py b/tests/unit_grpc/test_grpc_index_update.py index d6579d32d..3f3f656e4 100644 --- a/tests/unit_grpc/test_grpc_index_update.py +++ b/tests/unit_grpc/test_grpc_index_update.py @@ -41,3 +41,52 @@ def test_update_byIdAnValuesAndMetadata_updateByIdAndValuesAndMetadata( UpdateRequest(id="vec1", values=vals1, set_metadata=dict_to_proto_struct(md1)), timeout=None, ) + + def test_update_withFilter_updateWithFilter(self, mocker, filter1): + mock_response = UpdateResponse() + mocker.patch.object(self.index.runner, "run", return_value=(mock_response, None)) + self.index.update(id="vec1", filter=filter1, namespace="ns") + self.index.runner.run.assert_called_once_with( + self.index.stub.Update, + UpdateRequest(id="vec1", filter=dict_to_proto_struct(filter1), namespace="ns"), + timeout=None, + ) + + def test_update_withFilterAndSetMetadata_updateWithFilterAndSetMetadata( + self, mocker, vals1, md1, filter1 + ): + mock_response = UpdateResponse() + mocker.patch.object(self.index.runner, "run", return_value=(mock_response, None)) + self.index.update(id="vec1", values=vals1, set_metadata=md1, filter=filter1, namespace="ns") + self.index.runner.run.assert_called_once_with( + self.index.stub.Update, + UpdateRequest( + id="vec1", + values=vals1, + set_metadata=dict_to_proto_struct(md1), + filter=dict_to_proto_struct(filter1), + namespace="ns", + ), + timeout=None, + ) + + def test_update_withFilterAndValues_updateWithFilterAndValues(self, mocker, vals1, filter1): + mock_response = UpdateResponse() + mocker.patch.object(self.index.runner, "run", return_value=(mock_response, None)) + self.index.update(id="vec1", values=vals1, filter=filter1, namespace="ns") + self.index.runner.run.assert_called_once_with( + self.index.stub.Update, + UpdateRequest( + id="vec1", values=vals1, filter=dict_to_proto_struct(filter1), namespace="ns" + ), + timeout=None, + ) + + def test_update_withFilter_asyncReq_updateWithFilterAsyncReq(self, mocker, filter1): + mocker.patch.object(self.index.runner, "run", autospec=True) + self.index.update(id="vec1", filter=filter1, namespace="ns", async_req=True) + self.index.runner.run.assert_called_once_with( + self.index.stub.Update.future, + UpdateRequest(id="vec1", filter=dict_to_proto_struct(filter1), namespace="ns"), + timeout=None, + ) From e480e65040fa27268e3712f4a8c2762f3ff4e87a Mon Sep 17 00:00:00 2001 From: Jen Hamon Date: Sat, 15 Nov 2025 16:17:17 -0500 Subject: [PATCH 2/6] Update by metadata --- pinecone/db_data/index.py | 9 +- pinecone/db_data/index_asyncio.py | 9 +- pinecone/db_data/index_asyncio_interface.py | 60 +- pinecone/db_data/interfaces.py | 60 +- pinecone/db_data/request_factory.py | 7 +- .../resources/asyncio/record_asyncio.py | 172 ++++ .../resources/asyncio/vector_asyncio.py | 741 +++++++++++++++ pinecone/db_data/resources/sync/record.py | 170 ++++ pinecone/db_data/resources/sync/vector.py | 791 ++++++++++++++++ pinecone/grpc/index_grpc.py | 72 +- pinecone/grpc/resources/vector_grpc.py | 858 ++++++++++++++++++ tests/unit/data/test_request_factory.py | 18 + tests/unit/test_index.py | 34 +- tests/unit_grpc/test_grpc_index_update.py | 48 +- 14 files changed, 2965 insertions(+), 84 deletions(-) create mode 100644 pinecone/db_data/resources/asyncio/record_asyncio.py create mode 100644 pinecone/db_data/resources/asyncio/vector_asyncio.py create mode 100644 pinecone/db_data/resources/sync/record.py create mode 100644 pinecone/db_data/resources/sync/vector.py create mode 100644 pinecone/grpc/resources/vector_grpc.py diff --git a/pinecone/db_data/index.py b/pinecone/db_data/index.py index 7e4b88b06..559bb1e17 100644 --- a/pinecone/db_data/index.py +++ b/pinecone/db_data/index.py @@ -652,7 +652,7 @@ def query_namespaces( @validate_and_convert_errors def update( self, - id: str, + id: Optional[str] = None, values: Optional[List[float]] = None, set_metadata: Optional[VectorMetadataTypedDict] = None, namespace: Optional[str] = None, @@ -660,6 +660,13 @@ def update( filter: Optional[FilterTypedDict] = None, **kwargs, ) -> UpdateResponse: + # Validate that exactly one of id or filter is provided + if id is None and filter is None: + raise ValueError("Either 'id' or 'filter' must be provided to update vectors.") + if id is not None and filter is not None: + raise ValueError( + "Cannot provide both 'id' and 'filter' in the same update call. Use 'id' for single vector updates or 'filter' for bulk updates." + ) result = self._vector_api.update_vector( IndexRequestFactory.update_request( id=id, diff --git a/pinecone/db_data/index_asyncio.py b/pinecone/db_data/index_asyncio.py index 32a3bcf49..08097d5f8 100644 --- a/pinecone/db_data/index_asyncio.py +++ b/pinecone/db_data/index_asyncio.py @@ -623,7 +623,7 @@ async def query_namespaces( @validate_and_convert_errors async def update( self, - id: str, + id: Optional[str] = None, values: Optional[List[float]] = None, set_metadata: Optional[VectorMetadataTypedDict] = None, namespace: Optional[str] = None, @@ -631,6 +631,13 @@ async def update( filter: Optional[FilterTypedDict] = None, **kwargs, ) -> UpdateResponse: + # Validate that exactly one of id or filter is provided + if id is None and filter is None: + raise ValueError("Either 'id' or 'filter' must be provided to update vectors.") + if id is not None and filter is not None: + raise ValueError( + "Cannot provide both 'id' and 'filter' in the same update call. Use 'id' for single vector updates or 'filter' for bulk updates." + ) result = await self._vector_api.update_vector( IndexRequestFactory.update_request( id=id, diff --git a/pinecone/db_data/index_asyncio_interface.py b/pinecone/db_data/index_asyncio_interface.py index 5094acfd8..553480fda 100644 --- a/pinecone/db_data/index_asyncio_interface.py +++ b/pinecone/db_data/index_asyncio_interface.py @@ -525,7 +525,7 @@ async def main(): @abstractmethod async def update( self, - id: str, + id: Optional[str] = None, values: Optional[List[float]] = None, set_metadata: Optional[VectorMetadataTypedDict] = None, namespace: Optional[str] = None, @@ -534,29 +534,26 @@ async def update( **kwargs, ) -> UpdateResponse: """ - The Update operation updates vector in a namespace. + The Update operation updates vectors in a namespace. - Args: - id (str): Vector's unique id. - values (List[float]): vector values to set. [optional] - set_metadata (Dict[str, Union[str, float, int, bool, List[int], List[float], List[str]]]]): - metadata to set for vector. [optional] - namespace (str): Namespace name where to update the vector.. [optional] - sparse_values: (Dict[str, Union[List[float], List[int]]]): sparse values to update for the vector. - Expected to be either a SparseValues object or a dict of the form: - {'indices': List[int], 'values': List[float]} where the lists each have the same length. - filter (Dict[str, Union[str, float, int, bool, List, dict]]): A metadata filter expression. - When updating metadata across records in a namespace, the update is applied to all records - that match the filter. See `metadata filtering _`. - [optional] + This method supports two update modes: - If a value is included, it will overwrite the previous value. - If a set_metadata is included, - the values of the fields specified in it will be added or overwrite the previous value. + 1. **Single vector update by ID**: Provide `id` to update a specific vector. + - Updates the vector with the given ID + - If `values` is included, it will overwrite the previous vector values + - If `set_metadata` is included, the values of the fields specified will be added or overwrite the previous metadata + 2. **Bulk update by metadata filter**: Provide `filter` to update all vectors matching the filter criteria. + - Updates all vectors in the namespace that match the filter expression + - Useful for updating metadata across multiple vectors at once + - The response includes `matched_records` indicating how many vectors were updated + + Either `id` or `filter` must be provided (but not both in the same call). Examples: + **Single vector update by ID:** + .. code-block:: python import asyncio @@ -593,16 +590,37 @@ async def main(): namespace='my_namespace' ) - # Update by metadata filter - await idx.update( - id='id1', + **Bulk update by metadata filter:** + + .. code-block:: python + + # Update metadata for all vectors matching the filter + response = await idx.update( set_metadata={'status': 'active'}, filter={'genre': {'$eq': 'drama'}}, namespace='my_namespace' ) + print(f"Updated {response.matched_records} vectors") asyncio.run(main()) + Args: + id (str): Vector's unique id. Required for single vector updates. Must not be provided when using filter. [optional] + values (List[float]): Vector values to set. [optional] + set_metadata (Dict[str, Union[str, float, int, bool, List[int], List[float], List[str]]]]): + Metadata to set for the vector(s). [optional] + namespace (str): Namespace name where to update the vector(s). [optional] + sparse_values: (Dict[str, Union[List[float], List[int]]]): Sparse values to update for the vector. + Expected to be either a SparseValues object or a dict of the form: + {'indices': List[int], 'values': List[float]} where the lists each have the same length. [optional] + filter (Dict[str, Union[str, float, int, bool, List, dict]]): A metadata filter expression. + When provided, updates all vectors in the namespace that match the filter criteria. + See `metadata filtering _`. + Must not be provided when using id. Either `id` or `filter` must be provided. [optional] + + Returns: + UpdateResponse: An UpdateResponse object. When using filter-based updates, the response includes + `matched_records` indicating the number of vectors that were updated. """ pass diff --git a/pinecone/db_data/interfaces.py b/pinecone/db_data/interfaces.py index af48a9e0c..6e3505e4a 100644 --- a/pinecone/db_data/interfaces.py +++ b/pinecone/db_data/interfaces.py @@ -710,7 +710,7 @@ def query_namespaces( @abstractmethod def update( self, - id: str, + id: Optional[str] = None, values: Optional[List[float]] = None, set_metadata: Optional[VectorMetadataTypedDict] = None, namespace: Optional[str] = None, @@ -719,40 +719,64 @@ def update( **kwargs, ) -> UpdateResponse: """ - The Update operation updates vector in a namespace. - If a value is included, it will overwrite the previous value. - If a set_metadata is included, - the values of the fields specified in it will be added or overwrite the previous value. + The Update operation updates vectors in a namespace. + + This method supports two update modes: + + 1. **Single vector update by ID**: Provide `id` to update a specific vector. + - Updates the vector with the given ID + - If `values` is included, it will overwrite the previous vector values + - If `set_metadata` is included, the values of the fields specified will be added or overwrite the previous metadata + + 2. **Bulk update by metadata filter**: Provide `filter` to update all vectors matching the filter criteria. + - Updates all vectors in the namespace that match the filter expression + - Useful for updating metadata across multiple vectors at once + - The response includes `matched_records` indicating how many vectors were updated + + Either `id` or `filter` must be provided (but not both in the same call). Examples: + **Single vector update by ID:** + .. code-block:: python + >>> # Update vector values >>> index.update(id='id1', values=[1, 2, 3], namespace='my_namespace') + >>> # Update vector metadata >>> index.update(id='id1', set_metadata={'key': 'value'}, namespace='my_namespace') + >>> # Update vector values and sparse values >>> index.update(id='id1', values=[1, 2, 3], sparse_values={'indices': [1, 2], 'values': [0.2, 0.4]}, >>> namespace='my_namespace') >>> index.update(id='id1', values=[1, 2, 3], sparse_values=SparseValues(indices=[1, 2], values=[0.2, 0.4]), >>> namespace='my_namespace') - >>> index.update(id='id1', set_metadata={'status': 'active'}, filter={'genre': {'$eq': 'drama'}}, - >>> namespace='my_namespace') + + **Bulk update by metadata filter:** + + .. code-block:: python + + >>> # Update metadata for all vectors matching the filter + >>> response = index.update(set_metadata={'status': 'active'}, filter={'genre': {'$eq': 'drama'}}, + >>> namespace='my_namespace') + >>> print(f"Updated {response.matched_records} vectors") Args: - id (str): Vector's unique id. - values (List[float]): vector values to set. [optional] + id (str): Vector's unique id. Required for single vector updates. Must not be provided when using filter. [optional] + values (List[float]): Vector values to set. [optional] set_metadata (Dict[str, Union[str, float, int, bool, List[int], List[float], List[str]]]]): - metadata to set for vector. [optional] - namespace (str): Namespace name where to update the vector.. [optional] - sparse_values: (Dict[str, Union[List[float], List[int]]]): sparse values to update for the vector. + Metadata to set for the vector(s). [optional] + namespace (str): Namespace name where to update the vector(s). [optional] + sparse_values: (Dict[str, Union[List[float], List[int]]]): Sparse values to update for the vector. Expected to be either a SparseValues object or a dict of the form: - {'indices': List[int], 'values': List[float]} where the lists each have the same length. + {'indices': List[int], 'values': List[float]} where the lists each have the same length. [optional] filter (Dict[str, Union[str, float, int, bool, List, dict]]): A metadata filter expression. - When updating metadata across records in a namespace, the update is applied to all records - that match the filter. See `metadata filtering _`. - [optional] - + When provided, updates all vectors in the namespace that match the filter criteria. + See `metadata filtering _`. + Must not be provided when using id. Either `id` or `filter` must be provided. [optional] - Returns: An empty dictionary if the update was successful. + Returns: + UpdateResponse: An UpdateResponse object. When using filter-based updates, the response includes + `matched_records` indicating the number of vectors that were updated. """ pass diff --git a/pinecone/db_data/request_factory.py b/pinecone/db_data/request_factory.py index ac0a27336..d1b64bcca 100644 --- a/pinecone/db_data/request_factory.py +++ b/pinecone/db_data/request_factory.py @@ -135,7 +135,7 @@ def fetch_by_metadata_request( @staticmethod def update_request( - id: str, + id: Optional[str] = None, values: Optional[List[float]] = None, set_metadata: Optional[VectorMetadataTypedDict] = None, namespace: Optional[str] = None, @@ -147,6 +147,7 @@ def update_request( sparse_values_normalized = SparseValuesFactory.build(sparse_values) args_dict = parse_non_empty_args( [ + ("id", id), ("values", values), ("set_metadata", set_metadata), ("namespace", namespace), @@ -155,9 +156,7 @@ def update_request( ] ) - return UpdateRequest( - id=id, **args_dict, _check_type=_check_type, **non_openapi_kwargs(kwargs) - ) + return UpdateRequest(**args_dict, _check_type=_check_type, **non_openapi_kwargs(kwargs)) @staticmethod def describe_index_stats_request( diff --git a/pinecone/db_data/resources/asyncio/record_asyncio.py b/pinecone/db_data/resources/asyncio/record_asyncio.py new file mode 100644 index 000000000..14cd6b28d --- /dev/null +++ b/pinecone/db_data/resources/asyncio/record_asyncio.py @@ -0,0 +1,172 @@ +from typing import Union, List, Optional, Dict +import logging + +from pinecone.core.openapi.db_data.api.vector_operations_api import AsyncioVectorOperationsApi +from pinecone.core.openapi.db_data.models import SearchRecordsResponse +from pinecone.db_data.dataclasses import SearchQuery, SearchRerank, UpsertResponse +from pinecone.db_data.request_factory import IndexRequestFactory +from pinecone.db_data.types import SearchQueryTypedDict, SearchRerankTypedDict +from pinecone.utils import validate_and_convert_errors, PluginAware + +logger = logging.getLogger(__name__) +""" :meta private: """ + + +class RecordResourceAsyncio(PluginAware): + """Resource for record operations on a Pinecone index (async).""" + + def __init__(self, vector_api: AsyncioVectorOperationsApi, config, openapi_config): + self._vector_api = vector_api + """ :meta private: """ + self._config = config + """ :meta private: """ + self._openapi_config = openapi_config + """ :meta private: """ + super().__init__() + + @validate_and_convert_errors + async def upsert_records(self, namespace: str, records: List[Dict]) -> UpsertResponse: + """Upsert records to a namespace. + + A record is a dictionary that contains either an `id` or `_id` field along with + other fields that will be stored as metadata. The `id` or `_id` field is used + as the unique identifier for the record. At least one field in the record should + correspond to a field mapping in the index's embed configuration. + + When records are upserted, Pinecone converts mapped fields into embeddings and + upserts them into the specified namespace of the index. + + Args: + namespace: The namespace of the index to upsert records to. + records: The records to upsert into the index. Each record must have an 'id' + or '_id' field. + + Returns: + UpsertResponse object which contains the number of records upserted. + + Raises: + ValueError: If namespace is not provided or if no records are provided, or + if a record is missing an 'id' or '_id' field. + + Examples: + >>> await index.record.upsert_records( + ... namespace='my-namespace', + ... records=[ + ... { + ... "_id": "test1", + ... "my_text_field": "Apple is a popular fruit known for its sweetness.", + ... }, + ... { + ... "_id": "test2", + ... "my_text_field": "The tech company Apple is known for its innovative products.", + ... }, + ... ] + ... ) + """ + args = IndexRequestFactory.upsert_records_args(namespace=namespace, records=records) + # Use _return_http_data_only=False to get headers for LSN extraction + result = await self._vector_api.upsert_records_namespace( + _return_http_data_only=False, **args + ) + # result is a tuple: (data, status, headers) when _return_http_data_only=False + response_info = None + if isinstance(result, tuple) and len(result) >= 3: + headers = result[2] + if headers: + from pinecone.utils.response_info import extract_response_info + + response_info = extract_response_info(headers) + # response_info may contain raw_headers even without LSN values + + # Ensure response_info is always present + if response_info is None: + from pinecone.utils.response_info import extract_response_info + + response_info = extract_response_info({}) + + # Count records (could be len(records) but we don't know if any failed) + # For now, assume all succeeded + return UpsertResponse(upserted_count=len(records), _response_info=response_info) + + @validate_and_convert_errors + async def search( + self, + namespace: str, + query: Union[SearchQueryTypedDict, SearchQuery], + rerank: Optional[Union[SearchRerankTypedDict, SearchRerank]] = None, + fields: Optional[List[str]] = ["*"], # Default to returning all fields + ) -> SearchRecordsResponse: + """Search for records. + + This operation converts a query to a vector embedding and then searches a namespace. + You can optionally provide a reranking operation as part of the search. + + Args: + namespace: The namespace in the index to search. + query: The SearchQuery to use for the search. The query can include a + ``match_terms`` field to specify which terms must be present in the text + of each search hit. The match_terms should be a dict with ``strategy`` + (str) and ``terms`` (List[str]) keys, e.g. + ``{"strategy": "all", "terms": ["term1", "term2"]}``. Currently only + "all" strategy is supported, which means all specified terms must be + present. **Note:** match_terms is only supported for sparse indexes with + integrated embedding configured to use the pinecone-sparse-english-v0 + model. + rerank: The SearchRerank to use with the search request. [optional] + fields: List of fields to return in the response. Defaults to ["*"] which + returns all fields. [optional] + + Returns: + SearchRecordsResponse containing the records that match the search. + + Raises: + Exception: If namespace is not provided. + + Examples: + >>> from pinecone import SearchQuery, SearchRerank, RerankModel + >>> await index.record.search( + ... namespace='my-namespace', + ... query=SearchQuery( + ... inputs={ + ... "text": "Apple corporation", + ... }, + ... top_k=3, + ... ), + ... rerank=SearchRerank( + ... model=RerankModel.Bge_Reranker_V2_M3, + ... rank_fields=["my_text_field"], + ... top_n=3, + ... ), + ... ) + """ + if namespace is None: + raise Exception("Namespace is required when searching records") + + request = IndexRequestFactory.search_request(query=query, rerank=rerank, fields=fields) + + return await self._vector_api.search_records_namespace(namespace, request) + + @validate_and_convert_errors + async def search_records( + self, + namespace: str, + query: Union[SearchQueryTypedDict, SearchQuery], + rerank: Optional[Union[SearchRerankTypedDict, SearchRerank]] = None, + fields: Optional[List[str]] = ["*"], # Default to returning all fields + ) -> SearchRecordsResponse: + """Search for records (alias for search method). + + This is an alias for the ``search`` method. See :meth:`search` for full + documentation. + + Args: + namespace: The namespace in the index to search. + query: The SearchQuery to use for the search. + rerank: The SearchRerank to use with the search request. [optional] + fields: List of fields to return in the response. Defaults to ["*"] which + returns all fields. [optional] + + Returns: + SearchRecordsResponse containing the records that match the search. + """ + return await self.search(namespace, query=query, rerank=rerank, fields=fields) diff --git a/pinecone/db_data/resources/asyncio/vector_asyncio.py b/pinecone/db_data/resources/asyncio/vector_asyncio.py new file mode 100644 index 000000000..e4d953314 --- /dev/null +++ b/pinecone/db_data/resources/asyncio/vector_asyncio.py @@ -0,0 +1,741 @@ +from pinecone.utils.tqdm import tqdm +import logging +import asyncio +import json +from typing import Union, List, Optional, Dict, Any, Literal, AsyncIterator + +from pinecone.core.openapi.db_data.api.vector_operations_api import AsyncioVectorOperationsApi +from pinecone.core.openapi.db_data.models import ( + QueryResponse as OpenAPIQueryResponse, + IndexDescription as DescribeIndexStatsResponse, + ListResponse, + UpsertRequest, + DeleteRequest, +) +from pinecone.db_data.dataclasses import ( + Vector, + SparseValues, + FetchResponse, + FetchByMetadataResponse, + Pagination, + QueryResponse, + UpsertResponse, + UpdateResponse, +) +from pinecone.db_data.request_factory import IndexRequestFactory +from pinecone.db_data.types import ( + SparseVectorTypedDict, + VectorTypedDict, + VectorMetadataTypedDict, + VectorTuple, + VectorTupleWithMetadata, + FilterTypedDict, +) +from pinecone.utils import ( + validate_and_convert_errors, + filter_dict, + parse_non_empty_args, + PluginAware, +) +from pinecone.db_data.query_results_aggregator import QueryResultsAggregator, QueryNamespacesResults +from pinecone.db_data.vector_factory import VectorFactory + +logger = logging.getLogger(__name__) +""" :meta private: """ + +_OPENAPI_ENDPOINT_PARAMS = ( + "_return_http_data_only", + "_preload_content", + "_request_timeout", + "_check_input_type", + "_check_return_type", +) +""" :meta private: """ + + +def parse_query_response(response: OpenAPIQueryResponse): + """:meta private:""" + # Convert OpenAPI QueryResponse to dataclass QueryResponse + from pinecone.utils.response_info import extract_response_info + + response_info = None + if hasattr(response, "_response_info"): + response_info = response._response_info + + if response_info is None: + response_info = extract_response_info({}) + + # Remove deprecated 'results' field if present + if hasattr(response, "_data_store"): + response._data_store.pop("results", None) + + return QueryResponse( + matches=response.matches, + namespace=response.namespace or "", + usage=response.usage if hasattr(response, "usage") and response.usage else None, + _response_info=response_info, + ) + + +class VectorResourceAsyncio(PluginAware): + """Resource for vector operations on a Pinecone index (async).""" + + def __init__(self, vector_api: AsyncioVectorOperationsApi, config, openapi_config): + self._vector_api = vector_api + """ :meta private: """ + self._config = config + """ :meta private: """ + self._openapi_config = openapi_config + """ :meta private: """ + super().__init__() + + def _openapi_kwargs(self, kwargs: Dict[str, Any]) -> Dict[str, Any]: + return filter_dict(kwargs, _OPENAPI_ENDPOINT_PARAMS) + + @validate_and_convert_errors + async def upsert( + self, + vectors: Union[ + List[Vector], List[VectorTuple], List[VectorTupleWithMetadata], List[VectorTypedDict] + ], + namespace: Optional[str] = None, + batch_size: Optional[int] = None, + show_progress: bool = True, + **kwargs, + ) -> UpsertResponse: + """Upsert vectors into the index. + + The upsert operation writes vectors into a namespace. If a new value is upserted + for an existing vector id, it will overwrite the previous value. + + Args: + vectors: A list of vectors to upsert. Each vector can be a Vector object, + tuple, or dictionary. + namespace: The namespace to write to. If not specified, the default namespace + is used. [optional] + batch_size: The number of vectors to upsert in each batch. If not specified, + all vectors will be upserted in a single batch. [optional] + show_progress: Whether to show a progress bar using tqdm. Applied only if + batch_size is provided. Default is True. + **kwargs: Additional keyword arguments. + + Returns: + UpsertResponse containing the number of vectors upserted. + + Examples: + >>> await index.vector.upsert( + ... vectors=[ + ... ('id1', [1.0, 2.0, 3.0], {'key': 'value'}), + ... ('id2', [1.0, 2.0, 3.0]) + ... ], + ... namespace='ns1' + ... ) + """ + _check_type = kwargs.pop("_check_type", True) + + if batch_size is None: + return await self._upsert_batch(vectors, namespace, _check_type, **kwargs) + + if not isinstance(batch_size, int) or batch_size <= 0: + raise ValueError("batch_size must be a positive integer") + + upsert_tasks = [ + self._upsert_batch(vectors[i : i + batch_size], namespace, _check_type, **kwargs) + for i in range(0, len(vectors), batch_size) + ] + + total_upserted = 0 + last_result = None + with tqdm(total=len(vectors), desc="Upserted vectors", disable=not show_progress) as pbar: + for task in asyncio.as_completed(upsert_tasks): + res = await task + pbar.update(res.upserted_count) + total_upserted += res.upserted_count + last_result = res + + # Create aggregated response with metadata from last completed batch + # Note: For parallel batches, this uses the last completed result (order may vary) + from pinecone.utils.response_info import extract_response_info + + response_info = None + if last_result and hasattr(last_result, "_response_info"): + response_info = last_result._response_info + if response_info is None: + response_info = extract_response_info({}) + + return UpsertResponse(upserted_count=total_upserted, _response_info=response_info) + + @validate_and_convert_errors + async def _upsert_batch( + self, + vectors: Union[ + List[Vector], List[VectorTuple], List[VectorTupleWithMetadata], List[VectorTypedDict] + ], + namespace: Optional[str], + _check_type: bool, + **kwargs, + ) -> UpsertResponse: + args_dict = parse_non_empty_args([("namespace", namespace)]) + + def vec_builder(v): + return VectorFactory.build(v, check_type=_check_type) + + # Convert OpenAPI UpsertResponse to dataclass UpsertResponse + result = await self._vector_api.upsert_vectors( + UpsertRequest( + vectors=list(map(vec_builder, vectors)), + **args_dict, + _check_type=_check_type, + **{k: v for k, v in kwargs.items() if k not in _OPENAPI_ENDPOINT_PARAMS}, + ), + **{k: v for k, v in kwargs.items() if k in _OPENAPI_ENDPOINT_PARAMS}, + ) + + from pinecone.utils.response_info import extract_response_info + + response_info = None + if hasattr(result, "_response_info"): + response_info = result._response_info + if response_info is None: + response_info = extract_response_info({}) + + return UpsertResponse(upserted_count=result.upserted_count, _response_info=response_info) + + @validate_and_convert_errors + async def upsert_from_dataframe( + self, df, namespace: Optional[str] = None, batch_size: int = 500, show_progress: bool = True + ): + """Upsert vectors from a pandas DataFrame. + + Args: + df: A pandas DataFrame with vector data. + namespace: The namespace to write to. If not specified, the default namespace + is used. [optional] + batch_size: The number of rows to upsert in each batch. Default is 500. + show_progress: Whether to show a progress bar. Default is True. + + Returns: + UpsertResponse containing the number of vectors upserted. + + Raises: + NotImplementedError: This method is not implemented for asyncio. + """ + raise NotImplementedError("upsert_from_dataframe is not implemented for asyncio") + + @validate_and_convert_errors + async def delete( + self, + ids: Optional[List[str]] = None, + delete_all: Optional[bool] = None, + namespace: Optional[str] = None, + filter: Optional[FilterTypedDict] = None, + **kwargs, + ) -> Dict[str, Any]: + """Delete vectors from the index. + + The Delete operation deletes vectors from the index, from a single namespace. + No error is raised if the vector id does not exist. + + Args: + ids: Vector ids to delete. [optional] + delete_all: If True, all vectors in the index namespace will be deleted. + Default is False. [optional] + namespace: The namespace to delete vectors from. If not specified, the default + namespace is used. [optional] + filter: Metadata filter expression to select vectors to delete. This is mutually + exclusive with specifying ids or using delete_all=True. [optional] + **kwargs: Additional keyword arguments. + + Returns: + Dict containing the deletion response. + + Examples: + >>> await index.vector.delete(ids=['id1', 'id2'], namespace='my_namespace') + >>> await index.vector.delete(delete_all=True, namespace='my_namespace') + >>> await index.vector.delete(filter={'key': 'value'}, namespace='my_namespace') + """ + _check_type = kwargs.pop("_check_type", False) + args_dict = parse_non_empty_args( + [("ids", ids), ("delete_all", delete_all), ("namespace", namespace), ("filter", filter)] + ) + + return await self._vector_api.delete_vectors( + DeleteRequest( + **args_dict, + **{ + k: v + for k, v in kwargs.items() + if k not in _OPENAPI_ENDPOINT_PARAMS and v is not None + }, + _check_type=_check_type, + ), + **{k: v for k, v in kwargs.items() if k in _OPENAPI_ENDPOINT_PARAMS}, + ) + + @validate_and_convert_errors + async def fetch( + self, ids: List[str], namespace: Optional[str] = None, **kwargs + ) -> FetchResponse: + """Fetch vectors by ID. + + The fetch operation looks up and returns vectors, by ID, from a single namespace. + The returned vectors include the vector data and/or metadata. + + Args: + ids: The vector IDs to fetch. + namespace: The namespace to fetch vectors from. If not specified, the default + namespace is used. [optional] + **kwargs: Additional keyword arguments. + + Returns: + FetchResponse object containing the fetched vectors and namespace name. + + Examples: + >>> await index.vector.fetch(ids=['id1', 'id2'], namespace='my_namespace') + >>> await index.vector.fetch(ids=['id1', 'id2']) + """ + args_dict = parse_non_empty_args([("namespace", namespace)]) + result = await self._vector_api.fetch_vectors(ids=ids, **args_dict, **kwargs) + # Copy response info from OpenAPI response if present + from pinecone.utils.response_info import extract_response_info + + response_info = None + if hasattr(result, "_response_info"): + response_info = result._response_info + if response_info is None: + response_info = extract_response_info({}) + + fetch_response = FetchResponse( + namespace=result.namespace, + vectors={k: Vector.from_dict(v) for k, v in result.vectors.items()}, + usage=result.usage, + _response_info=response_info, + ) + return fetch_response + + @validate_and_convert_errors + async def fetch_by_metadata( + self, + filter: FilterTypedDict, + namespace: Optional[str] = None, + limit: Optional[int] = None, + pagination_token: Optional[str] = None, + **kwargs, + ) -> FetchByMetadataResponse: + """Fetch vectors by metadata filter. + + Look up and return vectors by metadata filter from a single namespace. + The returned vectors include the vector data and/or metadata. + + Args: + filter: Metadata filter expression to select vectors. + See `metadata filtering _` + namespace: The namespace to fetch vectors from. If not specified, the default + namespace is used. [optional] + limit: Max number of vectors to return. Defaults to 100. [optional] + pagination_token: Pagination token to continue a previous listing operation. + [optional] + **kwargs: Additional keyword arguments. + + Returns: + FetchByMetadataResponse: Object containing the fetched vectors, namespace, + usage, and pagination token. + + Examples: + >>> await index.vector.fetch_by_metadata( + ... filter={'genre': {'$in': ['comedy', 'drama']}, 'year': {'$eq': 2019}}, + ... namespace='my_namespace', + ... limit=50 + ... ) + >>> await index.vector.fetch_by_metadata( + ... filter={'status': 'active'}, + ... pagination_token='token123' + ... ) + """ + request = IndexRequestFactory.fetch_by_metadata_request( + filter=filter, + namespace=namespace, + limit=limit, + pagination_token=pagination_token, + **kwargs, + ) + result = await self._vector_api.fetch_vectors_by_metadata( + request, **{k: v for k, v in kwargs.items() if k in _OPENAPI_ENDPOINT_PARAMS} + ) + + pagination = None + if result.pagination and result.pagination.next: + pagination = Pagination(next=result.pagination.next) + + # Copy response info from OpenAPI response if present + from pinecone.utils.response_info import extract_response_info + + response_info = None + if hasattr(result, "_response_info"): + response_info = result._response_info + if response_info is None: + response_info = extract_response_info({}) + + fetch_by_metadata_response = FetchByMetadataResponse( + namespace=result.namespace or "", + vectors={k: Vector.from_dict(v) for k, v in result.vectors.items()}, + usage=result.usage, + pagination=pagination, + _response_info=response_info, + ) + return fetch_by_metadata_response + + @validate_and_convert_errors + async def query( + self, + *args, + top_k: int, + vector: Optional[List[float]] = None, + id: Optional[str] = None, + namespace: Optional[str] = None, + filter: Optional[FilterTypedDict] = None, + include_values: Optional[bool] = None, + include_metadata: Optional[bool] = None, + sparse_vector: Optional[Union[SparseValues, SparseVectorTypedDict]] = None, + **kwargs, + ) -> QueryResponse: + """Query the index. + + The Query operation searches a namespace, using a query vector. It retrieves the + ids of the most similar items in a namespace, along with their similarity scores. + + Args: + top_k: The number of results to return for each query. Must be an integer + greater than 1. + vector: The query vector. This should be the same length as the dimension of + the index being queried. Each query request can contain only one of the + parameters id or vector. [optional] + id: The unique ID of the vector to be used as a query vector. Each query request + can contain only one of the parameters vector or id. [optional] + namespace: The namespace to query. If not specified, the default namespace is + used. [optional] + filter: The filter to apply. You can use vector metadata to limit your search. + See `metadata filtering _` + [optional] + include_values: Indicates whether vector values are included in the response. + If omitted the server will use the default value of False. [optional] + include_metadata: Indicates whether metadata is included in the response as well + as the ids. If omitted the server will use the default value of False. + [optional] + sparse_vector: Sparse values of the query vector. Expected to be either a + SparseValues object or a dict of the form {'indices': List[int], + 'values': List[float]}, where the lists each have the same length. + [optional] + **kwargs: Additional keyword arguments. + + Returns: + QueryResponse object which contains the list of the closest vectors as + ScoredVector objects, and namespace name. + + Examples: + >>> await index.vector.query(vector=[1, 2, 3], top_k=10, namespace='my_namespace') + >>> await index.vector.query(id='id1', top_k=10, namespace='my_namespace') + >>> await index.vector.query(vector=[1, 2, 3], top_k=10, namespace='my_namespace', + ... filter={'key': 'value'}) + """ + response = await self._query( + *args, + top_k=top_k, + vector=vector, + id=id, + namespace=namespace, + filter=filter, + include_values=include_values, + include_metadata=include_metadata, + sparse_vector=sparse_vector, + **kwargs, + ) + return parse_query_response(response) + + async def _query( + self, + *args, + top_k: int, + vector: Optional[List[float]] = None, + id: Optional[str] = None, + namespace: Optional[str] = None, + filter: Optional[FilterTypedDict] = None, + include_values: Optional[bool] = None, + include_metadata: Optional[bool] = None, + sparse_vector: Optional[Union[SparseValues, SparseVectorTypedDict]] = None, + **kwargs, + ) -> OpenAPIQueryResponse: + if len(args) > 0: + raise ValueError( + "Please use keyword arguments instead of positional arguments. Example: index.query(vector=[0.1, 0.2, 0.3], top_k=10, namespace='my_namespace')" + ) + + request = IndexRequestFactory.query_request( + top_k=top_k, + vector=vector, + id=id, + namespace=namespace, + filter=filter, + include_values=include_values, + include_metadata=include_metadata, + sparse_vector=sparse_vector, + **kwargs, + ) + return await self._vector_api.query_vectors( + request, **{k: v for k, v in kwargs.items() if k in _OPENAPI_ENDPOINT_PARAMS} + ) + + @validate_and_convert_errors + async def query_namespaces( + self, + namespaces: List[str], + metric: Literal["cosine", "euclidean", "dotproduct"], + top_k: Optional[int] = None, + filter: Optional[Dict[str, Union[str, float, int, bool, List, dict]]] = None, + include_values: Optional[bool] = None, + include_metadata: Optional[bool] = None, + vector: Optional[List[float]] = None, + sparse_vector: Optional[ + Union[SparseValues, Dict[str, Union[List[float], List[int]]]] + ] = None, + **kwargs, + ) -> QueryNamespacesResults: + """Query across multiple namespaces. + + Performs a query operation across multiple namespaces and aggregates the results. + + Args: + vector: The query vector. [optional] + namespaces: List of namespace names to query. + metric: The similarity metric to use for aggregation. Must be one of "cosine", + "euclidean", or "dotproduct". + top_k: The number of results to return. If not specified, defaults to 10. + [optional] + filter: The filter to apply. You can use vector metadata to limit your search. + [optional] + include_values: Indicates whether vector values are included in the response. + [optional] + include_metadata: Indicates whether metadata is included in the response. + [optional] + sparse_vector: Sparse values of the query vector. [optional] + **kwargs: Additional keyword arguments. + + Returns: + QueryNamespacesResults containing aggregated results from all namespaces. + + Raises: + ValueError: If no namespaces are specified or if vector is empty. + + Examples: + >>> await index.vector.query_namespaces( + ... vector=[1, 2, 3], + ... namespaces=['ns1', 'ns2'], + ... metric='cosine', + ... top_k=10 + ... ) + """ + if namespaces is None or len(namespaces) == 0: + raise ValueError("At least one namespace must be specified") + if sparse_vector is None and vector is not None and len(vector) == 0: + # If querying with a vector, it must not be empty + raise ValueError("Query vector must not be empty") + + overall_topk = top_k if top_k is not None else 10 + aggregator = QueryResultsAggregator(top_k=overall_topk, metric=metric) + + target_namespaces = set(namespaces) # dedup namespaces + tasks = [ + self._query( + top_k=overall_topk, + vector=vector, + namespace=ns, + filter=filter, # type: ignore[arg-type] + include_values=include_values, + include_metadata=include_metadata, + sparse_vector=sparse_vector, # type: ignore[arg-type] + async_threadpool_executor=True, + _preload_content=False, + **kwargs, + ) + for ns in target_namespaces + ] + + for task in asyncio.as_completed(tasks): + raw_result = await task + # When _preload_content=False, _query returns a RESTResponse object + from pinecone.openapi_support.rest_utils import RESTResponse + + if isinstance(raw_result, RESTResponse): + response = json.loads(raw_result.data.decode("utf-8")) + aggregator.add_results(response) + else: + # Fallback: if somehow we got an OpenAPIQueryResponse, parse it + response = json.loads(raw_result.to_dict()) + aggregator.add_results(response) + + final_results = aggregator.get_results() + return final_results + + @validate_and_convert_errors + async def update( + self, + id: str, + values: Optional[List[float]] = None, + set_metadata: Optional[VectorMetadataTypedDict] = None, + namespace: Optional[str] = None, + sparse_values: Optional[Union[SparseValues, SparseVectorTypedDict]] = None, + **kwargs, + ) -> UpdateResponse: + """Update a vector in the index. + + The Update operation updates vector in a namespace. If a value is included, it + will overwrite the previous value. If a set_metadata is included, the values of + the fields specified in it will be added or overwrite the previous value. + + Args: + id: Vector's unique id. + values: Vector values to set. [optional] + set_metadata: Metadata to set for vector. [optional] + namespace: Namespace name where to update the vector. If not specified, the + default namespace is used. [optional] + sparse_values: Sparse values to update for the vector. Expected to be either + a SparseValues object or a dict of the form {'indices': List[int], + 'values': List[float]} where the lists each have the same length. + [optional] + **kwargs: Additional keyword arguments. + + Returns: + UpdateResponse (contains no data). + + Examples: + >>> await index.vector.update(id='id1', values=[1, 2, 3], namespace='my_namespace') + >>> await index.vector.update(id='id1', set_metadata={'key': 'value'}, + ... namespace='my_namespace') + """ + result = await self._vector_api.update_vector( + IndexRequestFactory.update_request( + id=id, + values=values, + set_metadata=set_metadata, + namespace=namespace, + sparse_values=sparse_values, + **kwargs, + ), + **self._openapi_kwargs(kwargs), + ) + # Extract response info from result if it's an OpenAPI model with _response_info + response_info = None + if hasattr(result, "_response_info"): + response_info = result._response_info + else: + # If result is a dict or empty, create default response_info + from pinecone.utils.response_info import extract_response_info + + response_info = extract_response_info({}) + + return UpdateResponse(_response_info=response_info) + + @validate_and_convert_errors + async def describe_index_stats( + self, filter: Optional[FilterTypedDict] = None, **kwargs + ) -> DescribeIndexStatsResponse: + """Describe index statistics. + + The DescribeIndexStats operation returns statistics about the index's contents. + For example: The vector count per namespace and the number of dimensions. + + Args: + filter: If this parameter is present, the operation only returns statistics + for vectors that satisfy the filter. See `metadata filtering + _` [optional] + **kwargs: Additional keyword arguments. + + Returns: + DescribeIndexStatsResponse object which contains stats about the index. + + Examples: + >>> await index.vector.describe_index_stats() + >>> await index.vector.describe_index_stats(filter={'key': 'value'}) + """ + return await self._vector_api.describe_index_stats( + IndexRequestFactory.describe_index_stats_request(filter, **kwargs), + **self._openapi_kwargs(kwargs), + ) + + @validate_and_convert_errors + async def list_paginated( + self, + prefix: Optional[str] = None, + limit: Optional[int] = None, + pagination_token: Optional[str] = None, + namespace: Optional[str] = None, + **kwargs, + ) -> ListResponse: + """List vectors with pagination. + + The list_paginated operation finds vectors based on an id prefix within a single + namespace. It returns matching ids in a paginated form, with a pagination token to + fetch the next page of results. + + Args: + prefix: The id prefix to match. If unspecified, an empty string prefix will + be used with the effect of listing all ids in a namespace. [optional] + limit: The maximum number of ids to return. If unspecified, the server will + use a default value. [optional] + pagination_token: A token needed to fetch the next page of results. This token + is returned in the response if additional results are available. [optional] + namespace: The namespace to list vectors from. If not specified, the default + namespace is used. [optional] + **kwargs: Additional keyword arguments. + + Returns: + ListResponse object which contains the list of ids, the namespace name, + pagination information, and usage showing the number of read_units consumed. + + Examples: + >>> results = await index.vector.list_paginated(prefix='99', limit=5, + ... namespace='my_namespace') + >>> results.pagination.next + 'eyJza2lwX3Bhc3QiOiI5OTMiLCJwcmVmaXgiOiI5OSJ9' + """ + args_dict = IndexRequestFactory.list_paginated_args( + prefix=prefix, + limit=limit, + pagination_token=pagination_token, + namespace=namespace, + **kwargs, + ) + return await self._vector_api.list_vectors(**args_dict, **kwargs) + + @validate_and_convert_errors + async def list(self, **kwargs) -> AsyncIterator[List[str]]: + """List vectors. + + The list operation accepts all of the same arguments as list_paginated, and returns + an async generator that yields a list of the matching vector ids in each page of results. + It automatically handles pagination tokens on your behalf. + + Args: + **kwargs: Same arguments as list_paginated (prefix, limit, pagination_token, + namespace). + + Yields: + List of vector ids for each page of results. + + Examples: + >>> async for ids in index.vector.list(prefix='99', limit=5, + ... namespace='my_namespace'): + ... print(ids) + ['99', '990', '991', '992', '993'] + ['994', '995', '996', '997', '998'] + """ + done = False + while not done: + results = await self.list_paginated(**kwargs) + if len(results.vectors) > 0: + yield [v.id for v in results.vectors] + + if results.pagination: + kwargs.update({"pagination_token": results.pagination.next}) + else: + done = True diff --git a/pinecone/db_data/resources/sync/record.py b/pinecone/db_data/resources/sync/record.py new file mode 100644 index 000000000..447071b94 --- /dev/null +++ b/pinecone/db_data/resources/sync/record.py @@ -0,0 +1,170 @@ +from typing import Union, List, Optional, Dict +import logging + +from pinecone.core.openapi.db_data.api.vector_operations_api import VectorOperationsApi +from pinecone.core.openapi.db_data.models import SearchRecordsResponse +from pinecone.db_data.dataclasses import SearchQuery, SearchRerank, UpsertResponse +from pinecone.db_data.request_factory import IndexRequestFactory +from pinecone.db_data.types import SearchQueryTypedDict, SearchRerankTypedDict +from pinecone.utils import validate_and_convert_errors, PluginAware + +logger = logging.getLogger(__name__) +""" :meta private: """ + + +class RecordResource(PluginAware): + """Resource for record operations on a Pinecone index.""" + + def __init__(self, vector_api: VectorOperationsApi, config, openapi_config): + self._vector_api = vector_api + """ :meta private: """ + self._config = config + """ :meta private: """ + self._openapi_config = openapi_config + """ :meta private: """ + super().__init__() + + @validate_and_convert_errors + def upsert_records(self, namespace: str, records: List[Dict]) -> UpsertResponse: + """Upsert records to a namespace. + + A record is a dictionary that contains either an `id` or `_id` field along with + other fields that will be stored as metadata. The `id` or `_id` field is used + as the unique identifier for the record. At least one field in the record should + correspond to a field mapping in the index's embed configuration. + + When records are upserted, Pinecone converts mapped fields into embeddings and + upserts them into the specified namespace of the index. + + Args: + namespace: The namespace of the index to upsert records to. + records: The records to upsert into the index. Each record must have an 'id' + or '_id' field. + + Returns: + UpsertResponse object which contains the number of records upserted. + + Raises: + ValueError: If namespace is not provided or if no records are provided, or + if a record is missing an 'id' or '_id' field. + + Examples: + >>> index.record.upsert_records( + ... namespace='my-namespace', + ... records=[ + ... { + ... "_id": "test1", + ... "my_text_field": "Apple is a popular fruit known for its sweetness.", + ... }, + ... { + ... "_id": "test2", + ... "my_text_field": "The tech company Apple is known for its innovative products.", + ... }, + ... ] + ... ) + """ + args = IndexRequestFactory.upsert_records_args(namespace=namespace, records=records) + # Use _return_http_data_only=False to get headers for LSN extraction + result = self._vector_api.upsert_records_namespace(_return_http_data_only=False, **args) + # result is a tuple: (data, status, headers) when _return_http_data_only=False + response_info = None + if isinstance(result, tuple) and len(result) >= 3: + headers = result[2] + if headers: + from pinecone.utils.response_info import extract_response_info + + response_info = extract_response_info(headers) + # response_info may contain raw_headers even without LSN values + + # Ensure response_info is always present + if response_info is None: + from pinecone.utils.response_info import extract_response_info + + response_info = extract_response_info({}) + + # Count records (could be len(records) but we don't know if any failed) + # For now, assume all succeeded + return UpsertResponse(upserted_count=len(records), _response_info=response_info) + + @validate_and_convert_errors + def search( + self, + namespace: str, + query: Union[SearchQueryTypedDict, SearchQuery], + rerank: Optional[Union[SearchRerankTypedDict, SearchRerank]] = None, + fields: Optional[List[str]] = ["*"], # Default to returning all fields + ) -> SearchRecordsResponse: + """Search for records. + + This operation converts a query to a vector embedding and then searches a namespace. + You can optionally provide a reranking operation as part of the search. + + Args: + namespace: The namespace in the index to search. + query: The SearchQuery to use for the search. The query can include a + ``match_terms`` field to specify which terms must be present in the text + of each search hit. The match_terms should be a dict with ``strategy`` + (str) and ``terms`` (List[str]) keys, e.g. + ``{"strategy": "all", "terms": ["term1", "term2"]}``. Currently only + "all" strategy is supported, which means all specified terms must be + present. **Note:** match_terms is only supported for sparse indexes with + integrated embedding configured to use the pinecone-sparse-english-v0 + model. + rerank: The SearchRerank to use with the search request. [optional] + fields: List of fields to return in the response. Defaults to ["*"] which + returns all fields. [optional] + + Returns: + SearchRecordsResponse containing the records that match the search. + + Raises: + Exception: If namespace is not provided. + + Examples: + >>> from pinecone import SearchQuery, SearchRerank, RerankModel + >>> index.record.search( + ... namespace='my-namespace', + ... query=SearchQuery( + ... inputs={ + ... "text": "Apple corporation", + ... }, + ... top_k=3, + ... ), + ... rerank=SearchRerank( + ... model=RerankModel.Bge_Reranker_V2_M3, + ... rank_fields=["my_text_field"], + ... top_n=3, + ... ), + ... ) + """ + if namespace is None: + raise Exception("Namespace is required when searching records") + + request = IndexRequestFactory.search_request(query=query, rerank=rerank, fields=fields) + + return self._vector_api.search_records_namespace(namespace, request) + + @validate_and_convert_errors + def search_records( + self, + namespace: str, + query: Union[SearchQueryTypedDict, SearchQuery], + rerank: Optional[Union[SearchRerankTypedDict, SearchRerank]] = None, + fields: Optional[List[str]] = ["*"], # Default to returning all fields + ) -> SearchRecordsResponse: + """Search for records (alias for search method). + + This is an alias for the ``search`` method. See :meth:`search` for full + documentation. + + Args: + namespace: The namespace in the index to search. + query: The SearchQuery to use for the search. + rerank: The SearchRerank to use with the search request. [optional] + fields: List of fields to return in the response. Defaults to ["*"] which + returns all fields. [optional] + + Returns: + SearchRecordsResponse containing the records that match the search. + """ + return self.search(namespace, query=query, rerank=rerank, fields=fields) diff --git a/pinecone/db_data/resources/sync/vector.py b/pinecone/db_data/resources/sync/vector.py new file mode 100644 index 000000000..1162eff41 --- /dev/null +++ b/pinecone/db_data/resources/sync/vector.py @@ -0,0 +1,791 @@ +from pinecone.utils.tqdm import tqdm +import logging +import json +from typing import Union, List, Optional, Dict, Any, Literal +from multiprocessing.pool import ApplyResult +from concurrent.futures import as_completed + +from pinecone.core.openapi.db_data.api.vector_operations_api import VectorOperationsApi +from pinecone.core.openapi.db_data.models import ( + QueryResponse as OpenAPIQueryResponse, + IndexDescription as DescribeIndexStatsResponse, + ListResponse, +) +from pinecone.db_data.dataclasses import ( + Vector, + SparseValues, + FetchResponse, + FetchByMetadataResponse, + Pagination, + QueryResponse, + UpsertResponse, + UpdateResponse, +) +from pinecone.db_data.request_factory import IndexRequestFactory +from pinecone.db_data.types import ( + SparseVectorTypedDict, + VectorTypedDict, + VectorMetadataTypedDict, + VectorTuple, + VectorTupleWithMetadata, + FilterTypedDict, +) +from pinecone.utils import ( + validate_and_convert_errors, + filter_dict, + parse_non_empty_args, + PluginAware, +) +from pinecone.db_data.query_results_aggregator import QueryResultsAggregator, QueryNamespacesResults +from pinecone.openapi_support import OPENAPI_ENDPOINT_PARAMS + +logger = logging.getLogger(__name__) +""" :meta private: """ + + +def parse_query_response(response: OpenAPIQueryResponse): + """:meta private:""" + # Convert OpenAPI QueryResponse to dataclass QueryResponse + from pinecone.utils.response_info import extract_response_info + + response_info = None + if hasattr(response, "_response_info"): + response_info = response._response_info + + if response_info is None: + response_info = extract_response_info({}) + + # Remove deprecated 'results' field if present + if hasattr(response, "_data_store"): + response._data_store.pop("results", None) + + return QueryResponse( + matches=response.matches, + namespace=response.namespace or "", + usage=response.usage if hasattr(response, "usage") and response.usage else None, + _response_info=response_info, + ) + + +class VectorResource(PluginAware): + """Resource for vector operations on a Pinecone index.""" + + def __init__(self, vector_api: VectorOperationsApi, config, openapi_config, pool_threads: int): + self._vector_api = vector_api + """ :meta private: """ + self._config = config + """ :meta private: """ + self._openapi_config = openapi_config + """ :meta private: """ + self._pool_threads = pool_threads + """ :meta private: """ + super().__init__() + + def _openapi_kwargs(self, kwargs: Dict[str, Any]) -> Dict[str, Any]: + return filter_dict(kwargs, OPENAPI_ENDPOINT_PARAMS) + + @validate_and_convert_errors + def upsert( + self, + vectors: Union[ + List[Vector], List[VectorTuple], List[VectorTupleWithMetadata], List[VectorTypedDict] + ], + namespace: Optional[str] = None, + batch_size: Optional[int] = None, + show_progress: bool = True, + **kwargs, + ) -> Union[UpsertResponse, ApplyResult]: + """Upsert vectors into the index. + + The upsert operation writes vectors into a namespace. If a new value is upserted + for an existing vector id, it will overwrite the previous value. + + Args: + vectors: A list of vectors to upsert. Each vector can be a Vector object, + tuple, or dictionary. + namespace: The namespace to write to. If not specified, the default namespace + is used. [optional] + batch_size: The number of vectors to upsert in each batch. If not specified, + all vectors will be upserted in a single batch. [optional] + show_progress: Whether to show a progress bar using tqdm. Applied only if + batch_size is provided. Default is True. + **kwargs: Additional keyword arguments. + + Returns: + UpsertResponse containing the number of vectors upserted, or ApplyResult if + async_req=True. + + Examples: + >>> index.vector.upsert( + ... vectors=[ + ... ('id1', [1.0, 2.0, 3.0], {'key': 'value'}), + ... ('id2', [1.0, 2.0, 3.0]) + ... ], + ... namespace='ns1' + ... ) + """ + _check_type = kwargs.pop("_check_type", True) + + if kwargs.get("async_req", False) and batch_size is not None: + raise ValueError( + "async_req is not supported when batch_size is provided." + "To upsert in parallel, please follow: " + "https://docs.pinecone.io/docs/insert-data#sending-upserts-in-parallel" + ) + + if batch_size is None: + result = self._upsert_batch(vectors, namespace, _check_type, **kwargs) + # If async_req=True, result is an ApplyResult[OpenAPIUpsertResponse] + # We need to wrap it to convert to our dataclass when .get() is called + if kwargs.get("async_req", False): + # Create a wrapper that transforms the OpenAPI response to our dataclass + class UpsertResponseTransformer: + def __init__(self, apply_result: ApplyResult): + self._apply_result = apply_result + + def get(self, timeout=None): + openapi_response = self._apply_result.get(timeout) + from pinecone.utils.response_info import extract_response_info + + response_info = None + if hasattr(openapi_response, "_response_info"): + response_info = openapi_response._response_info + if response_info is None: + response_info = extract_response_info({}) + return UpsertResponse( + upserted_count=openapi_response.upserted_count, + _response_info=response_info, + ) + + def __getattr__(self, name): + # Delegate other methods to the underlying ApplyResult + return getattr(self._apply_result, name) + + # result is ApplyResult when async_req=True + return UpsertResponseTransformer(result) # type: ignore[arg-type, return-value] + # result is UpsertResponse when async_req=False + return result # type: ignore[return-value] + + if not isinstance(batch_size, int) or batch_size <= 0: + raise ValueError("batch_size must be a positive integer") + + pbar = tqdm(total=len(vectors), disable=not show_progress, desc="Upserted vectors") + total_upserted = 0 + for i in range(0, len(vectors), batch_size): + batch_result = self._upsert_batch( + vectors[i : i + batch_size], namespace, _check_type, **kwargs + ) + # When batch_size is provided, async_req cannot be True (checked above), + # so batch_result is always UpsertResponse, not ApplyResult + assert isinstance( + batch_result, UpsertResponse + ), "batch_result must be UpsertResponse when batch_size is provided" + pbar.update(batch_result.upserted_count) + # we can't use here pbar.n for the case show_progress=False + total_upserted += batch_result.upserted_count + + # _response_info may be attached if LSN headers were present in the last batch + # Create dataclass UpsertResponse from the last batch result + from pinecone.utils.response_info import extract_response_info + + response_info = None + if batch_result and hasattr(batch_result, "_response_info"): + response_info = batch_result._response_info + if response_info is None: + response_info = extract_response_info({}) + + return UpsertResponse(upserted_count=total_upserted, _response_info=response_info) + + def _upsert_batch( + self, + vectors: Union[ + List[Vector], List[VectorTuple], List[VectorTupleWithMetadata], List[VectorTypedDict] + ], + namespace: Optional[str], + _check_type: bool, + **kwargs, + ) -> Union[UpsertResponse, ApplyResult]: + # Convert OpenAPI UpsertResponse to dataclass UpsertResponse + result = self._vector_api.upsert_vectors( + IndexRequestFactory.upsert_request(vectors, namespace, _check_type, **kwargs), + **self._openapi_kwargs(kwargs), + ) + + # If async_req=True, result is an ApplyResult[OpenAPIUpsertResponse] + # We need to wrap it in a transformer that converts to our dataclass + if kwargs.get("async_req", False): + # Return ApplyResult - it will be unwrapped by the caller + # The ApplyResult contains OpenAPIUpsertResponse which will be converted when .get() is called + return result # type: ignore[return-value] # ApplyResult is not tracked through OpenAPI layers + + from pinecone.utils.response_info import extract_response_info + + response_info = None + if hasattr(result, "_response_info"): + response_info = result._response_info + if response_info is None: + response_info = extract_response_info({}) + + return UpsertResponse(upserted_count=result.upserted_count, _response_info=response_info) + + @staticmethod + def _iter_dataframe(df, batch_size): + for i in range(0, len(df), batch_size): + batch = df.iloc[i : i + batch_size].to_dict(orient="records") + yield batch + + @validate_and_convert_errors + def upsert_from_dataframe( + self, df, namespace: Optional[str] = None, batch_size: int = 500, show_progress: bool = True + ) -> UpsertResponse: + """Upsert vectors from a pandas DataFrame. + + Args: + df: A pandas DataFrame with vector data. + namespace: The namespace to write to. If not specified, the default namespace + is used. [optional] + batch_size: The number of rows to upsert in each batch. Default is 500. + show_progress: Whether to show a progress bar. Default is True. + + Returns: + UpsertResponse containing the number of vectors upserted. + + Raises: + RuntimeError: If pandas is not installed. + ValueError: If df is not a pandas DataFrame. + """ + try: + import pandas as pd + except ImportError: + raise RuntimeError( + "The `pandas` package is not installed. Please install pandas to use `upsert_from_dataframe()`" + ) + + if not isinstance(df, pd.DataFrame): + raise ValueError(f"Only pandas dataframes are supported. Found: {type(df)}") + + pbar = tqdm(total=len(df), disable=not show_progress, desc="sending upsert requests") + results = [] + for chunk in self._iter_dataframe(df, batch_size=batch_size): + res = self.upsert(vectors=chunk, namespace=namespace) + pbar.update(len(chunk)) + results.append(res) + + upserted_count = 0 + last_result = None + for res in results: + upserted_count += res.upserted_count + last_result = res + + # Create aggregated response with metadata from final batch + from pinecone.utils.response_info import extract_response_info + + response_info = None + if last_result and hasattr(last_result, "_response_info"): + response_info = last_result._response_info + if response_info is None: + response_info = extract_response_info({}) + + return UpsertResponse(upserted_count=upserted_count, _response_info=response_info) + + @validate_and_convert_errors + def delete( + self, + ids: Optional[List[str]] = None, + delete_all: Optional[bool] = None, + namespace: Optional[str] = None, + filter: Optional[Dict[str, Union[str, float, int, bool, List, dict]]] = None, + **kwargs, + ) -> Dict[str, Any]: + """Delete vectors from the index. + + The Delete operation deletes vectors from the index, from a single namespace. + No error is raised if the vector id does not exist. + + Args: + ids: Vector ids to delete. [optional] + delete_all: If True, all vectors in the index namespace will be deleted. + Default is False. [optional] + namespace: The namespace to delete vectors from. If not specified, the default + namespace is used. [optional] + filter: Metadata filter expression to select vectors to delete. This is mutually + exclusive with specifying ids or using delete_all=True. [optional] + **kwargs: Additional keyword arguments. + + Returns: + Dict containing the deletion response. + + Examples: + >>> index.vector.delete(ids=['id1', 'id2'], namespace='my_namespace') + >>> index.vector.delete(delete_all=True, namespace='my_namespace') + >>> index.vector.delete(filter={'key': 'value'}, namespace='my_namespace') + """ + return self._vector_api.delete_vectors( + IndexRequestFactory.delete_request( + ids=ids, delete_all=delete_all, namespace=namespace, filter=filter, **kwargs + ), + **self._openapi_kwargs(kwargs), + ) + + @validate_and_convert_errors + def fetch(self, ids: List[str], namespace: Optional[str] = None, **kwargs) -> FetchResponse: + """Fetch vectors by ID. + + The fetch operation looks up and returns vectors, by ID, from a single namespace. + The returned vectors include the vector data and/or metadata. + + Args: + ids: The vector IDs to fetch. + namespace: The namespace to fetch vectors from. If not specified, the default + namespace is used. [optional] + **kwargs: Additional keyword arguments. + + Returns: + FetchResponse object containing the fetched vectors and namespace name. + + Examples: + >>> index.vector.fetch(ids=['id1', 'id2'], namespace='my_namespace') + >>> index.vector.fetch(ids=['id1', 'id2']) + """ + args_dict = parse_non_empty_args([("namespace", namespace)]) + result = self._vector_api.fetch_vectors(ids=ids, **args_dict, **kwargs) + # Copy response info from OpenAPI response if present + from pinecone.utils.response_info import extract_response_info + + response_info = None + if hasattr(result, "_response_info"): + response_info = result._response_info + if response_info is None: + response_info = extract_response_info({}) + + fetch_response = FetchResponse( + namespace=result.namespace, + vectors={k: Vector.from_dict(v) for k, v in result.vectors.items()}, + usage=result.usage, + _response_info=response_info, + ) + return fetch_response + + @validate_and_convert_errors + def fetch_by_metadata( + self, + filter: FilterTypedDict, + namespace: Optional[str] = None, + limit: Optional[int] = None, + pagination_token: Optional[str] = None, + **kwargs, + ) -> FetchByMetadataResponse: + """Fetch vectors by metadata filter. + + Look up and return vectors by metadata filter from a single namespace. + The returned vectors include the vector data and/or metadata. + + Args: + filter: Metadata filter expression to select vectors. + See `metadata filtering _` + namespace: The namespace to fetch vectors from. If not specified, the default + namespace is used. [optional] + limit: Max number of vectors to return. Defaults to 100. [optional] + pagination_token: Pagination token to continue a previous listing operation. + [optional] + **kwargs: Additional keyword arguments. + + Returns: + FetchByMetadataResponse: Object containing the fetched vectors, namespace, + usage, and pagination token. + + Examples: + >>> index.vector.fetch_by_metadata( + ... filter={'genre': {'$in': ['comedy', 'drama']}, 'year': {'$eq': 2019}}, + ... namespace='my_namespace', + ... limit=50 + ... ) + >>> index.vector.fetch_by_metadata( + ... filter={'status': 'active'}, + ... pagination_token='token123' + ... ) + """ + request = IndexRequestFactory.fetch_by_metadata_request( + filter=filter, + namespace=namespace, + limit=limit, + pagination_token=pagination_token, + **kwargs, + ) + result = self._vector_api.fetch_vectors_by_metadata(request, **self._openapi_kwargs(kwargs)) + + pagination = None + if result.pagination and result.pagination.next: + pagination = Pagination(next=result.pagination.next) + + # Copy response info from OpenAPI response if present + from pinecone.utils.response_info import extract_response_info + + response_info = None + if hasattr(result, "_response_info"): + response_info = result._response_info + if response_info is None: + response_info = extract_response_info({}) + + fetch_by_metadata_response = FetchByMetadataResponse( + namespace=result.namespace or "", + vectors={k: Vector.from_dict(v) for k, v in result.vectors.items()}, + usage=result.usage, + pagination=pagination, + _response_info=response_info, + ) + return fetch_by_metadata_response + + @validate_and_convert_errors + def query( + self, + *args, + top_k: int, + vector: Optional[List[float]] = None, + id: Optional[str] = None, + namespace: Optional[str] = None, + filter: Optional[FilterTypedDict] = None, + include_values: Optional[bool] = None, + include_metadata: Optional[bool] = None, + sparse_vector: Optional[Union[SparseValues, SparseVectorTypedDict]] = None, + **kwargs, + ) -> Union[QueryResponse, ApplyResult]: + """Query the index. + + The Query operation searches a namespace, using a query vector. It retrieves the + ids of the most similar items in a namespace, along with their similarity scores. + + Args: + top_k: The number of results to return for each query. Must be an integer + greater than 1. + vector: The query vector. This should be the same length as the dimension of + the index being queried. Each query request can contain only one of the + parameters id or vector. [optional] + id: The unique ID of the vector to be used as a query vector. Each query request + can contain only one of the parameters vector or id. [optional] + namespace: The namespace to query. If not specified, the default namespace is + used. [optional] + filter: The filter to apply. You can use vector metadata to limit your search. + See `metadata filtering _` + [optional] + include_values: Indicates whether vector values are included in the response. + If omitted the server will use the default value of False. [optional] + include_metadata: Indicates whether metadata is included in the response as well + as the ids. If omitted the server will use the default value of False. + [optional] + sparse_vector: Sparse values of the query vector. Expected to be either a + SparseValues object or a dict of the form {'indices': List[int], + 'values': List[float]}, where the lists each have the same length. + [optional] + **kwargs: Additional keyword arguments. + + Returns: + QueryResponse object which contains the list of the closest vectors as + ScoredVector objects, and namespace name, or ApplyResult if async_req=True. + + Examples: + >>> index.vector.query(vector=[1, 2, 3], top_k=10, namespace='my_namespace') + >>> index.vector.query(id='id1', top_k=10, namespace='my_namespace') + >>> index.vector.query(vector=[1, 2, 3], top_k=10, namespace='my_namespace', + ... filter={'key': 'value'}) + """ + response = self._query( + *args, + top_k=top_k, + vector=vector, + id=id, + namespace=namespace, + filter=filter, + include_values=include_values, + include_metadata=include_metadata, + sparse_vector=sparse_vector, + **kwargs, + ) + + if kwargs.get("async_req", False) or kwargs.get("async_threadpool_executor", False): + # For async requests, the OpenAPI client wraps the response in ApplyResult + # The response is already an ApplyResult[OpenAPIQueryResponse] + return response # type: ignore[return-value] # ApplyResult is not tracked through OpenAPI layers + else: + return parse_query_response(response) + + def _query( + self, + *args, + top_k: int, + vector: Optional[List[float]] = None, + id: Optional[str] = None, + namespace: Optional[str] = None, + filter: Optional[FilterTypedDict] = None, + include_values: Optional[bool] = None, + include_metadata: Optional[bool] = None, + sparse_vector: Optional[Union[SparseValues, SparseVectorTypedDict]] = None, + **kwargs, + ) -> OpenAPIQueryResponse: + if len(args) > 0: + raise ValueError( + "The argument order for `query()` has changed; please use keyword arguments instead of positional arguments. Example: index.query(vector=[0.1, 0.2, 0.3], top_k=10, namespace='my_namespace')" + ) + + if top_k < 1: + raise ValueError("top_k must be a positive integer") + + request = IndexRequestFactory.query_request( + top_k=top_k, + vector=vector, + id=id, + namespace=namespace, + filter=filter, + include_values=include_values, + include_metadata=include_metadata, + sparse_vector=sparse_vector, + **kwargs, + ) + return self._vector_api.query_vectors(request, **self._openapi_kwargs(kwargs)) + + @validate_and_convert_errors + def query_namespaces( + self, + vector: Optional[List[float]], + namespaces: List[str], + metric: Literal["cosine", "euclidean", "dotproduct"], + top_k: Optional[int] = None, + filter: Optional[Dict[str, Union[str, float, int, bool, List, dict]]] = None, + include_values: Optional[bool] = None, + include_metadata: Optional[bool] = None, + sparse_vector: Optional[ + Union[SparseValues, Dict[str, Union[List[float], List[int]]]] + ] = None, + **kwargs, + ) -> QueryNamespacesResults: + """Query across multiple namespaces. + + Performs a query operation across multiple namespaces and aggregates the results. + + Args: + vector: The query vector. [optional] + namespaces: List of namespace names to query. + metric: The similarity metric to use for aggregation. Must be one of "cosine", + "euclidean", or "dotproduct". + top_k: The number of results to return. If not specified, defaults to 10. + [optional] + filter: The filter to apply. You can use vector metadata to limit your search. + [optional] + include_values: Indicates whether vector values are included in the response. + [optional] + include_metadata: Indicates whether metadata is included in the response. + [optional] + sparse_vector: Sparse values of the query vector. [optional] + **kwargs: Additional keyword arguments. + + Returns: + QueryNamespacesResults containing aggregated results from all namespaces. + + Raises: + ValueError: If no namespaces are specified or if vector is empty. + + Examples: + >>> index.vector.query_namespaces( + ... vector=[1, 2, 3], + ... namespaces=['ns1', 'ns2'], + ... metric='cosine', + ... top_k=10 + ... ) + """ + if namespaces is None or len(namespaces) == 0: + raise ValueError("At least one namespace must be specified") + if sparse_vector is None and vector is not None and len(vector) == 0: + # If querying with a vector, it must not be empty + raise ValueError("Query vector must not be empty") + + overall_topk = top_k if top_k is not None else 10 + aggregator = QueryResultsAggregator(top_k=overall_topk, metric=metric) + + target_namespaces = set(namespaces) # dedup namespaces + async_futures = [ + self.query( + vector=vector, + namespace=ns, + top_k=overall_topk, + filter=filter, + include_values=include_values, + include_metadata=include_metadata, + sparse_vector=sparse_vector, + async_threadpool_executor=True, + _preload_content=False, + **kwargs, + ) + for ns in target_namespaces + ] + + for result in as_completed(async_futures): + raw_result = result.result() + response = json.loads(raw_result.data.decode("utf-8")) + aggregator.add_results(response) + + final_results = aggregator.get_results() + return final_results + + @validate_and_convert_errors + def update( + self, + id: str, + values: Optional[List[float]] = None, + set_metadata: Optional[VectorMetadataTypedDict] = None, + namespace: Optional[str] = None, + sparse_values: Optional[Union[SparseValues, SparseVectorTypedDict]] = None, + **kwargs, + ) -> UpdateResponse: + """Update a vector in the index. + + The Update operation updates vector in a namespace. If a value is included, it + will overwrite the previous value. If a set_metadata is included, the values of + the fields specified in it will be added or overwrite the previous value. + + Args: + id: Vector's unique id. + values: Vector values to set. [optional] + set_metadata: Metadata to set for vector. [optional] + namespace: Namespace name where to update the vector. If not specified, the + default namespace is used. [optional] + sparse_values: Sparse values to update for the vector. Expected to be either + a SparseValues object or a dict of the form {'indices': List[int], + 'values': List[float]} where the lists each have the same length. + [optional] + **kwargs: Additional keyword arguments. + + Returns: + UpdateResponse (contains no data). + + Examples: + >>> index.vector.update(id='id1', values=[1, 2, 3], namespace='my_namespace') + >>> index.vector.update(id='id1', set_metadata={'key': 'value'}, + ... namespace='my_namespace') + """ + result = self._vector_api.update_vector( + IndexRequestFactory.update_request( + id=id, + values=values, + set_metadata=set_metadata, + namespace=namespace, + sparse_values=sparse_values, + **kwargs, + ), + **self._openapi_kwargs(kwargs), + ) + # Extract response info from result if it's an OpenAPI model with _response_info + response_info = None + if hasattr(result, "_response_info"): + response_info = result._response_info + else: + # If result is a dict or empty, create default response_info + from pinecone.utils.response_info import extract_response_info + + response_info = extract_response_info({}) + + return UpdateResponse(_response_info=response_info) + + @validate_and_convert_errors + def describe_index_stats( + self, filter: Optional[FilterTypedDict] = None, **kwargs + ) -> DescribeIndexStatsResponse: + """Describe index statistics. + + The DescribeIndexStats operation returns statistics about the index's contents. + For example: The vector count per namespace and the number of dimensions. + + Args: + filter: If this parameter is present, the operation only returns statistics + for vectors that satisfy the filter. See `metadata filtering + _` [optional] + **kwargs: Additional keyword arguments. + + Returns: + DescribeIndexStatsResponse object which contains stats about the index. + + Examples: + >>> index.vector.describe_index_stats() + >>> index.vector.describe_index_stats(filter={'key': 'value'}) + """ + return self._vector_api.describe_index_stats( + IndexRequestFactory.describe_index_stats_request(filter, **kwargs), + **self._openapi_kwargs(kwargs), + ) + + @validate_and_convert_errors + def list_paginated( + self, + prefix: Optional[str] = None, + limit: Optional[int] = None, + pagination_token: Optional[str] = None, + namespace: Optional[str] = None, + **kwargs, + ) -> ListResponse: + """List vectors with pagination. + + The list_paginated operation finds vectors based on an id prefix within a single + namespace. It returns matching ids in a paginated form, with a pagination token to + fetch the next page of results. + + Args: + prefix: The id prefix to match. If unspecified, an empty string prefix will + be used with the effect of listing all ids in a namespace. [optional] + limit: The maximum number of ids to return. If unspecified, the server will + use a default value. [optional] + pagination_token: A token needed to fetch the next page of results. This token + is returned in the response if additional results are available. [optional] + namespace: The namespace to list vectors from. If not specified, the default + namespace is used. [optional] + **kwargs: Additional keyword arguments. + + Returns: + ListResponse object which contains the list of ids, the namespace name, + pagination information, and usage showing the number of read_units consumed. + + Examples: + >>> results = index.vector.list_paginated(prefix='99', limit=5, + ... namespace='my_namespace') + >>> results.pagination.next + 'eyJza2lwX3Bhc3QiOiI5OTMiLCJwcmVmaXgiOiI5OSJ9' + """ + args_dict = IndexRequestFactory.list_paginated_args( + prefix=prefix, + limit=limit, + pagination_token=pagination_token, + namespace=namespace, + **kwargs, + ) + return self._vector_api.list_vectors(**args_dict, **kwargs) + + @validate_and_convert_errors + def list(self, **kwargs): + """List vectors. + + The list operation accepts all of the same arguments as list_paginated, and returns + a generator that yields a list of the matching vector ids in each page of results. + It automatically handles pagination tokens on your behalf. + + Args: + **kwargs: Same arguments as list_paginated (prefix, limit, pagination_token, + namespace). + + Yields: + List of vector ids for each page of results. + + Examples: + >>> for ids in index.vector.list(prefix='99', limit=5, + ... namespace='my_namespace'): + ... print(ids) + ['99', '990', '991', '992', '993'] + ['994', '995', '996', '997', '998'] + """ + done = False + while not done: + results = self.list_paginated(**kwargs) + if len(results.vectors) > 0: + yield [v.id for v in results.vectors] + + if results.pagination: + kwargs.update({"pagination_token": results.pagination.next}) + else: + done = True diff --git a/pinecone/grpc/index_grpc.py b/pinecone/grpc/index_grpc.py index bae88b7a4..b75f97834 100644 --- a/pinecone/grpc/index_grpc.py +++ b/pinecone/grpc/index_grpc.py @@ -681,7 +681,7 @@ def query_namespaces( def update( self, - id: str, + id: Optional[str] = None, async_req: bool = False, values: Optional[List[float]] = None, set_metadata: Optional[VectorMetadataTypedDict] = None, @@ -691,43 +691,76 @@ def update( **kwargs, ) -> Union[UpdateResponse, PineconeGrpcFuture]: """ - The Update operation updates vector in a namespace. - If a value is included, it will overwrite the previous value. - If a set_metadata is included, - the values of the fields specified in it will be added or overwrite the previous value. + The Update operation updates vectors in a namespace. + + This method supports two update modes: + + 1. **Single vector update by ID**: Provide `id` to update a specific vector. + - Updates the vector with the given ID + - If `values` is included, it will overwrite the previous vector values + - If `set_metadata` is included, the values of the fields specified will be added or overwrite the previous metadata + + 2. **Bulk update by metadata filter**: Provide `filter` to update all vectors matching the filter criteria. + - Updates all vectors in the namespace that match the filter expression + - Useful for updating metadata across multiple vectors at once + - The response includes `matched_records` indicating how many vectors were updated + + Either `id` or `filter` must be provided (but not both in the same call). Examples: + **Single vector update by ID:** + .. code-block:: python + >>> # Update vector values >>> index.update(id='id1', values=[1, 2, 3], namespace='my_namespace') + >>> # Update vector metadata >>> index.update(id='id1', set_metadata={'key': 'value'}, namespace='my_namespace', async_req=True) + >>> # Update vector values and sparse values >>> index.update(id='id1', values=[1, 2, 3], sparse_values={'indices': [1, 2], 'values': [0.2, 0.4]}, >>> namespace='my_namespace') >>> index.update(id='id1', values=[1, 2, 3], sparse_values=GRPCSparseValues(indices=[1, 2], values=[0.2, 0.4]), >>> namespace='my_namespace') - >>> index.update(id='id1', set_metadata={'status': 'active'}, filter={'genre': {'$eq': 'drama'}}, - >>> namespace='my_namespace') + + **Bulk update by metadata filter:** + + .. code-block:: python + + >>> # Update metadata for all vectors matching the filter + >>> response = index.update(set_metadata={'status': 'active'}, filter={'genre': {'$eq': 'drama'}}, + >>> namespace='my_namespace') + >>> print(f"Updated {response.matched_records} vectors") Args: - id (str): Vector's unique id. + id (str): Vector's unique id. Required for single vector updates. Must not be provided when using filter. [optional] async_req (bool): If True, the update operation will be performed asynchronously. Defaults to False. [optional] - values (List[float]): vector values to set. [optional] + values (List[float]): Vector values to set. [optional] set_metadata (Dict[str, Union[str, float, int, bool, List[int], List[float], List[str]]]]): - metadata to set for vector. [optional] - namespace (str): Namespace name where to update the vector.. [optional] - sparse_values: (Dict[str, Union[List[float], List[int]]]): sparse values to update for the vector. + Metadata to set for the vector(s). [optional] + namespace (str): Namespace name where to update the vector(s). [optional] + sparse_values: (Dict[str, Union[List[float], List[int]]]): Sparse values to update for the vector. Expected to be either a GRPCSparseValues object or a dict of the form: - {'indices': List[int], 'values': List[float]} where the lists each have the same length. + {'indices': List[int], 'values': List[float]} where the lists each have the same length. [optional] filter (Dict[str, Union[str, float, int, bool, List, dict]]): A metadata filter expression. - When updating metadata across records in a namespace, the update is applied to all records - that match the filter. See `metadata filtering _`. - [optional] + When provided, updates all vectors in the namespace that match the filter criteria. + See `metadata filtering _`. + Must not be provided when using id. Either `id` or `filter` must be provided. [optional] - - Returns: UpdateResponse (contains no data) or a PineconeGrpcFuture object if async_req is True. + Returns: + UpdateResponse or PineconeGrpcFuture: When using filter-based updates, the UpdateResponse includes + `matched_records` indicating the number of vectors that were updated. If `async_req=True`, returns + a PineconeGrpcFuture object instead. """ + # Validate that exactly one of id or filter is provided + if id is None and filter is None: + raise ValueError("Either 'id' or 'filter' must be provided to update vectors.") + if id is not None and filter is not None: + raise ValueError( + "Cannot provide both 'id' and 'filter' in the same update call. Use 'id' for single vector updates or 'filter' for bulk updates." + ) + if set_metadata is not None: set_metadata_struct = dict_to_proto_struct(set_metadata) else: @@ -742,6 +775,7 @@ def update( sparse_values = SparseValuesFactory.build(sparse_values) args_dict = self._parse_non_empty_args( [ + ("id", id), ("values", values), ("set_metadata", set_metadata_struct), ("namespace", namespace), @@ -750,7 +784,7 @@ def update( ] ) - request = UpdateRequest(id=id, **args_dict) + request = UpdateRequest(**args_dict) if async_req: future_result = self.runner.run(self.stub.Update.future, request, timeout=timeout) # For .future calls, runner returns (future, None, None) since .future doesn't support with_call diff --git a/pinecone/grpc/resources/vector_grpc.py b/pinecone/grpc/resources/vector_grpc.py new file mode 100644 index 000000000..ab14a3aed --- /dev/null +++ b/pinecone/grpc/resources/vector_grpc.py @@ -0,0 +1,858 @@ +import logging +from typing import Optional, Dict, Union, List, Tuple, Any, Iterable, cast, Literal + +from google.protobuf import json_format + +from pinecone.utils.tqdm import tqdm +from concurrent.futures import as_completed, Future + +from ..utils import ( + dict_to_proto_struct, + parse_fetch_response, + parse_fetch_by_metadata_response, + parse_query_response, + parse_stats_response, + parse_upsert_response, + parse_update_response, + parse_delete_response, +) +from ..vector_factory_grpc import VectorFactoryGRPC +from ..sparse_values_factory import SparseValuesFactory + +from pinecone.core.openapi.db_data.models import ( + FetchResponse, + QueryResponse, + IndexDescription as DescribeIndexStatsResponse, +) +from pinecone.db_data.dataclasses import FetchByMetadataResponse, UpdateResponse, UpsertResponse +from pinecone.db_control.models.list_response import ListResponse as SimpleListResponse, Pagination +from pinecone.core.grpc.protos.db_data_2025_10_pb2 import ( + Vector as GRPCVector, + UpsertRequest, + DeleteRequest, + QueryRequest, + FetchRequest, + FetchByMetadataRequest, + UpdateRequest, + ListRequest, + DescribeIndexStatsRequest, + DeleteResponse, + SparseValues as GRPCSparseValues, +) +from pinecone import Vector, SparseValues +from pinecone.db_data.query_results_aggregator import QueryNamespacesResults, QueryResultsAggregator +from ..future import PineconeGrpcFuture +from ...db_data.types import ( + SparseVectorTypedDict, + VectorTypedDict, + VectorTuple, + FilterTypedDict, + VectorMetadataTypedDict, +) +from ...utils import PluginAware + +logger = logging.getLogger(__name__) +""" :meta private: """ + + +class VectorResourceGRPC(PluginAware): + """Resource for vector operations on a Pinecone index (GRPC).""" + + def __init__(self, stub, runner, threadpool_executor): + self._stub = stub + """ :meta private: """ + self._runner = runner + """ :meta private: """ + self._threadpool_executor = threadpool_executor + """ :meta private: """ + super().__init__() + + @staticmethod + def _parse_non_empty_args(args: List[Tuple[str, Any]]) -> Dict[str, Any]: + return {arg_name: val for arg_name, val in args if val is not None} + + def upsert( + self, + vectors: Union[List[Vector], List[GRPCVector], List[VectorTuple], List[VectorTypedDict]], + async_req: bool = False, + namespace: Optional[str] = None, + batch_size: Optional[int] = None, + show_progress: bool = True, + **kwargs, + ) -> Union[UpsertResponse, PineconeGrpcFuture]: + """Upsert vectors into the index. + + The upsert operation writes vectors into a namespace. If a new value is upserted + for an existing vector id, it will overwrite the previous value. + + Args: + vectors: A list of vectors to upsert. Each vector can be a GRPCVector object, + tuple, or dictionary. + async_req: If True, the upsert operation will be performed asynchronously. + Cannot be used with batch_size. Defaults to False. + namespace: The namespace to write to. If not specified, the default namespace + is used. [optional] + batch_size: The number of vectors to upsert in each batch. Cannot be used + with async_req=True. If not specified, all vectors will be upserted in + a single batch. [optional] + show_progress: Whether to show a progress bar using tqdm. Applied only if + batch_size is provided. Default is True. + **kwargs: Additional keyword arguments. + + Returns: + UpsertResponse containing the number of vectors upserted, or + PineconeGrpcFuture if async_req=True. + + Examples: + >>> index.vector.upsert([('id1', [1.0, 2.0, 3.0], {'key': 'value'}), + ... ('id2', [1.0, 2.0, 3.0])], + ... namespace='ns1', async_req=True) + """ + if async_req and batch_size is not None: + raise ValueError( + "async_req is not supported when batch_size is provided." + "To upsert in parallel, please follow: " + "https://docs.pinecone.io/docs/performance-tuning" + ) + + timeout = kwargs.pop("timeout", None) + + vectors = list(map(VectorFactoryGRPC.build, vectors)) + if async_req: + args_dict = self._parse_non_empty_args([("namespace", namespace)]) + request = UpsertRequest(vectors=vectors, **args_dict, **kwargs) + future_result = self._runner.run(self._stub.Upsert.future, request, timeout=timeout) + # For .future calls, runner returns (future, None, None) since .future doesn't support with_call + # The future itself will provide metadata when it completes + future = future_result[0] if isinstance(future_result, tuple) else future_result + return PineconeGrpcFuture( + future, timeout=timeout, result_transformer=parse_upsert_response + ) + + if batch_size is None: + return self._upsert_batch(vectors, namespace, timeout=timeout, **kwargs) + + if not isinstance(batch_size, int) or batch_size <= 0: + raise ValueError("batch_size must be a positive integer") + + pbar = tqdm(total=len(vectors), disable=not show_progress, desc="Upserted vectors") + total_upserted = 0 + last_batch_result = None + for i in range(0, len(vectors), batch_size): + batch_result = self._upsert_batch( + vectors[i : i + batch_size], namespace, timeout=timeout, **kwargs + ) + pbar.update(batch_result.upserted_count) + # we can't use here pbar.n for the case show_progress=False + total_upserted += batch_result.upserted_count + last_batch_result = batch_result + + # Create aggregated response with metadata from final batch + from pinecone.db_data.dataclasses import UpsertResponse + + response_info = None + if last_batch_result and hasattr(last_batch_result, "_response_info"): + response_info = last_batch_result._response_info + else: + from pinecone.utils.response_info import extract_response_info + + response_info = extract_response_info({}) + + return UpsertResponse(upserted_count=total_upserted, _response_info=response_info) + + def _upsert_batch( + self, vectors: List[GRPCVector], namespace: Optional[str], timeout: Optional[int], **kwargs + ) -> UpsertResponse: + args_dict = self._parse_non_empty_args([("namespace", namespace)]) + request = UpsertRequest(vectors=vectors, **args_dict) + response, initial_metadata = self._runner.run( + self._stub.Upsert, request, timeout=timeout, **kwargs + ) + return parse_upsert_response(response, initial_metadata=initial_metadata) + + def upsert_from_dataframe( + self, + df, + namespace: str = "", + batch_size: int = 500, + use_async_requests: bool = True, + show_progress: bool = True, + ) -> UpsertResponse: + """Upsert vectors from a pandas DataFrame. + + Args: + df: A pandas DataFrame with vector data. + namespace: The namespace to upsert into. + batch_size: The number of rows to upsert in a single batch. + use_async_requests: Whether to upsert multiple requests at the same time + using asynchronous request mechanism. + show_progress: Whether to show a progress bar. + + Returns: + UpsertResponse containing the number of vectors upserted. + + Raises: + RuntimeError: If pandas is not installed. + ValueError: If df is not a pandas DataFrame. + """ + try: + import pandas as pd + except ImportError: + raise RuntimeError( + "The `pandas` package is not installed. Please install pandas to use `upsert_from_dataframe()`" + ) + + if not isinstance(df, pd.DataFrame): + raise ValueError(f"Only pandas dataframes are supported. Found: {type(df)}") + + pbar = tqdm(total=len(df), disable=not show_progress, desc="sending upsert requests") + results = [] + for chunk in self._iter_dataframe(df, batch_size=batch_size): + res = self.upsert(vectors=chunk, namespace=namespace, async_req=use_async_requests) + pbar.update(len(chunk)) + results.append(res) + + if use_async_requests: + cast_results = cast(List[PineconeGrpcFuture], results) + results = [ + async_result.result() + for async_result in tqdm( + iterable=cast_results, + disable=not show_progress, + desc="collecting async responses", + ) + ] + + upserted_count = 0 + last_result = None + for res in results: + if hasattr(res, "upserted_count") and isinstance(res.upserted_count, int): + upserted_count += res.upserted_count + last_result = res + + response_info = None + if last_result and hasattr(last_result, "_response_info"): + response_info = last_result._response_info + else: + from pinecone.utils.response_info import extract_response_info + + response_info = extract_response_info({}) + + return UpsertResponse(upserted_count=upserted_count, _response_info=response_info) + + @staticmethod + def _iter_dataframe(df, batch_size): + for i in range(0, len(df), batch_size): + batch = df.iloc[i : i + batch_size].to_dict(orient="records") + yield batch + + def delete( + self, + ids: Optional[List[str]] = None, + delete_all: Optional[bool] = None, + namespace: Optional[str] = None, + filter: Optional[FilterTypedDict] = None, + async_req: bool = False, + **kwargs, + ) -> Union[DeleteResponse, PineconeGrpcFuture]: + """Delete vectors from the index. + + The Delete operation deletes vectors from the index, from a single namespace. + No error is raised if the vector id does not exist. + + Args: + ids: Vector ids to delete. [optional] + delete_all: If True, all vectors in the index namespace will be deleted. + Default is False. [optional] + namespace: The namespace to delete vectors from. If not specified, the default + namespace is used. [optional] + filter: Metadata filter expression to select vectors to delete. This is mutually + exclusive with specifying ids or using delete_all=True. [optional] + async_req: If True, the delete operation will be performed asynchronously. + Defaults to False. [optional] + **kwargs: Additional keyword arguments. + + Returns: + DeleteResponse (contains no data) or a PineconeGrpcFuture object if + async_req is True. + + Examples: + >>> index.vector.delete(ids=['id1', 'id2'], namespace='my_namespace') + >>> index.vector.delete(delete_all=True, namespace='my_namespace') + >>> index.vector.delete(filter={'key': 'value'}, namespace='my_namespace', async_req=True) + """ + if filter is not None: + filter_struct = dict_to_proto_struct(filter) + else: + filter_struct = None + + args_dict = self._parse_non_empty_args( + [ + ("ids", ids), + ("delete_all", delete_all), + ("namespace", namespace), + ("filter", filter_struct), + ] + ) + timeout = kwargs.pop("timeout", None) + + request = DeleteRequest(**args_dict, **kwargs) + if async_req: + future_result = self._runner.run(self._stub.Delete.future, request, timeout=timeout) + # For .future calls, runner returns (future, None, None) since .future doesn't support with_call + future = future_result[0] if isinstance(future_result, tuple) else future_result + return PineconeGrpcFuture( + future, timeout=timeout, result_transformer=parse_delete_response + ) + else: + response, initial_metadata = self._runner.run( + self._stub.Delete, request, timeout=timeout + ) + return parse_delete_response(response, initial_metadata=initial_metadata) + + def fetch( + self, + ids: Optional[List[str]], + namespace: Optional[str] = None, + async_req: Optional[bool] = False, + **kwargs, + ) -> Union[FetchResponse, PineconeGrpcFuture]: + """Fetch vectors by ID. + + The fetch operation looks up and returns vectors, by ID, from a single namespace. + The returned vectors include the vector data and/or metadata. + + Args: + ids: The vector IDs to fetch. + namespace: The namespace to fetch vectors from. If not specified, the default + namespace is used. [optional] + async_req: If True, the fetch operation will be performed asynchronously. + Defaults to False. [optional] + **kwargs: Additional keyword arguments. + + Returns: + FetchResponse object which contains the list of Vector objects, and namespace name. + + Examples: + >>> index.vector.fetch(ids=['id1', 'id2'], namespace='my_namespace') + >>> index.vector.fetch(ids=['id1', 'id2']) + """ + timeout = kwargs.pop("timeout", None) + + args_dict = self._parse_non_empty_args([("namespace", namespace)]) + + request = FetchRequest(ids=ids, **args_dict, **kwargs) + + if async_req: + future_result = self._runner.run(self._stub.Fetch.future, request, timeout=timeout) + # For .future calls, runner returns (future, None, None) since .future doesn't support with_call + future = future_result[0] if isinstance(future_result, tuple) else future_result + return PineconeGrpcFuture( + future, result_transformer=parse_fetch_response, timeout=timeout + ) + else: + response, initial_metadata = self._runner.run( + self._stub.Fetch, request, timeout=timeout + ) + return parse_fetch_response(response, initial_metadata=initial_metadata) + + def fetch_by_metadata( + self, + filter: FilterTypedDict, + namespace: Optional[str] = None, + limit: Optional[int] = None, + pagination_token: Optional[str] = None, + async_req: Optional[bool] = False, + **kwargs, + ) -> Union[FetchByMetadataResponse, PineconeGrpcFuture]: + """Fetch vectors by metadata filter. + + Look up and return vectors by metadata filter from a single namespace. + The returned vectors include the vector data and/or metadata. + + Args: + filter: Metadata filter expression to select vectors. + See `metadata filtering _` + namespace: The namespace to fetch vectors from. If not specified, the default + namespace is used. [optional] + limit: Max number of vectors to return. Defaults to 100. [optional] + pagination_token: Pagination token to continue a previous listing operation. + [optional] + async_req: If True, the fetch operation will be performed asynchronously. + Defaults to False. [optional] + **kwargs: Additional keyword arguments. + + Returns: + FetchByMetadataResponse: Object containing the fetched vectors, namespace, + usage, and pagination token. + + Examples: + >>> index.vector.fetch_by_metadata( + ... filter={'genre': {'$in': ['comedy', 'drama']}, 'year': {'$eq': 2019}}, + ... namespace='my_namespace', + ... limit=50 + ... ) + >>> index.vector.fetch_by_metadata( + ... filter={'status': 'active'}, + ... pagination_token='token123' + ... ) + """ + timeout = kwargs.pop("timeout", None) + + if filter is not None: + filter_struct = dict_to_proto_struct(filter) + else: + filter_struct = None + + args_dict = self._parse_non_empty_args( + [ + ("namespace", namespace), + ("filter", filter_struct), + ("limit", limit), + ("pagination_token", pagination_token), + ] + ) + + request = FetchByMetadataRequest(**args_dict, **kwargs) + + if async_req: + future_result = self._runner.run( + self._stub.FetchByMetadata.future, request, timeout=timeout + ) + # For .future calls, runner returns (future, None, None) since .future doesn't support with_call + future = future_result[0] if isinstance(future_result, tuple) else future_result + return PineconeGrpcFuture( + future, result_transformer=parse_fetch_by_metadata_response, timeout=timeout + ) + else: + response, initial_metadata = self._runner.run( + self._stub.FetchByMetadata, request, timeout=timeout + ) + return parse_fetch_by_metadata_response(response, initial_metadata=initial_metadata) + + def _query( + self, + vector: Optional[List[float]] = None, + id: Optional[str] = None, + namespace: Optional[str] = None, + top_k: Optional[int] = None, + filter: Optional[FilterTypedDict] = None, + include_values: Optional[bool] = None, + include_metadata: Optional[bool] = None, + sparse_vector: Optional[ + Union[SparseValues, GRPCSparseValues, SparseVectorTypedDict] + ] = None, + **kwargs, + ) -> Tuple[Dict[str, Any], Optional[Dict[str, str]]]: + """ + Low-level query method that returns raw JSON dict and initial metadata without parsing. + Used internally by query() and query_namespaces() for performance. + + Returns: + Tuple of (json_dict, initial_metadata). initial_metadata may be None. + """ + if vector is not None and id is not None: + raise ValueError("Cannot specify both `id` and `vector`") + + if filter is not None: + filter_struct = dict_to_proto_struct(filter) + else: + filter_struct = None + + sparse_vector = SparseValuesFactory.build(sparse_vector) + args_dict = self._parse_non_empty_args( + [ + ("vector", vector), + ("id", id), + ("namespace", namespace), + ("top_k", top_k), + ("filter", filter_struct), + ("include_values", include_values), + ("include_metadata", include_metadata), + ("sparse_vector", sparse_vector), + ] + ) + + request = QueryRequest(**args_dict) + + timeout = kwargs.pop("timeout", None) + response, initial_metadata = self._runner.run(self._stub.Query, request, timeout=timeout) + return json_format.MessageToDict(response), initial_metadata + + def query( + self, + vector: Optional[List[float]] = None, + id: Optional[str] = None, + namespace: Optional[str] = None, + top_k: Optional[int] = None, + filter: Optional[FilterTypedDict] = None, + include_values: Optional[bool] = None, + include_metadata: Optional[bool] = None, + sparse_vector: Optional[ + Union[SparseValues, GRPCSparseValues, SparseVectorTypedDict] + ] = None, + async_req: Optional[bool] = False, + **kwargs, + ) -> Union[QueryResponse, PineconeGrpcFuture]: + """Query the index. + + The Query operation searches a namespace, using a query vector. It retrieves the + ids of the most similar items in a namespace, along with their similarity scores. + + Args: + vector: The query vector. This should be the same length as the dimension of + the index being queried. Each query request can contain only one of the + parameters id or vector. [optional] + id: The unique ID of the vector to be used as a query vector. Each query request + can contain only one of the parameters vector or id. [optional] + top_k: The number of results to return for each query. Must be an integer + greater than 1. + namespace: The namespace to query. If not specified, the default namespace is + used. [optional] + filter: The filter to apply. You can use vector metadata to limit your search. + See `metadata filtering _` + [optional] + include_values: Indicates whether vector values are included in the response. + If omitted the server will use the default value of False. [optional] + include_metadata: Indicates whether metadata is included in the response as well + as the ids. If omitted the server will use the default value of False. + [optional] + sparse_vector: Sparse values of the query vector. Expected to be either a + SparseValues object or a dict of the form {'indices': List[int], + 'values': List[float]}, where the lists each have the same length. + [optional] + async_req: If True, the query operation will be performed asynchronously. + Defaults to False. [optional] + **kwargs: Additional keyword arguments. + + Returns: + QueryResponse object which contains the list of the closest vectors as + ScoredVector objects, and namespace name, or PineconeGrpcFuture if + async_req=True. + + Examples: + >>> index.vector.query(vector=[1, 2, 3], top_k=10, namespace='my_namespace') + >>> index.vector.query(id='id1', top_k=10, namespace='my_namespace') + >>> index.vector.query(vector=[1, 2, 3], top_k=10, namespace='my_namespace', + ... filter={'key': 'value'}) + """ + timeout = kwargs.pop("timeout", None) + + if async_req: + # For async requests, we need to build the request manually + if vector is not None and id is not None: + raise ValueError("Cannot specify both `id` and `vector`") + + if filter is not None: + filter_struct = dict_to_proto_struct(filter) + else: + filter_struct = None + + sparse_vector = SparseValuesFactory.build(sparse_vector) + args_dict = self._parse_non_empty_args( + [ + ("vector", vector), + ("id", id), + ("namespace", namespace), + ("top_k", top_k), + ("filter", filter_struct), + ("include_values", include_values), + ("include_metadata", include_metadata), + ("sparse_vector", sparse_vector), + ] + ) + + request = QueryRequest(**args_dict) + future_result = self._runner.run(self._stub.Query.future, request, timeout=timeout) + # For .future calls, runner returns (future, None) since .future doesn't support with_call + future = future_result[0] if isinstance(future_result, tuple) else future_result + return PineconeGrpcFuture( + future, result_transformer=parse_query_response, timeout=timeout + ) + else: + # For sync requests, use _query to get raw dict and metadata, then parse it + json_response, initial_metadata = self._query( + vector=vector, + id=id, + namespace=namespace, + top_k=top_k, + filter=filter, + include_values=include_values, + include_metadata=include_metadata, + sparse_vector=sparse_vector, + timeout=timeout, + **kwargs, + ) + return parse_query_response( + json_response, _check_type=False, initial_metadata=initial_metadata + ) + + def query_namespaces( + self, + vector: List[float], + namespaces: List[str], + metric: Literal["cosine", "euclidean", "dotproduct"], + top_k: Optional[int] = None, + filter: Optional[FilterTypedDict] = None, + include_values: Optional[bool] = None, + include_metadata: Optional[bool] = None, + sparse_vector: Optional[Union[GRPCSparseValues, SparseVectorTypedDict]] = None, + **kwargs, + ) -> QueryNamespacesResults: + """Query across multiple namespaces. + + Performs a query operation across multiple namespaces and aggregates the results. + + Args: + vector: The query vector. + namespaces: List of namespace names to query. + metric: The similarity metric to use for aggregation. Must be one of "cosine", + "euclidean", or "dotproduct". + top_k: The number of results to return. If not specified, defaults to 10. + [optional] + filter: The filter to apply. You can use vector metadata to limit your search. + [optional] + include_values: Indicates whether vector values are included in the response. + [optional] + include_metadata: Indicates whether metadata is included in the response. + [optional] + sparse_vector: Sparse values of the query vector. [optional] + **kwargs: Additional keyword arguments. + + Returns: + QueryNamespacesResults containing aggregated results from all namespaces. + + Raises: + ValueError: If no namespaces are specified or if vector is empty. + + Examples: + >>> index.vector.query_namespaces( + ... vector=[1, 2, 3], + ... namespaces=['ns1', 'ns2'], + ... metric='cosine', + ... top_k=10 + ... ) + """ + if namespaces is None or len(namespaces) == 0: + raise ValueError("At least one namespace must be specified") + if len(vector) == 0: + raise ValueError("Query vector must not be empty") + + overall_topk = top_k if top_k is not None else 10 + aggregator = QueryResultsAggregator(top_k=overall_topk, metric=metric) + + target_namespaces = set(namespaces) # dedup namespaces + futures = [ + self._threadpool_executor.submit( + self._query, + vector=vector, + namespace=ns, + top_k=overall_topk, + filter=filter, + include_values=include_values, + include_metadata=include_metadata, + sparse_vector=sparse_vector, + **kwargs, + ) + for ns in target_namespaces + ] + + only_futures = cast(Iterable[Future], futures) + for response in as_completed(only_futures): + json_response, _ = response.result() # Ignore initial_metadata for query_namespaces + # Pass raw dict directly to aggregator - no parsing needed + aggregator.add_results(json_response) + + final_results = aggregator.get_results() + return final_results + + def update( + self, + id: str, + async_req: bool = False, + values: Optional[List[float]] = None, + set_metadata: Optional[VectorMetadataTypedDict] = None, + namespace: Optional[str] = None, + sparse_values: Optional[Union[GRPCSparseValues, SparseVectorTypedDict]] = None, + **kwargs, + ) -> Union[UpdateResponse, PineconeGrpcFuture]: + """Update a vector in the index. + + The Update operation updates vector in a namespace. If a value is included, it + will overwrite the previous value. If a set_metadata is included, the values of + the fields specified in it will be added or overwrite the previous value. + + Args: + id: Vector's unique id. + async_req: If True, the update operation will be performed asynchronously. + Defaults to False. + values: Vector values to set. [optional] + set_metadata: Metadata to set for vector. [optional] + namespace: Namespace name where to update the vector. If not specified, the + default namespace is used. [optional] + sparse_values: Sparse values to update for the vector. Expected to be either + a GRPCSparseValues object or a dict of the form {'indices': List[int], + 'values': List[float]} where the lists each have the same length. + [optional] + **kwargs: Additional keyword arguments. + + Returns: + UpdateResponse (contains no data), or PineconeGrpcFuture if async_req=True. + + Examples: + >>> index.vector.update(id='id1', values=[1, 2, 3], namespace='my_namespace') + >>> index.vector.update(id='id1', set_metadata={'key': 'value'}, + ... namespace='my_namespace') + """ + timeout = kwargs.pop("timeout", None) + + sparse_values = SparseValuesFactory.build(sparse_values) + args_dict = self._parse_non_empty_args( + [ + ("id", id), + ("values", values), + ("set_metadata", dict_to_proto_struct(set_metadata) if set_metadata else None), + ("namespace", namespace), + ("sparse_values", sparse_values), + ] + ) + + request = UpdateRequest(**args_dict, **kwargs) + + if async_req: + future_result = self._runner.run(self._stub.Update.future, request, timeout=timeout) + # For .future calls, runner returns (future, None, None) since .future doesn't support with_call + future = future_result[0] if isinstance(future_result, tuple) else future_result + return PineconeGrpcFuture( + future, timeout=timeout, result_transformer=parse_update_response + ) + else: + response, initial_metadata = self._runner.run( + self._stub.Update, request, timeout=timeout + ) + return parse_update_response(response, initial_metadata=initial_metadata) + + def list_paginated( + self, + prefix: Optional[str] = None, + limit: Optional[int] = None, + pagination_token: Optional[str] = None, + namespace: Optional[str] = None, + **kwargs, + ) -> SimpleListResponse: + """List vectors with pagination. + + The list_paginated operation finds vectors based on an id prefix within a single + namespace. It returns matching ids in a paginated form, with a pagination token to + fetch the next page of results. + + Args: + prefix: The id prefix to match. If unspecified, an empty string prefix will + be used with the effect of listing all ids in a namespace. [optional] + limit: The maximum number of ids to return. If unspecified, the server will + use a default value. [optional] + pagination_token: A token needed to fetch the next page of results. This token + is returned in the response if additional results are available. [optional] + namespace: The namespace to list vectors from. If not specified, the default + namespace is used. [optional] + **kwargs: Additional keyword arguments. + + Returns: + SimpleListResponse object which contains the list of ids, the namespace name, + pagination information, and usage showing the number of read_units consumed. + + Examples: + >>> results = index.vector.list_paginated(prefix='99', limit=5, + ... namespace='my_namespace') + >>> results.pagination.next + 'eyJza2lwX3Bhc3QiOiI5OTMiLCJwcmVmaXgiOiI5OSJ9' + """ + args_dict = self._parse_non_empty_args( + [ + ("prefix", prefix), + ("limit", limit), + ("namespace", namespace), + ("pagination_token", pagination_token), + ] + ) + request = ListRequest(**args_dict, **kwargs) + timeout = kwargs.pop("timeout", None) + response, _ = self._runner.run(self._stub.List, request, timeout=timeout) + + if response.pagination and response.pagination.next != "": + pagination = Pagination(next=response.pagination.next) + else: + pagination = None + + return SimpleListResponse( + namespace=response.namespace, vectors=response.vectors, pagination=pagination + ) + + def list(self, **kwargs): + """List vectors. + + The list operation accepts all of the same arguments as list_paginated, and returns + a generator that yields a list of the matching vector ids in each page of results. + It automatically handles pagination tokens on your behalf. + + Args: + **kwargs: Same arguments as list_paginated (prefix, limit, pagination_token, + namespace). + + Yields: + List of vector ids for each page of results. + + Examples: + >>> for ids in index.vector.list(prefix='99', limit=5, + ... namespace='my_namespace'): + ... print(ids) + ['99', '990', '991', '992', '993'] + ['994', '995', '996', '997', '998'] + """ + done = False + while not done: + try: + results = self.list_paginated(**kwargs) + except Exception as e: + raise e + + if len(results.vectors) > 0: + yield [v.id for v in results.vectors] + + if results.pagination and results.pagination.next: + kwargs.update({"pagination_token": results.pagination.next}) + else: + done = True + + def describe_index_stats( + self, filter: Optional[FilterTypedDict] = None, **kwargs + ) -> DescribeIndexStatsResponse: + """Describe index statistics. + + The DescribeIndexStats operation returns statistics about the index's contents. + For example: The vector count per namespace and the number of dimensions. + + Args: + filter: If this parameter is present, the operation only returns statistics + for vectors that satisfy the filter. See `metadata filtering + _` [optional] + **kwargs: Additional keyword arguments. + + Returns: + DescribeIndexStatsResponse object which contains stats about the index. + + Examples: + >>> index.vector.describe_index_stats() + >>> index.vector.describe_index_stats(filter={'key': 'value'}) + """ + if filter is not None: + filter_struct = dict_to_proto_struct(filter) + else: + filter_struct = None + args_dict = self._parse_non_empty_args([("filter", filter_struct)]) + timeout = kwargs.pop("timeout", None) + + request = DescribeIndexStatsRequest(**args_dict) + response, _ = self._runner.run(self._stub.DescribeIndexStats, request, timeout=timeout) + json_response = json_format.MessageToDict(response) + return parse_stats_response(json_response) diff --git a/tests/unit/data/test_request_factory.py b/tests/unit/data/test_request_factory.py index a5ccda6a4..42f3d6d30 100644 --- a/tests/unit/data/test_request_factory.py +++ b/tests/unit/data/test_request_factory.py @@ -520,6 +520,24 @@ def test_update_request_without_filter_backward_compatibility(self): # Filter should not be set when not provided assert not hasattr(request, "filter") or request.filter is None + def test_update_request_with_filter_only_no_id(self): + """Test update_request with filter only (no id) for bulk updates.""" + request = IndexRequestFactory.update_request( + filter={"genre": {"$eq": "action"}}, set_metadata={"status": "active"} + ) + assert request.filter == {"genre": {"$eq": "action"}} + assert request.set_metadata == {"status": "active"} + # id should not be set when not provided + assert not hasattr(request, "id") or request.id is None + + def test_update_request_with_id_only_no_filter(self): + """Test update_request with id only (no filter) - backward compatibility.""" + request = IndexRequestFactory.update_request(id="vec1", values=[0.1, 0.2, 0.3]) + assert request.id == "vec1" + assert request.values == [0.1, 0.2, 0.3] + # Filter should not be set when not provided + assert not hasattr(request, "filter") or request.filter is None + def test_update_request_with_simple_equality_filter(self): """Test update_request with simple equality filter.""" request = IndexRequestFactory.update_request(id="vec1", filter={"genre": "action"}) diff --git a/tests/unit/test_index.py b/tests/unit/test_index.py index 849a34d12..691f0c66e 100644 --- a/tests/unit/test_index.py +++ b/tests/unit/test_index.py @@ -515,29 +515,28 @@ def test_update_byIdAnValuesAndMetadata_updateByIdAndValuesAndMetadata(self, moc def test_update_withFilter_updateWithFilter(self, mocker): mocker.patch.object(self.index._vector_api, "update_vector", autospec=True) - self.index.update(id="vec1", filter=self.filter1, namespace="ns") + self.index.update(filter=self.filter1, namespace="ns") self.index._vector_api.update_vector.assert_called_once_with( - oai.UpdateRequest(id="vec1", filter=self.filter1, namespace="ns") + oai.UpdateRequest(filter=self.filter1, namespace="ns") ) def test_update_withFilterAndSetMetadata_updateWithFilterAndSetMetadata(self, mocker): mocker.patch.object(self.index._vector_api, "update_vector", autospec=True) - self.index.update(id="vec1", set_metadata=self.md1, filter=self.filter1, namespace="ns") + self.index.update(set_metadata=self.md1, filter=self.filter1, namespace="ns") self.index._vector_api.update_vector.assert_called_once_with( - oai.UpdateRequest(id="vec1", set_metadata=self.md1, filter=self.filter1, namespace="ns") + oai.UpdateRequest(set_metadata=self.md1, filter=self.filter1, namespace="ns") ) def test_update_withFilterAndValues_updateWithFilterAndValues(self, mocker): mocker.patch.object(self.index._vector_api, "update_vector", autospec=True) - self.index.update(id="vec1", values=self.vals1, filter=self.filter1, namespace="ns") + self.index.update(values=self.vals1, filter=self.filter1, namespace="ns") self.index._vector_api.update_vector.assert_called_once_with( - oai.UpdateRequest(id="vec1", values=self.vals1, filter=self.filter1, namespace="ns") + oai.UpdateRequest(values=self.vals1, filter=self.filter1, namespace="ns") ) def test_update_withFilterAndAllParams_updateWithFilterAndAllParams(self, mocker): mocker.patch.object(self.index._vector_api, "update_vector", autospec=True) self.index.update( - id="vec1", values=self.vals1, set_metadata=self.md1, sparse_values=self.sv1, @@ -546,7 +545,6 @@ def test_update_withFilterAndAllParams_updateWithFilterAndAllParams(self, mocker ) self.index._vector_api.update_vector.assert_called_once_with( oai.UpdateRequest( - id="vec1", values=self.vals1, set_metadata=self.md1, sparse_values=oai.SparseValues(indices=self.svi1, values=self.svv1), @@ -563,6 +561,26 @@ def test_update_withoutFilter_backwardCompatibility(self, mocker): oai.UpdateRequest(id="vec1", values=self.vals1, namespace="ns") ) + def test_update_withFilterOnly_noId(self, mocker): + """Test update with filter only (no id) for bulk updates.""" + mocker.patch.object(self.index._vector_api, "update_vector", autospec=True) + self.index.update(set_metadata=self.md1, filter=self.filter1, namespace="ns") + self.index._vector_api.update_vector.assert_called_once_with( + oai.UpdateRequest(set_metadata=self.md1, filter=self.filter1, namespace="ns") + ) + + def test_update_withNeitherIdNorFilter_raisesError(self, mocker): + """Test that update raises error when neither id nor filter is provided.""" + mocker.patch.object(self.index._vector_api, "update_vector", autospec=True) + with pytest.raises(ValueError, match="Either 'id' or 'filter' must be provided"): + self.index.update(values=self.vals1, namespace="ns") + + def test_update_withBothIdAndFilter_raisesError(self, mocker): + """Test that update raises error when both id and filter are provided.""" + mocker.patch.object(self.index._vector_api, "update_vector", autospec=True) + with pytest.raises(ValueError, match="Cannot provide both 'id' and 'filter'"): + self.index.update(id="vec1", filter=self.filter1, values=self.vals1, namespace="ns") + # endregion # region: describe index tests diff --git a/tests/unit_grpc/test_grpc_index_update.py b/tests/unit_grpc/test_grpc_index_update.py index 3f3f656e4..f856d9ea0 100644 --- a/tests/unit_grpc/test_grpc_index_update.py +++ b/tests/unit_grpc/test_grpc_index_update.py @@ -1,3 +1,4 @@ +import pytest from pinecone import Config from pinecone.grpc import GRPCIndex from pinecone.core.grpc.protos.db_data_2025_10_pb2 import UpdateRequest, UpdateResponse @@ -45,24 +46,22 @@ def test_update_byIdAnValuesAndMetadata_updateByIdAndValuesAndMetadata( def test_update_withFilter_updateWithFilter(self, mocker, filter1): mock_response = UpdateResponse() mocker.patch.object(self.index.runner, "run", return_value=(mock_response, None)) - self.index.update(id="vec1", filter=filter1, namespace="ns") + self.index.update(filter=filter1, namespace="ns") self.index.runner.run.assert_called_once_with( self.index.stub.Update, - UpdateRequest(id="vec1", filter=dict_to_proto_struct(filter1), namespace="ns"), + UpdateRequest(filter=dict_to_proto_struct(filter1), namespace="ns"), timeout=None, ) def test_update_withFilterAndSetMetadata_updateWithFilterAndSetMetadata( - self, mocker, vals1, md1, filter1 + self, mocker, md1, filter1 ): mock_response = UpdateResponse() mocker.patch.object(self.index.runner, "run", return_value=(mock_response, None)) - self.index.update(id="vec1", values=vals1, set_metadata=md1, filter=filter1, namespace="ns") + self.index.update(set_metadata=md1, filter=filter1, namespace="ns") self.index.runner.run.assert_called_once_with( self.index.stub.Update, UpdateRequest( - id="vec1", - values=vals1, set_metadata=dict_to_proto_struct(md1), filter=dict_to_proto_struct(filter1), namespace="ns", @@ -73,20 +72,45 @@ def test_update_withFilterAndSetMetadata_updateWithFilterAndSetMetadata( def test_update_withFilterAndValues_updateWithFilterAndValues(self, mocker, vals1, filter1): mock_response = UpdateResponse() mocker.patch.object(self.index.runner, "run", return_value=(mock_response, None)) - self.index.update(id="vec1", values=vals1, filter=filter1, namespace="ns") + self.index.update(values=vals1, filter=filter1, namespace="ns") self.index.runner.run.assert_called_once_with( self.index.stub.Update, - UpdateRequest( - id="vec1", values=vals1, filter=dict_to_proto_struct(filter1), namespace="ns" - ), + UpdateRequest(values=vals1, filter=dict_to_proto_struct(filter1), namespace="ns"), timeout=None, ) def test_update_withFilter_asyncReq_updateWithFilterAsyncReq(self, mocker, filter1): mocker.patch.object(self.index.runner, "run", autospec=True) - self.index.update(id="vec1", filter=filter1, namespace="ns", async_req=True) + self.index.update(filter=filter1, namespace="ns", async_req=True) self.index.runner.run.assert_called_once_with( self.index.stub.Update.future, - UpdateRequest(id="vec1", filter=dict_to_proto_struct(filter1), namespace="ns"), + UpdateRequest(filter=dict_to_proto_struct(filter1), namespace="ns"), + timeout=None, + ) + + def test_update_withFilterOnly_noId(self, mocker, filter1, md1): + """Test update with filter only (no id) for bulk updates.""" + mock_response = UpdateResponse() + mocker.patch.object(self.index.runner, "run", return_value=(mock_response, None)) + self.index.update(set_metadata=md1, filter=filter1, namespace="ns") + self.index.runner.run.assert_called_once_with( + self.index.stub.Update, + UpdateRequest( + set_metadata=dict_to_proto_struct(md1), + filter=dict_to_proto_struct(filter1), + namespace="ns", + ), timeout=None, ) + + def test_update_withNeitherIdNorFilter_raisesError(self, mocker, vals1): + """Test that update raises error when neither id nor filter is provided.""" + mocker.patch.object(self.index.runner, "run", autospec=True) + with pytest.raises(ValueError, match="Either 'id' or 'filter' must be provided"): + self.index.update(values=vals1, namespace="ns") + + def test_update_withBothIdAndFilter_raisesError(self, mocker, vals1, filter1): + """Test that update raises error when both id and filter are provided.""" + mocker.patch.object(self.index.runner, "run", autospec=True) + with pytest.raises(ValueError, match="Cannot provide both 'id' and 'filter'"): + self.index.update(id="vec1", filter=filter1, values=vals1, namespace="ns") From a25bbd523747249981768b007b8863d86769a439 Mon Sep 17 00:00:00 2001 From: Jen Hamon Date: Sat, 15 Nov 2025 16:34:56 -0500 Subject: [PATCH 3/6] Add dry_run --- pinecone/db_data/index.py | 2 + pinecone/db_data/index_asyncio.py | 2 + pinecone/db_data/index_asyncio_interface.py | 15 +++++- pinecone/db_data/interfaces.py | 11 +++- pinecone/db_data/request_factory.py | 2 + pinecone/grpc/index_grpc.py | 13 ++++- tests/unit/data/test_request_factory.py | 56 +++++++++++++++++++++ tests/unit/test_index.py | 48 ++++++++++++++++++ tests/unit_grpc/test_grpc_index_update.py | 50 ++++++++++++++++++ 9 files changed, 195 insertions(+), 4 deletions(-) diff --git a/pinecone/db_data/index.py b/pinecone/db_data/index.py index 559bb1e17..7e790a2b5 100644 --- a/pinecone/db_data/index.py +++ b/pinecone/db_data/index.py @@ -658,6 +658,7 @@ def update( namespace: Optional[str] = None, sparse_values: Optional[Union[SparseValues, SparseVectorTypedDict]] = None, filter: Optional[FilterTypedDict] = None, + dry_run: Optional[bool] = None, **kwargs, ) -> UpdateResponse: # Validate that exactly one of id or filter is provided @@ -675,6 +676,7 @@ def update( namespace=namespace, sparse_values=sparse_values, filter=filter, + dry_run=dry_run, **kwargs, ), **self._openapi_kwargs(kwargs), diff --git a/pinecone/db_data/index_asyncio.py b/pinecone/db_data/index_asyncio.py index 08097d5f8..bcfa57641 100644 --- a/pinecone/db_data/index_asyncio.py +++ b/pinecone/db_data/index_asyncio.py @@ -629,6 +629,7 @@ async def update( namespace: Optional[str] = None, sparse_values: Optional[Union[SparseValues, SparseVectorTypedDict]] = None, filter: Optional[FilterTypedDict] = None, + dry_run: Optional[bool] = None, **kwargs, ) -> UpdateResponse: # Validate that exactly one of id or filter is provided @@ -646,6 +647,7 @@ async def update( namespace=namespace, sparse_values=sparse_values, filter=filter, + dry_run=dry_run, **kwargs, ), **self._openapi_kwargs(kwargs), diff --git a/pinecone/db_data/index_asyncio_interface.py b/pinecone/db_data/index_asyncio_interface.py index 553480fda..a838bbfea 100644 --- a/pinecone/db_data/index_asyncio_interface.py +++ b/pinecone/db_data/index_asyncio_interface.py @@ -531,6 +531,7 @@ async def update( namespace: Optional[str] = None, sparse_values: Optional[Union[SparseValues, SparseVectorTypedDict]] = None, filter: Optional[FilterTypedDict] = None, + dry_run: Optional[bool] = None, **kwargs, ) -> UpdateResponse: """ @@ -601,6 +602,14 @@ async def main(): namespace='my_namespace' ) print(f"Updated {response.matched_records} vectors") + # Preview how many vectors would be updated (dry run) + response = await idx.update( + set_metadata={'status': 'active'}, + filter={'genre': {'$eq': 'drama'}}, + namespace='my_namespace', + dry_run=True + ) + print(f"Would update {response.matched_records} vectors") asyncio.run(main()) @@ -617,10 +626,14 @@ async def main(): When provided, updates all vectors in the namespace that match the filter criteria. See `metadata filtering _`. Must not be provided when using id. Either `id` or `filter` must be provided. [optional] + dry_run (bool): If `True`, return the number of records that match the `filter` without executing + the update. Only meaningful when using `filter` (not with `id`). Useful for previewing + the impact of a bulk update before applying changes. Defaults to `False`. [optional] Returns: UpdateResponse: An UpdateResponse object. When using filter-based updates, the response includes - `matched_records` indicating the number of vectors that were updated. + `matched_records` indicating the number of vectors that were updated (or would be updated if + `dry_run=True`). """ pass diff --git a/pinecone/db_data/interfaces.py b/pinecone/db_data/interfaces.py index 6e3505e4a..c7ecbf560 100644 --- a/pinecone/db_data/interfaces.py +++ b/pinecone/db_data/interfaces.py @@ -716,6 +716,7 @@ def update( namespace: Optional[str] = None, sparse_values: Optional[Union[SparseValues, SparseVectorTypedDict]] = None, filter: Optional[FilterTypedDict] = None, + dry_run: Optional[bool] = None, **kwargs, ) -> UpdateResponse: """ @@ -759,6 +760,10 @@ def update( >>> response = index.update(set_metadata={'status': 'active'}, filter={'genre': {'$eq': 'drama'}}, >>> namespace='my_namespace') >>> print(f"Updated {response.matched_records} vectors") + >>> # Preview how many vectors would be updated (dry run) + >>> response = index.update(set_metadata={'status': 'active'}, filter={'genre': {'$eq': 'drama'}}, + >>> namespace='my_namespace', dry_run=True) + >>> print(f"Would update {response.matched_records} vectors") Args: id (str): Vector's unique id. Required for single vector updates. Must not be provided when using filter. [optional] @@ -773,10 +778,14 @@ def update( When provided, updates all vectors in the namespace that match the filter criteria. See `metadata filtering _`. Must not be provided when using id. Either `id` or `filter` must be provided. [optional] + dry_run (bool): If `True`, return the number of records that match the `filter` without executing + the update. Only meaningful when using `filter` (not with `id`). Useful for previewing + the impact of a bulk update before applying changes. Defaults to `False`. [optional] Returns: UpdateResponse: An UpdateResponse object. When using filter-based updates, the response includes - `matched_records` indicating the number of vectors that were updated. + `matched_records` indicating the number of vectors that were updated (or would be updated if + `dry_run=True`). """ pass diff --git a/pinecone/db_data/request_factory.py b/pinecone/db_data/request_factory.py index d1b64bcca..23125abb5 100644 --- a/pinecone/db_data/request_factory.py +++ b/pinecone/db_data/request_factory.py @@ -141,6 +141,7 @@ def update_request( namespace: Optional[str] = None, sparse_values: Optional[Union[SparseValues, SparseVectorTypedDict]] = None, filter: Optional[FilterTypedDict] = None, + dry_run: Optional[bool] = None, **kwargs, ) -> UpdateRequest: _check_type = kwargs.pop("_check_type", False) @@ -153,6 +154,7 @@ def update_request( ("namespace", namespace), ("sparse_values", sparse_values_normalized), ("filter", filter), + ("dry_run", dry_run), ] ) diff --git a/pinecone/grpc/index_grpc.py b/pinecone/grpc/index_grpc.py index b75f97834..4dd0f9749 100644 --- a/pinecone/grpc/index_grpc.py +++ b/pinecone/grpc/index_grpc.py @@ -688,6 +688,7 @@ def update( namespace: Optional[str] = None, sparse_values: Optional[Union[GRPCSparseValues, SparseVectorTypedDict]] = None, filter: Optional[FilterTypedDict] = None, + dry_run: Optional[bool] = None, **kwargs, ) -> Union[UpdateResponse, PineconeGrpcFuture]: """ @@ -731,6 +732,10 @@ def update( >>> response = index.update(set_metadata={'status': 'active'}, filter={'genre': {'$eq': 'drama'}}, >>> namespace='my_namespace') >>> print(f"Updated {response.matched_records} vectors") + >>> # Preview how many vectors would be updated (dry run) + >>> response = index.update(set_metadata={'status': 'active'}, filter={'genre': {'$eq': 'drama'}}, + >>> namespace='my_namespace', dry_run=True) + >>> print(f"Would update {response.matched_records} vectors") Args: id (str): Vector's unique id. Required for single vector updates. Must not be provided when using filter. [optional] @@ -747,11 +752,14 @@ def update( When provided, updates all vectors in the namespace that match the filter criteria. See `metadata filtering _`. Must not be provided when using id. Either `id` or `filter` must be provided. [optional] + dry_run (bool): If `True`, return the number of records that match the `filter` without executing + the update. Only meaningful when using `filter` (not with `id`). Useful for previewing + the impact of a bulk update before applying changes. Defaults to `False`. [optional] Returns: UpdateResponse or PineconeGrpcFuture: When using filter-based updates, the UpdateResponse includes - `matched_records` indicating the number of vectors that were updated. If `async_req=True`, returns - a PineconeGrpcFuture object instead. + `matched_records` indicating the number of vectors that were updated (or would be updated if + `dry_run=True`). If `async_req=True`, returns a PineconeGrpcFuture object instead. """ # Validate that exactly one of id or filter is provided if id is None and filter is None: @@ -781,6 +789,7 @@ def update( ("namespace", namespace), ("sparse_values", sparse_values), ("filter", filter_struct), + ("dry_run", dry_run), ] ) diff --git a/tests/unit/data/test_request_factory.py b/tests/unit/data/test_request_factory.py index 42f3d6d30..bd5620394 100644 --- a/tests/unit/data/test_request_factory.py +++ b/tests/unit/data/test_request_factory.py @@ -578,4 +578,60 @@ def test_update_request_with_complex_nested_filter(self): assert request.id == "vec1" assert request.filter == complex_filter + def test_update_request_with_dry_run(self): + """Test update_request with dry_run parameter.""" + request = IndexRequestFactory.update_request( + filter={"genre": {"$eq": "action"}}, dry_run=True + ) + assert request.filter == {"genre": {"$eq": "action"}} + assert request.dry_run is True + + def test_update_request_with_dry_run_false(self): + """Test update_request with dry_run=False.""" + request = IndexRequestFactory.update_request( + filter={"genre": {"$eq": "action"}}, dry_run=False + ) + assert request.filter == {"genre": {"$eq": "action"}} + assert request.dry_run is False + + def test_update_request_with_dry_run_and_set_metadata(self): + """Test update_request with dry_run and set_metadata.""" + request = IndexRequestFactory.update_request( + filter={"genre": {"$eq": "drama"}}, set_metadata={"status": "active"}, dry_run=True + ) + assert request.filter == {"genre": {"$eq": "drama"}} + assert request.set_metadata == {"status": "active"} + assert request.dry_run is True + + def test_update_request_with_dry_run_and_all_params(self): + """Test update_request with dry_run and all parameters.""" + values = [0.1, 0.2, 0.3] + set_metadata = {"status": "active"} + sparse_values = {"indices": [1, 2], "values": [0.4, 0.5]} + filter_dict = {"genre": {"$eq": "action"}} + request = IndexRequestFactory.update_request( + values=values, + set_metadata=set_metadata, + namespace="my_namespace", + sparse_values=sparse_values, + filter=filter_dict, + dry_run=True, + ) + assert request.values == values + assert request.set_metadata == set_metadata + assert request.namespace == "my_namespace" + assert request.sparse_values is not None + assert request.filter == filter_dict + assert request.dry_run is True + + def test_update_request_without_dry_run_not_included(self): + """Test that dry_run is not included in request when not provided.""" + request = IndexRequestFactory.update_request( + filter={"genre": {"$eq": "action"}}, set_metadata={"status": "active"} + ) + assert request.filter == {"genre": {"$eq": "action"}} + assert request.set_metadata == {"status": "active"} + # dry_run should not be set when not provided (defaults to False in OpenAPI) + assert not hasattr(request, "dry_run") or request.dry_run is False + # endregion diff --git a/tests/unit/test_index.py b/tests/unit/test_index.py index 691f0c66e..0080d06d4 100644 --- a/tests/unit/test_index.py +++ b/tests/unit/test_index.py @@ -581,6 +581,54 @@ def test_update_withBothIdAndFilter_raisesError(self, mocker): with pytest.raises(ValueError, match="Cannot provide both 'id' and 'filter'"): self.index.update(id="vec1", filter=self.filter1, values=self.vals1, namespace="ns") + def test_update_withDryRun_updateWithDryRun(self, mocker): + """Test update with dry_run parameter.""" + mocker.patch.object(self.index._vector_api, "update_vector", autospec=True) + self.index.update(filter=self.filter1, dry_run=True, namespace="ns") + self.index._vector_api.update_vector.assert_called_once_with( + oai.UpdateRequest(filter=self.filter1, dry_run=True, namespace="ns") + ) + + def test_update_withDryRunAndSetMetadata_updateWithDryRunAndSetMetadata(self, mocker): + """Test update with dry_run and set_metadata.""" + mocker.patch.object(self.index._vector_api, "update_vector", autospec=True) + self.index.update(set_metadata=self.md1, filter=self.filter1, dry_run=True, namespace="ns") + self.index._vector_api.update_vector.assert_called_once_with( + oai.UpdateRequest( + set_metadata=self.md1, filter=self.filter1, dry_run=True, namespace="ns" + ) + ) + + def test_update_withDryRunFalse_updateWithDryRunFalse(self, mocker): + """Test update with dry_run=False.""" + mocker.patch.object(self.index._vector_api, "update_vector", autospec=True) + self.index.update(filter=self.filter1, dry_run=False, namespace="ns") + self.index._vector_api.update_vector.assert_called_once_with( + oai.UpdateRequest(filter=self.filter1, dry_run=False, namespace="ns") + ) + + def test_update_withDryRunAndAllParams_updateWithDryRunAndAllParams(self, mocker): + """Test update with dry_run and all parameters.""" + mocker.patch.object(self.index._vector_api, "update_vector", autospec=True) + self.index.update( + values=self.vals1, + set_metadata=self.md1, + sparse_values=self.sv1, + filter=self.filter1, + dry_run=True, + namespace="ns", + ) + self.index._vector_api.update_vector.assert_called_once_with( + oai.UpdateRequest( + values=self.vals1, + set_metadata=self.md1, + sparse_values=oai.SparseValues(indices=self.svi1, values=self.svv1), + filter=self.filter1, + dry_run=True, + namespace="ns", + ) + ) + # endregion # region: describe index tests diff --git a/tests/unit_grpc/test_grpc_index_update.py b/tests/unit_grpc/test_grpc_index_update.py index f856d9ea0..0afd09571 100644 --- a/tests/unit_grpc/test_grpc_index_update.py +++ b/tests/unit_grpc/test_grpc_index_update.py @@ -114,3 +114,53 @@ def test_update_withBothIdAndFilter_raisesError(self, mocker, vals1, filter1): mocker.patch.object(self.index.runner, "run", autospec=True) with pytest.raises(ValueError, match="Cannot provide both 'id' and 'filter'"): self.index.update(id="vec1", filter=filter1, values=vals1, namespace="ns") + + def test_update_withDryRun_updateWithDryRun(self, mocker, filter1): + """Test update with dry_run parameter.""" + mock_response = UpdateResponse() + mocker.patch.object(self.index.runner, "run", return_value=(mock_response, None)) + self.index.update(filter=filter1, dry_run=True, namespace="ns") + self.index.runner.run.assert_called_once_with( + self.index.stub.Update, + UpdateRequest(filter=dict_to_proto_struct(filter1), dry_run=True, namespace="ns"), + timeout=None, + ) + + def test_update_withDryRunAndSetMetadata_updateWithDryRunAndSetMetadata( + self, mocker, md1, filter1 + ): + """Test update with dry_run and set_metadata.""" + mock_response = UpdateResponse() + mocker.patch.object(self.index.runner, "run", return_value=(mock_response, None)) + self.index.update(set_metadata=md1, filter=filter1, dry_run=True, namespace="ns") + self.index.runner.run.assert_called_once_with( + self.index.stub.Update, + UpdateRequest( + set_metadata=dict_to_proto_struct(md1), + filter=dict_to_proto_struct(filter1), + dry_run=True, + namespace="ns", + ), + timeout=None, + ) + + def test_update_withDryRunFalse_updateWithDryRunFalse(self, mocker, filter1): + """Test update with dry_run=False.""" + mock_response = UpdateResponse() + mocker.patch.object(self.index.runner, "run", return_value=(mock_response, None)) + self.index.update(filter=filter1, dry_run=False, namespace="ns") + self.index.runner.run.assert_called_once_with( + self.index.stub.Update, + UpdateRequest(filter=dict_to_proto_struct(filter1), dry_run=False, namespace="ns"), + timeout=None, + ) + + def test_update_withDryRun_asyncReq_updateWithDryRunAsyncReq(self, mocker, filter1): + """Test update with dry_run and async_req=True.""" + mocker.patch.object(self.index.runner, "run", autospec=True) + self.index.update(filter=filter1, dry_run=True, namespace="ns", async_req=True) + self.index.runner.run.assert_called_once_with( + self.index.stub.Update.future, + UpdateRequest(filter=dict_to_proto_struct(filter1), dry_run=True, namespace="ns"), + timeout=None, + ) From 466eb04f89ca3f8120bf9b9a58afe21b5da0b4d0 Mon Sep 17 00:00:00 2001 From: Jen Hamon Date: Sat, 15 Nov 2025 17:02:34 -0500 Subject: [PATCH 4/6] Implement dry_run --- .../db_data/dataclasses/update_response.py | 6 +- pinecone/db_data/index.py | 27 ++++++- pinecone/db_data/index_asyncio.py | 27 ++++++- pinecone/grpc/utils.py | 24 ++++++- .../rest_asyncio/db/data/test_update.py | 72 +++++++++++++++++++ .../rest_sync/db/data/test_update.py | 70 ++++++++++++++++++ tests/unit/test_index.py | 66 ++++++++++++++++- 7 files changed, 287 insertions(+), 5 deletions(-) create mode 100644 tests/integration/rest_sync/db/data/test_update.py diff --git a/pinecone/db_data/dataclasses/update_response.py b/pinecone/db_data/dataclasses/update_response.py index 582d4fbac..53bed447b 100644 --- a/pinecone/db_data/dataclasses/update_response.py +++ b/pinecone/db_data/dataclasses/update_response.py @@ -1,5 +1,5 @@ from dataclasses import dataclass, field -from typing import cast +from typing import Optional, cast from .utils import DictLike from pinecone.utils.response_info import ResponseInfo @@ -10,9 +10,13 @@ class UpdateResponse(DictLike): """Response from an update operation. Attributes: + matched_records: The number of records that matched the filter (if a filter was provided). + updated_records: The number of records that were actually updated. _response_info: Response metadata including LSN headers. """ + matched_records: Optional[int] = None + updated_records: Optional[int] = None _response_info: ResponseInfo = field( default_factory=lambda: cast(ResponseInfo, {"raw_headers": {}}), repr=True, compare=False ) diff --git a/pinecone/db_data/index.py b/pinecone/db_data/index.py index 7e790a2b5..98f3fbd92 100644 --- a/pinecone/db_data/index.py +++ b/pinecone/db_data/index.py @@ -683,6 +683,8 @@ def update( ) # Extract response info from result if it's an OpenAPI model with _response_info response_info = None + matched_records = None + updated_records = None if hasattr(result, "_response_info"): response_info = result._response_info else: @@ -691,7 +693,30 @@ def update( response_info = extract_response_info({}) - return UpdateResponse(_response_info=response_info) + # Extract matched_records and updated_records from OpenAPI model + if hasattr(result, "matched_records"): + matched_records = result.matched_records + if hasattr(result, "updated_records"): + updated_records = result.updated_records + # Also check for camelCase in case it's in the raw response + if updated_records is None and hasattr(result, "updatedRecords"): + updated_records = result.updatedRecords + # Check _data_store for fields not in the OpenAPI spec + if hasattr(result, "_data_store"): + if updated_records is None: + updated_records = result._data_store.get( + "updatedRecords" + ) or result._data_store.get("updated_records") + if matched_records is None: + matched_records = result._data_store.get( + "matchedRecords" + ) or result._data_store.get("matched_records") + + return UpdateResponse( + matched_records=matched_records, + updated_records=updated_records, + _response_info=response_info, + ) @validate_and_convert_errors def describe_index_stats( diff --git a/pinecone/db_data/index_asyncio.py b/pinecone/db_data/index_asyncio.py index bcfa57641..26a51a52d 100644 --- a/pinecone/db_data/index_asyncio.py +++ b/pinecone/db_data/index_asyncio.py @@ -654,6 +654,8 @@ async def update( ) # Extract response info from result if it's an OpenAPI model with _response_info response_info = None + matched_records = None + updated_records = None if hasattr(result, "_response_info"): response_info = result._response_info else: @@ -662,7 +664,30 @@ async def update( response_info = extract_response_info({}) - return UpdateResponse(_response_info=response_info) + # Extract matched_records and updated_records from OpenAPI model + if hasattr(result, "matched_records"): + matched_records = result.matched_records + if hasattr(result, "updated_records"): + updated_records = result.updated_records + # Also check for camelCase in case it's in the raw response + if updated_records is None and hasattr(result, "updatedRecords"): + updated_records = result.updatedRecords + # Check _data_store for fields not in the OpenAPI spec + if hasattr(result, "_data_store"): + if updated_records is None: + updated_records = result._data_store.get( + "updatedRecords" + ) or result._data_store.get("updated_records") + if matched_records is None: + matched_records = result._data_store.get( + "matchedRecords" + ) or result._data_store.get("matched_records") + + return UpdateResponse( + matched_records=matched_records, + updated_records=updated_records, + _response_info=response_info, + ) @validate_and_convert_errors async def describe_index_stats( diff --git a/pinecone/grpc/utils.py b/pinecone/grpc/utils.py index 688f247da..4be241308 100644 --- a/pinecone/grpc/utils.py +++ b/pinecone/grpc/utils.py @@ -152,12 +152,34 @@ def parse_update_response( ): from pinecone.db_data.dataclasses import UpdateResponse from pinecone.utils.response_info import extract_response_info + from google.protobuf import json_format # Extract response info from initial metadata metadata = initial_metadata or {} response_info = extract_response_info(metadata) - return UpdateResponse(_response_info=response_info) + # Extract matched_records and updated_records from response + matched_records = None + updated_records = None + if isinstance(response, Message): + # GRPC response - convert to dict to extract matched_records and updated_records + json_response = json_format.MessageToDict(response) + matched_records = json_response.get("matchedRecords") or json_response.get( + "matched_records" + ) + updated_records = json_response.get("updatedRecords") or json_response.get( + "updated_records" + ) + elif isinstance(response, dict): + # Dict response - extract directly + matched_records = response.get("matchedRecords") or response.get("matched_records") + updated_records = response.get("updatedRecords") or response.get("updated_records") + + return UpdateResponse( + matched_records=matched_records, + updated_records=updated_records, + _response_info=response_info, + ) def parse_delete_response( diff --git a/tests/integration/rest_asyncio/db/data/test_update.py b/tests/integration/rest_asyncio/db/data/test_update.py index 98b805e8a..04be845cb 100644 --- a/tests/integration/rest_asyncio/db/data/test_update.py +++ b/tests/integration/rest_asyncio/db/data/test_update.py @@ -69,3 +69,75 @@ async def test_update_metadata(self, index_host, dimension, target_namespace): fetched_vec = await asyncio_idx.fetch(ids=["2"], namespace=target_namespace) assert fetched_vec.vectors["2"].metadata == {"genre": "comedy"} await asyncio_idx.close() + + async def test_update_with_filter_and_dry_run(self, index_host, dimension, target_namespace): + """Test update with filter and dry_run=True to verify matched_records and updated_records are returned.""" + asyncio_idx = build_asyncioindex_client(index_host) + + # Upsert vectors with different genres + upsert1 = await asyncio_idx.upsert( + vectors=[ + Vector( + id=str(i), + values=embedding_values(dimension), + metadata={"genre": "comedy" if i % 2 == 0 else "drama", "status": "active"}, + ) + for i in range(10) + ], + namespace=target_namespace, + batch_size=10, + show_progress=False, + ) + + await poll_until_lsn_reconciled_async( + asyncio_idx, upsert1._response_info, namespace=target_namespace + ) + + # Test dry_run=True - should return matched_records without updating + dry_run_response = await asyncio_idx.update( + filter={"genre": {"$eq": "comedy"}}, + set_metadata={"status": "updated"}, + dry_run=True, + namespace=target_namespace, + ) + + # Verify matched_records is returned and correct (5 comedy vectors) + assert dry_run_response.matched_records is not None + assert dry_run_response.matched_records == 5 + # In dry run, updated_records should be 0 or None since no records are actually updated + assert dry_run_response.updated_records is None or dry_run_response.updated_records == 0 + + # Verify the vectors were NOT actually updated (dry run) + fetched_before = await asyncio_idx.fetch( + ids=["0", "2", "4", "6", "8"], namespace=target_namespace + ) + for vec_id in ["0", "2", "4", "6", "8"]: + assert fetched_before.vectors[vec_id].metadata.get("status") == "active" + + # Now do the actual update + update_response = await asyncio_idx.update( + filter={"genre": {"$eq": "comedy"}}, + set_metadata={"status": "updated"}, + namespace=target_namespace, + ) + + # Verify matched_records and updated_records are returned + assert update_response.matched_records is not None + assert update_response.matched_records == 5 + # updated_records should match the number of records actually updated (if returned by API) + if update_response.updated_records is not None: + assert update_response.updated_records == 5 + + await poll_until_lsn_reconciled_async( + asyncio_idx, update_response._response_info, namespace=target_namespace + ) + + # Verify the vectors were actually updated + fetched_after = await asyncio_idx.fetch( + ids=["0", "2", "4", "6", "8"], namespace=target_namespace + ) + for vec_id in ["0", "2", "4", "6", "8"]: + assert fetched_after.vectors[vec_id].metadata.get("status") == "updated" + assert fetched_after.vectors[vec_id].metadata.get("genre") == "comedy" + + await asyncio_idx.close() diff --git a/tests/integration/rest_sync/db/data/test_update.py b/tests/integration/rest_sync/db/data/test_update.py new file mode 100644 index 000000000..ffcc88c55 --- /dev/null +++ b/tests/integration/rest_sync/db/data/test_update.py @@ -0,0 +1,70 @@ +import pytest +from pinecone import Vector +from tests.integration.helpers import poll_until_lsn_reconciled, embedding_values, random_string + + +@pytest.fixture(scope="session") +def update_namespace(): + return random_string(10) + + +class TestUpdate: + def test_update_with_filter_and_dry_run(self, idx, update_namespace): + """Test update with filter and dry_run=True to verify matched_records and updated_records are returned.""" + target_namespace = update_namespace + + # Upsert vectors with different genres + upsert1 = idx.upsert( + vectors=[ + Vector( + id=str(i), + values=embedding_values(), + metadata={"genre": "comedy" if i % 2 == 0 else "drama", "status": "active"}, + ) + for i in range(10) + ], + namespace=target_namespace, + ) + + poll_until_lsn_reconciled(idx, upsert1._response_info, namespace=target_namespace) + + # Test dry_run=True - should return matched_records without updating + dry_run_response = idx.update( + filter={"genre": {"$eq": "comedy"}}, + set_metadata={"status": "updated"}, + dry_run=True, + namespace=target_namespace, + ) + + # Verify matched_records is returned and correct (5 comedy vectors) + assert dry_run_response.matched_records is not None + assert dry_run_response.matched_records == 5 + # In dry run, updated_records should be 0 or None since no records are actually updated + assert dry_run_response.updated_records is None or dry_run_response.updated_records == 0 + + # Verify the vectors were NOT actually updated (dry run) + fetched_before = idx.fetch(ids=["0", "2", "4", "6", "8"], namespace=target_namespace) + for vec_id in ["0", "2", "4", "6", "8"]: + assert fetched_before.vectors[vec_id].metadata.get("status") == "active" + + # Now do the actual update + update_response = idx.update( + filter={"genre": {"$eq": "comedy"}}, + set_metadata={"status": "updated"}, + namespace=target_namespace, + ) + + # Verify matched_records and updated_records are returned + assert update_response.matched_records is not None + assert update_response.matched_records == 5 + # updated_records should match the number of records actually updated (if returned by API) + if update_response.updated_records is not None: + assert update_response.updated_records == 5 + + poll_until_lsn_reconciled(idx, update_response._response_info, namespace=target_namespace) + + # Verify the vectors were actually updated + fetched_after = idx.fetch(ids=["0", "2", "4", "6", "8"], namespace=target_namespace) + for vec_id in ["0", "2", "4", "6", "8"]: + assert fetched_after.vectors[vec_id].metadata.get("status") == "updated" + assert fetched_after.vectors[vec_id].metadata.get("genre") == "comedy" diff --git a/tests/unit/test_index.py b/tests/unit/test_index.py index 0080d06d4..81292757a 100644 --- a/tests/unit/test_index.py +++ b/tests/unit/test_index.py @@ -1,7 +1,7 @@ import pandas as pd import pytest -from pinecone.db_data import _Index +from pinecone.db_data import _Index, _IndexAsyncio import pinecone.core.openapi.db_data.models as oai from pinecone import QueryResponse, UpsertResponse, Vector @@ -631,6 +631,70 @@ def test_update_withDryRunAndAllParams_updateWithDryRunAndAllParams(self, mocker # endregion + # region: asyncio update tests + + @pytest.mark.asyncio + async def test_asyncio_update_withDryRun_updateWithDryRun(self, mocker): + """Test asyncio update with dry_run parameter.""" + asyncio_index = _IndexAsyncio(api_key="asdf", host="https://test.pinecone.io") + mocker.patch.object(asyncio_index._vector_api, "update_vector", autospec=True) + await asyncio_index.update(filter=self.filter1, dry_run=True, namespace="ns") + asyncio_index._vector_api.update_vector.assert_called_once_with( + oai.UpdateRequest(filter=self.filter1, dry_run=True, namespace="ns") + ) + + @pytest.mark.asyncio + async def test_asyncio_update_withDryRunAndSetMetadata_updateWithDryRunAndSetMetadata( + self, mocker + ): + """Test asyncio update with dry_run and set_metadata.""" + asyncio_index = _IndexAsyncio(api_key="asdf", host="https://test.pinecone.io") + mocker.patch.object(asyncio_index._vector_api, "update_vector", autospec=True) + await asyncio_index.update( + set_metadata=self.md1, filter=self.filter1, dry_run=True, namespace="ns" + ) + asyncio_index._vector_api.update_vector.assert_called_once_with( + oai.UpdateRequest( + set_metadata=self.md1, filter=self.filter1, dry_run=True, namespace="ns" + ) + ) + + @pytest.mark.asyncio + async def test_asyncio_update_withDryRunFalse_updateWithDryRunFalse(self, mocker): + """Test asyncio update with dry_run=False.""" + asyncio_index = _IndexAsyncio(api_key="asdf", host="https://test.pinecone.io") + mocker.patch.object(asyncio_index._vector_api, "update_vector", autospec=True) + await asyncio_index.update(filter=self.filter1, dry_run=False, namespace="ns") + asyncio_index._vector_api.update_vector.assert_called_once_with( + oai.UpdateRequest(filter=self.filter1, dry_run=False, namespace="ns") + ) + + @pytest.mark.asyncio + async def test_asyncio_update_withDryRunAndAllParams_updateWithDryRunAndAllParams(self, mocker): + """Test asyncio update with dry_run and all parameters.""" + asyncio_index = _IndexAsyncio(api_key="asdf", host="https://test.pinecone.io") + mocker.patch.object(asyncio_index._vector_api, "update_vector", autospec=True) + await asyncio_index.update( + values=self.vals1, + set_metadata=self.md1, + sparse_values=self.sv1, + filter=self.filter1, + dry_run=True, + namespace="ns", + ) + asyncio_index._vector_api.update_vector.assert_called_once_with( + oai.UpdateRequest( + values=self.vals1, + set_metadata=self.md1, + sparse_values=oai.SparseValues(indices=self.svi1, values=self.svv1), + filter=self.filter1, + dry_run=True, + namespace="ns", + ) + ) + + # endregion + # region: describe index tests def test_describeIndexStats_callWithoutFilter_CalledWithoutFilter(self, mocker): From e591c12d825b033cc83b8f0f91cf8356ce4d269e Mon Sep 17 00:00:00 2001 From: Jen Hamon Date: Sat, 15 Nov 2025 17:18:06 -0500 Subject: [PATCH 5/6] Fix failing unit tests --- tests/unit/data/test_request_factory.py | 5 ++-- tests/unit/test_index.py | 32 +++++++++++++++++++++---- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/tests/unit/data/test_request_factory.py b/tests/unit/data/test_request_factory.py index bd5620394..999a331b9 100644 --- a/tests/unit/data/test_request_factory.py +++ b/tests/unit/data/test_request_factory.py @@ -631,7 +631,8 @@ def test_update_request_without_dry_run_not_included(self): ) assert request.filter == {"genre": {"$eq": "action"}} assert request.set_metadata == {"status": "active"} - # dry_run should not be set when not provided (defaults to False in OpenAPI) - assert not hasattr(request, "dry_run") or request.dry_run is False + # dry_run should not be set when not provided + # Since parse_non_empty_args filters out None values, dry_run won't be in _data_store + assert "dry_run" not in request._data_store # endregion diff --git a/tests/unit/test_index.py b/tests/unit/test_index.py index 81292757a..6117d444a 100644 --- a/tests/unit/test_index.py +++ b/tests/unit/test_index.py @@ -637,7 +637,13 @@ def test_update_withDryRunAndAllParams_updateWithDryRunAndAllParams(self, mocker async def test_asyncio_update_withDryRun_updateWithDryRun(self, mocker): """Test asyncio update with dry_run parameter.""" asyncio_index = _IndexAsyncio(api_key="asdf", host="https://test.pinecone.io") - mocker.patch.object(asyncio_index._vector_api, "update_vector", autospec=True) + mock_response = oai.UpdateResponse(matched_records=5, _check_type=False) + mocker.patch.object( + asyncio_index._vector_api, + "update_vector", + return_value=mock_response, + new_callable=mocker.AsyncMock, + ) await asyncio_index.update(filter=self.filter1, dry_run=True, namespace="ns") asyncio_index._vector_api.update_vector.assert_called_once_with( oai.UpdateRequest(filter=self.filter1, dry_run=True, namespace="ns") @@ -649,7 +655,13 @@ async def test_asyncio_update_withDryRunAndSetMetadata_updateWithDryRunAndSetMet ): """Test asyncio update with dry_run and set_metadata.""" asyncio_index = _IndexAsyncio(api_key="asdf", host="https://test.pinecone.io") - mocker.patch.object(asyncio_index._vector_api, "update_vector", autospec=True) + mock_response = oai.UpdateResponse(matched_records=5, _check_type=False) + mocker.patch.object( + asyncio_index._vector_api, + "update_vector", + return_value=mock_response, + new_callable=mocker.AsyncMock, + ) await asyncio_index.update( set_metadata=self.md1, filter=self.filter1, dry_run=True, namespace="ns" ) @@ -663,7 +675,13 @@ async def test_asyncio_update_withDryRunAndSetMetadata_updateWithDryRunAndSetMet async def test_asyncio_update_withDryRunFalse_updateWithDryRunFalse(self, mocker): """Test asyncio update with dry_run=False.""" asyncio_index = _IndexAsyncio(api_key="asdf", host="https://test.pinecone.io") - mocker.patch.object(asyncio_index._vector_api, "update_vector", autospec=True) + mock_response = oai.UpdateResponse(matched_records=5, _check_type=False) + mocker.patch.object( + asyncio_index._vector_api, + "update_vector", + return_value=mock_response, + new_callable=mocker.AsyncMock, + ) await asyncio_index.update(filter=self.filter1, dry_run=False, namespace="ns") asyncio_index._vector_api.update_vector.assert_called_once_with( oai.UpdateRequest(filter=self.filter1, dry_run=False, namespace="ns") @@ -673,7 +691,13 @@ async def test_asyncio_update_withDryRunFalse_updateWithDryRunFalse(self, mocker async def test_asyncio_update_withDryRunAndAllParams_updateWithDryRunAndAllParams(self, mocker): """Test asyncio update with dry_run and all parameters.""" asyncio_index = _IndexAsyncio(api_key="asdf", host="https://test.pinecone.io") - mocker.patch.object(asyncio_index._vector_api, "update_vector", autospec=True) + mock_response = oai.UpdateResponse(matched_records=5, _check_type=False) + mocker.patch.object( + asyncio_index._vector_api, + "update_vector", + return_value=mock_response, + new_callable=mocker.AsyncMock, + ) await asyncio_index.update( values=self.vals1, set_metadata=self.md1, From 570ca389a3ce9f7c9279d481b52afaa1d48848c6 Mon Sep 17 00:00:00 2001 From: Jen Hamon Date: Sat, 15 Nov 2025 17:38:23 -0500 Subject: [PATCH 6/6] Iterate --- .../db_data/dataclasses/update_response.py | 2 -- pinecone/db_data/index.py | 18 ++---------------- pinecone/db_data/index_asyncio.py | 18 ++---------------- pinecone/db_data/index_asyncio_interface.py | 10 ++++++++-- pinecone/db_data/interfaces.py | 10 ++++++++-- pinecone/grpc/index_grpc.py | 10 ++++++++-- pinecone/grpc/utils.py | 15 +++------------ .../rest_asyncio/db/data/test_update.py | 9 ++------- .../rest_sync/db/data/test_update.py | 9 ++------- 9 files changed, 35 insertions(+), 66 deletions(-) diff --git a/pinecone/db_data/dataclasses/update_response.py b/pinecone/db_data/dataclasses/update_response.py index 53bed447b..d07e258e7 100644 --- a/pinecone/db_data/dataclasses/update_response.py +++ b/pinecone/db_data/dataclasses/update_response.py @@ -11,12 +11,10 @@ class UpdateResponse(DictLike): Attributes: matched_records: The number of records that matched the filter (if a filter was provided). - updated_records: The number of records that were actually updated. _response_info: Response metadata including LSN headers. """ matched_records: Optional[int] = None - updated_records: Optional[int] = None _response_info: ResponseInfo = field( default_factory=lambda: cast(ResponseInfo, {"raw_headers": {}}), repr=True, compare=False ) diff --git a/pinecone/db_data/index.py b/pinecone/db_data/index.py index 98f3fbd92..f98c6f173 100644 --- a/pinecone/db_data/index.py +++ b/pinecone/db_data/index.py @@ -684,7 +684,6 @@ def update( # Extract response info from result if it's an OpenAPI model with _response_info response_info = None matched_records = None - updated_records = None if hasattr(result, "_response_info"): response_info = result._response_info else: @@ -693,30 +692,17 @@ def update( response_info = extract_response_info({}) - # Extract matched_records and updated_records from OpenAPI model + # Extract matched_records from OpenAPI model if hasattr(result, "matched_records"): matched_records = result.matched_records - if hasattr(result, "updated_records"): - updated_records = result.updated_records - # Also check for camelCase in case it's in the raw response - if updated_records is None and hasattr(result, "updatedRecords"): - updated_records = result.updatedRecords # Check _data_store for fields not in the OpenAPI spec if hasattr(result, "_data_store"): - if updated_records is None: - updated_records = result._data_store.get( - "updatedRecords" - ) or result._data_store.get("updated_records") if matched_records is None: matched_records = result._data_store.get( "matchedRecords" ) or result._data_store.get("matched_records") - return UpdateResponse( - matched_records=matched_records, - updated_records=updated_records, - _response_info=response_info, - ) + return UpdateResponse(matched_records=matched_records, _response_info=response_info) @validate_and_convert_errors def describe_index_stats( diff --git a/pinecone/db_data/index_asyncio.py b/pinecone/db_data/index_asyncio.py index 26a51a52d..f4046fc2d 100644 --- a/pinecone/db_data/index_asyncio.py +++ b/pinecone/db_data/index_asyncio.py @@ -655,7 +655,6 @@ async def update( # Extract response info from result if it's an OpenAPI model with _response_info response_info = None matched_records = None - updated_records = None if hasattr(result, "_response_info"): response_info = result._response_info else: @@ -664,30 +663,17 @@ async def update( response_info = extract_response_info({}) - # Extract matched_records and updated_records from OpenAPI model + # Extract matched_records from OpenAPI model if hasattr(result, "matched_records"): matched_records = result.matched_records - if hasattr(result, "updated_records"): - updated_records = result.updated_records - # Also check for camelCase in case it's in the raw response - if updated_records is None and hasattr(result, "updatedRecords"): - updated_records = result.updatedRecords # Check _data_store for fields not in the OpenAPI spec if hasattr(result, "_data_store"): - if updated_records is None: - updated_records = result._data_store.get( - "updatedRecords" - ) or result._data_store.get("updated_records") if matched_records is None: matched_records = result._data_store.get( "matchedRecords" ) or result._data_store.get("matched_records") - return UpdateResponse( - matched_records=matched_records, - updated_records=updated_records, - _response_info=response_info, - ) + return UpdateResponse(matched_records=matched_records, _response_info=response_info) @validate_and_convert_errors async def describe_index_stats( diff --git a/pinecone/db_data/index_asyncio_interface.py b/pinecone/db_data/index_asyncio_interface.py index a838bbfea..8996f6a81 100644 --- a/pinecone/db_data/index_asyncio_interface.py +++ b/pinecone/db_data/index_asyncio_interface.py @@ -542,11 +542,16 @@ async def update( 1. **Single vector update by ID**: Provide `id` to update a specific vector. - Updates the vector with the given ID - If `values` is included, it will overwrite the previous vector values - - If `set_metadata` is included, the values of the fields specified will be added or overwrite the previous metadata + - If `set_metadata` is included, the metadata will be merged with existing metadata on the vector. + Fields specified in `set_metadata` will overwrite existing fields with the same key, while + fields not in `set_metadata` will remain unchanged. 2. **Bulk update by metadata filter**: Provide `filter` to update all vectors matching the filter criteria. - Updates all vectors in the namespace that match the filter expression - Useful for updating metadata across multiple vectors at once + - If `set_metadata` is included, the metadata will be merged with existing metadata on each vector. + Fields specified in `set_metadata` will overwrite existing fields with the same key, while + fields not in `set_metadata` will remain unchanged. - The response includes `matched_records` indicating how many vectors were updated Either `id` or `filter` must be provided (but not both in the same call). @@ -617,7 +622,8 @@ async def main(): id (str): Vector's unique id. Required for single vector updates. Must not be provided when using filter. [optional] values (List[float]): Vector values to set. [optional] set_metadata (Dict[str, Union[str, float, int, bool, List[int], List[float], List[str]]]]): - Metadata to set for the vector(s). [optional] + Metadata to merge with existing metadata on the vector(s). Fields specified will overwrite + existing fields with the same key, while fields not specified will remain unchanged. [optional] namespace (str): Namespace name where to update the vector(s). [optional] sparse_values: (Dict[str, Union[List[float], List[int]]]): Sparse values to update for the vector. Expected to be either a SparseValues object or a dict of the form: diff --git a/pinecone/db_data/interfaces.py b/pinecone/db_data/interfaces.py index c7ecbf560..0974694b2 100644 --- a/pinecone/db_data/interfaces.py +++ b/pinecone/db_data/interfaces.py @@ -727,11 +727,16 @@ def update( 1. **Single vector update by ID**: Provide `id` to update a specific vector. - Updates the vector with the given ID - If `values` is included, it will overwrite the previous vector values - - If `set_metadata` is included, the values of the fields specified will be added or overwrite the previous metadata + - If `set_metadata` is included, the metadata will be merged with existing metadata on the vector. + Fields specified in `set_metadata` will overwrite existing fields with the same key, while + fields not in `set_metadata` will remain unchanged. 2. **Bulk update by metadata filter**: Provide `filter` to update all vectors matching the filter criteria. - Updates all vectors in the namespace that match the filter expression - Useful for updating metadata across multiple vectors at once + - If `set_metadata` is included, the metadata will be merged with existing metadata on each vector. + Fields specified in `set_metadata` will overwrite existing fields with the same key, while + fields not in `set_metadata` will remain unchanged. - The response includes `matched_records` indicating how many vectors were updated Either `id` or `filter` must be provided (but not both in the same call). @@ -769,7 +774,8 @@ def update( id (str): Vector's unique id. Required for single vector updates. Must not be provided when using filter. [optional] values (List[float]): Vector values to set. [optional] set_metadata (Dict[str, Union[str, float, int, bool, List[int], List[float], List[str]]]]): - Metadata to set for the vector(s). [optional] + Metadata to merge with existing metadata on the vector(s). Fields specified will overwrite + existing fields with the same key, while fields not specified will remain unchanged. [optional] namespace (str): Namespace name where to update the vector(s). [optional] sparse_values: (Dict[str, Union[List[float], List[int]]]): Sparse values to update for the vector. Expected to be either a SparseValues object or a dict of the form: diff --git a/pinecone/grpc/index_grpc.py b/pinecone/grpc/index_grpc.py index 4dd0f9749..1b2be170b 100644 --- a/pinecone/grpc/index_grpc.py +++ b/pinecone/grpc/index_grpc.py @@ -699,11 +699,16 @@ def update( 1. **Single vector update by ID**: Provide `id` to update a specific vector. - Updates the vector with the given ID - If `values` is included, it will overwrite the previous vector values - - If `set_metadata` is included, the values of the fields specified will be added or overwrite the previous metadata + - If `set_metadata` is included, the metadata will be merged with existing metadata on the vector. + Fields specified in `set_metadata` will overwrite existing fields with the same key, while + fields not in `set_metadata` will remain unchanged. 2. **Bulk update by metadata filter**: Provide `filter` to update all vectors matching the filter criteria. - Updates all vectors in the namespace that match the filter expression - Useful for updating metadata across multiple vectors at once + - If `set_metadata` is included, the metadata will be merged with existing metadata on each vector. + Fields specified in `set_metadata` will overwrite existing fields with the same key, while + fields not in `set_metadata` will remain unchanged. - The response includes `matched_records` indicating how many vectors were updated Either `id` or `filter` must be provided (but not both in the same call). @@ -743,7 +748,8 @@ def update( Defaults to False. [optional] values (List[float]): Vector values to set. [optional] set_metadata (Dict[str, Union[str, float, int, bool, List[int], List[float], List[str]]]]): - Metadata to set for the vector(s). [optional] + Metadata to merge with existing metadata on the vector(s). Fields specified will overwrite + existing fields with the same key, while fields not specified will remain unchanged. [optional] namespace (str): Namespace name where to update the vector(s). [optional] sparse_values: (Dict[str, Union[List[float], List[int]]]): Sparse values to update for the vector. Expected to be either a GRPCSparseValues object or a dict of the form: diff --git a/pinecone/grpc/utils.py b/pinecone/grpc/utils.py index 4be241308..fcb2d70b1 100644 --- a/pinecone/grpc/utils.py +++ b/pinecone/grpc/utils.py @@ -158,28 +158,19 @@ def parse_update_response( metadata = initial_metadata or {} response_info = extract_response_info(metadata) - # Extract matched_records and updated_records from response + # Extract matched_records from response matched_records = None - updated_records = None if isinstance(response, Message): - # GRPC response - convert to dict to extract matched_records and updated_records + # GRPC response - convert to dict to extract matched_records json_response = json_format.MessageToDict(response) matched_records = json_response.get("matchedRecords") or json_response.get( "matched_records" ) - updated_records = json_response.get("updatedRecords") or json_response.get( - "updated_records" - ) elif isinstance(response, dict): # Dict response - extract directly matched_records = response.get("matchedRecords") or response.get("matched_records") - updated_records = response.get("updatedRecords") or response.get("updated_records") - return UpdateResponse( - matched_records=matched_records, - updated_records=updated_records, - _response_info=response_info, - ) + return UpdateResponse(matched_records=matched_records, _response_info=response_info) def parse_delete_response( diff --git a/tests/integration/rest_asyncio/db/data/test_update.py b/tests/integration/rest_asyncio/db/data/test_update.py index 04be845cb..5f66c033d 100644 --- a/tests/integration/rest_asyncio/db/data/test_update.py +++ b/tests/integration/rest_asyncio/db/data/test_update.py @@ -71,7 +71,7 @@ async def test_update_metadata(self, index_host, dimension, target_namespace): await asyncio_idx.close() async def test_update_with_filter_and_dry_run(self, index_host, dimension, target_namespace): - """Test update with filter and dry_run=True to verify matched_records and updated_records are returned.""" + """Test update with filter and dry_run=True to verify matched_records is returned.""" asyncio_idx = build_asyncioindex_client(index_host) # Upsert vectors with different genres @@ -104,8 +104,6 @@ async def test_update_with_filter_and_dry_run(self, index_host, dimension, targe # Verify matched_records is returned and correct (5 comedy vectors) assert dry_run_response.matched_records is not None assert dry_run_response.matched_records == 5 - # In dry run, updated_records should be 0 or None since no records are actually updated - assert dry_run_response.updated_records is None or dry_run_response.updated_records == 0 # Verify the vectors were NOT actually updated (dry run) fetched_before = await asyncio_idx.fetch( @@ -121,12 +119,9 @@ async def test_update_with_filter_and_dry_run(self, index_host, dimension, targe namespace=target_namespace, ) - # Verify matched_records and updated_records are returned + # Verify matched_records is returned assert update_response.matched_records is not None assert update_response.matched_records == 5 - # updated_records should match the number of records actually updated (if returned by API) - if update_response.updated_records is not None: - assert update_response.updated_records == 5 await poll_until_lsn_reconciled_async( asyncio_idx, update_response._response_info, namespace=target_namespace diff --git a/tests/integration/rest_sync/db/data/test_update.py b/tests/integration/rest_sync/db/data/test_update.py index ffcc88c55..d8ea51f3f 100644 --- a/tests/integration/rest_sync/db/data/test_update.py +++ b/tests/integration/rest_sync/db/data/test_update.py @@ -10,7 +10,7 @@ def update_namespace(): class TestUpdate: def test_update_with_filter_and_dry_run(self, idx, update_namespace): - """Test update with filter and dry_run=True to verify matched_records and updated_records are returned.""" + """Test update with filter and dry_run=True to verify matched_records is returned.""" target_namespace = update_namespace # Upsert vectors with different genres @@ -39,8 +39,6 @@ def test_update_with_filter_and_dry_run(self, idx, update_namespace): # Verify matched_records is returned and correct (5 comedy vectors) assert dry_run_response.matched_records is not None assert dry_run_response.matched_records == 5 - # In dry run, updated_records should be 0 or None since no records are actually updated - assert dry_run_response.updated_records is None or dry_run_response.updated_records == 0 # Verify the vectors were NOT actually updated (dry run) fetched_before = idx.fetch(ids=["0", "2", "4", "6", "8"], namespace=target_namespace) @@ -54,12 +52,9 @@ def test_update_with_filter_and_dry_run(self, idx, update_namespace): namespace=target_namespace, ) - # Verify matched_records and updated_records are returned + # Verify matched_records is returned assert update_response.matched_records is not None assert update_response.matched_records == 5 - # updated_records should match the number of records actually updated (if returned by API) - if update_response.updated_records is not None: - assert update_response.updated_records == 5 poll_until_lsn_reconciled(idx, update_response._response_info, namespace=target_namespace)