From 51d481797ad9f2e437310282e7a1f5db06cef14a Mon Sep 17 00:00:00 2001 From: Eleonore Vonck Date: Wed, 6 Aug 2025 11:05:28 -0700 Subject: [PATCH 1/4] Sentinel Documentation --- README.md | 14 ++++++---- src/sentinel/embeddings/sbert.py | 40 ---------------------------- src/sentinel/sentinel_local_index.py | 10 ++----- 3 files changed, 11 insertions(+), 53 deletions(-) diff --git a/README.md b/README.md index 16fd318..f076299 100644 --- a/README.md +++ b/README.md @@ -109,11 +109,6 @@ saved_config = index.save( aws_access_key_id="YOUR_ACCESS_KEY_ID", # Optional if using environment credentials aws_secret_access_key="YOUR_SECRET_ACCESS_KEY" # Optional if using environment credentials ) - -# If you need to extract the model name from an existing SentenceTransformer instance (best effort): -from sentinel.embeddings.sbert import extract_model_name_from_sentence_transformer -model_name = extract_model_name_from_sentence_transformer(model) -# Returns the model name if it can be determined, or None otherwise ``` ## How It Works @@ -186,6 +181,15 @@ poetry run sphinx-build -b html source build/html Then open `docs/build/html/index.html` in your browser. +## Examples +To run the notebook examples +```bash +# Install with examples dependencies +poetry install --with examples +poetry install --extras=sbert +poetry run jupyter notebook +``` + ## License Apache License 2.0 diff --git a/src/sentinel/embeddings/sbert.py b/src/sentinel/embeddings/sbert.py index 278efad..6a776f7 100644 --- a/src/sentinel/embeddings/sbert.py +++ b/src/sentinel/embeddings/sbert.py @@ -46,46 +46,6 @@ def get_sentence_transformer_and_scaling_fn( return model, None - -def extract_model_name_from_sentence_transformer( - model: SentenceTransformer, -) -> Optional[str]: - """ - Attempt to extract the model name from a SentenceTransformer instance. - - This is a best-effort function as SentenceTransformer doesn't directly store - the original model name. It tries to infer it from the model's configuration. - - Args: - model: A SentenceTransformer model instance - - Returns: - A string representing the best guess at the model name, or - None if it cannot be determined - """ - # Try to get the name from the model's modules - if hasattr(model, "modules") and model.modules: - # Most SentenceTransformer models use a Transformer as the first module - if hasattr(model.modules[0], "auto_model") and hasattr( - model.modules[0].auto_model, "config" - ): - # Try to get the name from the config - if hasattr(model.modules[0].auto_model.config, "name_or_path"): - return model.modules[0].auto_model.config.name_or_path - - # Try to get from the model's save directory name - if hasattr(model, "get_sentence_embedding_dimension") and hasattr( - model, "get_config_dict" - ): - config = model.get_config_dict() - if "__path__" in config and config["__path__"]: - # Extract the final path component as the model name - return os.path.basename(os.path.normpath(config["__path__"])) - - # Return None if we can't determine the name - return None - - def e5_scaling_function(score: float) -> float: """ Scale the similarity score for E5 embeddings. diff --git a/src/sentinel/sentinel_local_index.py b/src/sentinel/sentinel_local_index.py index 1e8113d..8730bfb 100644 --- a/src/sentinel/sentinel_local_index.py +++ b/src/sentinel/sentinel_local_index.py @@ -93,10 +93,7 @@ def __init__( `model, scale_fn = get_sentence_transformer_and_scaling_fn(encoder_model_name_or_path)` When saving the index, you must provide the exact encoder_model_name_or_path - as SentenceTransformer doesn't store the original model name. If needed, you can use - `sentinel.embeddings.sbert.extract_model_name_from_sentence_transformer(model)` - to attempt extracting the model name, but this is a best-effort function that returns - None if it cannot determine the model name. + as SentenceTransformer doesn't store the original model name. Use the class method `load` to load an index from S3 or local storage. """ @@ -139,10 +136,7 @@ def save( path: Path to save the index to (local directory or S3 URI). encoder_model_name_or_path: Name or path of the sentence transformer encoder model used. This must be the exact name used to create the SentenceTransformer as it cannot be - reliably extracted from the model instance. If you need to try extracting the model name - from an existing instance, use - `sentinel.embeddings.sbert.extract_model_name_from_sentence_transformer(model)`, - which will return the model name if it can be determined or None otherwise. + reliably extracted from the model instance. aws_access_key_id: Optional AWS access key ID for S3 access. aws_secret_access_key: Optional AWS secret access key for S3 access. From 2d19f0ed9dcc2c0dcea25bfab70503772dc57642 Mon Sep 17 00:00:00 2001 From: Eleonore Vonck Date: Wed, 6 Aug 2025 11:34:59 -0700 Subject: [PATCH 2/4] Update documentation --- docs/source/sentinel.embeddings.rst | 20 +++++++++++++ docs/source/sentinel.io.rst | 28 ++++++++++++++++++ docs/source/sentinel.rst | 44 +++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+) create mode 100644 docs/source/sentinel.embeddings.rst create mode 100644 docs/source/sentinel.io.rst create mode 100644 docs/source/sentinel.rst diff --git a/docs/source/sentinel.embeddings.rst b/docs/source/sentinel.embeddings.rst new file mode 100644 index 0000000..837f145 --- /dev/null +++ b/docs/source/sentinel.embeddings.rst @@ -0,0 +1,20 @@ +sentinel.embeddings +=================== + +.. automodule:: sentinel.embeddings + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + + +sentinel.embeddings.sbert +------------------------- + +.. automodule:: sentinel.embeddings.sbert + :members: + :undoc-members: + :show-inheritance: + diff --git a/docs/source/sentinel.io.rst b/docs/source/sentinel.io.rst new file mode 100644 index 0000000..0d4da4b --- /dev/null +++ b/docs/source/sentinel.io.rst @@ -0,0 +1,28 @@ +sentinel.io +=========== + +.. automodule:: sentinel.io + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + + +sentinel.io.index_io +-------------------- + +.. automodule:: sentinel.io.index_io + :members: + :undoc-members: + :show-inheritance: + +sentinel.io.saved_index_config +------------------------------ + +.. automodule:: sentinel.io.saved_index_config + :members: + :undoc-members: + :show-inheritance: + diff --git a/docs/source/sentinel.rst b/docs/source/sentinel.rst new file mode 100644 index 0000000..b0f78d7 --- /dev/null +++ b/docs/source/sentinel.rst @@ -0,0 +1,44 @@ +sentinel +======== + +.. automodule:: sentinel + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + + +sentinel.score_formulae +----------------------- + +.. automodule:: sentinel.score_formulae + :members: + :undoc-members: + :show-inheritance: + +sentinel.score_types +-------------------- + +.. automodule:: sentinel.score_types + :members: + :undoc-members: + :show-inheritance: + +sentinel.sentinel_local_index +----------------------------- + +.. automodule:: sentinel.sentinel_local_index + :members: + :undoc-members: + :show-inheritance: + +Subpackages +---------- + +.. toctree:: + :maxdepth: 4 + + sentinel.embeddings + sentinel.io From 317ec95c4b8aa48b52acdd54b2c29c078bc6242f Mon Sep 17 00:00:00 2001 From: Eleonore Vonck Date: Wed, 6 Aug 2025 11:55:02 -0700 Subject: [PATCH 3/4] Update documentation --- docs/source/sriracha.embeddings.rst | 20 ------------- docs/source/sriracha.io.rst | 28 ------------------ docs/source/sriracha.rst | 44 ----------------------------- 3 files changed, 92 deletions(-) delete mode 100644 docs/source/sriracha.embeddings.rst delete mode 100644 docs/source/sriracha.io.rst delete mode 100644 docs/source/sriracha.rst diff --git a/docs/source/sriracha.embeddings.rst b/docs/source/sriracha.embeddings.rst deleted file mode 100644 index 837f145..0000000 --- a/docs/source/sriracha.embeddings.rst +++ /dev/null @@ -1,20 +0,0 @@ -sentinel.embeddings -=================== - -.. automodule:: sentinel.embeddings - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - - -sentinel.embeddings.sbert -------------------------- - -.. automodule:: sentinel.embeddings.sbert - :members: - :undoc-members: - :show-inheritance: - diff --git a/docs/source/sriracha.io.rst b/docs/source/sriracha.io.rst deleted file mode 100644 index 0d4da4b..0000000 --- a/docs/source/sriracha.io.rst +++ /dev/null @@ -1,28 +0,0 @@ -sentinel.io -=========== - -.. automodule:: sentinel.io - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - - -sentinel.io.index_io --------------------- - -.. automodule:: sentinel.io.index_io - :members: - :undoc-members: - :show-inheritance: - -sentinel.io.saved_index_config ------------------------------- - -.. automodule:: sentinel.io.saved_index_config - :members: - :undoc-members: - :show-inheritance: - diff --git a/docs/source/sriracha.rst b/docs/source/sriracha.rst deleted file mode 100644 index 4d3bdaa..0000000 --- a/docs/source/sriracha.rst +++ /dev/null @@ -1,44 +0,0 @@ -sentinel -======== - -.. automodule:: sentinel - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - - -sentinel.score_formulae ------------------------ - -.. automodule:: sentinel.score_formulae - :members: - :undoc-members: - :show-inheritance: - -sentinel.score_types --------------------- - -.. automodule:: sentinel.score_types - :members: - :undoc-members: - :show-inheritance: - -sentinel.sentinel ------------------------------ - -.. automodule:: sentinel.sentinel - :members: - :undoc-members: - :show-inheritance: - -Subpackages ----------- - -.. toctree:: - :maxdepth: 4 - - sentinel.embeddings - sentinel.io From 21b2df61f4923ff0c371dedf3494b69c6b8042b5 Mon Sep 17 00:00:00 2001 From: Eleonore Vonck Date: Wed, 6 Aug 2025 12:06:50 -0700 Subject: [PATCH 4/4] Remove Pages github action --- .github/workflows/docs.yml | 55 -------------------------------------- 1 file changed, 55 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index ce5bb5f..3ae388d 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -73,58 +73,3 @@ jobs: body: '⚠️ **Documentation is out of sync with the code!**\n\nPlease run `python docs/generate_docs.py` and commit the updated documentation files.' }) - build: - runs-on: ubuntu-latest - if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' - needs: [check-docs] - # Make this job informational only for PRs - don't block PR - continue-on-error: ${{ github.event_name == 'pull_request' }} - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - cache: 'pip' - - - name: Install Poetry - run: | - curl -sSL https://install.python-poetry.org | python3 - - echo "$HOME/.local/bin" >> $GITHUB_PATH - - - name: Install dependencies - run: | - poetry install --with docs - - - name: Generate RST files - run: | - cd ${{ github.workspace }} - poetry run python docs/generate_docs.py - - - name: Build documentation - run: | - cd ${{ github.workspace }}/docs - poetry run sphinx-build -b html source build/html - - - name: Setup Pages - uses: actions/configure-pages@v4 - - - name: Upload artifact - uses: actions/upload-pages-artifact@v3 - with: - path: ${{ github.workspace }}/docs/build/html - - deploy: - # Only deploy on push to main or workflow_dispatch, skip on PRs - if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'workflow_dispatch' - needs: build - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} - runs-on: ubuntu-latest - steps: - - name: Deploy to GitHub Pages - id: deployment - uses: actions/deploy-pages@v4