Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 0 additions & 55 deletions .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,58 +73,3 @@ jobs:
body: '⚠️ **Documentation is out of sync with the code!**\n\nPlease run `python docs/generate_docs.py` and commit the updated documentation files.'
})

build:
runs-on: ubuntu-latest
if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request'
needs: [check-docs]
# Make this job informational only for PRs - don't block PR
continue-on-error: ${{ github.event_name == 'pull_request' }}
steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip'

- name: Install Poetry
run: |
curl -sSL https://install.python-poetry.org | python3 -
echo "$HOME/.local/bin" >> $GITHUB_PATH

- name: Install dependencies
run: |
poetry install --with docs

- name: Generate RST files
run: |
cd ${{ github.workspace }}
poetry run python docs/generate_docs.py

- name: Build documentation
run: |
cd ${{ github.workspace }}/docs
poetry run sphinx-build -b html source build/html

- name: Setup Pages
uses: actions/configure-pages@v4

- name: Upload artifact
uses: actions/upload-pages-artifact@v3
with:
path: ${{ github.workspace }}/docs/build/html

deploy:
# Only deploy on push to main or workflow_dispatch, skip on PRs
if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'workflow_dispatch'
needs: build
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
runs-on: ubuntu-latest
steps:
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v4
14 changes: 9 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,11 +109,6 @@ saved_config = index.save(
aws_access_key_id="YOUR_ACCESS_KEY_ID", # Optional if using environment credentials
aws_secret_access_key="YOUR_SECRET_ACCESS_KEY" # Optional if using environment credentials
)

# If you need to extract the model name from an existing SentenceTransformer instance (best effort):
from sentinel.embeddings.sbert import extract_model_name_from_sentence_transformer
model_name = extract_model_name_from_sentence_transformer(model)
# Returns the model name if it can be determined, or None otherwise
```

## How It Works
Expand Down Expand Up @@ -186,6 +181,15 @@ poetry run sphinx-build -b html source build/html

Then open `docs/build/html/index.html` in your browser.

## Examples
To run the notebook examples
```bash
# Install with examples dependencies
poetry install --with examples
poetry install --extras=sbert
poetry run jupyter notebook
```

## License

Apache License 2.0
File renamed without changes.
4 changes: 2 additions & 2 deletions docs/source/sriracha.rst → docs/source/sentinel.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ sentinel.score_types
:undoc-members:
:show-inheritance:

sentinel.sentinel
sentinel.sentinel_local_index
-----------------------------

.. automodule:: sentinel.sentinel
.. automodule:: sentinel.sentinel_local_index
:members:
:undoc-members:
:show-inheritance:
Expand Down
40 changes: 0 additions & 40 deletions src/sentinel/embeddings/sbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,46 +46,6 @@ def get_sentence_transformer_and_scaling_fn(

return model, None


def extract_model_name_from_sentence_transformer(
model: SentenceTransformer,
) -> Optional[str]:
"""
Attempt to extract the model name from a SentenceTransformer instance.

This is a best-effort function as SentenceTransformer doesn't directly store
the original model name. It tries to infer it from the model's configuration.

Args:
model: A SentenceTransformer model instance

Returns:
A string representing the best guess at the model name, or
None if it cannot be determined
"""
# Try to get the name from the model's modules
if hasattr(model, "modules") and model.modules:
# Most SentenceTransformer models use a Transformer as the first module
if hasattr(model.modules[0], "auto_model") and hasattr(
model.modules[0].auto_model, "config"
):
# Try to get the name from the config
if hasattr(model.modules[0].auto_model.config, "name_or_path"):
return model.modules[0].auto_model.config.name_or_path

# Try to get from the model's save directory name
if hasattr(model, "get_sentence_embedding_dimension") and hasattr(
model, "get_config_dict"
):
config = model.get_config_dict()
if "__path__" in config and config["__path__"]:
# Extract the final path component as the model name
return os.path.basename(os.path.normpath(config["__path__"]))

# Return None if we can't determine the name
return None


def e5_scaling_function(score: float) -> float:
"""
Scale the similarity score for E5 embeddings.
Expand Down
10 changes: 2 additions & 8 deletions src/sentinel/sentinel_local_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,7 @@ def __init__(
`model, scale_fn = get_sentence_transformer_and_scaling_fn(encoder_model_name_or_path)`

When saving the index, you must provide the exact encoder_model_name_or_path
as SentenceTransformer doesn't store the original model name. If needed, you can use
`sentinel.embeddings.sbert.extract_model_name_from_sentence_transformer(model)`
to attempt extracting the model name, but this is a best-effort function that returns
None if it cannot determine the model name.
as SentenceTransformer doesn't store the original model name.

Use the class method `load` to load an index from S3 or local storage.
"""
Expand Down Expand Up @@ -139,10 +136,7 @@ def save(
path: Path to save the index to (local directory or S3 URI).
encoder_model_name_or_path: Name or path of the sentence transformer encoder model used.
This must be the exact name used to create the SentenceTransformer as it cannot be
reliably extracted from the model instance. If you need to try extracting the model name
from an existing instance, use
`sentinel.embeddings.sbert.extract_model_name_from_sentence_transformer(model)`,
which will return the model name if it can be determined or None otherwise.
reliably extracted from the model instance.
aws_access_key_id: Optional AWS access key ID for S3 access.
aws_secret_access_key: Optional AWS secret access key for S3 access.

Expand Down