diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index ce5bb5f..3ae388d 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -73,58 +73,3 @@ jobs: body: '⚠️ **Documentation is out of sync with the code!**\n\nPlease run `python docs/generate_docs.py` and commit the updated documentation files.' }) - build: - runs-on: ubuntu-latest - if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' - needs: [check-docs] - # Make this job informational only for PRs - don't block PR - continue-on-error: ${{ github.event_name == 'pull_request' }} - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - cache: 'pip' - - - name: Install Poetry - run: | - curl -sSL https://install.python-poetry.org | python3 - - echo "$HOME/.local/bin" >> $GITHUB_PATH - - - name: Install dependencies - run: | - poetry install --with docs - - - name: Generate RST files - run: | - cd ${{ github.workspace }} - poetry run python docs/generate_docs.py - - - name: Build documentation - run: | - cd ${{ github.workspace }}/docs - poetry run sphinx-build -b html source build/html - - - name: Setup Pages - uses: actions/configure-pages@v4 - - - name: Upload artifact - uses: actions/upload-pages-artifact@v3 - with: - path: ${{ github.workspace }}/docs/build/html - - deploy: - # Only deploy on push to main or workflow_dispatch, skip on PRs - if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'workflow_dispatch' - needs: build - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} - runs-on: ubuntu-latest - steps: - - name: Deploy to GitHub Pages - id: deployment - uses: actions/deploy-pages@v4 diff --git a/README.md b/README.md index 16fd318..f076299 100644 --- a/README.md +++ b/README.md @@ -109,11 +109,6 @@ saved_config = index.save( aws_access_key_id="YOUR_ACCESS_KEY_ID", # Optional if using environment credentials aws_secret_access_key="YOUR_SECRET_ACCESS_KEY" # Optional if using environment credentials ) - -# If you need to extract the model name from an existing SentenceTransformer instance (best effort): -from sentinel.embeddings.sbert import extract_model_name_from_sentence_transformer -model_name = extract_model_name_from_sentence_transformer(model) -# Returns the model name if it can be determined, or None otherwise ``` ## How It Works @@ -186,6 +181,15 @@ poetry run sphinx-build -b html source build/html Then open `docs/build/html/index.html` in your browser. +## Examples +To run the notebook examples +```bash +# Install with examples dependencies +poetry install --with examples +poetry install --extras=sbert +poetry run jupyter notebook +``` + ## License Apache License 2.0 diff --git a/docs/source/sriracha.embeddings.rst b/docs/source/sentinel.embeddings.rst similarity index 100% rename from docs/source/sriracha.embeddings.rst rename to docs/source/sentinel.embeddings.rst diff --git a/docs/source/sriracha.io.rst b/docs/source/sentinel.io.rst similarity index 100% rename from docs/source/sriracha.io.rst rename to docs/source/sentinel.io.rst diff --git a/docs/source/sriracha.rst b/docs/source/sentinel.rst similarity index 88% rename from docs/source/sriracha.rst rename to docs/source/sentinel.rst index 4d3bdaa..b0f78d7 100644 --- a/docs/source/sriracha.rst +++ b/docs/source/sentinel.rst @@ -26,10 +26,10 @@ sentinel.score_types :undoc-members: :show-inheritance: -sentinel.sentinel +sentinel.sentinel_local_index ----------------------------- -.. automodule:: sentinel.sentinel +.. automodule:: sentinel.sentinel_local_index :members: :undoc-members: :show-inheritance: diff --git a/src/sentinel/embeddings/sbert.py b/src/sentinel/embeddings/sbert.py index 278efad..6a776f7 100644 --- a/src/sentinel/embeddings/sbert.py +++ b/src/sentinel/embeddings/sbert.py @@ -46,46 +46,6 @@ def get_sentence_transformer_and_scaling_fn( return model, None - -def extract_model_name_from_sentence_transformer( - model: SentenceTransformer, -) -> Optional[str]: - """ - Attempt to extract the model name from a SentenceTransformer instance. - - This is a best-effort function as SentenceTransformer doesn't directly store - the original model name. It tries to infer it from the model's configuration. - - Args: - model: A SentenceTransformer model instance - - Returns: - A string representing the best guess at the model name, or - None if it cannot be determined - """ - # Try to get the name from the model's modules - if hasattr(model, "modules") and model.modules: - # Most SentenceTransformer models use a Transformer as the first module - if hasattr(model.modules[0], "auto_model") and hasattr( - model.modules[0].auto_model, "config" - ): - # Try to get the name from the config - if hasattr(model.modules[0].auto_model.config, "name_or_path"): - return model.modules[0].auto_model.config.name_or_path - - # Try to get from the model's save directory name - if hasattr(model, "get_sentence_embedding_dimension") and hasattr( - model, "get_config_dict" - ): - config = model.get_config_dict() - if "__path__" in config and config["__path__"]: - # Extract the final path component as the model name - return os.path.basename(os.path.normpath(config["__path__"])) - - # Return None if we can't determine the name - return None - - def e5_scaling_function(score: float) -> float: """ Scale the similarity score for E5 embeddings. diff --git a/src/sentinel/sentinel_local_index.py b/src/sentinel/sentinel_local_index.py index 1e8113d..8730bfb 100644 --- a/src/sentinel/sentinel_local_index.py +++ b/src/sentinel/sentinel_local_index.py @@ -93,10 +93,7 @@ def __init__( `model, scale_fn = get_sentence_transformer_and_scaling_fn(encoder_model_name_or_path)` When saving the index, you must provide the exact encoder_model_name_or_path - as SentenceTransformer doesn't store the original model name. If needed, you can use - `sentinel.embeddings.sbert.extract_model_name_from_sentence_transformer(model)` - to attempt extracting the model name, but this is a best-effort function that returns - None if it cannot determine the model name. + as SentenceTransformer doesn't store the original model name. Use the class method `load` to load an index from S3 or local storage. """ @@ -139,10 +136,7 @@ def save( path: Path to save the index to (local directory or S3 URI). encoder_model_name_or_path: Name or path of the sentence transformer encoder model used. This must be the exact name used to create the SentenceTransformer as it cannot be - reliably extracted from the model instance. If you need to try extracting the model name - from an existing instance, use - `sentinel.embeddings.sbert.extract_model_name_from_sentence_transformer(model)`, - which will return the model name if it can be determined or None otherwise. + reliably extracted from the model instance. aws_access_key_id: Optional AWS access key ID for S3 access. aws_secret_access_key: Optional AWS secret access key for S3 access.