diff --git a/src/assistants/utils/README.md b/src/assistants/utils/README.md new file mode 100644 index 0000000..2f926d2 --- /dev/null +++ b/src/assistants/utils/README.md @@ -0,0 +1,20 @@ +# assistants.utils + +## Módulos usados em runtime (ativos) + +- `assistants.utils.data_store`: construção/carregamento de `FAISS` para contexto local/web. +- `assistants.utils.manager_tools`: decoradores utilitários para debug/retry. +- `assistants.utils.paths`: resolução centralizada de caminhos de dados no repositório. + +## Módulos experimentais/legados (arquivados) + +- `assistants.utils.archive.vector_store` +- `assistants.utils.archive.data_store_from_web_scraper` + +Esses módulos foram movidos para `archive/` e **não** devem ser usados em runtime padrão. +Os arquivos shim em `assistants.utils.vector_store` e +`assistants.utils.data_store_from_web_scraper` emitem aviso e levantam erro para evitar uso acidental. + +## Caminhos de dados padronizados + +Os caminhos de conhecimento agora são resolvidos por `assistants.utils.paths`, evitando strings hardcoded como `assistant/knowledge/...`. diff --git a/src/assistants/utils/__init__.py b/src/assistants/utils/__init__.py index e69de29..6e07924 100644 --- a/src/assistants/utils/__init__.py +++ b/src/assistants/utils/__init__.py @@ -0,0 +1,11 @@ +"""Utility modules for assistants runtime. + +Active runtime modules: +- manager_tools +- data_store +- paths + +Legacy/experimental modules were moved to `assistants.utils.archive` and are disabled by default. +""" + +__all__ = ["manager_tools", "data_store", "paths"] diff --git a/src/assistants/utils/archive/__init__.py b/src/assistants/utils/archive/__init__.py new file mode 100644 index 0000000..b063053 --- /dev/null +++ b/src/assistants/utils/archive/__init__.py @@ -0,0 +1,4 @@ +"""Archived legacy/experimental assistant utility modules. + +These modules are not part of the active runtime path. +""" diff --git a/src/assistants/utils/archive/data_store_from_web_scraper.py b/src/assistants/utils/archive/data_store_from_web_scraper.py new file mode 100644 index 0000000..679ce81 --- /dev/null +++ b/src/assistants/utils/archive/data_store_from_web_scraper.py @@ -0,0 +1,136 @@ +import os +import json +from assistants.utils.paths import WEB_SCRAPER_JSON_DIR +from langchain_openai import OpenAIEmbeddings +from typing import List +from langchain_community.vectorstores.faiss import VectorStore, Document, Embeddings, FAISS +from dotenv import load_dotenv +load_dotenv() + + +class VectorStoreBuilder: + """ + A class to build a vector store from a folder containing JSON documents. + """ + + def __init__(self, json_folder: str, embedding: Embeddings): + """ + Initialize the VectorStoreBuilder. + + Parameters: + - json_folder (str): The path to the folder containing JSON documents. + - embedding (Embeddings): The embedding model to use for vectorization. + """ + self.json_folder = json_folder + self.embedding = embedding + + def load_documents_with_no_metadatas(self) -> List[Document]: + """ + Load documents from JSON files in the specified folder. + + Returns: + - List[Document]: A list of Document objects representing the loaded documents. + """ + try: + documents = [] + for filename in os.listdir(self.json_folder): + if filename.endswith('.json'): + filepath = os.path.join(self.json_folder, filename) + with open(filepath, 'r', encoding='utf-8') as f: + data = json.load(f) + title = data.get('title', '') + content = data.get('content', '') + metadata = data.get('metadata', {}) + documents.append(Document(title=title, page_content=content, metadata=metadata)) + + # Print examples of loaded documents for verification + # print("Loaded documents[0]:\n", documents[0], documents[8], documents[15]) + print("\nLoaded documents[15]:\n", documents[15]) + return documents + + except Exception as e: + print(f"Error loading documents: {e}") + return [] + + def load_documents(self) -> List[Document]: + """ + Load documents from JSON files in the specified folder. + + Returns: + - List[Document]: A list of Document objects representing the loaded documents. + """ + try: + documents = [] + for filename in os.listdir(self.json_folder): + if filename.endswith('.json'): + filepath = os.path.join(self.json_folder, filename) + with open(filepath, 'r', encoding='utf-8') as f: + data = json.load(f) + title = data.get('title', '') + content = data.get('content', '') + metadata = data.get('metadata', {}) + # Add metadata to the content + content_with_metadata = f"Title: {title}\nMetadata: {metadata}\n Page Content: {content}" + documents.append(Document(title=title, page_content=content_with_metadata, metadata=metadata)) + + # Print examples of loaded documents for verification + # print("Loaded documents[0]:\n", documents[0], documents[8], documents[15]) + print("\nLoaded documents[15]:\n", documents[15]) + return documents + + except Exception as e: + print(f"Error loading documents: {e}") + return [] + + def build_vector_store(self) -> VectorStore: + """ + Build a vector store from loaded documents using FAISS. + + Returns: + - VectorStore: The built vector store. + """ + try: + documents = self.load_documents() + metadatas = [d.metadata for d in documents] + vector_store = FAISS.from_documents(documents, self.embedding) + # vector_store = FAISS.from_documents(documents, embedding=self.embedding, metadatas) + return vector_store + except Exception as e: + print(f"Error building vector store: {e}") + return None + + +def get_vector_store( + json_folder=str(WEB_SCRAPER_JSON_DIR)) -> VectorStore: + """ + Main function to get the vector store. + + Parameters: + - json_folder (str, optional): The folder containing JSON files to be used for building the vector store. + Defaults to the repository path `src/assistants/data/knowledge/flet/__extracted_code`. + + Returns: + - vector_store: The constructed vector store object or None if an error occurred. + """ + try: + # Initialize the embedding model + embedding = OpenAIEmbeddings() + + # Build the vector store + builder = VectorStoreBuilder(json_folder, embedding) + vector_store = builder.build_vector_store() + + # Check if the vector store was built successfully + if vector_store is not None: + print("Vector store built successfully!") + else: + print("Error building the vector store.") + except Exception as e: + print(f"Error during program execution: {e}") + finally: + print("Program completed.") + return vector_store + + +if __name__ == "__main__": + get_vector_store() diff --git a/src/assistants/utils/archive/vector_store.py b/src/assistants/utils/archive/vector_store.py new file mode 100644 index 0000000..210e9d2 --- /dev/null +++ b/src/assistants/utils/archive/vector_store.py @@ -0,0 +1,136 @@ +import os +import json +from assistants.utils.paths import WEB_SCRAPER_JSON_DIR +from langchain_openai import OpenAIEmbeddings +from typing import List +from langchain_community.vectorstores.faiss import VectorStore, Document, Embeddings, FAISS +from dotenv import load_dotenv +load_dotenv("env") + + +class VectorStoreBuilder: + """ + A class to build a vector store from a folder containing JSON documents. + """ + + def __init__(self, json_folder: str, embedding: Embeddings): + """ + Initialize the VectorStoreBuilder. + + Parameters: + - json_folder (str): The path to the folder containing JSON documents. + - embedding (Embeddings): The embedding model to use for vectorization. + """ + self.json_folder = json_folder + self.embedding = embedding + + def load_documents_with_no_metadatas(self) -> List[Document]: + """ + Load documents from JSON files in the specified folder. + + Returns: + - List[Document]: A list of Document objects representing the loaded documents. + """ + try: + documents = [] + for filename in os.listdir(self.json_folder): + if filename.endswith('.json'): + filepath = os.path.join(self.json_folder, filename) + with open(filepath, 'r', encoding='utf-8') as f: + data = json.load(f) + title = data.get('title', '') + content = data.get('content', '') + metadata = data.get('metadata', {}) + documents.append(Document(title=title, page_content=content, metadata=metadata)) + + # Print examples of loaded documents for verification + # print("Loaded documents[0]:\n", documents[0], documents[8], documents[15]) + print("\nLoaded documents[15]:\n", documents[15]) + return documents + + except Exception as e: + print(f"Error loading documents: {e}") + return [] + + def load_documents(self) -> List[Document]: + """ + Load documents from JSON files in the specified folder. + + Returns: + - List[Document]: A list of Document objects representing the loaded documents. + """ + try: + documents = [] + for filename in os.listdir(self.json_folder): + if filename.endswith('.json'): + filepath = os.path.join(self.json_folder, filename) + with open(filepath, 'r', encoding='utf-8') as f: + data = json.load(f) + title = data.get('title', '') + content = data.get('content', '') + metadata = data.get('metadata', {}) + # Add metadata to the content + content_with_metadata = f"Title: {title}\nMetadata: {metadata}\n Page Content: {content}" + documents.append(Document(title=title, page_content=content_with_metadata, metadata=metadata)) + + # Print examples of loaded documents for verification + # print("Loaded documents[0]:\n", documents[0], documents[8], documents[15]) + print("\nLoaded documents[15]:\n", documents[15]) + return documents + + except Exception as e: + print(f"Error loading documents: {e}") + return [] + + def build_vector_store(self) -> VectorStore: + """ + Build a vector store from loaded documents using FAISS. + + Returns: + - VectorStore: The built vector store. + """ + try: + documents = self.load_documents() + metadatas = [d.metadata for d in documents] + vector_store = FAISS.from_documents(documents, self.embedding) + # vector_store = FAISS.from_documents(documents, embedding=self.embedding, metadatas) + return vector_store + except Exception as e: + print(f"Error building vector store: {e}") + return None + + +def get_vector_store( + json_folder=str(WEB_SCRAPER_JSON_DIR)) -> VectorStore: + """ + Main function to get the vector store. + + Parameters: + - json_folder (str, optional): The folder containing JSON files to be used for building the vector store. + Defaults to the repository path `src/assistants/data/knowledge/flet/__extracted_code`. + + Returns: + - vector_store: The constructed vector store object or None if an error occurred. + """ + try: + # Initialize the embedding model + embedding = OpenAIEmbeddings() + + # Build the vector store + builder = VectorStoreBuilder(json_folder, embedding) + vector_store = builder.build_vector_store() + + # Check if the vector store was built successfully + if vector_store is not None: + print("Vector store built successfully!") + else: + print("Error building the vector store.") + except Exception as e: + print(f"Error during program execution: {e}") + finally: + print("Program completed.") + return vector_store + + +if __name__ == "__main__": + get_vector_store() diff --git a/src/assistants/utils/data_store.py b/src/assistants/utils/data_store.py index c92b1b0..db415d2 100644 --- a/src/assistants/utils/data_store.py +++ b/src/assistants/utils/data_store.py @@ -3,7 +3,12 @@ from langchain_community.document_loaders import DocusaurusLoader from langchain_openai import OpenAIEmbeddings from langchain_community.vectorstores.faiss import FAISS -from utils.manager_tools import ManagerTools as mt +from assistants.utils.manager_tools import ManagerTools as mt +from assistants.utils.paths import ( + DOCS_PICKLE_FILE, + SITE_DATA_FILE, + SITE_DATA_LIGHT_FILE +) import pickle import os @@ -27,11 +32,11 @@ class DataStore: PICKLE = "pickel" SITE_URL = "https://kobu.agency/" - site_datas_light = 'assistant/knowledge/data_store_files/default/site_datas_light.txt' - site_datas = 'assistant/knowledge/data_store_files/default/site_datas.txt' + site_datas_light = str(SITE_DATA_LIGHT_FILE) + site_datas = str(SITE_DATA_FILE) LOCAL_PATH = site_datas - docs_pickle_path = "assistant/knowledge/data_store_files/default/docs.pickle" + docs_pickle_path = str(DOCS_PICKLE_FILE) origin = LOCAL @classmethod diff --git a/src/assistants/utils/data_store_from_web_scraper.py b/src/assistants/utils/data_store_from_web_scraper.py index b4f15d2..6b7ce42 100644 --- a/src/assistants/utils/data_store_from_web_scraper.py +++ b/src/assistants/utils/data_store_from_web_scraper.py @@ -1,135 +1,20 @@ -import os -import json -from langchain_openai import OpenAIEmbeddings -from typing import List -from langchain_community.vectorstores.faiss import VectorStore, Document, Embeddings, FAISS -from dotenv import load_dotenv -load_dotenv() +"""Legacy module retained as a disabled shim. +Use `assistants.utils.archive.data_store_from_web_scraper` only for historical reference. +""" -class VectorStoreBuilder: - """ - A class to build a vector store from a folder containing JSON documents. - """ +import warnings - def __init__(self, json_folder: str, embedding: Embeddings): - """ - Initialize the VectorStoreBuilder. +warnings.warn( + "assistants.utils.data_store_from_web_scraper is legacy/experimental and disabled. " + "Use active runtime modules documented in assistants.utils/README.md.", + DeprecationWarning, + stacklevel=2, +) - Parameters: - - json_folder (str): The path to the folder containing JSON documents. - - embedding (Embeddings): The embedding model to use for vectorization. - """ - self.json_folder = json_folder - self.embedding = embedding - def load_documents_with_no_metadatas(self) -> List[Document]: - """ - Load documents from JSON files in the specified folder. - - Returns: - - List[Document]: A list of Document objects representing the loaded documents. - """ - try: - documents = [] - for filename in os.listdir(self.json_folder): - if filename.endswith('.json'): - filepath = os.path.join(self.json_folder, filename) - with open(filepath, 'r', encoding='utf-8') as f: - data = json.load(f) - title = data.get('title', '') - content = data.get('content', '') - metadata = data.get('metadata', {}) - documents.append(Document(title=title, page_content=content, metadata=metadata)) - - # Print examples of loaded documents for verification - # print("Loaded documents[0]:\n", documents[0], documents[8], documents[15]) - print("\nLoaded documents[15]:\n", documents[15]) - return documents - - except Exception as e: - print(f"Error loading documents: {e}") - return [] - - def load_documents(self) -> List[Document]: - """ - Load documents from JSON files in the specified folder. - - Returns: - - List[Document]: A list of Document objects representing the loaded documents. - """ - try: - documents = [] - for filename in os.listdir(self.json_folder): - if filename.endswith('.json'): - filepath = os.path.join(self.json_folder, filename) - with open(filepath, 'r', encoding='utf-8') as f: - data = json.load(f) - title = data.get('title', '') - content = data.get('content', '') - metadata = data.get('metadata', {}) - # Add metadata to the content - content_with_metadata = f"Title: {title}\nMetadata: {metadata}\n Page Content: {content}" - documents.append(Document(title=title, page_content=content_with_metadata, metadata=metadata)) - - # Print examples of loaded documents for verification - # print("Loaded documents[0]:\n", documents[0], documents[8], documents[15]) - print("\nLoaded documents[15]:\n", documents[15]) - return documents - - except Exception as e: - print(f"Error loading documents: {e}") - return [] - - def build_vector_store(self) -> VectorStore: - """ - Build a vector store from loaded documents using FAISS. - - Returns: - - VectorStore: The built vector store. - """ - try: - documents = self.load_documents() - metadatas = [d.metadata for d in documents] - vector_store = FAISS.from_documents(documents, self.embedding) - # vector_store = FAISS.from_documents(documents, embedding=self.embedding, metadatas) - return vector_store - except Exception as e: - print(f"Error building vector store: {e}") - return None - - -def get_vector_store( - json_folder='assistant/knowledge/web_scraper_files') -> VectorStore: - """ - Main function to get the vector store. - - Parameters: - - json_folder (str, optional): The folder containing JSON files to be used for building the vector store. - Defaults to 'assistant/knowledge/data_store_files/web_scraper_files'. - - Returns: - - vector_store: The constructed vector store object or None if an error occurred. - """ - try: - # Initialize the embedding model - embedding = OpenAIEmbeddings() - - # Build the vector store - builder = VectorStoreBuilder(json_folder, embedding) - vector_store = builder.build_vector_store() - - # Check if the vector store was built successfully - if vector_store is not None: - print("Vector store built successfully!") - else: - print("Error building the vector store.") - except Exception as e: - print(f"Error during program execution: {e}") - finally: - print("Program completed.") - return vector_store - - -if __name__ == "__main__": - get_vector_store() +def get_vector_store(*args, **kwargs): + raise RuntimeError( + "assistants.utils.data_store_from_web_scraper has been disabled and moved to " + "assistants.utils.archive.data_store_from_web_scraper" + ) diff --git a/src/assistants/utils/paths.py b/src/assistants/utils/paths.py new file mode 100644 index 0000000..c4d25be --- /dev/null +++ b/src/assistants/utils/paths.py @@ -0,0 +1,12 @@ +from pathlib import Path + +# Repository-aware paths for assistant knowledge assets. +SRC_ROOT = Path(__file__).resolve().parents[2] +KNOWLEDGE_ROOT = SRC_ROOT / "assistants" / "data" / "knowledge" +FLET_KNOWLEDGE_DIR = KNOWLEDGE_ROOT / "flet" +WEB_SCRAPER_JSON_DIR = FLET_KNOWLEDGE_DIR / "__extracted_code" + +# Existing corpus files used by the local TextLoader mode. +SITE_DATA_LIGHT_FILE = WEB_SCRAPER_JSON_DIR / "docs_controls_image.json" +SITE_DATA_FILE = WEB_SCRAPER_JSON_DIR / "docs_getting-started_flet-controls.json" +DOCS_PICKLE_FILE = FLET_KNOWLEDGE_DIR / "docs.pickle" diff --git a/src/assistants/utils/vector_store.py b/src/assistants/utils/vector_store.py index 61defd8..8b24ee4 100644 --- a/src/assistants/utils/vector_store.py +++ b/src/assistants/utils/vector_store.py @@ -1,135 +1,20 @@ -import os -import json -from langchain_openai import OpenAIEmbeddings -from typing import List -from langchain_community.vectorstores.faiss import VectorStore, Document, Embeddings, FAISS -from dotenv import load_dotenv -load_dotenv("env") +"""Legacy module retained as a disabled shim. +Use `assistants.utils.archive.vector_store` only for historical reference. +""" -class VectorStoreBuilder: - """ - A class to build a vector store from a folder containing JSON documents. - """ +import warnings - def __init__(self, json_folder: str, embedding: Embeddings): - """ - Initialize the VectorStoreBuilder. +warnings.warn( + "assistants.utils.vector_store is legacy/experimental and disabled. " + "Use active runtime modules documented in assistants.utils/README.md.", + DeprecationWarning, + stacklevel=2, +) - Parameters: - - json_folder (str): The path to the folder containing JSON documents. - - embedding (Embeddings): The embedding model to use for vectorization. - """ - self.json_folder = json_folder - self.embedding = embedding - def load_documents_with_no_metadatas(self) -> List[Document]: - """ - Load documents from JSON files in the specified folder. - - Returns: - - List[Document]: A list of Document objects representing the loaded documents. - """ - try: - documents = [] - for filename in os.listdir(self.json_folder): - if filename.endswith('.json'): - filepath = os.path.join(self.json_folder, filename) - with open(filepath, 'r', encoding='utf-8') as f: - data = json.load(f) - title = data.get('title', '') - content = data.get('content', '') - metadata = data.get('metadata', {}) - documents.append(Document(title=title, page_content=content, metadata=metadata)) - - # Print examples of loaded documents for verification - # print("Loaded documents[0]:\n", documents[0], documents[8], documents[15]) - print("\nLoaded documents[15]:\n", documents[15]) - return documents - - except Exception as e: - print(f"Error loading documents: {e}") - return [] - - def load_documents(self) -> List[Document]: - """ - Load documents from JSON files in the specified folder. - - Returns: - - List[Document]: A list of Document objects representing the loaded documents. - """ - try: - documents = [] - for filename in os.listdir(self.json_folder): - if filename.endswith('.json'): - filepath = os.path.join(self.json_folder, filename) - with open(filepath, 'r', encoding='utf-8') as f: - data = json.load(f) - title = data.get('title', '') - content = data.get('content', '') - metadata = data.get('metadata', {}) - # Add metadata to the content - content_with_metadata = f"Title: {title}\nMetadata: {metadata}\n Page Content: {content}" - documents.append(Document(title=title, page_content=content_with_metadata, metadata=metadata)) - - # Print examples of loaded documents for verification - # print("Loaded documents[0]:\n", documents[0], documents[8], documents[15]) - print("\nLoaded documents[15]:\n", documents[15]) - return documents - - except Exception as e: - print(f"Error loading documents: {e}") - return [] - - def build_vector_store(self) -> VectorStore: - """ - Build a vector store from loaded documents using FAISS. - - Returns: - - VectorStore: The built vector store. - """ - try: - documents = self.load_documents() - metadatas = [d.metadata for d in documents] - vector_store = FAISS.from_documents(documents, self.embedding) - # vector_store = FAISS.from_documents(documents, embedding=self.embedding, metadatas) - return vector_store - except Exception as e: - print(f"Error building vector store: {e}") - return None - - -def get_vector_store( - json_folder='assistant/knowledge/web_scraper_files') -> VectorStore: - """ - Main function to get the vector store. - - Parameters: - - json_folder (str, optional): The folder containing JSON files to be used for building the vector store. - Defaults to 'assistant/knowledge/data_store_files/web_scraper_files'. - - Returns: - - vector_store: The constructed vector store object or None if an error occurred. - """ - try: - # Initialize the embedding model - embedding = OpenAIEmbeddings() - - # Build the vector store - builder = VectorStoreBuilder(json_folder, embedding) - vector_store = builder.build_vector_store() - - # Check if the vector store was built successfully - if vector_store is not None: - print("Vector store built successfully!") - else: - print("Error building the vector store.") - except Exception as e: - print(f"Error during program execution: {e}") - finally: - print("Program completed.") - return vector_store - - -if __name__ == "__main__": - get_vector_store() +def get_vector_store(*args, **kwargs): + raise RuntimeError( + "assistants.utils.vector_store has been disabled and moved to " + "assistants.utils.archive.vector_store" + )