Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 29 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,26 +97,43 @@ npm install

```


---

## Database Setup

### Google Cloud Setup

1. **Install Google Cloud CLI and Authenticate**:
#### 1. Install Google Cloud CLI and Authenticate

```bash
# Install Google Cloud CLI
curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-x86_64.tar.gz
tar -xf google-cloud-cli-linux-x86_64.tar.gz
./google-cloud-sdk/install.sh

# Initialize and authenticate
gcloud init
gcloud auth application-default login
```
```bash
# Install Google Cloud CLI
curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-x86_64.tar.gz
tar -xf google-cloud-cli-linux-x86_64.tar.gz
./google-cloud-sdk/install.sh

# Initialize and authenticate
gcloud init
gcloud auth application-default login
```

---

### BigQuery and Vertex AI Configuration

Configuration details for BigQuery and Vertex AI services are provided in the `.env.template` file.
The backend requires specific environment variables to connect to **Google Cloud services**, including **BigQuery** and **Vertex AI**. Configure the following variables in your `.env` file:

| Variable | Description | How to Get It |
| ------------------- | ------------------------------------ | ------------------------------------------ |
| `GOOGLE_API_KEY` | API key for Gemini models | Generate from Google AI Studio |
| `GEMINI_USE_VERTEX` | Toggle for Vertex AI vs standard API | Set to `false` for local development |
| `GCP_PROJECT_ID` | Google Cloud Project ID | Required for Vertex AI and BigQuery |
| `BQ_DATASET_ID` | BigQuery dataset ID | Dataset containing KnowledgeSpace metadata |
| `INDEX_ENDPOINT_ID` | Vertex AI Vector Search endpoint | ID of deployed vector index for RAG |
| `ELASTIC_BASE_URL` | Elasticsearch base URL | URL of the text search engine |

---


## Running the Application

Expand Down
108 changes: 85 additions & 23 deletions backend/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,43 @@ async def call_gemini_detect_intents(query: str, history: List[str]) -> List[str
intents = [i for i in out.get("intents", []) if i in allowed]
return list(dict.fromkeys(intents or [QueryIntent.DATA_DISCOVERY.value]))[:6]


async def call_gemini_extract_filters(query: str) -> Dict[str, str]:
"""
Extract structured filters like species, modality, or specific datasources.
"""
_require_llm_creds()
client = _get_genai_client()

# Common filters derived from your datasources_config.json
valid_keys = [
"species", "brain_region", "license", "authors",
"formats", "technique", "sex", "age"
]

prompt = (
f"Analyze this neuroscience query: '{query}'\n"
f"Extract structured filters if explicitly mentioned. Valid keys: {valid_keys}.\n"
"Return STRICT JSON only: { \"filters\": { \"key\": \"value\" } }\n"
"Rules:\n"
"- If the user asks for 'human' data, set \"species\": \"Homo sapiens\".\n"
"- If the user asks for 'mouse' or 'rat', set \"species\": \"Mus musculus\" or \"Rattus norvegicus\".\n"
"- Map 'fMRI', 'EEG' to \"technique\" or \"formats\" appropriately.\n"
"- If no specific filters are found, return { \"filters\": {} }.\n"
)

cfg = genai_types.GenerateContentConfig(
temperature=0.0,
max_output_tokens=256,
response_mime_type="application/json",
)

try:
resp = client.models.generate_content(model=FLASH_LITE_MODEL, contents=[prompt], config=cfg)
out = json.loads(resp.text or "{}")
return out.get("filters", {})
except Exception as e:
print(f"Filter extraction failed: {e}")
return {}


async def call_gemini_for_final_synthesis(
Expand Down Expand Up @@ -320,35 +356,44 @@ class AgentState(TypedDict):
keywords: List[str]
effective_query: str
intents: List[str]
filters: Dict[str, str]
ks_results: List[dict]
vector_results: List[dict]
final_results: List[dict]
all_results: List[dict]
final_response: str


class KSSearchAgent:
async def run(self, query: str, keywords: List[str], want: int = 45) -> dict:
async def run(self, query: str, keywords: List[str], filters: Dict[str, str] = None, want: int = 45) -> dict:

general = []
fuzzy = []

enhanced_keywords = keywords.copy() if keywords else []

if filters:
print(f" -> Applying extracted filters: {filters}")
for key, val in filters.items():
if val and val not in enhanced_keywords:
enhanced_keywords.append(val)

try:
print(" -> Using parallel enrichment in KS search")
general = await general_search_async(query, top_k=min(want, 50), enrich_details=True)
general = general.get("combined_results", [])
general_data = await general_search_async(query, top_k=min(want, 50), enrich_details=True)
general = general_data.get("combined_results", [])
except Exception as e:
print(f"Async general search error, falling back to sync: {e}")
try:
general = general_search(query, top_k=min(want, 50), enrich_details=True).get("combined_results", [])
except Exception as e2:
print(f"Sync general search error: {e2}")
general = []
print(f"Async general search error: {e}")
general = []
try:
print(f" -> Running fuzzy search with keywords: {keywords}")
fuzzy = global_fuzzy_keyword_search(keywords, top_k=min(want, 50))
print(f" -> Running fuzzy search with keywords: {enhanced_keywords}")

fuzzy = global_fuzzy_keyword_search(enhanced_keywords, top_k=min(want, 50))
print(f" -> Fuzzy search returned {len(fuzzy)} results")
except Exception as e:
print(f"Fuzzy config search error: {e}")
print(f"Fuzzy search error: {e}")
fuzzy = []
return {"combined_results": (general + fuzzy)[: max(want, 15)]}

return {"combined_results": (general + fuzzy)[: max(want, 15)]}

class VectorSearchAgent:
def __init__(self):
Expand Down Expand Up @@ -381,14 +426,26 @@ async def extract_keywords_and_rewrite(state: AgentState) -> AgentState:
return {**state, "effective_query": state["query"], "keywords": [], "intents": intents0}

effective = await call_gemini_rewrite_with_history(state["query"], state.get("history", []))
keywords = await call_gemini_for_keywords(effective)
# Re-evaluate intents after rewrite (usually drops greeting if mixed)
intents = await call_gemini_detect_intents(effective, state.get("history", []))

# Run keywords AND filters extraction in parallel for speed
keywords_task = call_gemini_for_keywords(effective)
intents_task = call_gemini_detect_intents(effective, state.get("history", []))
filters_task = call_gemini_extract_filters(effective) # <--- NEW TASK

keywords, intents, filters = await asyncio.gather(keywords_task, intents_task, filters_task)

print(f" -> Effective query: {effective}")
print(f" -> Keywords: {keywords}")
print(f" -> Intents: {intents}")
return {**state, "effective_query": effective, "keywords": keywords, "intents": intents}

print(f" -> Filters: {filters}") # <--- Log it

# Add filters to the returned state
return {
**state,
"effective_query": effective,
"keywords": keywords,
"intents": intents,
"filters": filters
}

# Global vector agent instance - initialized once per process
_global_vector_agent = None
Expand All @@ -410,9 +467,14 @@ async def execute_search(state: AgentState) -> Dict[str, Any]:
ks_agent = KSSearchAgent()
vec_agent = get_vector_agent() # Reuse the same instance


ks_task = asyncio.create_task(
ks_agent.run(state["effective_query"], state.get("keywords", []), want=want_pool)
)
ks_agent.run(
state["effective_query"],
state.get("keywords", []),
filters=state.get("filters", {}), # <--- Pass filters here
want=want_pool
))
vec_task = asyncio.create_task(
vec_agent.run(query=state["effective_query"], want=want_pool, context={"raw": True})
)
Expand Down
46 changes: 36 additions & 10 deletions backend/ks_search_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import re
from urllib.parse import urlparse
from difflib import SequenceMatcher
from rapidfuzz import fuzz, process


def tool(args_schema):
Expand Down Expand Up @@ -48,22 +49,28 @@ def _init_(self, description="", default_factory=None):

DATASOURCE_ID_TO_NAME = {v: k for k, v in DATASOURCE_NAME_TO_ID.items()}


def fuzzy_match(query: str, target: str, threshold: float = 0.8) -> bool:
if not query or not target:
return False
similarity = SequenceMatcher(None, query.lower(), target.lower()).ratio()
return similarity >= threshold
# token_sort_ratio handles "brain human" == "human brain" automatically
score = fuzz.token_sort_ratio(query.lower(), target.lower())
return score >= (threshold * 100)


def find_best_matches(query: str, candidates: List[str], threshold: float = 0.8, max_matches: int = 5) -> List[str]:
matches = []
for candidate in candidates:
if fuzzy_match(query, candidate, threshold):
similarity = SequenceMatcher(None, query.lower(), candidate.lower()).ratio()
matches.append((candidate, similarity))
matches.sort(key=lambda x: x[1], reverse=True)
return [match[0] for match in matches[:max_matches]]
if not query or not candidates:
return []

# RapidFuzz's process.extract is highly optimized
results = process.extract(
query.lower(),
candidates,
scorer=fuzz.token_sort_ratio,
limit=max_matches,
score_cutoff=threshold * 100
)
# results format is [(match, score, index), ...]
return [res[0] for res in results]


def search_across_all_fields(query: str, all_configs: dict, threshold: float = 0.8) -> List[dict]:
Expand Down Expand Up @@ -463,3 +470,22 @@ def smart_knowledge_search(
results = _perform_search(target_id, q, dict(filters), all_configs)
return {"combined_results": results[:top_k]}
return general_search(q, top_k, enrich_details=True)

def fuzzy_match(query: str, target: str, threshold: float = 0.8) -> bool:
if not query or not target:
return False
# token_sort_ratio handles "brain human" == "human brain"
score = fuzz.token_sort_ratio(query.lower(), target.lower())
return score >= (threshold * 100)

def find_best_matches(query: str, candidates: List[str], threshold: float = 0.8, max_matches: int = 5) -> List[str]:
# process.extract is significantly optimized for list searching
results = process.extract(
query.lower(),
candidates,
scorer=fuzz.token_sort_ratio,
limit=max_matches,
score_cutoff=threshold * 100
)
# results format: [(match, score, index), ...]
return [res[0] for res in results]
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ dependencies = [
"langgraph>=0.6.4",
"matplotlib>=3.10.3",
"pandas>=2.3.1",
"rapidfuzz>=3.0.0",
"requests>=2.32.4",
"scikit-learn>=1.7.0",
"sentence-transformers>=3.0.0",
Expand Down