INCF · sam22ridhi · Dec 21, 2025 · Dec 24, 2025
diff --git a/README.md b/README.md
@@ -97,26 +97,43 @@ npm install
 
 ```
 
+
+---
+
 ## Database Setup
 
 ### Google Cloud Setup
 
-1. **Install Google Cloud CLI and Authenticate**:
+#### 1. Install Google Cloud CLI and Authenticate
 
-   ```bash
-   # Install Google Cloud CLI
-   curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-x86_64.tar.gz
-   tar -xf google-cloud-cli-linux-x86_64.tar.gz
-   ./google-cloud-sdk/install.sh
-
-   # Initialize and authenticate
-   gcloud init
-   gcloud auth application-default login
-   ```
+```bash
+# Install Google Cloud CLI
+curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-x86_64.tar.gz
+tar -xf google-cloud-cli-linux-x86_64.tar.gz
+./google-cloud-sdk/install.sh
+
+# Initialize and authenticate
+gcloud init
+gcloud auth application-default login
+```
+
+---
 
 ### BigQuery and Vertex AI Configuration
 
-Configuration details for BigQuery and Vertex AI services are provided in the `.env.template` file.
+The backend requires specific environment variables to connect to **Google Cloud services**, including **BigQuery** and **Vertex AI**. Configure the following variables in your `.env` file:
+
+| Variable            | Description                          | How to Get It                              |
+| ------------------- | ------------------------------------ | ------------------------------------------ |
+| `GOOGLE_API_KEY`    | API key for Gemini models            | Generate from Google AI Studio             |
+| `GEMINI_USE_VERTEX` | Toggle for Vertex AI vs standard API | Set to `false` for local development       |
+| `GCP_PROJECT_ID`    | Google Cloud Project ID              | Required for Vertex AI and BigQuery        |
+| `BQ_DATASET_ID`     | BigQuery dataset ID                  | Dataset containing KnowledgeSpace metadata |
+| `INDEX_ENDPOINT_ID` | Vertex AI Vector Search endpoint     | ID of deployed vector index for RAG        |
+| `ELASTIC_BASE_URL`  | Elasticsearch base URL               | URL of the text search engine              |
+
+---
+
 
 ## Running the Application
 

diff --git a/backend/agents.py b/backend/agents.py
@@ -226,7 +226,43 @@ async def call_gemini_detect_intents(query: str, history: List[str]) -> List[str
     intents = [i for i in out.get("intents", []) if i in allowed]
     return list(dict.fromkeys(intents or [QueryIntent.DATA_DISCOVERY.value]))[:6]
 
-
+async def call_gemini_extract_filters(query: str) -> Dict[str, str]:
+    """
+    Extract structured filters like species, modality, or specific datasources.
+    """
+    _require_llm_creds()
+    client = _get_genai_client()
+
+    # Common filters derived from your datasources_config.json
+    valid_keys = [
+        "species", "brain_region", "license", "authors", 
+        "formats", "technique", "sex", "age"
+    ]
+
+    prompt = (
+        f"Analyze this neuroscience query: '{query}'\n"
+        f"Extract structured filters if explicitly mentioned. Valid keys: {valid_keys}.\n"
+        "Return STRICT JSON only: { \"filters\": { \"key\": \"value\" } }\n"
+        "Rules:\n"
+        "- If the user asks for 'human' data, set \"species\": \"Homo sapiens\".\n"
+        "- If the user asks for 'mouse' or 'rat', set \"species\": \"Mus musculus\" or \"Rattus norvegicus\".\n"
+        "- Map 'fMRI', 'EEG' to \"technique\" or \"formats\" appropriately.\n"
+        "- If no specific filters are found, return { \"filters\": {} }.\n"
+    )
+
+    cfg = genai_types.GenerateContentConfig(
+        temperature=0.0,
+        max_output_tokens=256,
+        response_mime_type="application/json",
+    )
+
+    try:
+        resp = client.models.generate_content(model=FLASH_LITE_MODEL, contents=[prompt], config=cfg)
+        out = json.loads(resp.text or "{}")
+        return out.get("filters", {})
+    except Exception as e:
+        print(f"Filter extraction failed: {e}")
+        return {}
 
 
 async def call_gemini_for_final_synthesis(
@@ -320,35 +356,44 @@ class AgentState(TypedDict):
     keywords: List[str]
     effective_query: str
     intents: List[str]
+    filters: Dict[str, str]  
     ks_results: List[dict]
     vector_results: List[dict]
     final_results: List[dict]
     all_results: List[dict]
     final_response: str
 
-
 class KSSearchAgent:
-    async def run(self, query: str, keywords: List[str], want: int = 45) -> dict:
+    async def run(self, query: str, keywords: List[str], filters: Dict[str, str] = None, want: int = 45) -> dict:
+
+        general = []
+        fuzzy = []
+
+        enhanced_keywords = keywords.copy() if keywords else []
+
+        if filters:
+            print(f"  -> Applying extracted filters: {filters}")
+            for key, val in filters.items():
+                if val and val not in enhanced_keywords:
+                    enhanced_keywords.append(val)
+
         try:
             print("  -> Using parallel enrichment in KS search")
-            general = await general_search_async(query, top_k=min(want, 50), enrich_details=True)
-            general = general.get("combined_results", [])
+            general_data = await general_search_async(query, top_k=min(want, 50), enrich_details=True)
+            general = general_data.get("combined_results", [])
         except Exception as e:
-            print(f"Async general search error, falling back to sync: {e}")
-            try:
-                general = general_search(query, top_k=min(want, 50), enrich_details=True).get("combined_results", [])
-            except Exception as e2:
-                print(f"Sync general search error: {e2}")
-                general = []
+            print(f"Async general search error: {e}")
+            general = []
         try:
-            print(f"  -> Running fuzzy search with keywords: {keywords}")
-            fuzzy = global_fuzzy_keyword_search(keywords, top_k=min(want, 50))
+            print(f"  -> Running fuzzy search with keywords: {enhanced_keywords}")
+
+            fuzzy = global_fuzzy_keyword_search(enhanced_keywords, top_k=min(want, 50))
             print(f"  -> Fuzzy search returned {len(fuzzy)} results")
         except Exception as e:
-            print(f"Fuzzy config search error: {e}")
+            print(f"Fuzzy search error: {e}")
             fuzzy = []
-        return {"combined_results": (general + fuzzy)[: max(want, 15)]}
 
+        return {"combined_results": (general + fuzzy)[: max(want, 15)]}
 
 class VectorSearchAgent:
     def __init__(self):
@@ -381,14 +426,26 @@ async def extract_keywords_and_rewrite(state: AgentState) -> AgentState:
         return {**state, "effective_query": state["query"], "keywords": [], "intents": intents0}
 
     effective = await call_gemini_rewrite_with_history(state["query"], state.get("history", []))
-    keywords = await call_gemini_for_keywords(effective)
-    # Re-evaluate intents after rewrite (usually drops greeting if mixed)
-    intents = await call_gemini_detect_intents(effective, state.get("history", []))
+
+    # Run keywords AND filters extraction in parallel for speed
+    keywords_task = call_gemini_for_keywords(effective)
+    intents_task = call_gemini_detect_intents(effective, state.get("history", []))
+    filters_task = call_gemini_extract_filters(effective) # <--- NEW TASK
+
+    keywords, intents, filters = await asyncio.gather(keywords_task, intents_task, filters_task)
+
     print(f"  -> Effective query: {effective}")
     print(f"  -> Keywords: {keywords}")
-    print(f"  -> Intents: {intents}")
-    return {**state, "effective_query": effective, "keywords": keywords, "intents": intents}
-
+    print(f"  -> Filters: {filters}") # <--- Log it
+
+    # Add filters to the returned state
+    return {
+        **state, 
+        "effective_query": effective, 
+        "keywords": keywords, 
+        "intents": intents, 
+        "filters": filters
+    }
 
 # Global vector agent instance - initialized once per process
 _global_vector_agent = None
@@ -410,9 +467,14 @@ async def execute_search(state: AgentState) -> Dict[str, Any]:
     ks_agent = KSSearchAgent()
     vec_agent = get_vector_agent()  # Reuse the same instance
 
+
     ks_task = asyncio.create_task(
-        ks_agent.run(state["effective_query"], state.get("keywords", []), want=want_pool)
-    )
+        ks_agent.run(
+            state["effective_query"], 
+            state.get("keywords", []), 
+            filters=state.get("filters", {}), # <--- Pass filters here
+            want=want_pool
+        ))
     vec_task = asyncio.create_task(
         vec_agent.run(query=state["effective_query"], want=want_pool, context={"raw": True})
     )

diff --git a/backend/ks_search_tool.py b/backend/ks_search_tool.py
@@ -8,6 +8,7 @@
 import re
 from urllib.parse import urlparse
 from difflib import SequenceMatcher
+from rapidfuzz import fuzz, process
 
 
 def tool(args_schema):
@@ -48,22 +49,28 @@ def _init_(self, description="", default_factory=None):
 
 DATASOURCE_ID_TO_NAME = {v: k for k, v in DATASOURCE_NAME_TO_ID.items()}
 
-
 def fuzzy_match(query: str, target: str, threshold: float = 0.8) -> bool:
     if not query or not target:
         return False
-    similarity = SequenceMatcher(None, query.lower(), target.lower()).ratio()
-    return similarity >= threshold
+    # token_sort_ratio handles "brain human" == "human brain" automatically
+    score = fuzz.token_sort_ratio(query.lower(), target.lower())
+    return score >= (threshold * 100)
 
 
 def find_best_matches(query: str, candidates: List[str], threshold: float = 0.8, max_matches: int = 5) -> List[str]:
-    matches = []
-    for candidate in candidates:
-        if fuzzy_match(query, candidate, threshold):
-            similarity = SequenceMatcher(None, query.lower(), candidate.lower()).ratio()
-            matches.append((candidate, similarity))
-    matches.sort(key=lambda x: x[1], reverse=True)
-    return [match[0] for match in matches[:max_matches]]
+    if not query or not candidates:
+        return []
+
+    # RapidFuzz's process.extract is highly optimized
+    results = process.extract(
+        query.lower(),
+        candidates,
+        scorer=fuzz.token_sort_ratio,
+        limit=max_matches,
+        score_cutoff=threshold * 100
+    )
+    # results format is [(match, score, index), ...]
+    return [res[0] for res in results]
 
 
 def search_across_all_fields(query: str, all_configs: dict, threshold: float = 0.8) -> List[dict]:
@@ -463,3 +470,22 @@ def smart_knowledge_search(
                 results = _perform_search(target_id, q, dict(filters), all_configs)
                 return {"combined_results": results[:top_k]}
     return general_search(q, top_k, enrich_details=True)
+
+def fuzzy_match(query: str, target: str, threshold: float = 0.8) -> bool:
+    if not query or not target:
+        return False
+    # token_sort_ratio handles "brain human" == "human brain"
+    score = fuzz.token_sort_ratio(query.lower(), target.lower())
+    return score >= (threshold * 100)
+
+def find_best_matches(query: str, candidates: List[str], threshold: float = 0.8, max_matches: int = 5) -> List[str]:
+    # process.extract is significantly optimized for list searching
+    results = process.extract(
+        query.lower(), 
+        candidates, 
+        scorer=fuzz.token_sort_ratio, 
+        limit=max_matches,
+        score_cutoff=threshold * 100
+    )
+    # results format: [(match, score, index), ...]
+    return [res[0] for res in results]
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,6 +21,7 @@ dependencies = [
     "langgraph>=0.6.4",
     "matplotlib>=3.10.3",
     "pandas>=2.3.1",
+    "rapidfuzz>=3.0.0",
     "requests>=2.32.4",
     "scikit-learn>=1.7.0",
     "sentence-transformers>=3.0.0",