HUFS-Capstone-Project · 1000hyehyang · May 15, 2026 · May 15, 2026 · May 15, 2026 · May 15, 2026
diff --git a/app/infra/kakao/client.py b/app/infra/kakao/client.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
+from difflib import SequenceMatcher
 from typing import Any
 
 import httpx
@@ -116,7 +117,7 @@ def _to_places(
         return places
 
     @staticmethod
-    def _score_place(
+    def _score_place_v1(
         keyword: str,
         place_name: str,
         rank: int,
@@ -146,11 +147,60 @@ def _score_place(
 
         return max(0.0, min(0.99, score))
 
+    @staticmethod
+    def _score_place(
+        keyword: str,
+        place_name: str,
+        rank: int,
+        doc: dict[str, Any],
+        location_hints: list[str],
+    ) -> float:
+        name_similarity = _name_similarity(keyword, place_name)
+        score = 0.25 + (min(name_similarity, 1.0) * 0.45)
+        if rank == 0:
+            score += 0.12
+        elif rank == 1:
+            score += 0.06
+        elif rank == 2:
+            score += 0.03
+
+        address_blob = " ".join(
+            [
+                str(doc.get("address_name") or ""),
+                str(doc.get("road_address_name") or ""),
+            ]
+        )
+        exact_address_hint = _has_exact_address_hint(address_blob, location_hints)
+        if exact_address_hint:
+            score += 0.18
+        elif any(hint.lower() in address_blob.lower() for hint in location_hints[:2]):
+            score += 0.07
+
+        # v2 guardrail: address/rank alone must not select an unrelated POI.
+        if name_similarity < 0.45:
+            score = min(score, 0.69)
+
+        return max(0.0, min(0.99, score))
+
 
 def _normalize_place_text(value: str) -> str:
     return "".join((value or "").lower().split())
 
 
+def _normalize_name_text(value: str) -> str:
+    return "".join(char.lower() for char in value or "" if char.isalnum())
+
+
+def _name_similarity(left: str, right: str) -> float:
+    left_norm = _normalize_name_text(left)
+    right_norm = _normalize_name_text(right)
+    if not left_norm or not right_norm:
+        return 0.0
+    if left_norm in right_norm or right_norm in left_norm:
+        return 1.0
+    return SequenceMatcher(None, left_norm, right_norm).ratio()
+
+
 def _normalize_address_text(value: str) -> str:
     return "".join(
         char.lower()

diff --git a/app/infra/llm/client.py b/app/infra/llm/client.py
@@ -12,7 +12,7 @@
 from app.domain.job import ExtractionResult
 from app.schemas.extraction import ExtractionLLMResponse
 
-EXTRACTION_SYSTEM_PROMPT_TEMPLATE = (
+EXTRACTION_SYSTEM_PROMPT_TEMPLATE_V1 = (
     "You extract place/store information from Korean social media captions. "
     "Return only one JSON object with these exact top-level keys: store_name, "
     "address, store_name_evidence, address_evidence, certainty, places. "
@@ -45,6 +45,45 @@
     "explanations, Markdown, or any text outside the JSON object."
 )
 
+EXTRACTION_SYSTEM_PROMPT_TEMPLATE_V2 = (
+    "You extract visitable place/store information from Korean social media captions. "
+    "Return only one JSON object with these exact top-level keys: store_name, address, "
+    "store_name_evidence, address_evidence, certainty, places. places must be an array "
+    "of objects with the same exact keys except places. Extract every distinct "
+    "visitable local place/store/brand, up to {max_candidates} places, preserving "
+    "caption order. "
+    "Priority rules: "
+    "1. Prefer explicit place markers such as 📍, 📌위치, 위치, 상호명, 매장명, 가게, "
+    "or 장소. Text after the marker is usually the store_name. If a marker line is "
+    "followed by an address line, pair them. If a line has 'PLACE_NAME (ADDRESS)', "
+    "extract PLACE_NAME as store_name and ADDRESS as address. "
+    "2. First inspect hashtags before choosing descriptive category phrases. A "
+    "specific proper-noun hashtag near an address, hours, menu, phone number, or "
+    "place marker may be the store_name; prioritize it when it appears on the same "
+    "line as a map-pin. Prefer specific proper-noun hashtags over generic hashtags, "
+    "and remove the leading # when using one as store_name. "
+    "3. Preserve full Korean place names exactly as written, including branch or "
+    "store suffixes such as 본점, 직영점, 성수점, 강남점, and 용산 아이파크몰점. Do not "
+    "translate, romanize, shorten, or normalize names. "
+    "4. Never extract menu names, categories, title phrases, region tags, prizes, "
+    "account handles, or promotional phrases as store_name when a real place marker "
+    "or proper noun exists. "
+    "Examples: '📌위치 : 진담옥 감자탕 (서울 강남구 선릉로86길 12 2층)' -> "
+    "store_name '진담옥 감자탕', address '서울 강남구 선릉로86길 12 2층'. "
+    "'이게 카피바라야... 📍 썸머러너' -> store_name '썸머러너', not '카피바라'. "
+    "Address lines often start with map-pin markers, address/location labels, or "
+    "Korean address units such as city, gu, gun, dong, eup, myeon, ri, ga, ro, or "
+    "gil. Do not invent missing values. Use null when unknown. Evidence values must "
+    "be exact substrings copied from the input caption. certainty must be one of "
+    "high, medium, or low. The top-level legacy fields store_name, address, "
+    "store_name_evidence, address_evidence, and certainty must mirror the first item "
+    "in places, or null/low when places is empty. If no place is found, return "
+    "places as an empty array. Do not include explanations, Markdown, or any text "
+    "outside the JSON object."
+)
+
+EXTRACTION_SYSTEM_PROMPT_TEMPLATE = EXTRACTION_SYSTEM_PROMPT_TEMPLATE_V2
+
 
 def build_extraction_system_prompt(max_candidates: int) -> str:
     return EXTRACTION_SYSTEM_PROMPT_TEMPLATE.format(