diff --git a/app/infra/kakao/client.py b/app/infra/kakao/client.py index ce9a2c8..21329c9 100644 --- a/app/infra/kakao/client.py +++ b/app/infra/kakao/client.py @@ -1,6 +1,7 @@ from __future__ import annotations from dataclasses import dataclass +from difflib import SequenceMatcher from typing import Any import httpx @@ -116,7 +117,7 @@ def _to_places( return places @staticmethod - def _score_place( + def _score_place_v1( keyword: str, place_name: str, rank: int, @@ -146,11 +147,60 @@ def _score_place( return max(0.0, min(0.99, score)) + @staticmethod + def _score_place( + keyword: str, + place_name: str, + rank: int, + doc: dict[str, Any], + location_hints: list[str], + ) -> float: + name_similarity = _name_similarity(keyword, place_name) + score = 0.25 + (min(name_similarity, 1.0) * 0.45) + if rank == 0: + score += 0.12 + elif rank == 1: + score += 0.06 + elif rank == 2: + score += 0.03 + + address_blob = " ".join( + [ + str(doc.get("address_name") or ""), + str(doc.get("road_address_name") or ""), + ] + ) + exact_address_hint = _has_exact_address_hint(address_blob, location_hints) + if exact_address_hint: + score += 0.18 + elif any(hint.lower() in address_blob.lower() for hint in location_hints[:2]): + score += 0.07 + + # v2 guardrail: address/rank alone must not select an unrelated POI. + if name_similarity < 0.45: + score = min(score, 0.69) + + return max(0.0, min(0.99, score)) + def _normalize_place_text(value: str) -> str: return "".join((value or "").lower().split()) +def _normalize_name_text(value: str) -> str: + return "".join(char.lower() for char in value or "" if char.isalnum()) + + +def _name_similarity(left: str, right: str) -> float: + left_norm = _normalize_name_text(left) + right_norm = _normalize_name_text(right) + if not left_norm or not right_norm: + return 0.0 + if left_norm in right_norm or right_norm in left_norm: + return 1.0 + return SequenceMatcher(None, left_norm, right_norm).ratio() + + def _normalize_address_text(value: str) -> str: return "".join( char.lower() diff --git a/app/infra/llm/client.py b/app/infra/llm/client.py index 71ffe1c..53fc27a 100644 --- a/app/infra/llm/client.py +++ b/app/infra/llm/client.py @@ -12,7 +12,7 @@ from app.domain.job import ExtractionResult from app.schemas.extraction import ExtractionLLMResponse -EXTRACTION_SYSTEM_PROMPT_TEMPLATE = ( +EXTRACTION_SYSTEM_PROMPT_TEMPLATE_V1 = ( "You extract place/store information from Korean social media captions. " "Return only one JSON object with these exact top-level keys: store_name, " "address, store_name_evidence, address_evidence, certainty, places. " @@ -45,6 +45,45 @@ "explanations, Markdown, or any text outside the JSON object." ) +EXTRACTION_SYSTEM_PROMPT_TEMPLATE_V2 = ( + "You extract visitable place/store information from Korean social media captions. " + "Return only one JSON object with these exact top-level keys: store_name, address, " + "store_name_evidence, address_evidence, certainty, places. places must be an array " + "of objects with the same exact keys except places. Extract every distinct " + "visitable local place/store/brand, up to {max_candidates} places, preserving " + "caption order. " + "Priority rules: " + "1. Prefer explicit place markers such as ๐Ÿ“, ๐Ÿ“Œ์œ„์น˜, ์œ„์น˜, ์ƒํ˜ธ๋ช…, ๋งค์žฅ๋ช…, ๊ฐ€๊ฒŒ, " + "or ์žฅ์†Œ. Text after the marker is usually the store_name. If a marker line is " + "followed by an address line, pair them. If a line has 'PLACE_NAME (ADDRESS)', " + "extract PLACE_NAME as store_name and ADDRESS as address. " + "2. First inspect hashtags before choosing descriptive category phrases. A " + "specific proper-noun hashtag near an address, hours, menu, phone number, or " + "place marker may be the store_name; prioritize it when it appears on the same " + "line as a map-pin. Prefer specific proper-noun hashtags over generic hashtags, " + "and remove the leading # when using one as store_name. " + "3. Preserve full Korean place names exactly as written, including branch or " + "store suffixes such as ๋ณธ์ , ์ง์˜์ , ์„ฑ์ˆ˜์ , ๊ฐ•๋‚จ์ , and ์šฉ์‚ฐ ์•„์ดํŒŒํฌ๋ชฐ์ . Do not " + "translate, romanize, shorten, or normalize names. " + "4. Never extract menu names, categories, title phrases, region tags, prizes, " + "account handles, or promotional phrases as store_name when a real place marker " + "or proper noun exists. " + "Examples: '๐Ÿ“Œ์œ„์น˜ : ์ง„๋‹ด์˜ฅ ๊ฐ์žํƒ• (์„œ์šธ ๊ฐ•๋‚จ๊ตฌ ์„ ๋ฆ‰๋กœ86๊ธธ 12 2์ธต)' -> " + "store_name '์ง„๋‹ด์˜ฅ ๊ฐ์žํƒ•', address '์„œ์šธ ๊ฐ•๋‚จ๊ตฌ ์„ ๋ฆ‰๋กœ86๊ธธ 12 2์ธต'. " + "'์ด๊ฒŒ ์นดํ”ผ๋ฐ”๋ผ์•ผ... ๐Ÿ“ ์ธ๋จธ๋Ÿฌ๋„ˆ' -> store_name '์ธ๋จธ๋Ÿฌ๋„ˆ', not '์นดํ”ผ๋ฐ”๋ผ'. " + "Address lines often start with map-pin markers, address/location labels, or " + "Korean address units such as city, gu, gun, dong, eup, myeon, ri, ga, ro, or " + "gil. Do not invent missing values. Use null when unknown. Evidence values must " + "be exact substrings copied from the input caption. certainty must be one of " + "high, medium, or low. The top-level legacy fields store_name, address, " + "store_name_evidence, address_evidence, and certainty must mirror the first item " + "in places, or null/low when places is empty. If no place is found, return " + "places as an empty array. Do not include explanations, Markdown, or any text " + "outside the JSON object." +) + +EXTRACTION_SYSTEM_PROMPT_TEMPLATE = EXTRACTION_SYSTEM_PROMPT_TEMPLATE_V2 + def build_extraction_system_prompt(max_candidates: int) -> str: return EXTRACTION_SYSTEM_PROMPT_TEMPLATE.format(