Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 51 additions & 1 deletion app/infra/kakao/client.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from dataclasses import dataclass
from difflib import SequenceMatcher
from typing import Any

import httpx
Expand Down Expand Up @@ -116,7 +117,7 @@ def _to_places(
return places

@staticmethod
def _score_place(
def _score_place_v1(
keyword: str,
place_name: str,
rank: int,
Expand Down Expand Up @@ -146,11 +147,60 @@ def _score_place(

return max(0.0, min(0.99, score))

@staticmethod
def _score_place(
keyword: str,
place_name: str,
rank: int,
doc: dict[str, Any],
location_hints: list[str],
) -> float:
name_similarity = _name_similarity(keyword, place_name)
score = 0.25 + (min(name_similarity, 1.0) * 0.45)
if rank == 0:
score += 0.12
elif rank == 1:
score += 0.06
elif rank == 2:
score += 0.03

address_blob = " ".join(
[
str(doc.get("address_name") or ""),
str(doc.get("road_address_name") or ""),
]
)
exact_address_hint = _has_exact_address_hint(address_blob, location_hints)
if exact_address_hint:
score += 0.18
elif any(hint.lower() in address_blob.lower() for hint in location_hints[:2]):
score += 0.07

# v2 guardrail: address/rank alone must not select an unrelated POI.
if name_similarity < 0.45:
score = min(score, 0.69)

return max(0.0, min(0.99, score))


def _normalize_place_text(value: str) -> str:
return "".join((value or "").lower().split())


def _normalize_name_text(value: str) -> str:
return "".join(char.lower() for char in value or "" if char.isalnum())


def _name_similarity(left: str, right: str) -> float:
left_norm = _normalize_name_text(left)
right_norm = _normalize_name_text(right)
if not left_norm or not right_norm:
return 0.0
if left_norm in right_norm or right_norm in left_norm:
return 1.0
return SequenceMatcher(None, left_norm, right_norm).ratio()


def _normalize_address_text(value: str) -> str:
return "".join(
char.lower()
Expand Down
41 changes: 40 additions & 1 deletion app/infra/llm/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from app.domain.job import ExtractionResult
from app.schemas.extraction import ExtractionLLMResponse

EXTRACTION_SYSTEM_PROMPT_TEMPLATE = (
EXTRACTION_SYSTEM_PROMPT_TEMPLATE_V1 = (
"You extract place/store information from Korean social media captions. "
"Return only one JSON object with these exact top-level keys: store_name, "
"address, store_name_evidence, address_evidence, certainty, places. "
Expand Down Expand Up @@ -45,6 +45,45 @@
"explanations, Markdown, or any text outside the JSON object."
)

EXTRACTION_SYSTEM_PROMPT_TEMPLATE_V2 = (
"You extract visitable place/store information from Korean social media captions. "
"Return only one JSON object with these exact top-level keys: store_name, address, "
"store_name_evidence, address_evidence, certainty, places. places must be an array "
"of objects with the same exact keys except places. Extract every distinct "
"visitable local place/store/brand, up to {max_candidates} places, preserving "
"caption order. "
"Priority rules: "
"1. Prefer explicit place markers such as 📍, 📌위치, 위치, 상호명, 매장명, 가게, "
"or 장소. Text after the marker is usually the store_name. If a marker line is "
"followed by an address line, pair them. If a line has 'PLACE_NAME (ADDRESS)', "
"extract PLACE_NAME as store_name and ADDRESS as address. "
"2. First inspect hashtags before choosing descriptive category phrases. A "
"specific proper-noun hashtag near an address, hours, menu, phone number, or "
"place marker may be the store_name; prioritize it when it appears on the same "
"line as a map-pin. Prefer specific proper-noun hashtags over generic hashtags, "
"and remove the leading # when using one as store_name. "
"3. Preserve full Korean place names exactly as written, including branch or "
"store suffixes such as 본점, 직영점, 성수점, 강남점, and 용산 아이파크몰점. Do not "
"translate, romanize, shorten, or normalize names. "
"4. Never extract menu names, categories, title phrases, region tags, prizes, "
"account handles, or promotional phrases as store_name when a real place marker "
"or proper noun exists. "
"Examples: '📌위치 : 진담옥 감자탕 (서울 강남구 선릉로86길 12 2층)' -> "
"store_name '진담옥 감자탕', address '서울 강남구 선릉로86길 12 2층'. "
"'이게 카피바라야... 📍 썸머러너' -> store_name '썸머러너', not '카피바라'. "
"Address lines often start with map-pin markers, address/location labels, or "
"Korean address units such as city, gu, gun, dong, eup, myeon, ri, ga, ro, or "
"gil. Do not invent missing values. Use null when unknown. Evidence values must "
"be exact substrings copied from the input caption. certainty must be one of "
"high, medium, or low. The top-level legacy fields store_name, address, "
"store_name_evidence, address_evidence, and certainty must mirror the first item "
"in places, or null/low when places is empty. If no place is found, return "
"places as an empty array. Do not include explanations, Markdown, or any text "
"outside the JSON object."
)

EXTRACTION_SYSTEM_PROMPT_TEMPLATE = EXTRACTION_SYSTEM_PROMPT_TEMPLATE_V2


def build_extraction_system_prompt(max_candidates: int) -> str:
return EXTRACTION_SYSTEM_PROMPT_TEMPLATE.format(
Expand Down
Loading