From f3b2c8f056556241cbf7e8141af7533aebeb5919 Mon Sep 17 00:00:00 2001 From: KyungminPark-steck Date: Fri, 15 May 2026 18:18:11 +0900 Subject: [PATCH 1/3] feat: improve HF extraction prompt and Kakao fallback --- app/infra/kakao/client.py | 52 ++++++++++++++++++++++++++++++++++++++- app/infra/llm/client.py | 48 +++++++++++++++++++++++++++++++++++- 2 files changed, 98 insertions(+), 2 deletions(-) diff --git a/app/infra/kakao/client.py b/app/infra/kakao/client.py index d915eca..1be11b1 100644 --- a/app/infra/kakao/client.py +++ b/app/infra/kakao/client.py @@ -1,6 +1,7 @@ from __future__ import annotations from dataclasses import dataclass +from difflib import SequenceMatcher from typing import Any import httpx @@ -116,7 +117,7 @@ def _to_places( return places @staticmethod - def _score_place( + def _score_place_v1( keyword: str, place_name: str, rank: int, @@ -146,11 +147,60 @@ def _score_place( return max(0.0, min(0.99, score)) + @staticmethod + def _score_place( + keyword: str, + place_name: str, + rank: int, + doc: dict[str, Any], + location_hints: list[str], + ) -> float: + name_similarity = _name_similarity(keyword, place_name) + score = 0.25 + (min(name_similarity, 1.0) * 0.45) + if rank == 0: + score += 0.12 + elif rank == 1: + score += 0.06 + elif rank == 2: + score += 0.03 + + address_blob = " ".join( + [ + str(doc.get("address_name") or ""), + str(doc.get("road_address_name") or ""), + ] + ) + exact_address_hint = _has_exact_address_hint(address_blob, location_hints) + if exact_address_hint: + score += 0.18 + elif any(hint.lower() in address_blob.lower() for hint in location_hints[:2]): + score += 0.07 + + # v2 guardrail: address/rank alone must not select an unrelated POI. + if name_similarity < 0.45: + score = min(score, 0.69) + + return max(0.0, min(0.99, score)) + def _normalize_place_text(value: str) -> str: return "".join((value or "").lower().split()) +def _normalize_name_text(value: str) -> str: + return "".join(char.lower() for char in value or "" if char.isalnum()) + + +def _name_similarity(left: str, right: str) -> float: + left_norm = _normalize_name_text(left) + right_norm = _normalize_name_text(right) + if not left_norm or not right_norm: + return 0.0 + if left_norm in right_norm or right_norm in left_norm: + return 1.0 + return SequenceMatcher(None, left_norm, right_norm).ratio() + + def _normalize_address_text(value: str) -> str: return "".join( char.lower() diff --git a/app/infra/llm/client.py b/app/infra/llm/client.py index eb88cdf..4601860 100644 --- a/app/infra/llm/client.py +++ b/app/infra/llm/client.py @@ -11,7 +11,7 @@ from app.domain.job import ExtractionResult from app.schemas.extraction import ExtractionLLMResponse -EXTRACTION_SYSTEM_PROMPT_TEMPLATE = ( +EXTRACTION_SYSTEM_PROMPT_TEMPLATE_V1 = ( "You extract place/store information from Korean social media captions. " "Return only one JSON object with these exact top-level keys: store_name, " "address, store_name_evidence, address_evidence, certainty, places. " @@ -44,6 +44,52 @@ "explanations, Markdown, or any text outside the JSON object." ) +EXTRACTION_SYSTEM_PROMPT_TEMPLATE_V2 = ( + "You extract visitable place/store information from Korean social media captions. " + "Return only one JSON object with these exact top-level keys: store_name, address, " + "store_name_evidence, address_evidence, certainty, places. places must be an array " + "of objects. Each place object must have these exact keys: store_name, address, " + "store_name_evidence, address_evidence, certainty. Extract every distinct " + "place/store/brand that appears to be a visitable local business, up to " + "{max_candidates} places, preserving caption order. " + "Extraction priority: " + "1. Highest priority: explicit place markers. Treat the text immediately after " + "markers such as '๐Ÿ“', '๐Ÿ“Œ์œ„์น˜ :', '์œ„์น˜ :', '์ƒํ˜ธ๋ช… :', '๋งค์žฅ๋ช… :', '๊ฐ€๊ฒŒ :', " + "or '์žฅ์†Œ :' as the place name. If the line has 'PLACE_NAME (ADDRESS)', extract " + "PLACE_NAME as store_name and ADDRESS as address. " + "2. If a place marker line is followed by an address line, pair them together. " + "3. If a hashtag is near an address, hours, menu list, phone number, or place " + "marker, it may be the store name. Prefer specific proper-noun hashtags, but do " + "not use generic hashtags. " + "4. Preserve full branch/store names exactly as written. Keep suffixes such as " + "๋ณธ์ , ์ง์˜์ , ์„ฑ์ˆ˜์ , ์—ฐ์‹ ๋‚ด์ , ๊ฐ•๋‚จ์ , ์šฉ์‚ฐ ์•„์ดํŒŒํฌ๋ชฐ์ , ๋งˆ๊ณก๋ณธ์ . " + "5. Do not translate, romanize, shorten, or normalize Korean place names. Copy " + "the exact Korean text from the caption whenever possible. " + "Never extract these as store_name: menu names such as ์šฐ๋™, ๋ฉด๋ฐœ, ๋ถ์–ด๋ฐฑ์งฌ๋ฝ•, " + "์งํ™”๋งˆ๋ผํƒ•, ๊ฐ์žํƒ•, ์น˜์ฆˆ์ผ€์ดํฌ, ์นดํ”ผ๋ฐ”๋ผํ‘ธ๋”ฉ; category or title phrases such " + "as ๋…ธํฌ, ์ด๋ชจ์นด์„ธ, ์ˆ ์ง‘, ๋ง›์ง‘, ์นดํŽ˜, ๋””์ €ํŠธ๋ง›์ง‘, ์ฐœ์งˆ๋ฐฉ, ์•ผ์žฅ๋ง›์ง‘; " + "region/category phrases such as ์„ฑ์ˆ˜๋ง›์ง‘, ์‚ผ๊ฐ์ง€๋ง›์ง‘, ๋ฐฉ์ด๋™, ์‹ ์šฉ์‚ฐ ํ•ด์‚ฐ๋ฌผ์ง‘; " + "campaign/prize/event text; account handles. If both a descriptive title and an " + "explicit place marker exist, always choose the explicit place marker. " + "Examples: '๋ฉด๋ฐœ ํ•˜๋‚˜๋กœ... ๐Ÿ“์šฐ๋™ํ‚ค๋…ธ์•ผ ์‹ ์šฉ์‚ฐ๋ณธ์  ์„œ์šธ ์šฉ์‚ฐ๊ตฌ...' -> " + "์šฐ๋™ํ‚ค๋…ธ์•ผ ์‹ ์šฉ์‚ฐ๋ณธ์ ; '์ด๊ฒŒ ์นดํ”ผ๋ฐ”๋ผ์•ผ... ๐Ÿ“ ์ธ๋จธ๋Ÿฌ๋„ˆ' -> ์ธ๋จธ๋Ÿฌ๋„ˆ, not " + "์นดํ”ผ๋ฐ”๋ผ; '๐Ÿ“Œ์œ„์น˜ : ์ค‘ํ™”๊ฐ์ž”์ˆ˜ ๊ฐ•๋‚จ์  (์„œ์šธ ๊ฐ•๋‚จ๊ตฌ ๊ฐ•๋‚จ๋Œ€๋กœ66๊ธธ 11)' -> " + "์ค‘ํ™”๊ฐ์ž”์ˆ˜ ๊ฐ•๋‚จ์ , not ๋ถ์–ด๋ฐฑ์งฌ๋ฝ•; '๐Ÿ“ ๋‚˜์นจ๋ฐ˜ ์—ฐ์‹ ๋‚ด์ ' -> ๋‚˜์นจ๋ฐ˜ ์—ฐ์‹ ๋‚ด์ , " + "not ๋‚˜์นจ๋ฐ˜; '๐Ÿ“ ํ† ๋ผ์ • ์šฉ์‚ฐ ์•„์ดํŒŒํฌ๋ชฐ์ ' -> ํ† ๋ผ์ • ์šฉ์‚ฐ ์•„์ดํŒŒํฌ๋ชฐ์ , " + "not ํ† ๋ผ์ •; '๐Ÿ“Œ์ƒํ˜ธ๋ช… : ํ˜ผ์‹ ๊ผฌ์น˜ ๋ณธ์ ' -> ํ˜ผ์‹ ๊ผฌ์น˜ ๋ณธ์ . " + "Address lines often start with map-pin markers, address/location labels, or " + "Korean address units such as city, gu, gun, dong, eup, myeon, ri, ga, ro, or " + "gil. Do not invent missing values. Use null when unknown. Evidence values must " + "be exact substrings copied from the input caption. certainty must be one of " + "high, medium, or low. The top-level legacy fields store_name, address, " + "store_name_evidence, address_evidence, and certainty must mirror the first item " + "in places, or null/low when places is empty. If no place is found, return " + "places as an empty array. Do not include explanations, Markdown, or any text " + "outside the JSON object." +) + +EXTRACTION_SYSTEM_PROMPT_TEMPLATE = EXTRACTION_SYSTEM_PROMPT_TEMPLATE_V2 + def build_extraction_system_prompt(max_candidates: int) -> str: return EXTRACTION_SYSTEM_PROMPT_TEMPLATE.format( From d135d4312255529047450167bd81606899072414 Mon Sep 17 00:00:00 2001 From: KyungminPark-steck Date: Fri, 15 May 2026 18:45:11 +0900 Subject: [PATCH 2/3] fix: preserve hashtag prompt guidance after dev merge --- app/infra/llm/client.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/app/infra/llm/client.py b/app/infra/llm/client.py index dcb8a72..c131caf 100644 --- a/app/infra/llm/client.py +++ b/app/infra/llm/client.py @@ -59,9 +59,11 @@ "or '์žฅ์†Œ :' as the place name. If the line has 'PLACE_NAME (ADDRESS)', extract " "PLACE_NAME as store_name and ADDRESS as address. " "2. If a place marker line is followed by an address line, pair them together. " - "3. If a hashtag is near an address, hours, menu list, phone number, or place " - "marker, it may be the store name. Prefer specific proper-noun hashtags, but do " - "not use generic hashtags. " + "3. First inspect hashtags before choosing descriptive category phrases. " + "If a hashtag is near an address, hours, menu list, phone number, or place " + "marker, it may be the store name; prioritize it when it appears on the same " + "line as a map-pin. Prefer specific proper-noun hashtags, but do not use " + "generic hashtags. If store_name is taken from a hashtag, remove the leading #. " "4. Preserve full branch/store names exactly as written. Keep suffixes such as " "๋ณธ์ , ์ง์˜์ , ์„ฑ์ˆ˜์ , ์—ฐ์‹ ๋‚ด์ , ๊ฐ•๋‚จ์ , ์šฉ์‚ฐ ์•„์ดํŒŒํฌ๋ชฐ์ , ๋งˆ๊ณก๋ณธ์ . " "5. Do not translate, romanize, shorten, or normalize Korean place names. Copy " From b18e2267c12a750787935a4509b6ae87c7d67ed0 Mon Sep 17 00:00:00 2001 From: KyungminPark-steck Date: Sat, 16 May 2026 00:22:00 +0900 Subject: [PATCH 3/3] refactor: slim HF extraction prompt v2 --- app/infra/llm/client.py | 53 +++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/app/infra/llm/client.py b/app/infra/llm/client.py index c131caf..53fc27a 100644 --- a/app/infra/llm/client.py +++ b/app/infra/llm/client.py @@ -49,37 +49,28 @@ "You extract visitable place/store information from Korean social media captions. " "Return only one JSON object with these exact top-level keys: store_name, address, " "store_name_evidence, address_evidence, certainty, places. places must be an array " - "of objects. Each place object must have these exact keys: store_name, address, " - "store_name_evidence, address_evidence, certainty. Extract every distinct " - "place/store/brand that appears to be a visitable local business, up to " - "{max_candidates} places, preserving caption order. " - "Extraction priority: " - "1. Highest priority: explicit place markers. Treat the text immediately after " - "markers such as '๐Ÿ“', '๐Ÿ“Œ์œ„์น˜ :', '์œ„์น˜ :', '์ƒํ˜ธ๋ช… :', '๋งค์žฅ๋ช… :', '๊ฐ€๊ฒŒ :', " - "or '์žฅ์†Œ :' as the place name. If the line has 'PLACE_NAME (ADDRESS)', extract " - "PLACE_NAME as store_name and ADDRESS as address. " - "2. If a place marker line is followed by an address line, pair them together. " - "3. First inspect hashtags before choosing descriptive category phrases. " - "If a hashtag is near an address, hours, menu list, phone number, or place " - "marker, it may be the store name; prioritize it when it appears on the same " - "line as a map-pin. Prefer specific proper-noun hashtags, but do not use " - "generic hashtags. If store_name is taken from a hashtag, remove the leading #. " - "4. Preserve full branch/store names exactly as written. Keep suffixes such as " - "๋ณธ์ , ์ง์˜์ , ์„ฑ์ˆ˜์ , ์—ฐ์‹ ๋‚ด์ , ๊ฐ•๋‚จ์ , ์šฉ์‚ฐ ์•„์ดํŒŒํฌ๋ชฐ์ , ๋งˆ๊ณก๋ณธ์ . " - "5. Do not translate, romanize, shorten, or normalize Korean place names. Copy " - "the exact Korean text from the caption whenever possible. " - "Never extract these as store_name: menu names such as ์šฐ๋™, ๋ฉด๋ฐœ, ๋ถ์–ด๋ฐฑ์งฌ๋ฝ•, " - "์งํ™”๋งˆ๋ผํƒ•, ๊ฐ์žํƒ•, ์น˜์ฆˆ์ผ€์ดํฌ, ์นดํ”ผ๋ฐ”๋ผํ‘ธ๋”ฉ; category or title phrases such " - "as ๋…ธํฌ, ์ด๋ชจ์นด์„ธ, ์ˆ ์ง‘, ๋ง›์ง‘, ์นดํŽ˜, ๋””์ €ํŠธ๋ง›์ง‘, ์ฐœ์งˆ๋ฐฉ, ์•ผ์žฅ๋ง›์ง‘; " - "region/category phrases such as ์„ฑ์ˆ˜๋ง›์ง‘, ์‚ผ๊ฐ์ง€๋ง›์ง‘, ๋ฐฉ์ด๋™, ์‹ ์šฉ์‚ฐ ํ•ด์‚ฐ๋ฌผ์ง‘; " - "campaign/prize/event text; account handles. If both a descriptive title and an " - "explicit place marker exist, always choose the explicit place marker. " - "Examples: '๋ฉด๋ฐœ ํ•˜๋‚˜๋กœ... ๐Ÿ“์šฐ๋™ํ‚ค๋…ธ์•ผ ์‹ ์šฉ์‚ฐ๋ณธ์  ์„œ์šธ ์šฉ์‚ฐ๊ตฌ...' -> " - "์šฐ๋™ํ‚ค๋…ธ์•ผ ์‹ ์šฉ์‚ฐ๋ณธ์ ; '์ด๊ฒŒ ์นดํ”ผ๋ฐ”๋ผ์•ผ... ๐Ÿ“ ์ธ๋จธ๋Ÿฌ๋„ˆ' -> ์ธ๋จธ๋Ÿฌ๋„ˆ, not " - "์นดํ”ผ๋ฐ”๋ผ; '๐Ÿ“Œ์œ„์น˜ : ์ค‘ํ™”๊ฐ์ž”์ˆ˜ ๊ฐ•๋‚จ์  (์„œ์šธ ๊ฐ•๋‚จ๊ตฌ ๊ฐ•๋‚จ๋Œ€๋กœ66๊ธธ 11)' -> " - "์ค‘ํ™”๊ฐ์ž”์ˆ˜ ๊ฐ•๋‚จ์ , not ๋ถ์–ด๋ฐฑ์งฌ๋ฝ•; '๐Ÿ“ ๋‚˜์นจ๋ฐ˜ ์—ฐ์‹ ๋‚ด์ ' -> ๋‚˜์นจ๋ฐ˜ ์—ฐ์‹ ๋‚ด์ , " - "not ๋‚˜์นจ๋ฐ˜; '๐Ÿ“ ํ† ๋ผ์ • ์šฉ์‚ฐ ์•„์ดํŒŒํฌ๋ชฐ์ ' -> ํ† ๋ผ์ • ์šฉ์‚ฐ ์•„์ดํŒŒํฌ๋ชฐ์ , " - "not ํ† ๋ผ์ •; '๐Ÿ“Œ์ƒํ˜ธ๋ช… : ํ˜ผ์‹ ๊ผฌ์น˜ ๋ณธ์ ' -> ํ˜ผ์‹ ๊ผฌ์น˜ ๋ณธ์ . " + "of objects with the same exact keys except places. Extract every distinct " + "visitable local place/store/brand, up to {max_candidates} places, preserving " + "caption order. " + "Priority rules: " + "1. Prefer explicit place markers such as ๐Ÿ“, ๐Ÿ“Œ์œ„์น˜, ์œ„์น˜, ์ƒํ˜ธ๋ช…, ๋งค์žฅ๋ช…, ๊ฐ€๊ฒŒ, " + "or ์žฅ์†Œ. Text after the marker is usually the store_name. If a marker line is " + "followed by an address line, pair them. If a line has 'PLACE_NAME (ADDRESS)', " + "extract PLACE_NAME as store_name and ADDRESS as address. " + "2. First inspect hashtags before choosing descriptive category phrases. A " + "specific proper-noun hashtag near an address, hours, menu, phone number, or " + "place marker may be the store_name; prioritize it when it appears on the same " + "line as a map-pin. Prefer specific proper-noun hashtags over generic hashtags, " + "and remove the leading # when using one as store_name. " + "3. Preserve full Korean place names exactly as written, including branch or " + "store suffixes such as ๋ณธ์ , ์ง์˜์ , ์„ฑ์ˆ˜์ , ๊ฐ•๋‚จ์ , and ์šฉ์‚ฐ ์•„์ดํŒŒํฌ๋ชฐ์ . Do not " + "translate, romanize, shorten, or normalize names. " + "4. Never extract menu names, categories, title phrases, region tags, prizes, " + "account handles, or promotional phrases as store_name when a real place marker " + "or proper noun exists. " + "Examples: '๐Ÿ“Œ์œ„์น˜ : ์ง„๋‹ด์˜ฅ ๊ฐ์žํƒ• (์„œ์šธ ๊ฐ•๋‚จ๊ตฌ ์„ ๋ฆ‰๋กœ86๊ธธ 12 2์ธต)' -> " + "store_name '์ง„๋‹ด์˜ฅ ๊ฐ์žํƒ•', address '์„œ์šธ ๊ฐ•๋‚จ๊ตฌ ์„ ๋ฆ‰๋กœ86๊ธธ 12 2์ธต'. " + "'์ด๊ฒŒ ์นดํ”ผ๋ฐ”๋ผ์•ผ... ๐Ÿ“ ์ธ๋จธ๋Ÿฌ๋„ˆ' -> store_name '์ธ๋จธ๋Ÿฌ๋„ˆ', not '์นดํ”ผ๋ฐ”๋ผ'. " "Address lines often start with map-pin markers, address/location labels, or " "Korean address units such as city, gu, gun, dong, eup, myeon, ri, ga, ro, or " "gil. Do not invent missing values. Use null when unknown. Evidence values must "