Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ class Settings(BaseSettings):

hf_extraction_endpoint_url: str = ""
hf_extraction_api_token: str = ""
hf_extraction_model_name: str = "Qwen/Qwen2.5-3B-Instruct"
hf_extraction_model_name: str = "Qwen/Qwen2.5-Coder-32B-Instruct"
hf_extraction_timeout_seconds: int = 20
hf_extraction_max_new_tokens: int = 2048

Expand Down
28 changes: 27 additions & 1 deletion app/infra/kakao/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ def _build_query(self, keyword: str, location_hints: list[str]) -> str:
if not location_hints:
return keyword
top_hint = location_hints[0]
if _normalize_address_text(keyword) == _normalize_address_text(top_hint):
return top_hint
return f"{top_hint} {keyword}".strip()

def _to_places(
Expand Down Expand Up @@ -137,11 +139,35 @@ def _score_place(
str(doc.get("road_address_name") or ""),
]
)
if any(hint.lower() in address_blob.lower() for hint in location_hints[:2]):
if _has_exact_address_hint(address_blob, location_hints):
score += 0.25
elif any(hint.lower() in address_blob.lower() for hint in location_hints[:2]):
score += 0.1

return max(0.0, min(0.99, score))


def _normalize_place_text(value: str) -> str:
return "".join((value or "").lower().split())


def _normalize_address_text(value: str) -> str:
return "".join(
char.lower()
for char in value or ""
if char.isalnum()
)


def _has_exact_address_hint(address_blob: str, location_hints: list[str]) -> bool:
normalized_address = _normalize_address_text(address_blob)
if not normalized_address:
return False

for hint in location_hints[:2]:
normalized_hint = _normalize_address_text(hint)
if not normalized_hint or not any(char.isdigit() for char in normalized_hint):
continue
if normalized_hint in normalized_address or normalized_address in normalized_hint:
return True
return False
12 changes: 9 additions & 3 deletions app/infra/llm/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,15 @@
"brand information, store information, or place information. When a place name "
"line is followed by an address line, pair them together. Address lines often "
"start with map-pin markers, address/location labels, or Korean address units "
"such as city, gu, gun, dong, eup, myeon, ri, ga, ro, or gil. A hashtag can be "
"a real store name, for example #StoreName; consider it when it names a "
"specific local business. Do not extract generic regional/category/promotional "
"such as city, gu, gun, dong, eup, myeon, ri, ga, ro, or gil. First inspect "
"hashtags before choosing descriptive category phrases. A hashtag can be a real "
"store name, for example #StoreName; prioritize it when it appears on the same "
"line as a map-pin/location marker or near an address, hours, menu, or phone "
"number. For captions like '📍Guri Gyomun-dong #JukdongSikdang' followed by an "
"address, extract 'JukdongSikdang' as store_name and pair it with that address. "
"Prefer specific proper-noun hashtags over generic descriptive phrases such as "
"old restaurant, pork cutlet restaurant, cafe, dessert shop, hot place, or good "
"restaurant. Do not extract generic regional/category/promotional "
"hashtags such as Seoul cafe, Yeonnam cafe, dessert, hot place, date course, "
"travel, recommendation, or account handles as store names. If a store name is "
"taken from a hashtag, remove the leading # in store_name but keep the original "
Expand Down
18 changes: 17 additions & 1 deletion app/worker/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,23 @@ async def _search_places_by_hints(
return qualified

places = await self._search_places(candidate, [])
return self._qualified_places(places)
qualified = self._qualified_places(places)
if qualified:
return qualified

for hint in location_hints:
address_candidate = ExtractedCandidate(
keyword=hint,
source_keyword=candidate.source_keyword,
source_sentence=candidate.source_sentence,
raw_candidate=candidate.raw_candidate,
)
places = await self._search_places(address_candidate, [hint])
qualified = self._qualified_places(places)
if qualified:
return qualified

return []

def _qualified_places(self, places: list[PlaceCandidate]) -> list[PlaceCandidate]:
return [
Expand Down
7 changes: 7 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,10 @@ def test_production_private_api_requires_internal_key() -> None:
)
with pytest.raises(ValueError, match="INTERNAL_API_KEY"):
validate_production_internal_api_key(s)


def test_default_hf_extraction_model_uses_qwen_coder_32b() -> None:
assert (
Settings.model_fields["hf_extraction_model_name"].default
== "Qwen/Qwen2.5-Coder-32B-Instruct"
)
3 changes: 3 additions & 0 deletions tests/test_hf_extraction_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,9 @@ def test_build_extraction_system_prompt_mentions_hashtag_store_names() -> None:
prompt = build_extraction_system_prompt(6)

assert "hashtag" in prompt
assert "First inspect hashtags" in prompt
assert "prioritize it when it appears on the same line as a map-pin" in prompt
assert "Prefer specific proper-noun hashtags" in prompt
assert "remove the leading #" in prompt
assert "up to 6 places" in prompt

Expand Down
73 changes: 73 additions & 0 deletions tests/test_kakao_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,76 @@ def test_kakao_local_client_requires_api_key() -> None:

with pytest.raises(KakaoNonRetryableError):
_run(client.search_places(_candidate(), []))


@pytest.mark.skipif(not EVENT_LOOP_AVAILABLE, reason="Event loop creation is blocked in this environment")
def test_kakao_local_client_boosts_exact_address_match_above_threshold() -> None:
candidate = ExtractedCandidate(
keyword="중앙시장 오복닭집",
source_keyword="중앙시장 오복닭집",
source_sentence="(2) 중앙시장 오복닭집",
raw_candidate="중앙시장 오복닭집",
)

async def handler(request: httpx.Request) -> httpx.Response:
return httpx.Response(
200,
json={
"documents": [
{
"id": "456",
"place_name": "오복닭집",
"category_name": "음식점 > 치킨",
"category_group_code": "FD6",
"category_group_name": "음식점",
"phone": "054-000-0000",
"address_name": "경북 경주시 성건동 339-2",
"road_address_name": "경북 경주시 금성로 295",
"x": "129.0",
"y": "35.0",
"place_url": "https://place.map.kakao.com/456",
}
],
"meta": {"total_count": 1},
},
)

client = KakaoLocalClient(
_settings(),
transport=httpx.MockTransport(handler),
)

result = _run(
client.search_places(
candidate,
location_hints=["경북 경주시 금성로 295"],
)
)

assert result.places[0].place_name == "오복닭집"
assert result.places[0].confidence >= 0.7


@pytest.mark.skipif(not EVENT_LOOP_AVAILABLE, reason="Event loop creation is blocked in this environment")
def test_kakao_local_client_deduplicates_address_only_query() -> None:
seen_requests: list[httpx.Request] = []
address = "경북 경주시 내남면 포석로 110-32"
candidate = ExtractedCandidate(
keyword=address,
source_keyword="수뢰뫼",
source_sentence="(4) 수뢰뫼",
raw_candidate="수뢰뫼",
)

async def handler(request: httpx.Request) -> httpx.Response:
seen_requests.append(request)
return httpx.Response(200, json={"documents": [], "meta": {"total_count": 0}})

client = KakaoLocalClient(
_settings(),
transport=httpx.MockTransport(handler),
)

_run(client.search_places(candidate, location_hints=[address]))

assert seen_requests[0].url.params["query"] == address
87 changes: 87 additions & 0 deletions tests/test_worker_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,27 @@ async def search_places(self, candidate, location_hints: list[str]) -> FakePlace
return FakePlaceSearchResult(self.places_by_keyword.get(candidate.keyword, []))


class KeywordAndHintAwarePlaceSearchClient:
def __init__(
self,
places_by_call: dict[tuple[str, tuple[str, ...]], list[PlaceCandidate]],
) -> None:
self.places_by_call = places_by_call
self.calls: list[dict[str, object]] = []

async def search_places(self, candidate, location_hints: list[str]) -> FakePlaceSearchResult:
self.calls.append(
{
"keyword": candidate.keyword,
"source_keyword": candidate.source_keyword,
"location_hints": location_hints,
}
)
return FakePlaceSearchResult(
self.places_by_call.get((candidate.keyword, tuple(location_hints)), [])
)


class FailingPlaceSearchClient:
async def search_places(self, candidate, location_hints: list[str]) -> FakePlaceSearchResult:
raise RuntimeError("kakao unavailable")
Expand Down Expand Up @@ -340,6 +361,72 @@ async def fake_crawl(url: str, _settings: Settings) -> CrawlArtifact:
assert repo.saved_result["selected_places"][0]["confidence"] == 0.95


@pytest.mark.skipif(not EVENT_LOOP_AVAILABLE, reason="Event loop creation is blocked in this environment")
def test_processor_falls_back_to_address_only_search(monkeypatch) -> None:
job = _new_job()
repo = FakeRepository(job)
settings = Settings(kakao_min_place_confidence=0.7)
address = "경북 경주시 내남면 포석로 110-32"
extractor = FakeExtractionClient(
ExtractionResult(
store_name="수뢰뫼",
address=address,
store_name_evidence="수뢰뫼",
address_evidence=f"📍위치 : {address}",
certainty=ExtractionCertainty.HIGH,
)
)
place = _place_candidate(
confidence=0.8,
kakao_place_id="456",
place_name="수뢰뫼",
source_keyword="수뢰뫼",
address_name="경북 경주시 내남면 용장리 114-3",
road_address_name=address,
)
place_search = KeywordAndHintAwarePlaceSearchClient(
{
(address, (address,)): [place],
}
)

async def fake_crawl(url: str, _settings: Settings) -> CrawlArtifact:
return CrawlArtifact(
url=url,
html=None,
text=f"수뢰뫼\n📍위치 : {address}",
media_type="reel",
caption=f"수뢰뫼\n📍위치 : {address}",
instagram_meta=None,
)

monkeypatch.setattr("app.worker.processor.crawl_and_parse", fake_crawl)

processor = JobProcessor(
repository=repo,
settings=settings,
extraction_client=extractor,
place_search_client=place_search,
)

_run(processor.process_job(job.job_id))

assert [
{
"keyword": call["keyword"],
"location_hints": call["location_hints"],
}
for call in place_search.calls
] == [
{"keyword": "수뢰뫼", "location_hints": [address]},
{"keyword": "수뢰뫼", "location_hints": []},
{"keyword": address, "location_hints": [address]},
]
assert repo.saved_result is not None
assert repo.saved_result["selected_places"][0]["place_name"] == "수뢰뫼"
assert repo.saved_result["selected_places"][0]["source_keyword"] == "수뢰뫼"


def test_build_location_hints_from_korean_address() -> None:
assert JobProcessor._build_location_hints("서울 서초구 방배로 23길 31-6") == [
"서울 서초구 방배로 23길 31-6",
Expand Down
Loading