From 957930633cfe7bf4d6987a7e153d5951fbb098b6 Mon Sep 17 00:00:00 2001 From: KyungminPark-steck Date: Sun, 3 May 2026 10:10:26 +0900 Subject: [PATCH 1/3] refactor: improve HF extraction model and hashtag prompt --- app/core/config.py | 2 +- app/infra/llm/client.py | 12 +++++++++--- tests/test_config.py | 7 +++++++ tests/test_hf_extraction_client.py | 3 +++ 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/app/core/config.py b/app/core/config.py index fc1f7b8..9084f92 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -94,7 +94,7 @@ class Settings(BaseSettings): hf_extraction_endpoint_url: str = "" hf_extraction_api_token: str = "" - hf_extraction_model_name: str = "Qwen/Qwen2.5-3B-Instruct" + hf_extraction_model_name: str = "Qwen/Qwen2.5-Coder-32B-Instruct" hf_extraction_timeout_seconds: int = 20 hf_extraction_max_new_tokens: int = 2048 diff --git a/app/infra/llm/client.py b/app/infra/llm/client.py index b0c2b92..eb88cdf 100644 --- a/app/infra/llm/client.py +++ b/app/infra/llm/client.py @@ -23,9 +23,15 @@ "brand information, store information, or place information. When a place name " "line is followed by an address line, pair them together. Address lines often " "start with map-pin markers, address/location labels, or Korean address units " - "such as city, gu, gun, dong, eup, myeon, ri, ga, ro, or gil. A hashtag can be " - "a real store name, for example #StoreName; consider it when it names a " - "specific local business. Do not extract generic regional/category/promotional " + "such as city, gu, gun, dong, eup, myeon, ri, ga, ro, or gil. First inspect " + "hashtags before choosing descriptive category phrases. A hashtag can be a real " + "store name, for example #StoreName; prioritize it when it appears on the same " + "line as a map-pin/location marker or near an address, hours, menu, or phone " + "number. For captions like '📍Guri Gyomun-dong #JukdongSikdang' followed by an " + "address, extract 'JukdongSikdang' as store_name and pair it with that address. " + "Prefer specific proper-noun hashtags over generic descriptive phrases such as " + "old restaurant, pork cutlet restaurant, cafe, dessert shop, hot place, or good " + "restaurant. Do not extract generic regional/category/promotional " "hashtags such as Seoul cafe, Yeonnam cafe, dessert, hot place, date course, " "travel, recommendation, or account handles as store names. If a store name is " "taken from a hashtag, remove the leading # in store_name but keep the original " diff --git a/tests/test_config.py b/tests/test_config.py index 11e10d4..188f08c 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -46,3 +46,10 @@ def test_production_private_api_requires_internal_key() -> None: ) with pytest.raises(ValueError, match="INTERNAL_API_KEY"): validate_production_internal_api_key(s) + + +def test_default_hf_extraction_model_uses_qwen_coder_32b() -> None: + assert ( + Settings.model_fields["hf_extraction_model_name"].default + == "Qwen/Qwen2.5-Coder-32B-Instruct" + ) diff --git a/tests/test_hf_extraction_client.py b/tests/test_hf_extraction_client.py index f785982..7f5447b 100644 --- a/tests/test_hf_extraction_client.py +++ b/tests/test_hf_extraction_client.py @@ -212,6 +212,9 @@ def test_build_extraction_system_prompt_mentions_hashtag_store_names() -> None: prompt = build_extraction_system_prompt(6) assert "hashtag" in prompt + assert "First inspect hashtags" in prompt + assert "prioritize it when it appears on the same line as a map-pin" in prompt + assert "Prefer specific proper-noun hashtags" in prompt assert "remove the leading #" in prompt assert "up to 6 places" in prompt From bea156ff5fcd12fa87390087bdc43872b08739f1 Mon Sep 17 00:00:00 2001 From: KyungminPark-steck Date: Sun, 3 May 2026 10:36:54 +0900 Subject: [PATCH 2/3] fix: boost Kakao matches with exact addresses --- app/infra/kakao/client.py | 26 ++++++++++++++++++++- tests/test_kakao_client.py | 48 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/app/infra/kakao/client.py b/app/infra/kakao/client.py index 3bec02f..cd862ff 100644 --- a/app/infra/kakao/client.py +++ b/app/infra/kakao/client.py @@ -137,7 +137,9 @@ def _score_place( str(doc.get("road_address_name") or ""), ] ) - if any(hint.lower() in address_blob.lower() for hint in location_hints[:2]): + if _has_exact_address_hint(address_blob, location_hints): + score += 0.25 + elif any(hint.lower() in address_blob.lower() for hint in location_hints[:2]): score += 0.1 return max(0.0, min(0.99, score)) @@ -145,3 +147,25 @@ def _score_place( def _normalize_place_text(value: str) -> str: return "".join((value or "").lower().split()) + + +def _normalize_address_text(value: str) -> str: + return "".join( + char.lower() + for char in value or "" + if char.isalnum() + ) + + +def _has_exact_address_hint(address_blob: str, location_hints: list[str]) -> bool: + normalized_address = _normalize_address_text(address_blob) + if not normalized_address: + return False + + for hint in location_hints[:2]: + normalized_hint = _normalize_address_text(hint) + if not normalized_hint or not any(char.isdigit() for char in normalized_hint): + continue + if normalized_hint in normalized_address or normalized_address in normalized_hint: + return True + return False diff --git a/tests/test_kakao_client.py b/tests/test_kakao_client.py index ff49538..745f250 100644 --- a/tests/test_kakao_client.py +++ b/tests/test_kakao_client.py @@ -112,3 +112,51 @@ def test_kakao_local_client_requires_api_key() -> None: with pytest.raises(KakaoNonRetryableError): _run(client.search_places(_candidate(), [])) + + +@pytest.mark.skipif(not EVENT_LOOP_AVAILABLE, reason="Event loop creation is blocked in this environment") +def test_kakao_local_client_boosts_exact_address_match_above_threshold() -> None: + candidate = ExtractedCandidate( + keyword="중앙시장 오복닭집", + source_keyword="중앙시장 오복닭집", + source_sentence="(2) 중앙시장 오복닭집", + raw_candidate="중앙시장 오복닭집", + ) + + async def handler(request: httpx.Request) -> httpx.Response: + return httpx.Response( + 200, + json={ + "documents": [ + { + "id": "456", + "place_name": "오복닭집", + "category_name": "음식점 > 치킨", + "category_group_code": "FD6", + "category_group_name": "음식점", + "phone": "054-000-0000", + "address_name": "경북 경주시 성건동 339-2", + "road_address_name": "경북 경주시 금성로 295", + "x": "129.0", + "y": "35.0", + "place_url": "https://place.map.kakao.com/456", + } + ], + "meta": {"total_count": 1}, + }, + ) + + client = KakaoLocalClient( + _settings(), + transport=httpx.MockTransport(handler), + ) + + result = _run( + client.search_places( + candidate, + location_hints=["경북 경주시 금성로 295"], + ) + ) + + assert result.places[0].place_name == "오복닭집" + assert result.places[0].confidence >= 0.7 From 423a75f424a0777b0ce3a5443be6b9ca62480526 Mon Sep 17 00:00:00 2001 From: KyungminPark-steck Date: Sun, 3 May 2026 10:45:58 +0900 Subject: [PATCH 3/3] fix: add Kakao address-only fallback --- app/infra/kakao/client.py | 2 + app/worker/processor.py | 18 ++++++- tests/test_kakao_client.py | 25 ++++++++++ tests/test_worker_processor.py | 87 ++++++++++++++++++++++++++++++++++ 4 files changed, 131 insertions(+), 1 deletion(-) diff --git a/app/infra/kakao/client.py b/app/infra/kakao/client.py index cd862ff..d915eca 100644 --- a/app/infra/kakao/client.py +++ b/app/infra/kakao/client.py @@ -78,6 +78,8 @@ def _build_query(self, keyword: str, location_hints: list[str]) -> str: if not location_hints: return keyword top_hint = location_hints[0] + if _normalize_address_text(keyword) == _normalize_address_text(top_hint): + return top_hint return f"{top_hint} {keyword}".strip() def _to_places( diff --git a/app/worker/processor.py b/app/worker/processor.py index f0441c3..7dd42e8 100644 --- a/app/worker/processor.py +++ b/app/worker/processor.py @@ -260,7 +260,23 @@ async def _search_places_by_hints( return qualified places = await self._search_places(candidate, []) - return self._qualified_places(places) + qualified = self._qualified_places(places) + if qualified: + return qualified + + for hint in location_hints: + address_candidate = ExtractedCandidate( + keyword=hint, + source_keyword=candidate.source_keyword, + source_sentence=candidate.source_sentence, + raw_candidate=candidate.raw_candidate, + ) + places = await self._search_places(address_candidate, [hint]) + qualified = self._qualified_places(places) + if qualified: + return qualified + + return [] def _qualified_places(self, places: list[PlaceCandidate]) -> list[PlaceCandidate]: return [ diff --git a/tests/test_kakao_client.py b/tests/test_kakao_client.py index 745f250..547441b 100644 --- a/tests/test_kakao_client.py +++ b/tests/test_kakao_client.py @@ -160,3 +160,28 @@ async def handler(request: httpx.Request) -> httpx.Response: assert result.places[0].place_name == "오복닭집" assert result.places[0].confidence >= 0.7 + + +@pytest.mark.skipif(not EVENT_LOOP_AVAILABLE, reason="Event loop creation is blocked in this environment") +def test_kakao_local_client_deduplicates_address_only_query() -> None: + seen_requests: list[httpx.Request] = [] + address = "경북 경주시 내남면 포석로 110-32" + candidate = ExtractedCandidate( + keyword=address, + source_keyword="수뢰뫼", + source_sentence="(4) 수뢰뫼", + raw_candidate="수뢰뫼", + ) + + async def handler(request: httpx.Request) -> httpx.Response: + seen_requests.append(request) + return httpx.Response(200, json={"documents": [], "meta": {"total_count": 0}}) + + client = KakaoLocalClient( + _settings(), + transport=httpx.MockTransport(handler), + ) + + _run(client.search_places(candidate, location_hints=[address])) + + assert seen_requests[0].url.params["query"] == address diff --git a/tests/test_worker_processor.py b/tests/test_worker_processor.py index f8869cb..ce33c62 100644 --- a/tests/test_worker_processor.py +++ b/tests/test_worker_processor.py @@ -147,6 +147,27 @@ async def search_places(self, candidate, location_hints: list[str]) -> FakePlace return FakePlaceSearchResult(self.places_by_keyword.get(candidate.keyword, [])) +class KeywordAndHintAwarePlaceSearchClient: + def __init__( + self, + places_by_call: dict[tuple[str, tuple[str, ...]], list[PlaceCandidate]], + ) -> None: + self.places_by_call = places_by_call + self.calls: list[dict[str, object]] = [] + + async def search_places(self, candidate, location_hints: list[str]) -> FakePlaceSearchResult: + self.calls.append( + { + "keyword": candidate.keyword, + "source_keyword": candidate.source_keyword, + "location_hints": location_hints, + } + ) + return FakePlaceSearchResult( + self.places_by_call.get((candidate.keyword, tuple(location_hints)), []) + ) + + class FailingPlaceSearchClient: async def search_places(self, candidate, location_hints: list[str]) -> FakePlaceSearchResult: raise RuntimeError("kakao unavailable") @@ -340,6 +361,72 @@ async def fake_crawl(url: str, _settings: Settings) -> CrawlArtifact: assert repo.saved_result["selected_places"][0]["confidence"] == 0.95 +@pytest.mark.skipif(not EVENT_LOOP_AVAILABLE, reason="Event loop creation is blocked in this environment") +def test_processor_falls_back_to_address_only_search(monkeypatch) -> None: + job = _new_job() + repo = FakeRepository(job) + settings = Settings(kakao_min_place_confidence=0.7) + address = "경북 경주시 내남면 포석로 110-32" + extractor = FakeExtractionClient( + ExtractionResult( + store_name="수뢰뫼", + address=address, + store_name_evidence="수뢰뫼", + address_evidence=f"📍위치 : {address}", + certainty=ExtractionCertainty.HIGH, + ) + ) + place = _place_candidate( + confidence=0.8, + kakao_place_id="456", + place_name="수뢰뫼", + source_keyword="수뢰뫼", + address_name="경북 경주시 내남면 용장리 114-3", + road_address_name=address, + ) + place_search = KeywordAndHintAwarePlaceSearchClient( + { + (address, (address,)): [place], + } + ) + + async def fake_crawl(url: str, _settings: Settings) -> CrawlArtifact: + return CrawlArtifact( + url=url, + html=None, + text=f"수뢰뫼\n📍위치 : {address}", + media_type="reel", + caption=f"수뢰뫼\n📍위치 : {address}", + instagram_meta=None, + ) + + monkeypatch.setattr("app.worker.processor.crawl_and_parse", fake_crawl) + + processor = JobProcessor( + repository=repo, + settings=settings, + extraction_client=extractor, + place_search_client=place_search, + ) + + _run(processor.process_job(job.job_id)) + + assert [ + { + "keyword": call["keyword"], + "location_hints": call["location_hints"], + } + for call in place_search.calls + ] == [ + {"keyword": "수뢰뫼", "location_hints": [address]}, + {"keyword": "수뢰뫼", "location_hints": []}, + {"keyword": address, "location_hints": [address]}, + ] + assert repo.saved_result is not None + assert repo.saved_result["selected_places"][0]["place_name"] == "수뢰뫼" + assert repo.saved_result["selected_places"][0]["source_keyword"] == "수뢰뫼" + + def test_build_location_hints_from_korean_address() -> None: assert JobProcessor._build_location_hints("서울 서초구 방배로 23길 31-6") == [ "서울 서초구 방배로 23길 31-6",