From 82b2e3b05d1cd9024b4030d46e5e645220674fa7 Mon Sep 17 00:00:00 2001 From: KyungminPark-steck Date: Sat, 2 May 2026 16:45:23 +0900 Subject: [PATCH] feat: support multi-place extraction and selection --- app/api/v1/endpoints/jobs.py | 1 + app/core/config.py | 2 +- app/domain/job/__init__.py | 6 + app/domain/job/model.py | 50 ++- app/infra/db/repository.py | 16 +- app/infra/llm/client.py | 65 +++- app/schemas/extraction.py | 137 ++++++- app/schemas/jobs.py | 10 + app/worker/processor.py | 101 +++-- ...003_add_selected_places_to_job_results.sql | 7 + tests/test_extraction_schema.py | 34 ++ tests/test_hf_extraction_client.py | 63 +++- tests/test_hf_kakao_pipeline_live.py | 350 ++++++++++++++++++ tests/test_job_repository.py | 16 + tests/test_job_result_schema.py | 13 + tests/test_jobs_api_result.py | 11 + tests/test_worker_processor.py | 145 +++++++- 17 files changed, 954 insertions(+), 73 deletions(-) create mode 100644 migrations/003_add_selected_places_to_job_results.sql create mode 100644 tests/test_hf_kakao_pipeline_live.py diff --git a/app/api/v1/endpoints/jobs.py b/app/api/v1/endpoints/jobs.py index e0f1f78..2480978 100644 --- a/app/api/v1/endpoints/jobs.py +++ b/app/api/v1/endpoints/jobs.py @@ -133,6 +133,7 @@ async def get_job_result( extraction_result=result.extraction_result if result else None, place_candidates=result.place_candidates if result else [], selected_place=result.selected_place if result else None, + selected_places=result.selected_places if result else [], error_message=job.error_message, updated_at=job.updated_at, ) diff --git a/app/core/config.py b/app/core/config.py index 2cdd569..fc1f7b8 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -96,7 +96,7 @@ class Settings(BaseSettings): hf_extraction_api_token: str = "" hf_extraction_model_name: str = "Qwen/Qwen2.5-3B-Instruct" hf_extraction_timeout_seconds: int = 20 - hf_extraction_max_new_tokens: int = 512 + hf_extraction_max_new_tokens: int = 2048 @field_validator("processing_schema") @classmethod diff --git a/app/domain/job/__init__.py b/app/domain/job/__init__.py index 3c31519..db4301f 100644 --- a/app/domain/job/__init__.py +++ b/app/domain/job/__init__.py @@ -2,14 +2,17 @@ CrawlArtifact, ExtractionCertainty, ExtractionResult, + ExtractedPlace, ExtractedCandidate, JobRecord, JobResultRecord, JobStatus, PlaceCandidate, as_candidate_dict, + as_extracted_place_dict, as_extraction_result_dict, as_place_dict, + extracted_places_from_result, ) from app.domain.job.service import CreateJobCommand, InvalidJobRequest, JobService @@ -17,14 +20,17 @@ "CrawlArtifact", "ExtractionCertainty", "ExtractionResult", + "ExtractedPlace", "ExtractedCandidate", "JobRecord", "JobResultRecord", "JobStatus", "PlaceCandidate", "as_candidate_dict", + "as_extracted_place_dict", "as_extraction_result_dict", "as_place_dict", + "extracted_places_from_result", "CreateJobCommand", "InvalidJobRequest", "JobService", diff --git a/app/domain/job/model.py b/app/domain/job/model.py index 8b43186..4196d2e 100644 --- a/app/domain/job/model.py +++ b/app/domain/job/model.py @@ -1,6 +1,6 @@ from __future__ import annotations -from dataclasses import dataclass +from dataclasses import dataclass, field from datetime import datetime from enum import Enum from typing import Any @@ -20,6 +20,15 @@ class ExtractionCertainty(str, Enum): LOW = "low" +@dataclass(slots=True) +class ExtractedPlace: + store_name: str | None + address: str | None + store_name_evidence: str | None + address_evidence: str | None + certainty: ExtractionCertainty + + @dataclass(slots=True) class ExtractionResult: store_name: str | None @@ -27,6 +36,7 @@ class ExtractionResult: store_name_evidence: str | None address_evidence: str | None certainty: ExtractionCertainty + places: list[ExtractedPlace] = field(default_factory=list) @dataclass(slots=True) @@ -48,6 +58,7 @@ class JobResultRecord: extraction_result: dict[str, Any] | None place_candidates: list[dict[str, Any]] selected_place: dict[str, Any] | None + selected_places: list[dict[str, Any]] created_at: datetime updated_at: datetime @@ -118,6 +129,39 @@ def as_candidate_dict(candidate: ExtractedCandidate) -> dict[str, Any]: } +def as_extracted_place_dict(place: ExtractedPlace) -> dict[str, Any]: + return { + "store_name": place.store_name, + "address": place.address, + "store_name_evidence": place.store_name_evidence, + "address_evidence": place.address_evidence, + "certainty": place.certainty.value, + } + + +def extracted_places_from_result(result: ExtractionResult) -> list[ExtractedPlace]: + if result.places: + return result.places + if not any( + ( + result.store_name, + result.address, + result.store_name_evidence, + result.address_evidence, + ) + ): + return [] + return [ + ExtractedPlace( + store_name=result.store_name, + address=result.address, + store_name_evidence=result.store_name_evidence, + address_evidence=result.address_evidence, + certainty=result.certainty, + ) + ] + + def as_extraction_result_dict(result: ExtractionResult) -> dict[str, Any]: return { "store_name": result.store_name, @@ -125,4 +169,8 @@ def as_extraction_result_dict(result: ExtractionResult) -> dict[str, Any]: "store_name_evidence": result.store_name_evidence, "address_evidence": result.address_evidence, "certainty": result.certainty.value, + "places": [ + as_extracted_place_dict(place) + for place in extracted_places_from_result(result) + ], } diff --git a/app/infra/db/repository.py b/app/infra/db/repository.py index 51fce43..94245d9 100644 --- a/app/infra/db/repository.py +++ b/app/infra/db/repository.py @@ -108,12 +108,21 @@ async def upsert_job_result( extraction_result: dict[str, Any] | None = None, place_candidates: list[dict[str, Any]] | None = None, selected_place: dict[str, Any] | None = None, + selected_places: list[dict[str, Any]] | None = None, ) -> JobResultRecord: sql = f""" INSERT INTO {self._results_table} - (job_id, caption, instagram_meta, extraction_result, place_candidates, selected_place) + ( + job_id, + caption, + instagram_meta, + extraction_result, + place_candidates, + selected_place, + selected_places + ) VALUES - ($1, $2, $3::jsonb, $4::jsonb, $5::jsonb, $6::jsonb) + ($1, $2, $3::jsonb, $4::jsonb, $5::jsonb, $6::jsonb, $7::jsonb) ON CONFLICT (job_id) DO UPDATE SET caption = EXCLUDED.caption, @@ -121,6 +130,7 @@ async def upsert_job_result( extraction_result = EXCLUDED.extraction_result, place_candidates = EXCLUDED.place_candidates, selected_place = EXCLUDED.selected_place, + selected_places = EXCLUDED.selected_places, updated_at = NOW() RETURNING * """ @@ -132,6 +142,7 @@ async def upsert_job_result( json.dumps(extraction_result) if extraction_result is not None else None, json.dumps(place_candidates or []), json.dumps(selected_place) if selected_place is not None else None, + json.dumps(selected_places or []), ) if row is None: raise RuntimeError("Failed to upsert job result") @@ -156,6 +167,7 @@ def _to_job_result_record(self, row: asyncpg.Record) -> JobResultRecord: extraction_result=self._json_to_dict(row["extraction_result"]), place_candidates=self._json_to_list(row["place_candidates"]), selected_place=self._json_to_dict(row["selected_place"]), + selected_places=self._json_to_list(row["selected_places"]), created_at=row["created_at"], updated_at=row["updated_at"], ) diff --git a/app/infra/llm/client.py b/app/infra/llm/client.py index 85e3c78..b0c2b92 100644 --- a/app/infra/llm/client.py +++ b/app/infra/llm/client.py @@ -11,16 +11,43 @@ from app.domain.job import ExtractionResult from app.schemas.extraction import ExtractionLLMResponse -EXTRACTION_SYSTEM_PROMPT = ( - "You extract store information from Korean restaurant social media captions. " - "Return only one JSON object with these exact keys: store_name, address, " - "store_name_evidence, address_evidence, certainty. Use null when a value is " - "unknown. Evidence values must be substrings copied from the input caption. " - "certainty must be one of high, medium, or low. Do not include explanations, " - "Markdown, or any text outside the JSON object." +EXTRACTION_SYSTEM_PROMPT_TEMPLATE = ( + "You extract place/store information from Korean social media captions. " + "Return only one JSON object with these exact top-level keys: store_name, " + "address, store_name_evidence, address_evidence, certainty, places. " + "places must be an array of objects. Each place object must have these exact " + "keys: store_name, address, store_name_evidence, address_evidence, certainty. " + "Extract every distinct place/store/brand that appears to be a visitable local " + "business, up to {max_candidates} places, preserving caption order. Captions " + "may contain numbered lists such as 1, 2, circled numbers, or sections such as " + "brand information, store information, or place information. When a place name " + "line is followed by an address line, pair them together. Address lines often " + "start with map-pin markers, address/location labels, or Korean address units " + "such as city, gu, gun, dong, eup, myeon, ri, ga, ro, or gil. A hashtag can be " + "a real store name, for example #StoreName; consider it when it names a " + "specific local business. Do not extract generic regional/category/promotional " + "hashtags such as Seoul cafe, Yeonnam cafe, dessert, hot place, date course, " + "travel, recommendation, or account handles as store names. If a store name is " + "taken from a hashtag, remove the leading # in store_name but keep the original " + "hashtag substring in store_name_evidence. Do not invent missing values. Use " + "null when unknown. Evidence values must be exact substrings copied from the " + "input caption. certainty must be one of high, medium, or low. The top-level " + "legacy fields store_name, address, store_name_evidence, address_evidence, and " + "certainty must mirror the first item in places, or null/low when places is " + "empty. If no place is found, return places as an empty array. Do not include " + "explanations, Markdown, or any text outside the JSON object." ) +def build_extraction_system_prompt(max_candidates: int) -> str: + return EXTRACTION_SYSTEM_PROMPT_TEMPLATE.format( + max_candidates=max(1, max_candidates), + ) + + +EXTRACTION_SYSTEM_PROMPT = build_extraction_system_prompt(12) + + class HFExtractionError(Exception): pass @@ -85,9 +112,10 @@ async def extract( generated_json = extract_json_object(generated_text) try: - return ExtractionLLMResponse.model_validate(generated_json).to_domain() + result = ExtractionLLMResponse.model_validate(generated_json).to_domain() except ValidationError as exc: raise HFExtractionError("HF response failed schema validation") from exc + return self._limit_places(result) def _build_payload( self, @@ -100,13 +128,32 @@ def _build_payload( return { "model": self._settings.hf_extraction_model_name, "messages": [ - {"role": "system", "content": EXTRACTION_SYSTEM_PROMPT}, + { + "role": "system", + "content": build_extraction_system_prompt( + self._settings.extraction_max_candidates, + ), + }, {"role": "user", "content": text}, ], "temperature": 0.0, "max_tokens": self._settings.hf_extraction_max_new_tokens, } + def _limit_places(self, result: ExtractionResult) -> ExtractionResult: + max_places = max(1, self._settings.extraction_max_candidates) + if len(result.places) <= max_places: + return result + + result.places = result.places[:max_places] + first_place = result.places[0] + result.store_name = first_place.store_name + result.address = first_place.address + result.store_name_evidence = first_place.store_name_evidence + result.address_evidence = first_place.address_evidence + result.certainty = first_place.certainty + return result + def extract_text_from_hf_payload(payload: Any) -> str: if isinstance(payload, str): diff --git a/app/schemas/extraction.py b/app/schemas/extraction.py index 5b5c763..05fcac7 100644 --- a/app/schemas/extraction.py +++ b/app/schemas/extraction.py @@ -2,12 +2,32 @@ from typing import Literal -from pydantic import BaseModel, ConfigDict, field_validator +from pydantic import BaseModel, ConfigDict, Field, field_validator -from app.domain.job.model import ExtractionCertainty, ExtractionResult +from app.domain.job.model import ExtractedPlace, ExtractionCertainty, ExtractionResult -class ExtractionLLMResponse(BaseModel): +def _normalize_optional_string(value: object, *, strip_hash: bool = False) -> object: + if value is None: + return None + if isinstance(value, str): + stripped = value.strip() + if strip_hash: + stripped = stripped.lstrip("#").strip() + return stripped or None + return value + + +def _normalize_certainty(value: object) -> object: + if value is None: + return None + if isinstance(value, str): + stripped = value.strip().lower() + return stripped or None + return value + + +class ExtractedPlaceLLMResponse(BaseModel): model_config = ConfigDict(extra="ignore") store_name: str | None = None @@ -16,8 +36,12 @@ class ExtractionLLMResponse(BaseModel): address_evidence: str | None = None certainty: Literal["high", "medium", "low"] | None = None + @field_validator("store_name", mode="before") + @classmethod + def normalize_store_name(cls, value: object) -> object: + return _normalize_optional_string(value, strip_hash=True) + @field_validator( - "store_name", "address", "store_name_evidence", "address_evidence", @@ -25,28 +49,105 @@ class ExtractionLLMResponse(BaseModel): ) @classmethod def normalize_optional_string(cls, value: object) -> object: - if value is None: - return None - if isinstance(value, str): - stripped = value.strip() - return stripped or None - return value + return _normalize_optional_string(value) @field_validator("certainty", mode="before") @classmethod def normalize_certainty(cls, value: object) -> object: - if value is None: - return None - if isinstance(value, str): - stripped = value.strip().lower() - return stripped or None - return value + return _normalize_certainty(value) - def to_domain(self) -> ExtractionResult: - return ExtractionResult( + def has_content(self) -> bool: + return any( + ( + self.store_name, + self.address, + self.store_name_evidence, + self.address_evidence, + ) + ) + + def to_domain(self) -> ExtractedPlace: + return ExtractedPlace( store_name=self.store_name, address=self.address, store_name_evidence=self.store_name_evidence, address_evidence=self.address_evidence, certainty=ExtractionCertainty(self.certainty or "low"), ) + + +class ExtractionLLMResponse(BaseModel): + model_config = ConfigDict(extra="ignore") + + store_name: str | None = None + address: str | None = None + store_name_evidence: str | None = None + address_evidence: str | None = None + certainty: Literal["high", "medium", "low"] | None = None + places: list[ExtractedPlaceLLMResponse] = Field(default_factory=list) + + @field_validator("store_name", mode="before") + @classmethod + def normalize_store_name(cls, value: object) -> object: + return _normalize_optional_string(value, strip_hash=True) + + @field_validator( + "address", + "store_name_evidence", + "address_evidence", + mode="before", + ) + @classmethod + def normalize_optional_string(cls, value: object) -> object: + return _normalize_optional_string(value) + + @field_validator("certainty", mode="before") + @classmethod + def normalize_certainty(cls, value: object) -> object: + return _normalize_certainty(value) + + @field_validator("places", mode="before") + @classmethod + def normalize_places(cls, value: object) -> object: + if value is None: + return [] + return value + + def to_domain(self) -> ExtractionResult: + places = [place.to_domain() for place in self.places if place.has_content()] + if not places and self._has_legacy_content(): + places = [ + ExtractedPlace( + store_name=self.store_name, + address=self.address, + store_name_evidence=self.store_name_evidence, + address_evidence=self.address_evidence, + certainty=ExtractionCertainty(self.certainty or "low"), + ) + ] + + first_place = places[0] if places else None + return ExtractionResult( + store_name=first_place.store_name if first_place else self.store_name, + address=first_place.address if first_place else self.address, + store_name_evidence=( + first_place.store_name_evidence if first_place else self.store_name_evidence + ), + address_evidence=first_place.address_evidence if first_place else self.address_evidence, + certainty=( + first_place.certainty + if first_place + else ExtractionCertainty(self.certainty or "low") + ), + places=places, + ) + + def _has_legacy_content(self) -> bool: + return any( + ( + self.store_name, + self.address, + self.store_name_evidence, + self.address_evidence, + ) + ) diff --git a/app/schemas/jobs.py b/app/schemas/jobs.py index e989443..2d9c74e 100644 --- a/app/schemas/jobs.py +++ b/app/schemas/jobs.py @@ -9,12 +9,21 @@ from app.domain.job.model import JobStatus +class ExtractedPlaceResponse(BaseModel): + store_name: str | None + address: str | None + store_name_evidence: str | None + address_evidence: str | None + certainty: Literal["high", "medium", "low"] + + class ExtractionResultResponse(BaseModel): store_name: str | None address: str | None store_name_evidence: str | None address_evidence: str | None certainty: Literal["high", "medium", "low"] + places: list[ExtractedPlaceResponse] = Field(default_factory=list) class PlaceCandidateResponse(BaseModel): @@ -69,6 +78,7 @@ class JobResultResponse(BaseModel): extraction_result: ExtractionResultResponse | None = None place_candidates: list[PlaceCandidateResponse] = Field(default_factory=list) selected_place: PlaceCandidateResponse | None = None + selected_places: list[PlaceCandidateResponse] = Field(default_factory=list) error_message: str | None updated_at: datetime diff --git a/app/worker/processor.py b/app/worker/processor.py index 119e8ab..1d6387f 100644 --- a/app/worker/processor.py +++ b/app/worker/processor.py @@ -13,11 +13,13 @@ from app.domain.job import ( CrawlArtifact, ExtractedCandidate, + ExtractedPlace, ExtractionResult, JobRecord, PlaceCandidate, as_extraction_result_dict, as_place_dict, + extracted_places_from_result, ) from app.infra.kakao import KakaoNonRetryableError @@ -94,15 +96,16 @@ async def process_job(self, job_id: UUID) -> JobProcessOutcome: try: crawl_artifact = await crawl_and_parse(job.source_url, self._settings) extraction_result = await self._extract_result(job.source_url, crawl_artifact) - place_candidates, selected_place = await self._enrich_place( + place_candidates, selected_place, selected_places = await self._enrich_place( extraction_result, crawl_artifact, ) logger.info( - "job crawl completed job_id=%s caption_len=%s place_candidates=%s", + "job crawl completed job_id=%s caption_len=%s place_candidates=%s selected_places=%s", job.job_id, len(crawl_artifact.caption or ""), len(place_candidates), + len(selected_places), ) await self._repository.upsert_job_result( @@ -114,6 +117,7 @@ async def process_job(self, job_id: UUID) -> JobProcessOutcome: ), place_candidates=place_candidates, selected_place=selected_place, + selected_places=selected_places, ) await self._repository.mark_succeeded(job.job_id) elapsed_ms = int((time.monotonic() - started) * 1000) @@ -160,45 +164,82 @@ async def _enrich_place( self, extraction_result: ExtractionResult | None, crawl_artifact: CrawlArtifact, - ) -> tuple[list[dict[str, object]], dict[str, object] | None]: + ) -> tuple[list[dict[str, object]], dict[str, object] | None, list[dict[str, object]]]: if not self._place_search_client or not extraction_result: - return [], None - - store_name = (extraction_result.store_name or "").strip() + return [], None, [] + + extracted_places = extracted_places_from_result(extraction_result) + if not extracted_places: + return [], None, [] + + all_places: list[PlaceCandidate] = [] + selected_places: list[dict[str, object]] = [] + seen_candidate_keys: set[str] = set() + seen_selected_keys: set[str] = set() + + max_places = max(1, self._settings.extraction_max_candidates) + for extracted_place in extracted_places[:max_places]: + candidate = self._build_extracted_candidate(extracted_place, crawl_artifact) + if not candidate: + continue + location_hints = self._build_location_hints(extracted_place.address) + + try: + places = await self._search_places_by_hints(candidate, location_hints) + except KakaoNonRetryableError: + logger.error("kakao enrichment non-retryable failure", exc_info=True) + return [], None, [] + except Exception: + logger.exception( + "kakao enrichment failed source_keyword=%s", + candidate.source_keyword, + ) + continue + + places = sorted(places, key=lambda place: place.confidence, reverse=True) + if places: + selected_place = as_place_dict(places[0]) + selected_key = self._place_dedupe_key(places[0]) + if selected_key not in seen_selected_keys: + selected_places.append(selected_place) + seen_selected_keys.add(selected_key) + + for place in places: + candidate_key = self._place_dedupe_key(place) + if candidate_key in seen_candidate_keys: + continue + all_places.append(place) + seen_candidate_keys.add(candidate_key) + + place_candidates = [as_place_dict(place) for place in all_places] + selected_place = selected_places[0] if selected_places else None + return place_candidates, selected_place, selected_places + + def _build_extracted_candidate( + self, + extracted_place: ExtractedPlace, + crawl_artifact: CrawlArtifact, + ) -> ExtractedCandidate | None: + store_name = (extracted_place.store_name or "").strip() if not store_name: - return [], None - - candidate = ExtractedCandidate( + return None + return ExtractedCandidate( keyword=store_name, source_keyword=store_name, source_sentence=( - extraction_result.store_name_evidence - or extraction_result.address_evidence + extracted_place.store_name_evidence + or extracted_place.address_evidence or crawl_artifact.caption or "" ), raw_candidate=store_name, ) - location_hints = self._build_location_hints(extraction_result.address) - try: - places = await self._search_places_by_hints(candidate, location_hints) - except KakaoNonRetryableError: - logger.error("kakao enrichment non-retryable failure", exc_info=True) - return [], None - except Exception: - logger.exception("kakao enrichment failed") - return [], None - - places = sorted(places, key=lambda place: place.confidence, reverse=True) - places = [ - place - for place in places - if place.confidence >= self._settings.kakao_min_place_confidence - ] - place_candidates = [as_place_dict(place) for place in places] - selected_place = place_candidates[0] if place_candidates else None - return place_candidates, selected_place + @staticmethod + def _place_dedupe_key(place: PlaceCandidate) -> str: + if place.kakao_place_id: + return f"id:{place.kakao_place_id}" + return f"name:{place.place_name}|{place.address_name}|{place.road_address_name}" async def _search_places( self, diff --git a/migrations/003_add_selected_places_to_job_results.sql b/migrations/003_add_selected_places_to_job_results.sql new file mode 100644 index 0000000..5188dad --- /dev/null +++ b/migrations/003_add_selected_places_to_job_results.sql @@ -0,0 +1,7 @@ +ALTER TABLE processing.job_results +ADD COLUMN IF NOT EXISTS selected_places JSONB NOT NULL DEFAULT '[]'::jsonb; + +UPDATE processing.job_results +SET selected_places = jsonb_build_array(selected_place) +WHERE selected_place IS NOT NULL + AND selected_places = '[]'::jsonb; diff --git a/tests/test_extraction_schema.py b/tests/test_extraction_schema.py index 03207e0..6194c2a 100644 --- a/tests/test_extraction_schema.py +++ b/tests/test_extraction_schema.py @@ -25,6 +25,8 @@ def test_llm_response_normalizes_missing_fields_and_certainty() -> None: assert domain.store_name == "커먼맨션" assert domain.certainty is ExtractionCertainty.HIGH + assert len(domain.places) == 1 + assert domain.places[0].store_name == "커먼맨션" def test_llm_response_defaults_missing_certainty_to_low_in_domain() -> None: @@ -40,8 +42,40 @@ def test_llm_response_defaults_missing_certainty_to_low_in_domain() -> None: assert response.address == "서울 종로구 신문로2가 1-102" assert response.certainty is None assert domain.certainty is ExtractionCertainty.LOW + assert len(domain.places) == 1 + assert domain.places[0].address == "서울 종로구 신문로2가 1-102" def test_llm_response_rejects_unknown_certainty() -> None: with pytest.raises(ValidationError): ExtractionLLMResponse.model_validate({"certainty": "certain"}) + + +def test_llm_response_accepts_multiple_places_and_hashtag_store_name() -> None: + response = ExtractionLLMResponse.model_validate( + { + "places": [ + { + "store_name": "#플루밍", + "address": "서울 마포구 연남로13길 9 1층 101호", + "store_name_evidence": "#플루밍", + "address_evidence": "서울 마포구 연남로13길 9 1층 101호", + "certainty": "high", + }, + { + "store_name": "누크녹", + "address": "서울 마포구 성미산로 190-31 2층", + "store_name_evidence": "❷ 누크녹", + "address_evidence": "서울 마포구 성미산로 190-31 2층", + "certainty": "high", + }, + ] + } + ) + + domain = response.to_domain() + + assert domain.store_name == "플루밍" + assert domain.address == "서울 마포구 연남로13길 9 1층 101호" + assert [place.store_name for place in domain.places] == ["플루밍", "누크녹"] + assert domain.places[0].store_name_evidence == "#플루밍" diff --git a/tests/test_hf_extraction_client.py b/tests/test_hf_extraction_client.py index 55fc290..f785982 100644 --- a/tests/test_hf_extraction_client.py +++ b/tests/test_hf_extraction_client.py @@ -14,6 +14,7 @@ extract_json_object, extract_text_from_hf_payload, ) +from app.infra.llm.client import build_extraction_system_prompt def _run(coro): @@ -27,6 +28,7 @@ def _settings() -> Settings: return Settings( hf_extraction_endpoint_url="https://example.test/hf", hf_extraction_api_token="test-token", + hf_extraction_max_new_tokens=1024, ) @@ -84,11 +86,13 @@ async def handler(request: httpx.Request) -> httpx.Response: assert result is not None assert result.store_name == "Common Mansion" assert result.certainty is ExtractionCertainty.HIGH + assert len(result.places) == 1 + assert result.places[0].store_name == "Common Mansion" assert seen_requests[0]["messages"][1]["content"] == ( "Common Mansion 1-102 Sinmunro 2-ga, Jongno-gu, Seoul" ) assert seen_requests[0]["temperature"] == 0.0 - assert seen_requests[0]["max_tokens"] == 512 + assert seen_requests[0]["max_tokens"] == 1024 def test_hf_extraction_client_accepts_long_realistic_caption() -> None: @@ -152,9 +156,66 @@ async def handler(request: httpx.Request) -> httpx.Response: assert result.store_name == "커먼맨션" assert result.address == "서울 종로구 신문로2가 1-102" assert result.certainty is ExtractionCertainty.HIGH + assert [place.store_name for place in result.places] == ["커먼맨션"] assert seen_requests[0]["messages"][1]["content"] == long_caption +def test_hf_extraction_client_returns_multiple_domain_places() -> None: + async def handler(request: httpx.Request) -> httpx.Response: + return httpx.Response( + 200, + json={ + "generated_text": json.dumps( + { + "places": [ + { + "store_name": "#플루밍", + "address": "서울 마포구 연남로13길 9 1층 101호", + "store_name_evidence": "#플루밍", + "address_evidence": "서울 마포구 연남로13길 9 1층 101호", + "certainty": "high", + }, + { + "store_name": "누크녹", + "address": "서울 마포구 성미산로 190-31 2층", + "store_name_evidence": "❷ 누크녹", + "address_evidence": "서울 마포구 성미산로 190-31 2층", + "certainty": "high", + }, + ] + }, + ensure_ascii=False, + ) + }, + ) + + extractor = HFExtractionClient( + _settings(), + transport=httpx.MockTransport(handler), + ) + + result = _run( + extractor.extract( + text="#플루밍\n서울 마포구 연남로13길 9 1층 101호\n❷ 누크녹", + source_url="https://www.instagram.com/reel/example/", + media_type="reel", + ) + ) + + assert result is not None + assert result.store_name == "플루밍" + assert result.address == "서울 마포구 연남로13길 9 1층 101호" + assert [place.store_name for place in result.places] == ["플루밍", "누크녹"] + + +def test_build_extraction_system_prompt_mentions_hashtag_store_names() -> None: + prompt = build_extraction_system_prompt(6) + + assert "hashtag" in prompt + assert "remove the leading #" in prompt + assert "up to 6 places" in prompt + + def test_hf_extraction_client_raises_on_http_error() -> None: async def handler(request: httpx.Request) -> httpx.Response: return httpx.Response(500, json={"error": "temporary failure"}) diff --git a/tests/test_hf_kakao_pipeline_live.py b/tests/test_hf_kakao_pipeline_live.py new file mode 100644 index 0000000..ca4f8a7 --- /dev/null +++ b/tests/test_hf_kakao_pipeline_live.py @@ -0,0 +1,350 @@ +from __future__ import annotations + +import asyncio +import json +import os +from dataclasses import dataclass +from pathlib import Path +from time import perf_counter + +import pytest + +from app.core.config import Settings +from app.domain.job import ( + CrawlArtifact, + ExtractedCandidate, + as_extraction_result_dict, + as_place_dict, +) +from app.infra.kakao import KakaoLocalClient +from app.infra.llm import HFExtractionClient +from app.worker.processor import JobProcessor + +if hasattr(asyncio, "WindowsSelectorEventLoopPolicy"): + asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) + + +RUN_LIVE_TESTS = os.getenv("RUN_LIVE_HF_KAKAO_TESTS") == "1" +ARTIFACT_PATH = Path("artifacts") / "hf_kakao_pipeline_live_results.json" + + +MULTI_PLACE_CAPTION = """🍰먹기 전에 한 번 더 고민하게 되는 순간 +서울에서 만나는 비주얼 디저트 카페들🍓 + +디저트는 맛도 중요하지만, +요즘은 눈부터 만족시켜주는 게 먼저인 느낌 + +접시에 담긴 색감, 디테일 하나하나 +사진부터 찍게 되는 디저트들📸 + +보기만 해도 기분 좋아지는 디저트들로 +식후를 더 길게 만들 카페들 모아봤어요🍰 + +🗒️ 브랜드 정보 +❶ 플루밍 +📍서울 마포구 연남로13길 9 1층 101호 +🍰토~수 12:00 ~ 19:30 +🍰목~금 12:00 ~ 21:00 +🍰매주 월, 화 정기휴무 + +❷ 누크녹 +📍서울 마포구 성미산로 190-31 2층 +🍰12:30 ~ 20:30 +🍰매주 월 정기휴무 + +❸ 예챠 +📍서울 마포구 망원로7길 31-18 1층 102호 +🍰12:00 ~ 19:00 +🍰매주 월 정기휴무 + +❹ 라뚜셩트 +📍서울 서초구 방배로25길 50 1층 +🍰월~목 08:00 ~ 19:00 +🍰금~일 08:00 ~ 20:00 + +❺ 코이크 +📍서울 마포구 동교로39길 8 1-2층 +🍰월~목 12:00 ~ 20:30 +🍰금~일 12:00 ~ 21:30 + +❻ 카페토요 +📍서울 영등포구 도림로 436-7 1층 +🍰12:00 ~ 20:00 +🍰매주 월 정기휴무 + +이미지 | 각 브랜드 채널 + +요즘 감성 핫플 한눈에 보고 싶다면? @eateat.mag +데이트·여행 등 전국 ‘핫한 정보’ 필요하면? @eateat.mag +놓치면 후회할 핫플 리스트 @eateat.mag + +#서울디저트 #서울카페 #연남카페 #망원카페""" + + +SINGLE_PLACE_CAPTION = """실제 광화문 직장인 지인이 여기가 최고라고 소개해줘서 알게 된 집 + +실내 분위기 너무 좋았던 브런치 맛집 커먼맨션 입니다 + +샌드위치에 샐러드 파스타 이렇게 3종류로 크게 나눌 수 있는데 샌드위치 먹고 있으면 샌드위치 전문점인 거 같고 + +샐러드 먹으면 샐러드 파스타면 파스타 + +모든 메뉴가 전문점 수준으로 너무 맛있어서 정말 대만족했던 집 입니다 + +광화문 직장인 상권이다보니 점심시간에 가면 웨이팅이 심해서 못 먹고 올 수 있으니까 + +방문 예정이시면 꼭 예약을 미리 하고 가시는 걸 추천 드릴게요 + +실제 근처 직장인이시라면 점심 혹은 미팅 잡기도 정말 좋은 곳 일 + +거 같아요 + +• 커먼맨션 + +서울 종로구 신문로2가 1-102 +10:00 - 21:00 +20:00 라스트오더""" + + +@dataclass(frozen=True) +class LivePipelineCase: + case_id: str + caption: str + expected_place_names: list[str] + + +class RecordingKakaoLocalClient: + def __init__(self, client: KakaoLocalClient, settings: Settings) -> None: + self._client = client + self._settings = settings + self.calls: list[dict[str, object]] = [] + + async def search_places( + self, + candidate: ExtractedCandidate, + location_hints: list[str], + ): + started = perf_counter() + result = await self._client.search_places(candidate, location_hints) + elapsed_ms = int((perf_counter() - started) * 1000) + qualified = [ + place + for place in result.places + if place.confidence >= self._settings.kakao_min_place_confidence + ] + self.calls.append( + { + "keyword": candidate.keyword, + "location_hints": location_hints, + "elapsed_ms": elapsed_ms, + "returned_count": len(result.places), + "qualified_count": len(qualified), + "returned_places": [as_place_dict(place) for place in result.places], + "qualified_places": [as_place_dict(place) for place in qualified], + } + ) + return result + + +def _run(coro): + try: + return asyncio.run(coro) + except OSError as exc: + pytest.skip(f"Event loop creation is blocked in this environment: {exc}") + + +def _settings_for_live_test() -> Settings: + configured = Settings() + settings = Settings( + hf_extraction_timeout_seconds=60, + hf_extraction_max_new_tokens=max(2048, configured.hf_extraction_max_new_tokens), + kakao_timeout_seconds=10, + ) + missing = [] + if not settings.hf_extraction_endpoint_url: + missing.append("HF_EXTRACTION_ENDPOINT_URL") + if not settings.hf_extraction_api_token: + missing.append("HF_EXTRACTION_API_TOKEN") + if not settings.kakao_rest_api_key: + missing.append("KAKAO_REST_API_KEY") + if missing: + pytest.skip(f"Live HF/Kakao test credentials are missing: {', '.join(missing)}") + return settings + + +def _normalize_place_name(value: str | None) -> str: + return "".join((value or "").lower().split()) + + +def _contains_place_name(actual_names: list[str | None], expected_name: str) -> bool: + expected = _normalize_place_name(expected_name) + return any(expected in _normalize_place_name(actual) for actual in actual_names) + + +async def _run_live_pipeline_case( + case: LivePipelineCase, + settings: Settings, +) -> dict[str, object]: + extractor = HFExtractionClient(settings) + kakao = RecordingKakaoLocalClient(KakaoLocalClient(settings), settings) + processor = JobProcessor( + repository=None, # type: ignore[arg-type] + settings=settings, + place_search_client=kakao, + ) + + started = perf_counter() + extraction_result = await extractor.extract( + text=case.caption, + source_url=f"https://www.instagram.com/reel/live-{case.case_id}/", + media_type="reel", + ) + extraction_elapsed_ms = int((perf_counter() - started) * 1000) + + crawl_artifact = CrawlArtifact( + url=f"https://www.instagram.com/reel/live-{case.case_id}/", + html=None, + text=case.caption, + media_type="reel", + caption=case.caption, + instagram_meta={"caption": case.caption}, + ) + place_candidates, selected_place, selected_places = await processor._enrich_place( + extraction_result, + crawl_artifact, + ) + + extraction_dict = as_extraction_result_dict(extraction_result) if extraction_result else None + extracted_names = [ + place.get("store_name") + for place in (extraction_dict or {}).get("places", []) + if isinstance(place, dict) + ] + selected_names = [place.get("place_name") for place in selected_places] + extraction_matches = { + name: _contains_place_name(extracted_names, name) + for name in case.expected_place_names + } + selected_matches = { + name: _contains_place_name(selected_names, name) + for name in case.expected_place_names + } + + return { + "case_id": case.case_id, + "caption": case.caption, + "expected_place_names": case.expected_place_names, + "extraction_elapsed_ms": extraction_elapsed_ms, + "extraction_result": extraction_dict, + "extracted_names": extracted_names, + "extraction_matches": extraction_matches, + "kakao_calls": kakao.calls, + "place_candidates": place_candidates, + "place_candidate_count": len(place_candidates), + "selected_place": selected_place, + "selected_places": selected_places, + "selected_place_count": len(selected_places), + "selected_matches": selected_matches, + } + + +@pytest.mark.skipif( + not RUN_LIVE_TESTS, + reason="Set RUN_LIVE_HF_KAKAO_TESTS=1 to call live HF and Kakao APIs.", +) +def test_live_hf_extraction_to_kakao_search_writes_artifact() -> None: + settings = _settings_for_live_test() + cases = [ + LivePipelineCase( + case_id="dessert_cafes_multi_place", + caption=MULTI_PLACE_CAPTION, + expected_place_names=["플루밍", "누크녹", "예챠", "라뚜셩트", "코이크", "카페토요"], + ), + LivePipelineCase( + case_id="common_mansion_single_place", + caption=SINGLE_PLACE_CAPTION, + expected_place_names=["커먼맨션"], + ), + ] + + results = _run(_run_all_live_pipeline_cases(cases, settings)) + ARTIFACT_PATH.parent.mkdir(parents=True, exist_ok=True) + ARTIFACT_PATH.write_text( + json.dumps( + { + "settings": { + "hf_extraction_endpoint_url_configured": bool( + settings.hf_extraction_endpoint_url + ), + "hf_extraction_model_name": settings.hf_extraction_model_name, + "hf_extraction_max_new_tokens": settings.hf_extraction_max_new_tokens, + "kakao_base_url": settings.kakao_base_url, + "kakao_max_places_per_candidate": settings.kakao_max_places_per_candidate, + "kakao_min_place_confidence": settings.kakao_min_place_confidence, + }, + "results": results, + }, + ensure_ascii=False, + indent=2, + ) + + "\n", + encoding="utf-8", + ) + + failures: list[str] = [] + for result in results: + case_id = result["case_id"] + error = result.get("error") + if error: + failures.append(f"{case_id} error: {error}") + continue + missing_extractions = [ + name + for name, matched in result["extraction_matches"].items() + if not matched + ] + missing_selections = [ + name + for name, matched in result["selected_matches"].items() + if not matched + ] + if missing_extractions: + failures.append(f"{case_id} missing extraction: {missing_extractions}") + if missing_selections: + failures.append(f"{case_id} missing selected Kakao match: {missing_selections}") + + assert not failures, f"Live HF/Kakao pipeline mismatches. See {ARTIFACT_PATH}: {failures}" + + +async def _run_all_live_pipeline_cases( + cases: list[LivePipelineCase], + settings: Settings, +) -> list[dict[str, object]]: + results: list[dict[str, object]] = [] + for case in cases: + try: + results.append(await _run_live_pipeline_case(case, settings)) + except Exception as exc: # noqa: BLE001 + results.append( + { + "case_id": case.case_id, + "caption": case.caption, + "expected_place_names": case.expected_place_names, + "error": f"{type(exc).__name__}: {exc}", + "extraction_result": None, + "extracted_names": [], + "extraction_matches": { + name: False for name in case.expected_place_names + }, + "kakao_calls": [], + "place_candidates": [], + "place_candidate_count": 0, + "selected_place": None, + "selected_places": [], + "selected_place_count": 0, + "selected_matches": { + name: False for name in case.expected_place_names + }, + } + ) + return results diff --git a/tests/test_job_repository.py b/tests/test_job_repository.py index c8b8e4a..671efc1 100644 --- a/tests/test_job_repository.py +++ b/tests/test_job_repository.py @@ -52,6 +52,7 @@ async def fetchrow(self, sql: str, *args): "extraction_result": args[3], "place_candidates": args[4], "selected_place": args[5], + "selected_places": args[6], "created_at": now, "updated_at": now, } @@ -68,6 +69,15 @@ def test_upsert_job_result_persists_extraction_result() -> None: "store_name_evidence": "Common Mansion", "address_evidence": "1-102 Sinmunro 2-ga, Jongno-gu, Seoul", "certainty": "high", + "places": [ + { + "store_name": "Common Mansion", + "address": "1-102 Sinmunro 2-ga, Jongno-gu, Seoul", + "store_name_evidence": "Common Mansion", + "address_evidence": "1-102 Sinmunro 2-ga, Jongno-gu, Seoul", + "certainty": "high", + } + ], } selected_place = { "kakao_place_id": "123", @@ -95,6 +105,7 @@ def test_upsert_job_result_persists_extraction_result() -> None: extraction_result=extraction_result, place_candidates=[selected_place], selected_place=selected_place, + selected_places=[selected_place], ) ) @@ -108,10 +119,12 @@ def test_upsert_job_result_persists_extraction_result() -> None: json.dumps(extraction_result), json.dumps([selected_place]), json.dumps(selected_place), + json.dumps([selected_place]), ) assert record.extraction_result == extraction_result assert record.place_candidates == [selected_place] assert record.selected_place == selected_place + assert record.selected_places == [selected_place] @pytest.mark.skipif(not EVENT_LOOP_AVAILABLE, reason="Event loop creation is blocked in this environment") @@ -124,6 +137,7 @@ def test_get_job_result_maps_extraction_result() -> None: "store_name_evidence": None, "address_evidence": None, "certainty": "low", + "places": [], } pool = FakePool( { @@ -133,6 +147,7 @@ def test_get_job_result_maps_extraction_result() -> None: "extraction_result": json.dumps(extraction_result), "place_candidates": json.dumps([]), "selected_place": None, + "selected_places": json.dumps([]), "created_at": now, "updated_at": now, } @@ -145,3 +160,4 @@ def test_get_job_result_maps_extraction_result() -> None: assert record.extraction_result == extraction_result assert record.place_candidates == [] assert record.selected_place is None + assert record.selected_places == [] diff --git a/tests/test_job_result_schema.py b/tests/test_job_result_schema.py index 66da15c..4d32b7c 100644 --- a/tests/test_job_result_schema.py +++ b/tests/test_job_result_schema.py @@ -21,6 +21,15 @@ def test_job_result_response_accepts_extraction_result() -> None: "store_name_evidence": "• 커먼맨션", "address_evidence": "서울 종로구 신문로2가 1-102", "certainty": "high", + "places": [ + { + "store_name": "커먼맨션", + "address": "서울 종로구 신문로2가 1-102", + "store_name_evidence": "• 커먼맨션", + "address_evidence": "서울 종로구 신문로2가 1-102", + "certainty": "high", + } + ], }, error_message=None, updated_at=datetime.now(timezone.utc), @@ -30,6 +39,7 @@ def test_job_result_response_accepts_extraction_result() -> None: assert dumped["extraction_result"]["store_name"] == "커먼맨션" assert dumped["extraction_result"]["certainty"] == "high" + assert dumped["extraction_result"]["places"][0]["store_name"] == "커먼맨션" def test_job_result_response_allows_missing_extraction_result() -> None: @@ -47,6 +57,7 @@ def test_job_result_response_allows_missing_extraction_result() -> None: assert response.extraction_result is None assert response.place_candidates == [] assert response.selected_place is None + assert response.selected_places == [] def test_job_result_response_accepts_kakao_place_result() -> None: @@ -78,6 +89,7 @@ def test_job_result_response_accepts_kakao_place_result() -> None: extraction_result=None, place_candidates=[selected_place], selected_place=selected_place, + selected_places=[selected_place], error_message=None, updated_at=datetime.now(timezone.utc), ) @@ -85,5 +97,6 @@ def test_job_result_response_accepts_kakao_place_result() -> None: dumped = response.model_dump() assert dumped["selected_place"]["place_name"] == "커먼맨션" + assert dumped["selected_places"][0]["place_name"] == "커먼맨션" assert dumped["selected_place"]["category_group_code"] == "CE7" assert dumped["place_candidates"][0]["road_address_name"] == "서울 종로구 새문안로 1" diff --git a/tests/test_jobs_api_result.py b/tests/test_jobs_api_result.py index c271f16..595e4dc 100644 --- a/tests/test_jobs_api_result.py +++ b/tests/test_jobs_api_result.py @@ -39,6 +39,15 @@ def test_get_job_result_returns_extraction_result() -> None: "store_name_evidence": "Common Mansion", "address_evidence": "1-102 Sinmunro 2-ga, Jongno-gu, Seoul", "certainty": "high", + "places": [ + { + "store_name": "Common Mansion", + "address": "1-102 Sinmunro 2-ga, Jongno-gu, Seoul", + "store_name_evidence": "Common Mansion", + "address_evidence": "1-102 Sinmunro 2-ga, Jongno-gu, Seoul", + "certainty": "high", + } + ], } selected_place = { "kakao_place_id": "123", @@ -78,6 +87,7 @@ def test_get_job_result_returns_extraction_result() -> None: extraction_result=extraction_result, place_candidates=[selected_place], selected_place=selected_place, + selected_places=[selected_place], created_at=now, updated_at=now, ) @@ -93,3 +103,4 @@ def test_get_job_result_returns_extraction_result() -> None: assert response.json()["extraction_result"] == extraction_result assert response.json()["place_candidates"] == [selected_place] assert response.json()["selected_place"] == selected_place + assert response.json()["selected_places"] == [selected_place] diff --git a/tests/test_worker_processor.py b/tests/test_worker_processor.py index 7c9308f..11215ac 100644 --- a/tests/test_worker_processor.py +++ b/tests/test_worker_processor.py @@ -10,6 +10,7 @@ from app.core.config import Settings from app.domain.job import ( CrawlArtifact, + ExtractedPlace, ExtractionCertainty, ExtractionResult, JobRecord, @@ -130,6 +131,22 @@ async def search_places(self, candidate, location_hints: list[str]) -> FakePlace return FakePlaceSearchResult(self.places_by_hint.get(tuple(location_hints), [])) +class KeywordAwarePlaceSearchClient: + def __init__(self, places_by_keyword: dict[str, list[PlaceCandidate]]) -> None: + self.places_by_keyword = places_by_keyword + self.calls: list[dict[str, object]] = [] + + async def search_places(self, candidate, location_hints: list[str]) -> FakePlaceSearchResult: + self.calls.append( + { + "keyword": candidate.keyword, + "source_keyword": candidate.source_keyword, + "location_hints": location_hints, + } + ) + return FakePlaceSearchResult(self.places_by_keyword.get(candidate.keyword, [])) + + class FailingPlaceSearchClient: async def search_places(self, candidate, location_hints: list[str]) -> FakePlaceSearchResult: raise RuntimeError("kakao unavailable") @@ -148,23 +165,31 @@ def _new_job() -> JobRecord: ) -def _place_candidate(*, confidence: float = 0.95) -> PlaceCandidate: +def _place_candidate( + *, + confidence: float = 0.95, + kakao_place_id: str = "123", + place_name: str = "Common Mansion", + source_keyword: str = "Common Mansion", + address_name: str = "Seoul Jongno-gu Sinmunro 2-ga 1-102", + road_address_name: str = "Seoul Jongno-gu Saemunan-ro 1", +) -> PlaceCandidate: return PlaceCandidate( - kakao_place_id="123", - place_name="Common Mansion", + kakao_place_id=kakao_place_id, + place_name=place_name, category_name="Food > Cafe", category_group_code="CE7", category_group_name="Cafe", phone="02-0000-0000", - address_name="Seoul Jongno-gu Sinmunro 2-ga 1-102", - road_address_name="Seoul Jongno-gu Saemunan-ro 1", + address_name=address_name, + road_address_name=road_address_name, x="126.970000", y="37.570000", - place_url="https://place.map.kakao.com/123", + place_url=f"https://place.map.kakao.com/{kakao_place_id}", confidence=confidence, - source_keyword="Common Mansion", - source_sentence="Common Mansion 1-102 Sinmunro 2-ga", - raw_candidate="Common Mansion", + source_keyword=source_keyword, + source_sentence=f"{source_keyword} 1-102 Sinmunro 2-ga", + raw_candidate=source_keyword, ) @@ -250,6 +275,15 @@ async def fake_crawl(url: str, _settings: Settings) -> CrawlArtifact: "store_name_evidence": "Common Mansion", "address_evidence": "1-102 Sinmunro 2-ga, Jongno-gu, Seoul", "certainty": "high", + "places": [ + { + "store_name": "Common Mansion", + "address": "1-102 Sinmunro 2-ga, Jongno-gu, Seoul", + "store_name_evidence": "Common Mansion", + "address_evidence": "1-102 Sinmunro 2-ga, Jongno-gu, Seoul", + "certainty": "high", + } + ], } assert repo.failed is None @@ -303,6 +337,7 @@ async def fake_crawl(url: str, _settings: Settings) -> CrawlArtifact: ] assert repo.saved_result is not None assert repo.saved_result["selected_place"]["confidence"] == 0.95 + assert repo.saved_result["selected_places"][0]["confidence"] == 0.95 def test_build_location_hints_from_korean_address() -> None: @@ -334,8 +369,8 @@ def test_processor_enriches_place_from_extraction_result(monkeypatch) -> None: ) place_search = FakePlaceSearchClient( [ - _place_candidate(confidence=0.75), - _place_candidate(confidence=0.95), + _place_candidate(confidence=0.75, kakao_place_id="122"), + _place_candidate(confidence=0.95, kakao_place_id="123"), ] ) @@ -372,6 +407,92 @@ async def fake_crawl(url: str, _settings: Settings) -> CrawlArtifact: assert len(repo.saved_result["place_candidates"]) == 2 assert repo.saved_result["selected_place"]["confidence"] == 0.95 assert repo.saved_result["selected_place"]["kakao_place_id"] == "123" + assert repo.saved_result["selected_places"][0]["confidence"] == 0.95 + assert repo.failed is None + + +@pytest.mark.skipif(not EVENT_LOOP_AVAILABLE, reason="Event loop creation is blocked in this environment") +def test_processor_enriches_multiple_places_from_extraction_result(monkeypatch) -> None: + job = _new_job() + repo = FakeRepository(job) + settings = Settings() + extracted_places = [ + ("플루밍", "서울 마포구 연남로13길 9 1층 101호"), + ("누크녹", "서울 마포구 성미산로 190-31 2층"), + ("예챠", "서울 마포구 망원로7길 31-18 1층 102호"), + ("라뚜셩트", "서울 서초구 방배로25길 50 1층"), + ("코이크", "서울 마포구 동교로39길 8 1-2층"), + ("카페토요", "서울 영등포구 도림로 436-7 1층"), + ] + extractor = FakeExtractionClient( + ExtractionResult( + store_name="플루밍", + address="서울 마포구 연남로13길 9 1층 101호", + store_name_evidence="❶ 플루밍", + address_evidence="📍서울 마포구 연남로13길 9 1층 101호", + certainty=ExtractionCertainty.HIGH, + places=[ + ExtractedPlace( + store_name=name, + address=address, + store_name_evidence=name, + address_evidence=address, + certainty=ExtractionCertainty.HIGH, + ) + for name, address in extracted_places + ], + ) + ) + place_search = KeywordAwarePlaceSearchClient( + { + name: [ + _place_candidate( + kakao_place_id=str(index), + place_name=name, + source_keyword=name, + address_name=address, + road_address_name=address, + ) + ] + for index, (name, address) in enumerate(extracted_places, start=1) + } + ) + + async def fake_crawl(url: str, _settings: Settings) -> CrawlArtifact: + return CrawlArtifact( + url=url, + html=None, + text="서울에서 만나는 비주얼 디저트 카페들", + media_type="reel", + caption="서울에서 만나는 비주얼 디저트 카페들", + instagram_meta=None, + ) + + monkeypatch.setattr("app.worker.processor.crawl_and_parse", fake_crawl) + + processor = JobProcessor( + repository=repo, + settings=settings, + extraction_client=extractor, + place_search_client=place_search, + ) + + _run(processor.process_job(job.job_id)) + + assert repo.succeeded is True + assert repo.saved_result is not None + assert [call["keyword"] for call in place_search.calls] == [ + name for name, _ in extracted_places + ] + assert len(repo.saved_result["place_candidates"]) == 6 + assert [place["place_name"] for place in repo.saved_result["selected_places"]] == [ + name for name, _ in extracted_places + ] + assert repo.saved_result["selected_place"]["place_name"] == "플루밍" + assert [ + place["store_name"] + for place in repo.saved_result["extraction_result"]["places"] + ] == [name for name, _ in extracted_places] assert repo.failed is None @@ -415,6 +536,7 @@ async def fake_crawl(url: str, _settings: Settings) -> CrawlArtifact: assert repo.saved_result is not None assert repo.saved_result["place_candidates"] == [] assert repo.saved_result["selected_place"] is None + assert repo.saved_result["selected_places"] == [] assert repo.failed is None @@ -458,6 +580,7 @@ async def fake_crawl(url: str, _settings: Settings) -> CrawlArtifact: assert repo.saved_result is not None assert repo.saved_result["place_candidates"] == [] assert repo.saved_result["selected_place"] is None + assert repo.saved_result["selected_places"] == [] assert repo.failed is None