Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions app/api/v1/endpoints/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ async def get_job_result(
extraction_result=result.extraction_result if result else None,
place_candidates=result.place_candidates if result else [],
selected_place=result.selected_place if result else None,
selected_places=result.selected_places if result else [],
error_message=job.error_message,
updated_at=job.updated_at,
)
2 changes: 1 addition & 1 deletion app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ class Settings(BaseSettings):
hf_extraction_api_token: str = ""
hf_extraction_model_name: str = "Qwen/Qwen2.5-3B-Instruct"
hf_extraction_timeout_seconds: int = 20
hf_extraction_max_new_tokens: int = 512
hf_extraction_max_new_tokens: int = 2048

@field_validator("processing_schema")
@classmethod
Expand Down
6 changes: 6 additions & 0 deletions app/domain/job/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,35 @@
CrawlArtifact,
ExtractionCertainty,
ExtractionResult,
ExtractedPlace,
ExtractedCandidate,
JobRecord,
JobResultRecord,
JobStatus,
PlaceCandidate,
as_candidate_dict,
as_extracted_place_dict,
as_extraction_result_dict,
as_place_dict,
extracted_places_from_result,
)
from app.domain.job.service import CreateJobCommand, InvalidJobRequest, JobService

__all__ = [
"CrawlArtifact",
"ExtractionCertainty",
"ExtractionResult",
"ExtractedPlace",
"ExtractedCandidate",
"JobRecord",
"JobResultRecord",
"JobStatus",
"PlaceCandidate",
"as_candidate_dict",
"as_extracted_place_dict",
"as_extraction_result_dict",
"as_place_dict",
"extracted_places_from_result",
"CreateJobCommand",
"InvalidJobRequest",
"JobService",
Expand Down
50 changes: 49 additions & 1 deletion app/domain/job/model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

from dataclasses import dataclass
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from typing import Any
Expand All @@ -20,13 +20,23 @@ class ExtractionCertainty(str, Enum):
LOW = "low"


@dataclass(slots=True)
class ExtractedPlace:
store_name: str | None
address: str | None
store_name_evidence: str | None
address_evidence: str | None
certainty: ExtractionCertainty


@dataclass(slots=True)
class ExtractionResult:
store_name: str | None
address: str | None
store_name_evidence: str | None
address_evidence: str | None
certainty: ExtractionCertainty
places: list[ExtractedPlace] = field(default_factory=list)


@dataclass(slots=True)
Expand All @@ -48,6 +58,7 @@ class JobResultRecord:
extraction_result: dict[str, Any] | None
place_candidates: list[dict[str, Any]]
selected_place: dict[str, Any] | None
selected_places: list[dict[str, Any]]
created_at: datetime
updated_at: datetime

Expand Down Expand Up @@ -118,11 +129,48 @@ def as_candidate_dict(candidate: ExtractedCandidate) -> dict[str, Any]:
}


def as_extracted_place_dict(place: ExtractedPlace) -> dict[str, Any]:
return {
"store_name": place.store_name,
"address": place.address,
"store_name_evidence": place.store_name_evidence,
"address_evidence": place.address_evidence,
"certainty": place.certainty.value,
}


def extracted_places_from_result(result: ExtractionResult) -> list[ExtractedPlace]:
if result.places:
return result.places
if not any(
(
result.store_name,
result.address,
result.store_name_evidence,
result.address_evidence,
)
):
return []
return [
ExtractedPlace(
store_name=result.store_name,
address=result.address,
store_name_evidence=result.store_name_evidence,
address_evidence=result.address_evidence,
certainty=result.certainty,
)
]


def as_extraction_result_dict(result: ExtractionResult) -> dict[str, Any]:
return {
"store_name": result.store_name,
"address": result.address,
"store_name_evidence": result.store_name_evidence,
"address_evidence": result.address_evidence,
"certainty": result.certainty.value,
"places": [
as_extracted_place_dict(place)
for place in extracted_places_from_result(result)
],
}
16 changes: 14 additions & 2 deletions app/infra/db/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,19 +108,29 @@ async def upsert_job_result(
extraction_result: dict[str, Any] | None = None,
place_candidates: list[dict[str, Any]] | None = None,
selected_place: dict[str, Any] | None = None,
selected_places: list[dict[str, Any]] | None = None,
) -> JobResultRecord:
sql = f"""
INSERT INTO {self._results_table}
(job_id, caption, instagram_meta, extraction_result, place_candidates, selected_place)
(
job_id,
caption,
instagram_meta,
extraction_result,
place_candidates,
selected_place,
selected_places
)
VALUES
($1, $2, $3::jsonb, $4::jsonb, $5::jsonb, $6::jsonb)
($1, $2, $3::jsonb, $4::jsonb, $5::jsonb, $6::jsonb, $7::jsonb)
ON CONFLICT (job_id)
DO UPDATE SET
caption = EXCLUDED.caption,
instagram_meta = EXCLUDED.instagram_meta,
extraction_result = EXCLUDED.extraction_result,
place_candidates = EXCLUDED.place_candidates,
selected_place = EXCLUDED.selected_place,
selected_places = EXCLUDED.selected_places,
updated_at = NOW()
RETURNING *
"""
Expand All @@ -132,6 +142,7 @@ async def upsert_job_result(
json.dumps(extraction_result) if extraction_result is not None else None,
json.dumps(place_candidates or []),
json.dumps(selected_place) if selected_place is not None else None,
json.dumps(selected_places or []),
)
if row is None:
raise RuntimeError("Failed to upsert job result")
Expand All @@ -156,6 +167,7 @@ def _to_job_result_record(self, row: asyncpg.Record) -> JobResultRecord:
extraction_result=self._json_to_dict(row["extraction_result"]),
place_candidates=self._json_to_list(row["place_candidates"]),
selected_place=self._json_to_dict(row["selected_place"]),
selected_places=self._json_to_list(row["selected_places"]),
created_at=row["created_at"],
updated_at=row["updated_at"],
)
Expand Down
65 changes: 56 additions & 9 deletions app/infra/llm/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,43 @@
from app.domain.job import ExtractionResult
from app.schemas.extraction import ExtractionLLMResponse

EXTRACTION_SYSTEM_PROMPT = (
"You extract store information from Korean restaurant social media captions. "
"Return only one JSON object with these exact keys: store_name, address, "
"store_name_evidence, address_evidence, certainty. Use null when a value is "
"unknown. Evidence values must be substrings copied from the input caption. "
"certainty must be one of high, medium, or low. Do not include explanations, "
"Markdown, or any text outside the JSON object."
EXTRACTION_SYSTEM_PROMPT_TEMPLATE = (
"You extract place/store information from Korean social media captions. "
"Return only one JSON object with these exact top-level keys: store_name, "
"address, store_name_evidence, address_evidence, certainty, places. "
"places must be an array of objects. Each place object must have these exact "
"keys: store_name, address, store_name_evidence, address_evidence, certainty. "
"Extract every distinct place/store/brand that appears to be a visitable local "
"business, up to {max_candidates} places, preserving caption order. Captions "
"may contain numbered lists such as 1, 2, circled numbers, or sections such as "
"brand information, store information, or place information. When a place name "
"line is followed by an address line, pair them together. Address lines often "
"start with map-pin markers, address/location labels, or Korean address units "
"such as city, gu, gun, dong, eup, myeon, ri, ga, ro, or gil. A hashtag can be "
"a real store name, for example #StoreName; consider it when it names a "
"specific local business. Do not extract generic regional/category/promotional "
"hashtags such as Seoul cafe, Yeonnam cafe, dessert, hot place, date course, "
"travel, recommendation, or account handles as store names. If a store name is "
"taken from a hashtag, remove the leading # in store_name but keep the original "
"hashtag substring in store_name_evidence. Do not invent missing values. Use "
"null when unknown. Evidence values must be exact substrings copied from the "
"input caption. certainty must be one of high, medium, or low. The top-level "
"legacy fields store_name, address, store_name_evidence, address_evidence, and "
"certainty must mirror the first item in places, or null/low when places is "
"empty. If no place is found, return places as an empty array. Do not include "
"explanations, Markdown, or any text outside the JSON object."
)


def build_extraction_system_prompt(max_candidates: int) -> str:
return EXTRACTION_SYSTEM_PROMPT_TEMPLATE.format(
max_candidates=max(1, max_candidates),
)


EXTRACTION_SYSTEM_PROMPT = build_extraction_system_prompt(12)


class HFExtractionError(Exception):
pass

Expand Down Expand Up @@ -85,9 +112,10 @@ async def extract(
generated_json = extract_json_object(generated_text)

try:
return ExtractionLLMResponse.model_validate(generated_json).to_domain()
result = ExtractionLLMResponse.model_validate(generated_json).to_domain()
except ValidationError as exc:
raise HFExtractionError("HF response failed schema validation") from exc
return self._limit_places(result)

def _build_payload(
self,
Expand All @@ -100,13 +128,32 @@ def _build_payload(
return {
"model": self._settings.hf_extraction_model_name,
"messages": [
{"role": "system", "content": EXTRACTION_SYSTEM_PROMPT},
{
"role": "system",
"content": build_extraction_system_prompt(
self._settings.extraction_max_candidates,
),
},
{"role": "user", "content": text},
],
"temperature": 0.0,
"max_tokens": self._settings.hf_extraction_max_new_tokens,
}

def _limit_places(self, result: ExtractionResult) -> ExtractionResult:
max_places = max(1, self._settings.extraction_max_candidates)
if len(result.places) <= max_places:
return result

result.places = result.places[:max_places]
first_place = result.places[0]
result.store_name = first_place.store_name
result.address = first_place.address
result.store_name_evidence = first_place.store_name_evidence
result.address_evidence = first_place.address_evidence
result.certainty = first_place.certainty
return result


def extract_text_from_hf_payload(payload: Any) -> str:
if isinstance(payload, str):
Expand Down
Loading
Loading