diff --git a/build.gradle b/build.gradle index fe060a4..384becd 100644 --- a/build.gradle +++ b/build.gradle @@ -44,7 +44,7 @@ dependencies { testImplementation 'org.jetbrains.kotlin:kotlin-test-junit5' testImplementation("org.mockito.kotlin:mockito-kotlin:5.3.1") testRuntimeOnly 'org.junit.platform:junit-platform-launcher' - runtimeOnly 'org.postgresql:postgresql' + implementation 'org.postgresql:postgresql' // ← 변경 testRuntimeOnly("com.h2database:h2") // swagger @@ -53,6 +53,9 @@ dependencies { // s3 implementation(platform("software.amazon.awssdk:bom:2.25.70")) implementation("software.amazon.awssdk:s3") + + // pgvector + implementation("com.pgvector:pgvector:0.1.6") } dependencyManagement { diff --git a/src/main/kotlin/simplerag/ragback/domain/index/entity/ChunkEmbedding.kt b/src/main/kotlin/simplerag/ragback/domain/index/entity/ChunkEmbedding.kt index 127770b..464f227 100644 --- a/src/main/kotlin/simplerag/ragback/domain/index/entity/ChunkEmbedding.kt +++ b/src/main/kotlin/simplerag/ragback/domain/index/entity/ChunkEmbedding.kt @@ -1,8 +1,8 @@ package simplerag.ragback.domain.index.entity +import com.pgvector.PGvector import jakarta.persistence.* import simplerag.ragback.global.entity.BaseEntity -import simplerag.ragback.global.util.FloatArrayToPgVectorStringConverter // 임베딩 크기를 서비스단에서 검증을 해줘야함 @Entity @@ -13,9 +13,8 @@ class ChunkEmbedding( @Lob val content: String, - @Convert(converter = FloatArrayToPgVectorStringConverter::class) - @Column(name = "embedding", nullable = false) - private var _embedding: FloatArray, + @Column(name = "embedding", columnDefinition = "vector") + var embedding: PGvector, @Column(name = "embedding_dim", nullable = false) val embeddingDim: Int, @@ -27,16 +26,4 @@ class ChunkEmbedding( @Id @GeneratedValue(strategy = GenerationType.IDENTITY) @Column(name = "chunk_embeddings_id") val id: Long? = null, -) : BaseEntity() { - - @get:Transient - val embedding: FloatArray get() = _embedding.copyOf() - - fun updateEmbedding(newVec: FloatArray) { - require(newVec.size == embeddingDim) { - "Embedding dimension mismatch: expected=$embeddingDim, got=${newVec.size}" - } - _embedding = newVec.copyOf() - } - -} \ No newline at end of file +) : BaseEntity() \ No newline at end of file diff --git a/src/main/kotlin/simplerag/ragback/domain/index/entity/enums/EmbeddingModel.kt b/src/main/kotlin/simplerag/ragback/domain/index/entity/enums/EmbeddingModel.kt index 6729034..a7bcda4 100644 --- a/src/main/kotlin/simplerag/ragback/domain/index/entity/enums/EmbeddingModel.kt +++ b/src/main/kotlin/simplerag/ragback/domain/index/entity/enums/EmbeddingModel.kt @@ -6,36 +6,21 @@ enum class EmbeddingModel( ) { // OpenAI TEXT_EMBEDDING_3_SMALL(1536, "text-embedding-3-small"), - TEXT_EMBEDDING_3_LARGE(3072, "text-embedding-3-large"), // SBERT / HuggingFace ALL_MINILM_L6_V2(384, "sentence-transformers/all-MiniLM-L6-v2"), ALL_MP_NET_BASE_V2(768, "sentence-transformers/all-mpnet-base-v2"), - MULTI_QA_MP_NET_BASE_DOT_V1(768, "sentence-transformers/multi-qa-mpnet-base-dot-v1"), DISTILUSE_BASE_MULTILINGUAL_CASED_V2(512, "sentence-transformers/distiluse-base-multilingual-cased-v2"), - PARAPHRASE_MULTILINGUAL_MINILM_L12_V2(384, "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"), - KO_SBERT_V1(768, "jhgan/ko-sbert-v1"), - KOR_SROBERTA(768, "jhgan/ko-sroberta-medium-nli"), - - // Korean specific - BM_KO_SMALL(512, "bespin-global/klue-sroberta-base-continue-learning-by-mnr"), - // Instructor / Mistral - INSTRUCTOR_BASE(768, "hkunlp/instructor-base"), - INSTRUCTOR_XL(1024, "hkunlp/instructor-xl"), - MISTRAL_EMBED(1024, "mistral-embed"), + // Korean + KO_SBERT_V1(768, "jhgan/ko-sbert-v1"), - // BGE / E5 etc - BGE_SMALL_EN(384, "BAAI/bge-small-en-v1.5"), + // BGE BGE_BASE_EN(768, "BAAI/bge-base-en-v1.5"), - BGE_LARGE_EN(1024, "BAAI/bge-large-en-v1.5"), BGE_M3(1024, "BAAI/bge-m3"), - E5_SMALL(384, "intfloat/e5-small-v2"), - E5_BASE(768, "intfloat/e5-base-v2"), - E5_LARGE(1024, "intfloat/e5-large-v2"), - // Old word vectors - FASTTEXT_KO(300, "fasttext-ko-300d"); + // E5 + E5_BASE(768, "intfloat/e5-base-v2"); companion object { fun findByModelId(modelId: String): EmbeddingModel? { diff --git a/src/main/kotlin/simplerag/ragback/global/util/FloatArrayToPgVectorStringConverter.kt b/src/main/kotlin/simplerag/ragback/global/util/FloatArrayToPgVectorStringConverter.kt deleted file mode 100644 index 04e1d23..0000000 --- a/src/main/kotlin/simplerag/ragback/global/util/FloatArrayToPgVectorStringConverter.kt +++ /dev/null @@ -1,34 +0,0 @@ -package simplerag.ragback.global.util - -import jakarta.persistence.AttributeConverter -import jakarta.persistence.Converter - -@Converter(autoApply = false) -class FloatArrayToPgVectorStringConverter : AttributeConverter { - override fun convertToDatabaseColumn(attribute: FloatArray?): String { - requireNotNull(attribute) { "Embedding (FloatArray) must not be null" } - require(attribute.isNotEmpty()) { "Embedding must not be empty; expected fixed dimension (e.g., 1536)" } - require(attribute.all { !it.isNaN() && !it.isInfinite() }) { - "Embedding must not contain NaN/Infinity" - } - return attribute.joinToString(prefix = "[", postfix = "]", separator = ",") { it.toString() } - } - - override fun convertToEntityAttribute(dbData: String?): FloatArray { - if (dbData.isNullOrBlank()) return floatArrayOf() - val body = dbData.trim().removePrefix("[").removeSuffix("]").trim() - if (body.isBlank()) return floatArrayOf() - return try { - body.split(',') - .map { it.trim().toFloat() } - .toFloatArray() - .also { arr -> - require(arr.all { it.isFinite() }) { - "Embedding must not contain NaN/Infinity (db → entity)" - } - } - } catch (e: NumberFormatException) { - throw IllegalArgumentException("Invalid vector literal for pgvector: '$dbData'", e) - } - } -} \ No newline at end of file