From f0fd20e9cd34f15be4646dd90b40647ba494d8e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=A4=80=ED=99=98?= Date: Tue, 19 Aug 2025 21:08:22 +0900 Subject: [PATCH 1/2] =?UTF-8?q?:rocket:=20chore:=20entity=20=EC=88=98?= =?UTF-8?q?=EC=A0=95=20=EB=B0=8F=20=EB=AA=A8=EB=8D=B8=20=EC=88=98=20?= =?UTF-8?q?=EC=A4=84=EC=9D=B4=EA=B8=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build.gradle | 2 +- .../domain/index/entity/ChunkEmbedding.kt | 9 +++-- .../index/entity/enums/EmbeddingModel.kt | 25 +++----------- .../util/FloatArrayToPgVectorConverter.kt | 29 ++++++++++++++++ .../FloatArrayToPgVectorStringConverter.kt | 34 ------------------- 5 files changed, 41 insertions(+), 58 deletions(-) create mode 100644 src/main/kotlin/simplerag/ragback/global/util/FloatArrayToPgVectorConverter.kt delete mode 100644 src/main/kotlin/simplerag/ragback/global/util/FloatArrayToPgVectorStringConverter.kt diff --git a/build.gradle b/build.gradle index fe060a4..944949c 100644 --- a/build.gradle +++ b/build.gradle @@ -44,7 +44,7 @@ dependencies { testImplementation 'org.jetbrains.kotlin:kotlin-test-junit5' testImplementation("org.mockito.kotlin:mockito-kotlin:5.3.1") testRuntimeOnly 'org.junit.platform:junit-platform-launcher' - runtimeOnly 'org.postgresql:postgresql' + implementation 'org.postgresql:postgresql' // ← 변경 testRuntimeOnly("com.h2database:h2") // swagger diff --git a/src/main/kotlin/simplerag/ragback/domain/index/entity/ChunkEmbedding.kt b/src/main/kotlin/simplerag/ragback/domain/index/entity/ChunkEmbedding.kt index 127770b..5f387df 100644 --- a/src/main/kotlin/simplerag/ragback/domain/index/entity/ChunkEmbedding.kt +++ b/src/main/kotlin/simplerag/ragback/domain/index/entity/ChunkEmbedding.kt @@ -1,8 +1,10 @@ package simplerag.ragback.domain.index.entity import jakarta.persistence.* +import org.hibernate.annotations.JdbcTypeCode +import org.hibernate.type.SqlTypes import simplerag.ragback.global.entity.BaseEntity -import simplerag.ragback.global.util.FloatArrayToPgVectorStringConverter +import simplerag.ragback.global.util.FloatArrayToPgVectorConverter // 임베딩 크기를 서비스단에서 검증을 해줘야함 @Entity @@ -13,8 +15,9 @@ class ChunkEmbedding( @Lob val content: String, - @Convert(converter = FloatArrayToPgVectorStringConverter::class) - @Column(name = "embedding", nullable = false) + @Convert(converter = FloatArrayToPgVectorConverter::class) + @JdbcTypeCode(SqlTypes.OTHER) + @Column(name = "embedding", nullable = false, columnDefinition = "vector(3072)") private var _embedding: FloatArray, @Column(name = "embedding_dim", nullable = false) diff --git a/src/main/kotlin/simplerag/ragback/domain/index/entity/enums/EmbeddingModel.kt b/src/main/kotlin/simplerag/ragback/domain/index/entity/enums/EmbeddingModel.kt index 6729034..a7bcda4 100644 --- a/src/main/kotlin/simplerag/ragback/domain/index/entity/enums/EmbeddingModel.kt +++ b/src/main/kotlin/simplerag/ragback/domain/index/entity/enums/EmbeddingModel.kt @@ -6,36 +6,21 @@ enum class EmbeddingModel( ) { // OpenAI TEXT_EMBEDDING_3_SMALL(1536, "text-embedding-3-small"), - TEXT_EMBEDDING_3_LARGE(3072, "text-embedding-3-large"), // SBERT / HuggingFace ALL_MINILM_L6_V2(384, "sentence-transformers/all-MiniLM-L6-v2"), ALL_MP_NET_BASE_V2(768, "sentence-transformers/all-mpnet-base-v2"), - MULTI_QA_MP_NET_BASE_DOT_V1(768, "sentence-transformers/multi-qa-mpnet-base-dot-v1"), DISTILUSE_BASE_MULTILINGUAL_CASED_V2(512, "sentence-transformers/distiluse-base-multilingual-cased-v2"), - PARAPHRASE_MULTILINGUAL_MINILM_L12_V2(384, "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"), - KO_SBERT_V1(768, "jhgan/ko-sbert-v1"), - KOR_SROBERTA(768, "jhgan/ko-sroberta-medium-nli"), - - // Korean specific - BM_KO_SMALL(512, "bespin-global/klue-sroberta-base-continue-learning-by-mnr"), - // Instructor / Mistral - INSTRUCTOR_BASE(768, "hkunlp/instructor-base"), - INSTRUCTOR_XL(1024, "hkunlp/instructor-xl"), - MISTRAL_EMBED(1024, "mistral-embed"), + // Korean + KO_SBERT_V1(768, "jhgan/ko-sbert-v1"), - // BGE / E5 etc - BGE_SMALL_EN(384, "BAAI/bge-small-en-v1.5"), + // BGE BGE_BASE_EN(768, "BAAI/bge-base-en-v1.5"), - BGE_LARGE_EN(1024, "BAAI/bge-large-en-v1.5"), BGE_M3(1024, "BAAI/bge-m3"), - E5_SMALL(384, "intfloat/e5-small-v2"), - E5_BASE(768, "intfloat/e5-base-v2"), - E5_LARGE(1024, "intfloat/e5-large-v2"), - // Old word vectors - FASTTEXT_KO(300, "fasttext-ko-300d"); + // E5 + E5_BASE(768, "intfloat/e5-base-v2"); companion object { fun findByModelId(modelId: String): EmbeddingModel? { diff --git a/src/main/kotlin/simplerag/ragback/global/util/FloatArrayToPgVectorConverter.kt b/src/main/kotlin/simplerag/ragback/global/util/FloatArrayToPgVectorConverter.kt new file mode 100644 index 0000000..806f63b --- /dev/null +++ b/src/main/kotlin/simplerag/ragback/global/util/FloatArrayToPgVectorConverter.kt @@ -0,0 +1,29 @@ +package simplerag.ragback.global.util + +import jakarta.persistence.AttributeConverter +import jakarta.persistence.Converter +import org.postgresql.util.PGobject + +@Converter(autoApply = false) +class FloatArrayToPgVectorConverter : AttributeConverter { + override fun convertToDatabaseColumn(attribute: FloatArray?): PGobject { + requireNotNull(attribute) { "embedding must not be null" } + require(attribute.isNotEmpty()) { "embedding must not be empty" } + require(attribute.all { it.isFinite() }) { "NaN/Infinity not allowed" } + + val sb = StringBuilder(attribute.size * 8 + 2).append('[') + attribute.forEachIndexed { i, v -> if (i > 0) sb.append(','); sb.append(v) } + sb.append(']') + + return PGobject().apply { + type = "vector" + value = sb.toString() + } + } + + override fun convertToEntityAttribute(dbData: PGobject?): FloatArray { + requireNotNull(dbData) { "db vector is null" } + val body = dbData.value?.trim()?.removePrefix("[")?.removeSuffix("]") ?: error("empty vector") + return body.split(',').map { it.trim().toFloat() }.toFloatArray() + } +} diff --git a/src/main/kotlin/simplerag/ragback/global/util/FloatArrayToPgVectorStringConverter.kt b/src/main/kotlin/simplerag/ragback/global/util/FloatArrayToPgVectorStringConverter.kt deleted file mode 100644 index 04e1d23..0000000 --- a/src/main/kotlin/simplerag/ragback/global/util/FloatArrayToPgVectorStringConverter.kt +++ /dev/null @@ -1,34 +0,0 @@ -package simplerag.ragback.global.util - -import jakarta.persistence.AttributeConverter -import jakarta.persistence.Converter - -@Converter(autoApply = false) -class FloatArrayToPgVectorStringConverter : AttributeConverter { - override fun convertToDatabaseColumn(attribute: FloatArray?): String { - requireNotNull(attribute) { "Embedding (FloatArray) must not be null" } - require(attribute.isNotEmpty()) { "Embedding must not be empty; expected fixed dimension (e.g., 1536)" } - require(attribute.all { !it.isNaN() && !it.isInfinite() }) { - "Embedding must not contain NaN/Infinity" - } - return attribute.joinToString(prefix = "[", postfix = "]", separator = ",") { it.toString() } - } - - override fun convertToEntityAttribute(dbData: String?): FloatArray { - if (dbData.isNullOrBlank()) return floatArrayOf() - val body = dbData.trim().removePrefix("[").removeSuffix("]").trim() - if (body.isBlank()) return floatArrayOf() - return try { - body.split(',') - .map { it.trim().toFloat() } - .toFloatArray() - .also { arr -> - require(arr.all { it.isFinite() }) { - "Embedding must not contain NaN/Infinity (db → entity)" - } - } - } catch (e: NumberFormatException) { - throw IllegalArgumentException("Invalid vector literal for pgvector: '$dbData'", e) - } - } -} \ No newline at end of file From a307fcd267f6f163b288bc16b13c4e4255f9d1b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=A4=80=ED=99=98?= Date: Thu, 21 Aug 2025 16:52:48 +0900 Subject: [PATCH 2/2] =?UTF-8?q?=F0=9F=9A=80=20Chore:=20pgvector=20library?= =?UTF-8?q?=20apply?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build.gradle | 3 ++ .../domain/index/entity/ChunkEmbedding.kt | 24 +++------------ .../util/FloatArrayToPgVectorConverter.kt | 29 ------------------- 3 files changed, 7 insertions(+), 49 deletions(-) delete mode 100644 src/main/kotlin/simplerag/ragback/global/util/FloatArrayToPgVectorConverter.kt diff --git a/build.gradle b/build.gradle index 944949c..384becd 100644 --- a/build.gradle +++ b/build.gradle @@ -53,6 +53,9 @@ dependencies { // s3 implementation(platform("software.amazon.awssdk:bom:2.25.70")) implementation("software.amazon.awssdk:s3") + + // pgvector + implementation("com.pgvector:pgvector:0.1.6") } dependencyManagement { diff --git a/src/main/kotlin/simplerag/ragback/domain/index/entity/ChunkEmbedding.kt b/src/main/kotlin/simplerag/ragback/domain/index/entity/ChunkEmbedding.kt index 5f387df..464f227 100644 --- a/src/main/kotlin/simplerag/ragback/domain/index/entity/ChunkEmbedding.kt +++ b/src/main/kotlin/simplerag/ragback/domain/index/entity/ChunkEmbedding.kt @@ -1,10 +1,8 @@ package simplerag.ragback.domain.index.entity +import com.pgvector.PGvector import jakarta.persistence.* -import org.hibernate.annotations.JdbcTypeCode -import org.hibernate.type.SqlTypes import simplerag.ragback.global.entity.BaseEntity -import simplerag.ragback.global.util.FloatArrayToPgVectorConverter // 임베딩 크기를 서비스단에서 검증을 해줘야함 @Entity @@ -15,10 +13,8 @@ class ChunkEmbedding( @Lob val content: String, - @Convert(converter = FloatArrayToPgVectorConverter::class) - @JdbcTypeCode(SqlTypes.OTHER) - @Column(name = "embedding", nullable = false, columnDefinition = "vector(3072)") - private var _embedding: FloatArray, + @Column(name = "embedding", columnDefinition = "vector") + var embedding: PGvector, @Column(name = "embedding_dim", nullable = false) val embeddingDim: Int, @@ -30,16 +26,4 @@ class ChunkEmbedding( @Id @GeneratedValue(strategy = GenerationType.IDENTITY) @Column(name = "chunk_embeddings_id") val id: Long? = null, -) : BaseEntity() { - - @get:Transient - val embedding: FloatArray get() = _embedding.copyOf() - - fun updateEmbedding(newVec: FloatArray) { - require(newVec.size == embeddingDim) { - "Embedding dimension mismatch: expected=$embeddingDim, got=${newVec.size}" - } - _embedding = newVec.copyOf() - } - -} \ No newline at end of file +) : BaseEntity() \ No newline at end of file diff --git a/src/main/kotlin/simplerag/ragback/global/util/FloatArrayToPgVectorConverter.kt b/src/main/kotlin/simplerag/ragback/global/util/FloatArrayToPgVectorConverter.kt deleted file mode 100644 index 806f63b..0000000 --- a/src/main/kotlin/simplerag/ragback/global/util/FloatArrayToPgVectorConverter.kt +++ /dev/null @@ -1,29 +0,0 @@ -package simplerag.ragback.global.util - -import jakarta.persistence.AttributeConverter -import jakarta.persistence.Converter -import org.postgresql.util.PGobject - -@Converter(autoApply = false) -class FloatArrayToPgVectorConverter : AttributeConverter { - override fun convertToDatabaseColumn(attribute: FloatArray?): PGobject { - requireNotNull(attribute) { "embedding must not be null" } - require(attribute.isNotEmpty()) { "embedding must not be empty" } - require(attribute.all { it.isFinite() }) { "NaN/Infinity not allowed" } - - val sb = StringBuilder(attribute.size * 8 + 2).append('[') - attribute.forEachIndexed { i, v -> if (i > 0) sb.append(','); sb.append(v) } - sb.append(']') - - return PGobject().apply { - type = "vector" - value = sb.toString() - } - } - - override fun convertToEntityAttribute(dbData: PGobject?): FloatArray { - requireNotNull(dbData) { "db vector is null" } - val body = dbData.value?.trim()?.removePrefix("[")?.removeSuffix("]") ?: error("empty vector") - return body.split(',').map { it.trim().toFloat() }.toFloatArray() - } -}