diff --git a/README.md b/README.md new file mode 100644 index 0000000..2a49a2e --- /dev/null +++ b/README.md @@ -0,0 +1,32 @@ +This is a fork of Edward Raff's [LZJD](https://github.com/EdwardRaff/LZJD). The added feature is a plugin for [Postgres](https://www.postgresql.org/) to allow similarity matching in SQL queries. + +### Requirements: +* GCC +* CMake +* Boost libraries +* Postgres requirements: + * Server development package, looks like `postgresql-server-dev-12`. + +### Compile instructions: +* Check out the code +* `cd src` +* `mkdir build` +* `cmake ..` +* `make` + +This creates: +* `liblzjd.shared.so`: Shared library for linking against. +* `liblzjd.static.a`: Static library for building against. +* `lzjd`: Command line application. +* If the Postgres server development package was installed, you should also have: + * `lzjd_psql.so`: Postgresl plugin + + +### Installation instructions: +* Copy `lzjd` to `/usr/local/bin/` or similar, if desired. +* Copy `liblzjd.shared.so` to `/usr/local/lib/liblzjd.so`, or similar, if desired. Note the file name change. +* Copy `liblzjd.static.a` to `/usr/local/lib/liblzjd.a`, or similar, if desired. Note the file name change here too. +* For the Postgres plugin: + * Run the command `pg_config --pkglibdir`, this is the installation directory. + * Copy `lzjd_psql.so` to the directory shown in the above command. Should be `/usr/lib/postgresql/XX/lib/` where XX is the version number. + * As the administrative user for your Postgres environment, run this SQL command to load the plugin: `CREATE OR REPLACE FUNCTION lzjd_compare(TEXT, TEXT) RETURNS INTEGER AS 'lzjd_psql.so', 'pg_lzjd_compare' LANGUAGE 'c';`. No restart required, the new function `lzjd_compare()` is available. \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7076fbb..fd4620a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,4 +1,4 @@ -project(lzjd CXX) +project(lzjd) cmake_minimum_required(VERSION 2.8) FIND_PACKAGE( Boost 1.50 COMPONENTS program_options system filesystem REQUIRED ) INCLUDE_DIRECTORIES( ${Boost_INCLUDE_DIR} ) @@ -13,5 +13,20 @@ target_link_libraries(lzjd.shared) add_library(lzjd.static STATIC ../src/LZJD.cpp ../src/MurmurHash3.cpp) +execute_process(COMMAND pg_config --includedir-server RESULT_VARIABLE PG_INCLUDESRV_RESULT OUTPUT_VARIABLE PG_INCLUDESRV_DIR) +execute_process(COMMAND pg_config --includedir RESULT_VARIABLE PG_INCLUDE_RESULT OUTPUT_VARIABLE PG_INCLUDE_DIR) + +if(${PG_INCLUDESRV_RESULT} EQUAL 0) + MESSAGE( STATUS "PG Server Include: ${PG_INCLUDESRV_DIR}" ) + MESSAGE( STATUS "PG Include: ${PG_INCLUDE_DIR}" ) + + add_library(lzjd_psql SHARED ../src/pglzjd.c ../src/pg_lzjd_helper.cpp ../src/LZJD.cpp ../src/MurmurHash3.cpp) + target_link_libraries(lzjd_psql ${Boost_LIBRARIES}) + target_include_directories(lzjd_psql PRIVATE ${PG_INCLUDESRV_DIR}) + #target_include_directories(lzjd_psql.so PRIVATE ${PG_INCLUDE_DIR}) + set_target_properties(lzjd_psql PROPERTIES PREFIX "") + set_target_properties(lzjd_psql PROPERTIES OUTPUT_NAME "lzjd_psql") +endif() + set(CMAKE_BUILD_TYPE Release) diff --git a/src/LZJD.cpp b/src/LZJD.cpp index b3e6940..27bda43 100644 --- a/src/LZJD.cpp +++ b/src/LZJD.cpp @@ -4,7 +4,7 @@ #include #include #include // std::call_once, std::once_flag - +#include // round() #include @@ -17,6 +17,10 @@ using namespace std; +#ifdef __cplusplus +extern "C" { +#endif + LZJD::LZJD() { } @@ -131,3 +135,7 @@ int32_t similarity(const std::vector& x_minset, const std::vector digest(uint64_t k, std::vector& bytes); int32_t similarity(const std::vector& x_minset, const std::vector& y_minset); +#ifdef __cplusplus +} +#endif + #endif diff --git a/src/MurmurHash3.cpp b/src/MurmurHash3.cpp index 7852b7f..4dd7dfc 100644 --- a/src/MurmurHash3.cpp +++ b/src/MurmurHash3.cpp @@ -5,6 +5,10 @@ #include "MurmurHash3.h" using namespace std; +#ifdef __cplusplus +extern "C" { +#endif + void MurmurHash3::reset() { _len = 0; _h1 = _seed; @@ -65,3 +69,6 @@ MurmurHash3::MurmurHash3(int32_t _seed) { this->reset(); } +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/src/MurmurHash3.h b/src/MurmurHash3.h index f40395c..d5109d1 100644 --- a/src/MurmurHash3.h +++ b/src/MurmurHash3.h @@ -5,7 +5,9 @@ #include #include - +#ifdef __cplusplus +extern "C" { +#endif #define FORCE_INLINE inline __attribute__((always_inline)) @@ -68,4 +70,8 @@ class MurmurHash3 }; +#ifdef __cplusplus +} +#endif + #endif diff --git a/src/pg_lzjd_helper.cpp b/src/pg_lzjd_helper.cpp new file mode 100644 index 0000000..c98186d --- /dev/null +++ b/src/pg_lzjd_helper.cpp @@ -0,0 +1,68 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "LZJD.h" + +using namespace std; +namespace bi = boost::archive::iterators; + +extern "C" { + +vector cstring_to_lzjd(char* hash) { + string line = hash; + auto first_colon = line.find(":", 0); + auto second_colon = line.find(":", first_colon + 1); + string path = line.substr(first_colon + 1, second_colon - first_colon - 1); + string base64ints = line.substr(second_colon + 1, line.size() - second_colon); + auto size = base64ints.size(); + while (size > 0 && base64ints[size - 1] == '=') + size--; + base64ints = base64ints.substr(0, size); + + + //TODO this is not 100% kosher, but C++ is a pain. + + typedef + bi::transform_width< + bi::binary_from_base64>, + 8, 6 + > + base64_dec; + + vector int_parts; + + copy( + base64_dec(base64ints.cbegin()), + base64_dec(base64ints.cend()), + std::back_inserter(int_parts) + ); + + vector decoded_ints(int_parts.size() / 4); + for (int i = 0; i < int_parts.size(); i += 4) { + //big endian extraction of the right value + int32_t dec_i = (int_parts[i + 0] << 24) | (int_parts[i + 1] << 16) | (int_parts[i + 2] << 8) | (int_parts[i + 3] << 0); + decoded_ints[i / 4] = dec_i; + // cout << dec_i << ", "; + } + return decoded_ints; +} + +int32_t lzjd_similarity(char *hash1, char *hash2) { + try { + vector l1 = cstring_to_lzjd(hash1); + vector l2 = cstring_to_lzjd(hash2); + return similarity(l1, l2); + } catch(...) { + return 0; + } + return 0; +} + +} // End Extern C diff --git a/src/pg_lzjd_helper.h b/src/pg_lzjd_helper.h new file mode 100644 index 0000000..1a1d006 --- /dev/null +++ b/src/pg_lzjd_helper.h @@ -0,0 +1,14 @@ +#ifndef PGFIX_H +#define PGFIX_H + +#ifdef __cplusplus +extern "C" { +#endif + +int32_t lzjd_similarity(char *hash1, char *hash2); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/pglzjd.c b/src/pglzjd.c new file mode 100644 index 0000000..2ecf2dc --- /dev/null +++ b/src/pglzjd.c @@ -0,0 +1,32 @@ +// PostgreSQL includes +#include +#include // in postgresql server includes, for text_to_cstring() + +// Project includes +#include "pg_lzjd_helper.h" + +PG_MODULE_MAGIC; + +// +// CREATE OR REPLACE FUNCTION lzjd_compare(TEXT, TEXT) RETURNS INTEGER AS 'lzjd_psql.so', 'pg_lzjd_compare' LANGUAGE 'c'; +// + +PG_FUNCTION_INFO_V1(pg_lzjd_compare); +Datum pg_lzjd_compare(PG_FUNCTION_ARGS); + +Datum pg_lzjd_compare(PG_FUNCTION_ARGS) { + if (PG_ARGISNULL(0) || PG_ARGISNULL(1)) { + PG_RETURN_INT32(0); + } + text *arg1 = PG_GETARG_TEXT_P(0); + text *arg2 = PG_GETARG_TEXT_P(1); + char* hash1 = text_to_cstring(arg1); + char* hash2 = text_to_cstring(arg2); + + int32 score = lzjd_similarity(hash1, hash2); + + pfree(hash1); + pfree(hash2); + + PG_RETURN_INT32(score); +} diff --git a/src/pglzjd.sql b/src/pglzjd.sql new file mode 100644 index 0000000..4a78644 --- /dev/null +++ b/src/pglzjd.sql @@ -0,0 +1 @@ +CREATE OR REPLACE FUNCTION lzjd_compare(TEXT, TEXT) RETURNS INTEGER AS 'lzjd_psql.so', 'pg_lzjd_compare' LANGUAGE 'c';