diff --git a/.travis.yml b/.travis.yml index daeabbf..52b57ab 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,6 +15,7 @@ before_install: - curl -OL http://raw.github.com/craigcitro/r-travis/master/scripts/travis-tool.sh - chmod 755 ./travis-tool.sh - ./travis-tool.sh bootstrap + - ./travis-tool.sh install_github qinwf/jiebaR - ./travis-tool.sh install_github jimhester/robustr - ./travis-tool.sh install_github jimhester/covr diff --git a/DESCRIPTION b/DESCRIPTION index 6e60c59..ccaa974 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -24,7 +24,7 @@ Imports: digest(>= 0.6.8), magrittr (>= 1.5) LinkingTo: Rcpp, digest(>= 0.6.8), BH -Suggests: RUnit, glmnet, knitr, xgboost, rmarkdown +Suggests: RUnit, glmnet, knitr, xgboost, rmarkdown, jiebaR(>= 0.5.1) RcppModules: callback, split_callback SystemRequirements: C++11 BugReports: https://github.com/wush978/FeatureHashing/issues diff --git a/NAMESPACE b/NAMESPACE index c323192..5b870ec 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,7 +7,9 @@ export(hash.size) export(hashed.interaction.value) export(hashed.model.matrix) export(hashed.value) +export(init_jiebaR_callback) export(intToRaw) +export(ls_special) export(register_callback) export(test_callback) import(digest) @@ -16,6 +18,7 @@ importFrom(Matrix,Diagonal) importFrom(Matrix,colSums) importFrom(Rcpp,cpp_object_initializer) importFrom(Rcpp,loadModule) +importFrom(Rcpp,sourceCpp) importFrom(magrittr,"%<>%") importFrom(magrittr,"%>%") importFrom(methods,as) diff --git a/R/callback.R b/R/callback.R index 498bc07..f7c147d 100644 --- a/R/callback.R +++ b/R/callback.R @@ -3,13 +3,24 @@ #'@title Register Special Function for Formula Interface #'@param special string. The name which will be used in formula interface. #'@param callback_generator function which will create a callback. Please see the details. -#'@examples +#'@details The callback_generator is a function whose first argument is the +#'input data and the other arguments could be used to initialize the callback +#'function properly. The result should be a Rcpp module which derives the +#'`CallbackFunctor` class. Please see the vignette for details. #'register_callback("split", generate_split_callback) register_callback <- function(special, callback_generator) { - .callback[[special]] <- callback + .callback[[special]] <- callback_generator invisible(NULL) } +#'@title List the Registered Specials +#'@return character vector. The specials which could be used in the +#'formula interface. +#'@export +ls_special <- function() { + ls(.callback) +} + #'@title Generate callback of split #'@param input character vector. The input of split #'@param delim string. \code{delim} will be used as delimiter for splitting @@ -23,3 +34,32 @@ generate_split_callback <- function(input, delim = ",", type = c("existence", "c .callback <- new.env() .callback[["split"]] <- generate_split_callback + +#'@title Initialize and register jiebaR to the formula interface +#'@details This function will register the callback of word segmentation +#'function provided by jiebaR to the formula interface. +#'For example, `~ jiebaR(...)` will use the feature of word segmentation +#'provided by jiebaR to segment a given column of the data. +#'The first argument of the jiebaR is a character which will be segmented. +#'The left arguments are the same as \code{\link[jiebaR]{worker}}. These +#'arguments will be used to initialize a jiebaR worker which will segment +#'the input data. +#' +#'@examples +#'\dontrun{ +#'library(FeatureHashing) +#'init_jiebaR_callback() +#'m <- hashed.model.matrix(~ jiebaR(title, type = "mix", df)) +#'# the column `df$title` will be feed into `worker <- worker(type = "mix")` +#'# the result of `worker <= df$title` will be hashed into the sparse matrix +#'# the result is `m` +#'} +#'@export +#'@importFrom Rcpp sourceCpp +init_jiebaR_callback <- function() { + if (!requireNamespace("jiebaR", character.only = TRUE)) stop("Please install the package jiebaR first") + tryCatch({ + sourceCpp(system.file("callback/jiebaR_callback.cpp", package = "FeatureHashing")) + }, finally = { + }) +} diff --git a/appveyor.yml b/appveyor.yml index 086cdaf..9862d36 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -13,6 +13,7 @@ install: build_script: - travis-tool.sh install_deps + - travis-tool.sh install_github qinwf/jiebaR test_script: - travis-tool.sh run_tests diff --git a/inst/callback/jiebaR_callback.cpp b/inst/callback/jiebaR_callback.cpp new file mode 100644 index 0000000..7d77864 --- /dev/null +++ b/inst/callback/jiebaR_callback.cpp @@ -0,0 +1,112 @@ +// [[Rcpp::depends(jiebaR)]] +// [[Rcpp::depends(FeatureHashing)]] + +#include "jiebaRAPI.h" +#include +#include + +using namespace Rcpp; + +struct jiebaRCallbackFunctor : public CallbackFunctor { + + enum Type { + MIX, + MP, + HMM, + QUERY, + KEY + }; + + Type type; + Environment cutter; + SEXP cutter_pointer; + + typedef SEXP (*Cut)(SEXP, SEXP); + + Cut cut; + + void set_type(std::string _type) { + if (_type.compare("mix") == 0) { + type = MIX; + } else if (_type.compare("mp") == 0) { + type = MP; + } else if (_type.compare("hmm") == 0) { + type = HMM; + } else if (_type.compare("query") == 0) { + type = QUERY; + } else if (_type.compare("key") == 0) { + type = KEY; + } else { + throw std::invalid_argument("Unknown type"); + } + } + + std::string get_type() { + switch (type) { + case MIX: + return "mix"; + case MP: + return "mp"; + case HMM: + return "hmm"; + case QUERY: + return "query"; + case KEY: + return "key"; + } + } + + void set_cut() { + std::string fname("jiebaR_"); + fname.append(get_type()); + fname.append("_cut"); + cut = reinterpret_cast(::R_GetCCallable("jiebaR", fname.c_str())); + } + + explicit jiebaRCallbackFunctor( + SEXP _src, + std::string _type, + SEXP _cutter + ) + : type(MIX), + cutter(_cutter), + cutter_pointer(NULL), + cut(NULL), + CallbackFunctor(_src) + { + set_type(_type); + set_cut(); + cutter_pointer = wrap(cutter["worker"]); + } + + virtual ~jiebaRCallbackFunctor() { } + + virtual const std::vector operator()(const char* input) const { + return as >((*cut)(wrap(input), cutter_pointer)); + } + +}; + +RCPP_MODULE(jiebaR_callback) { + + class_("callback") + ; + + class_("jiebaR_callback") + .derives("callback") + .constructor() + .property("type", &jiebaRCallbackFunctor::get_type, &jiebaRCallbackFunctor::set_type) + .field("cutter", &jiebaRCallbackFunctor::cutter) + ; + +} + +/***R +generate_jiebaR_callback <- function(input, type = "mix", ...) { + worker <- jiebaR::worker(type = type, ...) + callback <- new(jiebaR_callback, input, type, worker) + callback +} + +FeatureHashing::register_callback("jiebaR", generate_jiebaR_callback) +*/ diff --git a/inst/include/callback.h b/inst/include/callback.h index 5c24b2b..4e217a2 100644 --- a/inst/include/callback.h +++ b/inst/include/callback.h @@ -21,7 +21,6 @@ #include #include -#include "vector_converter.h" #include class CallbackFunctor { diff --git a/inst/include/hash_function.h b/inst/include/hash_function.h index 84c5523..43c17d3 100644 --- a/inst/include/hash_function.h +++ b/inst/include/hash_function.h @@ -22,7 +22,6 @@ #include #include #include -#include class HashFunction { @@ -32,40 +31,4 @@ class HashFunction { }; -class NullHashFunction : public HashFunction { - - public: - - virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false); - -}; - -class MurmurHash3HashFunction : public HashFunction { - - uint32_t seed; - -public : - - MurmurHash3HashFunction(uint32_t _seed) : seed(_seed) { } - - virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false); - -}; - -class MurmurHash3LogHashFunction : public HashFunction { - - uint32_t seed; - Rcpp::Environment e; - std::map inverse_mapping; - -public: - - MurmurHash3LogHashFunction(SEXP _e, uint32_t _seed) - : HashFunction(), seed(_seed), e(_e) - { } - - virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false); - -}; - #endif \ No newline at end of file diff --git a/man/init_jiebaR_callback.Rd b/man/init_jiebaR_callback.Rd new file mode 100644 index 0000000..9f0d0e4 --- /dev/null +++ b/man/init_jiebaR_callback.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/callback.R +\name{init_jiebaR_callback} +\alias{init_jiebaR_callback} +\title{Initialize and register jiebaR to the formula interface} +\usage{ +init_jiebaR_callback() +} +\description{ +Initialize and register jiebaR to the formula interface +} +\details{ +This function will register the callback of word segmentation +function provided by jiebaR to the formula interface. +For example, `~ jiebaR(...)` will use the feature of word segmentation +provided by jiebaR to segment a given column of the data. +The first argument of the jiebaR is a character which will be segmented. +The left arguments are the same as \code{\link[jiebaR]{worker}}. These +arguments will be used to initialize a jiebaR worker which will segment +the input data. +} +\examples{ +\dontrun{ +library(FeatureHashing) +init_jiebaR_callback() +m <- hashed.model.matrix(~ jiebaR(title, type = "mix", df)) +# the column `df$title` will be feed into `worker <- worker(type = "mix")` +# the result of `worker <= df$title` will be hashed into the sparse matrix +# the result is `m` +} +} + diff --git a/man/ls_special.Rd b/man/ls_special.Rd new file mode 100644 index 0000000..c9a3d24 --- /dev/null +++ b/man/ls_special.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/callback.R +\name{ls_special} +\alias{ls_special} +\title{List the Registered Specials} +\usage{ +ls_special() +} +\value{ +character vector. The specials which could be used in the +formula interface. +} +\description{ +List the Registered Specials +} + diff --git a/man/register_callback.Rd b/man/register_callback.Rd index 179c425..8660d12 100644 --- a/man/register_callback.Rd +++ b/man/register_callback.Rd @@ -14,7 +14,11 @@ register_callback(special, callback_generator) \description{ Register Special Function for Formula Interface } -\examples{ +\details{ +The callback_generator is a function whose first argument is the +input data and the other arguments could be used to initialize the callback +function properly. The result should be a Rcpp module which derives the +`CallbackFunctor` class. Please see the vignette for details. register_callback("split", generate_split_callback) } diff --git a/src/hash_function_implementation.h b/src/hash_function_implementation.h new file mode 100644 index 0000000..8c71698 --- /dev/null +++ b/src/hash_function_implementation.h @@ -0,0 +1,43 @@ +#ifndef __HASH_FUNCTION_IMPLEMENTATION_HPP__ +#define __HASH_FUNCTION_IMPLEMENTATION_HPP__ + +#include +#include + +class NullHashFunction : public HashFunction { + + public: + + virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false); + +}; + +class MurmurHash3HashFunction : public HashFunction { + + uint32_t seed; + +public : + + MurmurHash3HashFunction(uint32_t _seed) : seed(_seed) { } + + virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false); + +}; + +class MurmurHash3LogHashFunction : public HashFunction { + + uint32_t seed; + Rcpp::Environment e; + std::map inverse_mapping; + +public: + + MurmurHash3LogHashFunction(SEXP _e, uint32_t _seed) + : HashFunction(), seed(_seed), e(_e) + { } + + virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false); + +}; + +# endif // __HASH_FUNCTION_IMPLEMENTATION_HPP__ \ No newline at end of file diff --git a/src/hashed_model_matrix.h b/src/hashed_model_matrix.h index b3ab8c2..905fe1c 100644 --- a/src/hashed_model_matrix.h +++ b/src/hashed_model_matrix.h @@ -25,7 +25,7 @@ #include #include #include "callback.h" -#include "hash_function.h" +#include "hash_function_implementation.h" #include "vector_converter.h" #include "converters.h" diff --git a/src/split_callback.cpp b/src/split_callback.cpp index b879394..30e5e15 100644 --- a/src/split_callback.cpp +++ b/src/split_callback.cpp @@ -36,6 +36,7 @@ struct SplitCallbackFunctor : public CallbackFunctor { case SplitType::Existence: return "existence"; } + throw std::logic_error("Invalid SplitType"); } virtual const std::vector operator()(const char* input) const { @@ -53,6 +54,7 @@ struct SplitCallbackFunctor : public CallbackFunctor { return tmp; } } + throw std::logic_error("Invalid SplitType"); } }; @@ -60,10 +62,10 @@ struct SplitCallbackFunctor : public CallbackFunctor { using namespace Rcpp; RCPP_MODULE(split_callback) { - + class_("callback") ; - + class_("split_callback") .derives("callback") .constructor() diff --git a/inst/include/vector_converter.h b/src/vector_converter.h similarity index 98% rename from inst/include/vector_converter.h rename to src/vector_converter.h index 33bd28c..d78e4dc 100644 --- a/inst/include/vector_converter.h +++ b/src/vector_converter.h @@ -21,7 +21,9 @@ #include "callback.h" #include "hash_function.h" -#include +#ifdef NOISY_DEBUG +#include +#endif struct VectorConverterParam; class VectorConverter; diff --git a/tests/test-jiebaR.R b/tests/test-jiebaR.R new file mode 100644 index 0000000..ab1f419 --- /dev/null +++ b/tests/test-jiebaR.R @@ -0,0 +1,49 @@ +if (require(RUnit) & Sys.getenv("TEST_JIEBAR") == "TRUE") { + library(FeatureHashing) + df <- data.frame(title = c( + "貶值取代降息? 台幣貶破33元", + "優生 培寶4款毒奶瓶下架", + " 秋節上國道 閃11塞車點", + "習近平訪美前…//中國戰機公海危險攔截美機", + "352億公開收購 日月光成矽品最大股東", + "驚 AT-3又出事 南投深山失聯 2飛官生死未卜", + "誰說該廢死的?怕死鄭捷首度道歉", + "歐習會前夕// 美國安顧問:反對片面改變台海現狀" + )) + init_jiebaR_callback() + m <- hashed.model.matrix(~ jiebaR(title), df, create.mapping = TRUE) + title_tokens <- names(hash.mapping(m)) + checkEquals(title_tokens, c("title4", "title股東", "title國道", "title中國", "title現狀", +"title…", "title閃", "title習近平", "title日", "title11", +"title驚", "title公開", "title億", "title又", "title:", +"title該", "title塞車", "title訪美", "title?", "title會", +"title公海", "title深山", "title片面", "title奶瓶", "title說", +"title成矽品", "title危險", "title台海", "title最大", +"title美國", "title貶值", "title上", "title下架", "title秋節", +"titleAT", "title352", "title生死未卜", "title收購", "title月光", +"title怕死", "title貶破", "title飛官", "title出事", "title取代", +"title道歉", "title歐習", "title33", "title ", "title款毒", +"title優生", "title顧問", "title前", "title前夕", "title廢死的", +"title反對", "title改變", "title點", "title培寶", "title台幣", +"title降息", "title美機", "title安", "title-", "title南投", +"title首度", "title戰機", "title鄭捷", "title/", "title元", +"title誰", "title攔截", "title2", "title失聯", "title3")) + m <- hashed.model.matrix(~ jiebaR(title, type = "hmm"), df, create.mapping = TRUE) + title_tokens <- names(hash.mapping(m)) + checkEquals(title_tokens, c("title4", "title改", "title鄭", "title股東", "title死", +"title…", "title海", "title上國道", "title閃", "title日", +"title11", "title現", "title首", "title驚", "title片", "title光成", +"title又", "title:", "title該", "title機公海", "title反", +"title習近", "title矽品", "title怕", "title生死", "title?", +"title捷", "title會", "title對", "title深山", "title奶瓶", +"title說", "title月", "title危險", "title最大", "title貶值", +"title下架", "title台", "title秋節", "titleAT", "title美前", +"title面", "title352", "title收購", "title狀", "title貶破", +"title飛官", "title歉", "title出事", "title取代", "title平訪", +"title歐習", "title億公開", "title33", "title未卜", "title中國戰", +"title ", "title款毒", "title優生", "title前夕", "title度", +"title美國安顧問", "title廢死的", "title變", "title塞車點", +"title培寶", "title台幣", "title降息", "title美機", "title-", +"title南投", "title道", "title/", "title元", "title誰", +"title攔截", "title2", "title失聯", "title3")) +} \ No newline at end of file diff --git a/vignettes/Callback.Rmd b/vignettes/Callback.Rmd new file mode 100644 index 0000000..97b47ce --- /dev/null +++ b/vignettes/Callback.Rmd @@ -0,0 +1,27 @@ +--- +title: "Register Callback for FeatureHashing" +author: "Wush Wu" +output: + rmarkdown::html_vignette: + css: vignette.css + number_sections: yes + toc: yes +date: "September 24, 2015" +vignette: > + %\VignetteIndexEntry{FeatureHashing} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +This is an introduction of registering callback for the formula interface of FeatureHashing. + +## Demo + +## Getting Started + +### Implement Rcpp Module + +### Implement Generator + +### Register the Generator to the Formula Interface +