diff --git a/hugegraph-core/pom.xml b/hugegraph-core/pom.xml
index f8b3f8a45e..764bb849b0 100644
--- a/hugegraph-core/pom.xml
+++ b/hugegraph-core/pom.xml
@@ -13,6 +13,21 @@
${basedir}/..
+ 1.3.11
+ 0.7.4
+ 1.8.0
+ 1.3.1
+ 1.10.0
+ 2.6.2
+ portable-1.8.3
+ 5.1.6
+ 8.11.2
+ 1.0.2
+ 2012_u6
+ 1.21
+ 11.1.0
+ 8.5.9
+ 0.11.5
@@ -53,7 +68,7 @@
com.alipay.sofa
jraft-core
- 1.3.9
+ ${jraft.version}
org.slf4j
@@ -85,7 +100,7 @@
org.caffinitas.ohc
ohc-core
- 0.7.0
+ ${ohc.version}
com.google.guava
@@ -97,98 +112,106 @@
org.apdplat
word
- 1.3
+ ${apdplat-word.version}
ch.qos.logback
logback-classic
+
+ org.apache.lucene
+ lucene-core
+
+
+ org.apache.lucene
+ lucene-analyzers-common
+
org.ansj
ansj_seg
- 5.1.6
+ ${ansj-seg.version}
com.hankcs
hanlp
- portable-1.5.0
+ ${hanlp.version}
org.apache.lucene
lucene-analyzers-smartcn
- 7.4.0
+ ${lucene.version}
org.apache.lucene
lucene-core
- 7.4.0
+ ${lucene.version}
com.huaban
jieba-analysis
- 1.0.2
+ ${jieba-analysis.version}
org.lionsoul
jcseg-core
- 2.2.0
+ ${jcseg.version}
com.chenlb.mmseg4j
mmseg4j-core
- 1.10.0
+ ${mmseg4j-core.version}
com.janeluo
ikanalyzer
- 2012_u6
+ ${ikanalyzer.version}
org.lz4
lz4-java
- 1.7.1
+ ${lz4.version}
org.apache.commons
commons-compress
- 1.21
+ ${commons-compress.version}
org.eclipse.collections
eclipse-collections-api
- 10.4.0
+ ${eclipse-collections.version}
org.eclipse.collections
eclipse-collections
- 10.4.0
+ ${eclipse-collections.version}
it.unimi.dsi
fastutil
- 8.1.0
+ ${fastutil.version}
io.jsonwebtoken
jjwt-api
- 0.11.2
+ ${jjwt.version}
io.jsonwebtoken
jjwt-impl
- 0.11.2
+ ${jjwt.version}
runtime
io.jsonwebtoken
jjwt-jackson
- 0.11.2
+ ${jjwt.version}
runtime
diff --git a/hugegraph-core/src/main/java/com/baidu/hugegraph/analyzer/JcsegAnalyzer.java b/hugegraph-core/src/main/java/com/baidu/hugegraph/analyzer/JcsegAnalyzer.java
index 211f384295..b4ccb7b701 100644
--- a/hugegraph-core/src/main/java/com/baidu/hugegraph/analyzer/JcsegAnalyzer.java
+++ b/hugegraph-core/src/main/java/com/baidu/hugegraph/analyzer/JcsegAnalyzer.java
@@ -23,12 +23,11 @@
import java.util.List;
import java.util.Set;
-import org.lionsoul.jcseg.tokenizer.core.ADictionary;
-import org.lionsoul.jcseg.tokenizer.core.DictionaryFactory;
-import org.lionsoul.jcseg.tokenizer.core.ISegment;
-import org.lionsoul.jcseg.tokenizer.core.IWord;
-import org.lionsoul.jcseg.tokenizer.core.JcsegTaskConfig;
-import org.lionsoul.jcseg.tokenizer.core.SegmentFactory;
+import org.lionsoul.jcseg.ISegment;
+import org.lionsoul.jcseg.IWord;
+import org.lionsoul.jcseg.dic.ADictionary;
+import org.lionsoul.jcseg.dic.DictionaryFactory;
+import org.lionsoul.jcseg.segmenter.SegmenterConfig;
import com.baidu.hugegraph.HugeException;
import com.baidu.hugegraph.config.ConfigException;
@@ -45,11 +44,10 @@ public class JcsegAnalyzer implements Analyzer {
"Complex"
);
- private static final JcsegTaskConfig CONFIG = new JcsegTaskConfig();
- private static final ADictionary DIC =
- DictionaryFactory.createDefaultDictionary(new JcsegTaskConfig());
+ private static final SegmenterConfig CONFIG = new SegmenterConfig();
+ private static final ADictionary DIC = DictionaryFactory.createDefaultDictionary(CONFIG);
- private int segMode;
+ private final ISegment.Type type;
public JcsegAnalyzer(String mode) {
if (!SUPPORT_MODES.contains(mode)) {
@@ -57,17 +55,23 @@ public JcsegAnalyzer(String mode) {
"Unsupported segment mode '%s' for jcseg analyzer, " +
"the available values are %s", mode, SUPPORT_MODES);
}
- this.segMode = SUPPORT_MODES.indexOf(mode) + 1;
+
+ if ("Simple".equals(mode)) {
+ this.type = ISegment.SIMPLE;
+ } else {
+ this.type = ISegment.COMPLEX;
+ }
}
@Override
public Set segment(String text) {
Set result = InsertionOrderUtil.newSet();
try {
- Object[] args = new Object[]{new StringReader(text), CONFIG, DIC};
- ISegment seg = SegmentFactory.createJcseg(this.segMode, args);
- IWord word = null;
- while ((word = seg.next()) != null) {
+ ISegment segmentor = this.type.factory.create(CONFIG, DIC);
+ segmentor.reset(new StringReader(text));
+
+ IWord word;
+ while ((word = segmentor.next()) != null) {
result.add(word.getValue());
}
} catch (Exception e) {
diff --git a/hugegraph-core/src/main/java/com/baidu/hugegraph/analyzer/SmartCNAnalyzer.java b/hugegraph-core/src/main/java/com/baidu/hugegraph/analyzer/SmartCNAnalyzer.java
index 9b0dc699c8..ad8c3727b0 100644
--- a/hugegraph-core/src/main/java/com/baidu/hugegraph/analyzer/SmartCNAnalyzer.java
+++ b/hugegraph-core/src/main/java/com/baidu/hugegraph/analyzer/SmartCNAnalyzer.java
@@ -52,7 +52,7 @@ public Set segment(String text) {
Reader reader = new StringReader(text);
try (TokenStream tokenStream = ANALYZER.tokenStream("text", reader)) {
tokenStream.reset();
- CharTermAttribute term = null;
+ CharTermAttribute term;
while (tokenStream.incrementToken()) {
term = tokenStream.getAttribute(CharTermAttribute.class);
result.add(term.toString());
diff --git a/hugegraph-test/src/main/java/com/baidu/hugegraph/unit/core/AnalyzerTest.java b/hugegraph-test/src/main/java/com/baidu/hugegraph/unit/core/AnalyzerTest.java
index e72f3ce9bb..7a5c1a9bc8 100644
--- a/hugegraph-test/src/main/java/com/baidu/hugegraph/unit/core/AnalyzerTest.java
+++ b/hugegraph-test/src/main/java/com/baidu/hugegraph/unit/core/AnalyzerTest.java
@@ -99,13 +99,17 @@ public void testHanlpAnalyzer() {
"海淀区", "西北旺", "东路", "10", "号", "院"),
analyzer.segment(text2));
- // nlp mode
- analyzer = AnalyzerFactory.analyzer("hanlp", "nlp");
+ // Note latest hanlp portable version not contains model data
+ // https://github.com/hankcs/HanLP/tree/portable#%E6%96%B9%E5%BC%8F%E4%B8%80maven
+ // So test IndexTokenizer instead
+ analyzer = AnalyzerFactory.analyzer("hanlp", "index");
Assert.assertEquals(setOf("England", " ", "wins", "World", "Cup"),
analyzer.segment(text1));
- Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", ",", "中华人民共和国",
- "国歌", "百度", "科技园", "位于", "北京市",
- "海淀区", "西北旺", "东路10号院"),
+ Assert.assertEquals(setOf("英格兰", "英格", "格兰", "世界杯", "世界", "夺冠", ",",
+ "中华人民共和国", "中华", "华人", "人民", "共和国",
+ "共和","国歌", "百度", "科技园", "科技", "位于",
+ "北京市", "北京", "海淀区", "海淀", "淀区", "西北旺",
+ "西北", "东路", "10", "号", "院"),
analyzer.segment(text2));
}
@@ -152,7 +156,7 @@ public void testJcsegAnalyzer() {
analyzer.segment(text1));
Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", ",", "中华",
"人民共和国", "国歌", "百度", "科技", "园", "位于",
- "北京市", "海淀区", "西北", "旺", "东路", "10",
+ "北京市", "海淀区", "西北", "旺", "东路", "1", "0",
"号", "院"),
analyzer.segment(text2));
@@ -162,7 +166,7 @@ public void testJcsegAnalyzer() {
analyzer.segment(text1));
Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", ",", "中华",
"人民共和国", "国歌", "百度", "科技", "园", "位于",
- "北京市", "海淀区", "西北", "旺", "东路", "10",
+ "北京市", "海淀区", "西北", "旺", "东路", "1", "0",
"号", "院"),
analyzer.segment(text2));
}