diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 47c400d77c6..e3464aa350d 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -257,6 +257,9 @@ Dependency Upgrades Other Changes --------------------- +* SOLR-17960: Removed TikaLanguageIdentifierUpdateProcessor. Use LangDetectLanguageIdentifierUpdateProcessor + or OpenNLPLangDetectUpdateProcessor instead. (janhoy) + * SOLR-15730: SolrJ modules like SolrJ-Zookeeper are now opt-in from a Maven POM perspective. Previously, the modules would come transitively. (David Smiley) diff --git a/solr/modules/langid/README.md b/solr/modules/langid/README.md index 0dfc293c17b..b4eeba306cb 100644 --- a/solr/modules/langid/README.md +++ b/solr/modules/langid/README.md @@ -35,6 +35,5 @@ for more information. Dependencies ------------ -The Tika detector depends on Tika Core (which is part of the extraction module) The Langdetect detector depends on LangDetect library The OpenNLP detector depends on OpenNLP tools and requires a previously trained user-supplied model diff --git a/solr/modules/langid/build.gradle b/solr/modules/langid/build.gradle index dc63a9f9bbc..7e7bf0fadff 100644 --- a/solr/modules/langid/build.gradle +++ b/solr/modules/langid/build.gradle @@ -23,7 +23,6 @@ dependencies { implementation project(':solr:core') implementation project(':solr:solrj') - implementation(libs.apache.tika.core) { transitive = false } implementation libs.commonsio.commonsio implementation libs.cybozulabs.langdetect implementation libs.apache.opennlp.tools diff --git a/solr/modules/langid/gradle.lockfile b/solr/modules/langid/gradle.lockfile index 6a560903424..943907a4607 100644 --- a/solr/modules/langid/gradle.lockfile +++ b/solr/modules/langid/gradle.lockfile @@ -116,7 +116,6 @@ org.apache.lucene:lucene-spatial3d:10.3.1=jarValidation,runtimeClasspath,runtime org.apache.lucene:lucene-suggest:10.3.1=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath org.apache.lucene:lucene-test-framework:10.3.1=jarValidation,testCompileClasspath,testRuntimeClasspath org.apache.opennlp:opennlp-tools:2.5.6=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath -org.apache.tika:tika-core:1.28.5=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath org.apache.zookeeper:zookeeper-jute:3.9.4=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath org.apache.zookeeper:zookeeper:3.9.4=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath org.apiguardian:apiguardian-api:1.1.2=jarValidation,testRuntimeClasspath diff --git a/solr/modules/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java b/solr/modules/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java deleted file mode 100644 index bde2f62437c..00000000000 --- a/solr/modules/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.solr.update.processor; - -import java.io.Reader; -import java.lang.invoke.MethodHandles; -import java.util.ArrayList; -import java.util.List; -import org.apache.solr.request.SolrQueryRequest; -import org.apache.solr.response.SolrQueryResponse; -import org.apache.tika.language.LanguageIdentifier; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Identifies the language of a set of input fields using Tika's LanguageIdentifier. The - * tika-core-x.y.jar must be on the classpath - * - *

See https://solr.apache.org/guide/solr/latest/indexing-guide/language-detection.html#configuring-tika-language-detection - * - * @since 3.5 - * @deprecated Since 9.10, use {@link OpenNLPLangDetectUpdateProcessor} instead. - */ -@Deprecated(since = "9.10") -public class TikaLanguageIdentifierUpdateProcessor extends LanguageIdentifierUpdateProcessor { - - private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - public TikaLanguageIdentifierUpdateProcessor( - SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) { - super(req, rsp, next); - } - - @Override - protected List detectLanguage(Reader solrDocReader) { - String content = SolrInputDocumentReader.asString(solrDocReader); - List languages = new ArrayList<>(); - if (content.length() != 0) { - LanguageIdentifier identifier = new LanguageIdentifier(content); - // FIXME: Hack - we get the distance from toString and calculate our own certainty score - Double distance = - Double.parseDouble( - tikaSimilarityPattern.matcher(identifier.toString()).replaceFirst("$1")); - // This formula gives: 0.02 => 0.8, 0.1 => 0.5 which is a better sweetspot than - // isReasonablyCertain() - Double certainty = 1 - (5 * distance); - if (certainty < 0) certainty = 0d; - DetectedLanguage language = new DetectedLanguage(identifier.getLanguage(), certainty); - languages.add(language); - if (log.isDebugEnabled()) { - log.debug( - "Language detected as {} with a certainty of {} (Tika distance={})", - language, - language.getCertainty(), - identifier); - } - } else { - log.debug("No input text to detect language from, returning empty list"); - } - return languages; - } -} diff --git a/solr/modules/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactory.java b/solr/modules/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactory.java deleted file mode 100644 index 1728390c065..00000000000 --- a/solr/modules/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactory.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.solr.update.processor; - -import org.apache.solr.common.params.SolrParams; -import org.apache.solr.common.util.NamedList; -import org.apache.solr.core.SolrCore; -import org.apache.solr.request.SolrQueryRequest; -import org.apache.solr.response.SolrQueryResponse; -import org.apache.solr.util.SolrPluginUtils; -import org.apache.solr.util.plugin.SolrCoreAware; - -/** - * Identifies the language of a set of input fields using Tika's LanguageIdentifier. The - * tika-core-x.y.jar must be on the classpath - * - *

The UpdateProcessorChain config entry can take a number of parameters which may also be passed - * as HTTP parameters on the update request and override the defaults. Here is the simplest - * processor config possible: - * - *

- * <processor class="org.apache.solr.update.processor.TikaLanguageIdentifierUpdateProcessorFactory">
- *   <str name="langid.fl">title,text</str>
- *   <str name="langid.langField">language_s</str>
- * </processor>
- * 
- * - * See https://solr.apache.org/guide/solr/latest/indexing-guide/language-detection.html#configuring-tika-language-detection - * - * @since 3.5 - * @deprecated Since 9.10, use {@link OpenNLPLangDetectUpdateProcessorFactory} instead. - */ -@Deprecated(since = "9.10") -public class TikaLanguageIdentifierUpdateProcessorFactory extends UpdateRequestProcessorFactory - implements SolrCoreAware, LangIdParams { - - protected SolrParams defaults; - protected SolrParams appends; - protected SolrParams invariants; - - @Override - public void inform(SolrCore core) {} - - /** - * The UpdateRequestProcessor may be initialized in solrconfig.xml similarly to a RequestHandler, - * with defaults, appends and invariants. - * - * @param args a NamedList with the configuration parameters - */ - @Override - public void init(NamedList args) { - if (args != null) { - Object o; - o = args.get("defaults"); - if (o instanceof NamedList) { - defaults = ((NamedList) o).toSolrParams(); - } else { - defaults = args.toSolrParams(); - } - o = args.get("appends"); - if (o instanceof NamedList) { - appends = ((NamedList) o).toSolrParams(); - } - o = args.get("invariants"); - if (o instanceof NamedList) { - invariants = ((NamedList) o).toSolrParams(); - } - } - } - - @Override - public UpdateRequestProcessor getInstance( - SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) { - // Process defaults, appends and invariants if we got a request - if (req != null) { - SolrPluginUtils.setDefaults(req, defaults, appends, invariants); - } - return new TikaLanguageIdentifierUpdateProcessor(req, rsp, next); - } -} diff --git a/solr/modules/langid/src/test-files/langid/solr/collection1/conf/solrconfig-languageidentifier.xml b/solr/modules/langid/src/test-files/langid/solr/collection1/conf/solrconfig-languageidentifier.xml index a951ea4d80d..8edfa41cbdf 100644 --- a/solr/modules/langid/src/test-files/langid/solr/collection1/conf/solrconfig-languageidentifier.xml +++ b/solr/modules/langid/src/test-files/langid/solr/collection1/conf/solrconfig-languageidentifier.xml @@ -56,27 +56,10 @@ - lang_id_tika + lang_id_lang_detect - - - - - true - name,subject - true - language_s - language_sm - th:thai - 0.5 - fallback - - - - - diff --git a/solr/modules/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java b/solr/modules/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java index dc421084141..1e740806e6a 100644 --- a/solr/modules/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java +++ b/solr/modules/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java @@ -39,9 +39,7 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S public static void beforeClass() throws Exception { initCore("solrconfig-languageidentifier.xml", "schema.xml", getFile("langid/solr")); SolrCore core = h.getCore(); - UpdateRequestProcessorChain chained = core.getUpdateProcessingChain("lang_id_tika"); - assertNotNull(chained); - chained = core.getUpdateProcessingChain("lang_id_lang_detect"); + UpdateRequestProcessorChain chained = core.getUpdateProcessingChain("lang_id_lang_detect"); assertNotNull(chained); chained = core.getUpdateProcessingChain("lang_id_opennlp"); assertNotNull(chained); diff --git a/solr/modules/langid/src/test/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactoryTest.java b/solr/modules/langid/src/test/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactoryTest.java deleted file mode 100644 index 3c47c2a9de0..00000000000 --- a/solr/modules/langid/src/test/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactoryTest.java +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.solr.update.processor; - -import org.apache.solr.common.SolrInputDocument; -import org.apache.solr.common.params.ModifiableSolrParams; -import org.junit.Test; - -@SuppressWarnings("deprecation") -public class TikaLanguageIdentifierUpdateProcessorFactoryTest - extends LanguageIdentifierUpdateProcessorFactoryTestCase { - @Override - protected LanguageIdentifierUpdateProcessor createLangIdProcessor(ModifiableSolrParams parameters) - throws Exception { - return new TikaLanguageIdentifierUpdateProcessor( - _parser.buildRequestFrom(h.getCore(), parameters, null), resp, null); - } - - @Test - public void testMaxFieldValueChars() throws Exception { - SolrInputDocument doc = new SolrInputDocument(); - String valueF1 = - "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License."; - String valueF2 = - "An open-source search server based on the Lucene Java search library. News, documentation, resources, and download."; - doc.addField("foo_s", valueF1); - - ModifiableSolrParams parameters = new ModifiableSolrParams(); - parameters.add("langid.fl", "foo_s"); - parameters.add("langid.langField", "language"); - parameters.add("langid.enforceSchema", "false"); - TikaLanguageIdentifierUpdateProcessor p = - (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters); - assertEquals(valueF1, p.concatFields(doc).trim()); - - parameters = new ModifiableSolrParams(); - parameters.add("langid.fl", "foo_s"); - parameters.add("langid.langField", "language"); - parameters.add("langid.enforceSchema", "false"); - parameters.add("langid.maxFieldValueChars", "6"); - p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters); - assertEquals("Apache", p.concatFields(doc).trim()); - - doc.addField("bar_s", valueF2); - - parameters = new ModifiableSolrParams(); - parameters.add("langid.fl", "foo_s,bar_s"); - parameters.add("langid.langField", "language"); - parameters.add("langid.enforceSchema", "false"); - p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters); - assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim()); - - parameters = new ModifiableSolrParams(); - parameters.add("langid.fl", "foo_s,bar_s"); - parameters.add("langid.langField", "language"); - parameters.add("langid.enforceSchema", "false"); - parameters.add("langid.maxFieldValueChars", "6"); - p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters); - assertEquals("Apache" + " " + "An ope", p.concatFields(doc).trim()); - - parameters = new ModifiableSolrParams(); - parameters.add("langid.fl", "foo_s,bar_s"); - parameters.add("langid.langField", "language"); - parameters.add("langid.enforceSchema", "false"); - parameters.add("langid.maxFieldValueChars", "100000"); - p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters); - assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim()); - } - - @Test - public void testMaxTotalChars() throws Exception { - SolrInputDocument doc = new SolrInputDocument(); - String valueF1 = - "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License."; - String valueF2 = - "An open-source search server based on the Lucene Java search library. News, documentation, resources, and download."; - doc.addField("foo_s", valueF1); - - ModifiableSolrParams parameters = new ModifiableSolrParams(); - parameters.add("langid.fl", "foo_s"); - parameters.add("langid.langField", "language"); - parameters.add("langid.enforceSchema", "false"); - TikaLanguageIdentifierUpdateProcessor p = - (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters); - assertEquals(valueF1, p.concatFields(doc).trim()); - - parameters = new ModifiableSolrParams(); - parameters.add("langid.fl", "foo_s"); - parameters.add("langid.langField", "language"); - parameters.add("langid.enforceSchema", "false"); - parameters.add("langid.maxTotalChars", "6"); - p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters); - assertEquals("Apache", p.concatFields(doc).trim()); - - doc.addField("bar_s", valueF2); - - parameters = new ModifiableSolrParams(); - parameters.add("langid.fl", "foo_s,bar_s"); - parameters.add("langid.langField", "language"); - parameters.add("langid.enforceSchema", "false"); - p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters); - assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim()); - - parameters = new ModifiableSolrParams(); - parameters.add("langid.fl", "foo_s,bar_s"); - parameters.add("langid.langField", "language"); - parameters.add("langid.enforceSchema", "false"); - parameters.add("langid.maxTotalChars", "6"); - p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters); - assertEquals("Apache", p.concatFields(doc).trim()); - - parameters = new ModifiableSolrParams(); - parameters.add("langid.fl", "foo_s,bar_s"); - parameters.add("langid.langField", "language"); - parameters.add("langid.enforceSchema", "false"); - parameters.add("langid.maxTotalChars", "100000"); - p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters); - assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim()); - } - - @Test - public void testMaxFieldValueCharsAndMaxTotalChars() throws Exception { - SolrInputDocument doc = new SolrInputDocument(); - String valueF1 = - "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License."; - String valueF2 = - "An open-source search server based on the Lucene Java search library. News, documentation, resources, and download."; - doc.addField("foo_s", valueF1); - - ModifiableSolrParams parameters = new ModifiableSolrParams(); - parameters.add("langid.fl", "foo_s"); - parameters.add("langid.langField", "language"); - parameters.add("langid.enforceSchema", "false"); - TikaLanguageIdentifierUpdateProcessor p = - (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters); - assertEquals(valueF1, p.concatFields(doc).trim()); - - parameters = new ModifiableSolrParams(); - parameters.add("langid.fl", "foo_s"); - parameters.add("langid.langField", "language"); - parameters.add("langid.enforceSchema", "false"); - parameters.add("langid.maxFieldValueChars", "8"); - parameters.add("langid.maxTotalChars", "6"); - p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters); - assertEquals("Apache", p.concatFields(doc).trim()); - - doc.addField("bar_s", valueF2); - - parameters = new ModifiableSolrParams(); - parameters.add("langid.fl", "foo_s,bar_s"); - parameters.add("langid.langField", "language"); - parameters.add("langid.enforceSchema", "false"); - p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters); - assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim()); - - parameters = new ModifiableSolrParams(); - parameters.add("langid.fl", "foo_s,bar_s"); - parameters.add("langid.langField", "language"); - parameters.add("langid.enforceSchema", "false"); - parameters.add("langid.maxFieldValueChars", "3"); - parameters.add("langid.maxTotalChars", "8"); - p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters); - assertEquals("Apa An", p.concatFields(doc).trim()); - - parameters = new ModifiableSolrParams(); - parameters.add("langid.fl", "foo_s,bar_s"); - parameters.add("langid.langField", "language"); - parameters.add("langid.enforceSchema", "false"); - parameters.add("langid.maxFieldValueChars", "10000"); - parameters.add("langid.maxTotalChars", "100000"); - p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters); - assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim()); - } -} diff --git a/solr/server/solr/configsets/sample_techproducts_configs/conf/solrconfig.xml b/solr/server/solr/configsets/sample_techproducts_configs/conf/solrconfig.xml index 0efc7af9bb0..4192759f158 100644 --- a/solr/server/solr/configsets/sample_techproducts_configs/conf/solrconfig.xml +++ b/solr/server/solr/configsets/sample_techproducts_configs/conf/solrconfig.xml @@ -1133,7 +1133,7 @@ via parameters. The below configuration supports hl.method=original and fastVec -->