diff --git a/.github/workflows/gradle-extraction-check.yml b/.github/workflows/gradle-extraction-check.yml new file mode 100644 index 00000000000..8a040a077b0 --- /dev/null +++ b/.github/workflows/gradle-extraction-check.yml @@ -0,0 +1,28 @@ +# This test covers TikaServer tests for SolrCell, since crave does not support docker yet. +name: Extraction module tests with Docker + +on: + pull_request: + branches: + - '*' + paths: + - 'solr/modules/extraction/**' + +jobs: + test: + name: extraction module tests with docker + + runs-on: ubuntu-latest + timeout-minutes: 15 + + env: + DEVELOCITY_ACCESS_KEY: ${{ secrets.SOLR_DEVELOCITY_ACCESS_KEY }} + + steps: + - name: Checkout code + uses: actions/checkout@v5 + + - uses: ./.github/actions/prepare-for-build + + - name: Run extraction module tests + run: ./gradlew --no-daemon solr:modules:extraction:check diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 14435a7242f..641de520d6b 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -140,6 +140,8 @@ jersey-containers = "2.39.1" # @keep for version alignment jetbrains-annotations = "26.0.2" # @keep for version alignment +jna = "5.13.0" +# @keep for version alignment joda-time = "2.14.0" junit = "4.13.2" junit-jupiter = "5.13.4" @@ -197,6 +199,7 @@ squareup-okhttp3-okhttp = "4.12.0" stephenc-jcip = "1.0-1" swagger3 = "2.2.22" tdunning-tdigest = "3.3" +testcontainers = "1.20.4" thetaphi-forbiddenapis = "3.10" thisptr-jacksonjq = "0.0.13" threeten-bp = "1.6.8" @@ -427,6 +430,8 @@ jersey-media-jsonjackson = { module = "org.glassfish.jersey.media:jersey-media-j # @keep transitive dependency for version alignment jetbrains-annotations = { module = "org.jetbrains:annotations", version.ref = "jetbrains-annotations" } # @keep transitive dependency for version alignment +jna = { module = "net.java.dev.jna:jna", version.ref = "jna" } +# @keep transitive dependency for version alignment jodatime-jodatime = { module = "joda-time:joda-time", version.ref = "joda-time" } junit-junit = { module = "junit:junit", version.ref = "junit" } junit-jupiter = { module = "org.junit.jupiter:junit-jupiter", version.ref = "junit-jupiter" } @@ -516,6 +521,7 @@ stephenc-jcip-annotations = { module = "com.github.stephenc.jcip:jcip-annotation swagger3-annotations-jakarta = { module = "io.swagger.core.v3:swagger-annotations-jakarta", version.ref = "swagger3" } swagger3-jaxrs2-jakarta = { module = "io.swagger.core.v3:swagger-jaxrs2-jakarta", version.ref = "swagger3" } tdunning-tdigest = { module = "com.tdunning:t-digest", version.ref = "tdunning-tdigest" } +testcontainers = { module = "org.testcontainers:testcontainers", version.ref = "testcontainers" } thisptr-jacksonjq = { module = "net.thisptr:jackson-jq", version.ref = "thisptr-jacksonjq" } threeten-bp = { module = "org.threeten:threetenbp", version.ref = "threeten-bp" } xerces-impl = { module = "xerces:xercesImpl", version.ref = "xerces" } diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index e87696e3e8f..dcb2d427577 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -333,6 +333,10 @@ New Features * SOLR-17923: Add fullOuterJoin stream function (Andy Webb) +* SOLR-7632: The Extraction Request Handler, aka Solr Cell, now supports delegating the parsing of rich documents to + an external Tika Server. This allows for a more stable Solr server, and easier to configure and scale parsing + independently. The local in-process Tika parser is now deprecated. (Jan Høydahl, Eric Pugh) + Improvements --------------------- * SOLR-17860: DocBasedVersionConstraintsProcessorFactory now supports PULL replicas. (Houston Putman) diff --git a/solr/licenses/docker-java-LICENSE-ASL.txt b/solr/licenses/docker-java-LICENSE-ASL.txt new file mode 100644 index 00000000000..38275f2f4fe --- /dev/null +++ b/solr/licenses/docker-java-LICENSE-ASL.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [2013] [docker-java@googlegroups.com] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/solr/licenses/docker-java-NOTICE.txt b/solr/licenses/docker-java-NOTICE.txt new file mode 100644 index 00000000000..f54dcc4f15a --- /dev/null +++ b/solr/licenses/docker-java-NOTICE.txt @@ -0,0 +1,7 @@ +This product includes software developed by the docker-java project. + +Copyright (c) 2013, docker-java project contributors + +Project: https://github.com/docker-java/docker-java + +Licensed under the Apache License, Version 2.0. diff --git a/solr/licenses/docker-java-api-3.4.0.jar.sha1 b/solr/licenses/docker-java-api-3.4.0.jar.sha1 new file mode 100644 index 00000000000..bf5ca0d6db4 --- /dev/null +++ b/solr/licenses/docker-java-api-3.4.0.jar.sha1 @@ -0,0 +1 @@ +9ef23dcc93693f15e69b64632be096c38e31bc44 diff --git a/solr/licenses/docker-java-transport-3.4.0.jar.sha1 b/solr/licenses/docker-java-transport-3.4.0.jar.sha1 new file mode 100644 index 00000000000..c1232d24a6b --- /dev/null +++ b/solr/licenses/docker-java-transport-3.4.0.jar.sha1 @@ -0,0 +1 @@ +c058705684d782effc4b2edfdef1a87544ba4af8 diff --git a/solr/licenses/docker-java-transport-zerodep-3.4.0.jar.sha1 b/solr/licenses/docker-java-transport-zerodep-3.4.0.jar.sha1 new file mode 100644 index 00000000000..b658f8f0810 --- /dev/null +++ b/solr/licenses/docker-java-transport-zerodep-3.4.0.jar.sha1 @@ -0,0 +1 @@ +c4ce6d8695cfdb0027872f99cc20f8f679f8a969 diff --git a/solr/licenses/duct-tape-1.0.8.jar.sha1 b/solr/licenses/duct-tape-1.0.8.jar.sha1 new file mode 100644 index 00000000000..8ccb86d64ea --- /dev/null +++ b/solr/licenses/duct-tape-1.0.8.jar.sha1 @@ -0,0 +1 @@ +92edc22a9ab2f3e17c9bf700aaee377d50e8b530 diff --git a/solr/licenses/duct-tape-LICENSE-MIT.txt b/solr/licenses/duct-tape-LICENSE-MIT.txt new file mode 100644 index 00000000000..2091a63f988 --- /dev/null +++ b/solr/licenses/duct-tape-LICENSE-MIT.txt @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2014 Richard North + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/solr/licenses/jna-5.12.1.jar.sha1 b/solr/licenses/jna-5.12.1.jar.sha1 deleted file mode 100644 index 648c9d576db..00000000000 --- a/solr/licenses/jna-5.12.1.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -b1e93a735caea94f503e95e6fe79bf9cdc1e985d diff --git a/solr/licenses/jna-5.13.0.jar.sha1 b/solr/licenses/jna-5.13.0.jar.sha1 new file mode 100644 index 00000000000..93b456b9293 --- /dev/null +++ b/solr/licenses/jna-5.13.0.jar.sha1 @@ -0,0 +1 @@ +1200e7ebeedbe0d10062093f32925a912020e747 diff --git a/solr/licenses/testcontainers-1.20.4.jar.sha1 b/solr/licenses/testcontainers-1.20.4.jar.sha1 new file mode 100644 index 00000000000..29746a98e88 --- /dev/null +++ b/solr/licenses/testcontainers-1.20.4.jar.sha1 @@ -0,0 +1 @@ +ee2fe3afc9fa6cb2e6a43233998f3633f761692f diff --git a/solr/licenses/testcontainers-LICENSE-MIT.txt b/solr/licenses/testcontainers-LICENSE-MIT.txt new file mode 100644 index 00000000000..9c9e8bc5563 --- /dev/null +++ b/solr/licenses/testcontainers-LICENSE-MIT.txt @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015-2019 Richard North + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/solr/modules/extraction/build.gradle b/solr/modules/extraction/build.gradle index 655fc0360f4..5c52c056587 100644 --- a/solr/modules/extraction/build.gradle +++ b/solr/modules/extraction/build.gradle @@ -19,6 +19,11 @@ apply plugin: 'java-library' description = 'Solr Integration with Tika for extracting content from binary file formats such as Microsoft Word and Adobe PDF' +ext { + // Disable security manager for extraction module tests + useSecurityManager = false +} + dependencies { implementation platform(project(':platform')) implementation project(':solr:core') @@ -27,17 +32,23 @@ dependencies { implementation libs.apache.lucene.core implementation libs.slf4j.api + // For 'local' Tika backend implementation libs.apache.tika.core implementation (libs.apache.tika.parsers, { exclude group: 'org.apache.cxf', module: 'cxf-rt-rs-client' exclude group: 'org.quartz-scheduler', module: 'quartz' exclude group: 'xml-apis', module: 'xml-apis' }) - implementation (libs.xerces.impl, { - exclude group: 'xml-apis', module: 'xml-apis' - }) + + // For 'tikaserver' backend + implementation libs.eclipse.jetty.client + permitUsedUndeclared libs.eclipse.jetty.http + permitUsedUndeclared libs.eclipse.jetty.util + permitUsedUndeclared libs.eclipse.jetty.io testImplementation project(':solr:test-framework') testImplementation libs.apache.lucene.testframework testImplementation libs.junit.junit + testImplementation libs.testcontainers + testImplementation libs.carrotsearch.randomizedtesting.runner } diff --git a/solr/modules/extraction/gradle.lockfile b/solr/modules/extraction/gradle.lockfile index 18895208387..fa2dc94f4cf 100644 --- a/solr/modules/extraction/gradle.lockfile +++ b/solr/modules/extraction/gradle.lockfile @@ -15,6 +15,9 @@ com.fasterxml.jackson.module:jackson-module-jakarta-xmlbind-annotations:2.20.0=j com.fasterxml.jackson:jackson-bom:2.20.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath com.fasterxml.woodstox:woodstox-core:7.0.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath com.github.ben-manes.caffeine:caffeine:3.2.2=annotationProcessor,errorprone,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testAnnotationProcessor,testRuntimeClasspath +com.github.docker-java:docker-java-api:3.4.0=jarValidation,testCompileClasspath,testRuntimeClasspath +com.github.docker-java:docker-java-transport-zerodep:3.4.0=jarValidation,testCompileClasspath,testRuntimeClasspath +com.github.docker-java:docker-java-transport:3.4.0=jarValidation,testCompileClasspath,testRuntimeClasspath com.github.jai-imageio:jai-imageio-core:1.4.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath com.github.junrar:junrar:7.5.3=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath com.github.kevinstern:software-and-algorithms:1.0=annotationProcessor,errorprone,testAnnotationProcessor @@ -107,7 +110,7 @@ javax.inject:javax.inject:1=annotationProcessor,errorprone,testAnnotationProcess javax.measure:unit-api:1.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath joda-time:joda-time:2.14.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath junit:junit:4.13.2=jarValidation,testCompileClasspath,testRuntimeClasspath -net.java.dev.jna:jna:5.12.1=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath +net.java.dev.jna:jna:5.13.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath net.sf.ehcache:ehcache-core:2.6.2=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath org.antlr:antlr4-runtime:4.13.2=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath org.apache.commons:commons-collections4:4.5.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath @@ -195,14 +198,14 @@ org.eclipse.jetty:jetty-alpn-client:12.0.27=compileClasspath,jarValidation,runti org.eclipse.jetty:jetty-alpn-java-client:12.0.27=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath org.eclipse.jetty:jetty-alpn-java-server:12.0.27=jarValidation,testRuntimeClasspath org.eclipse.jetty:jetty-alpn-server:12.0.27=jarValidation,testRuntimeClasspath -org.eclipse.jetty:jetty-client:12.0.27=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath -org.eclipse.jetty:jetty-http:12.0.27=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath -org.eclipse.jetty:jetty-io:12.0.27=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath +org.eclipse.jetty:jetty-client:12.0.27=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath +org.eclipse.jetty:jetty-http:12.0.27=compileClasspath,jarValidation,permitUsedUndeclared,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath +org.eclipse.jetty:jetty-io:12.0.27=compileClasspath,jarValidation,permitUsedUndeclared,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath org.eclipse.jetty:jetty-rewrite:12.0.27=jarValidation,testRuntimeClasspath org.eclipse.jetty:jetty-security:12.0.27=jarValidation,testCompileClasspath,testRuntimeClasspath org.eclipse.jetty:jetty-server:12.0.27=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath org.eclipse.jetty:jetty-session:12.0.27=jarValidation,testCompileClasspath,testRuntimeClasspath -org.eclipse.jetty:jetty-util:12.0.27=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath +org.eclipse.jetty:jetty-util:12.0.27=compileClasspath,jarValidation,permitUsedUndeclared,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath org.gagravarr:vorbis-java-core:0.8=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath org.gagravarr:vorbis-java-tika:0.8=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath org.glassfish.hk2.external:aopalliance-repackaged:3.1.1=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath @@ -223,6 +226,7 @@ org.hamcrest:hamcrest:3.0=jarValidation,testCompileClasspath,testRuntimeClasspat org.itadaki:bzip2:0.9.1=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath org.javassist:javassist:3.30.2-GA=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath org.jdom:jdom2:2.0.6.1=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath +org.jetbrains:annotations:26.0.2=jarValidation,testCompileClasspath,testRuntimeClasspath org.jspecify:jspecify:1.0.0=annotationProcessor,compileClasspath,errorprone,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testAnnotationProcessor,testCompileClasspath,testRuntimeClasspath org.junit.jupiter:junit-jupiter-api:5.6.2=jarValidation,testRuntimeClasspath org.junit.platform:junit-platform-commons:1.6.2=jarValidation,testRuntimeClasspath @@ -234,15 +238,17 @@ org.ow2.asm:asm-commons:9.8=jarValidation,runtimeClasspath,runtimeLibs,solrPlatf org.ow2.asm:asm-tree:9.8=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath org.ow2.asm:asm:9.8=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath org.pcollections:pcollections:4.0.1=annotationProcessor,errorprone,testAnnotationProcessor +org.rnorth.duct-tape:duct-tape:1.0.8=jarValidation,testCompileClasspath,testRuntimeClasspath org.semver4j:semver4j:6.0.0=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath org.slf4j:jcl-over-slf4j:2.0.17=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath org.slf4j:jul-to-slf4j:2.0.17=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath -org.slf4j:slf4j-api:2.0.17=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath +org.slf4j:slf4j-api:2.0.17=compileClasspath,jarValidation,permitUsedUndeclared,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath org.tallison.xmp:xmpcore-shaded:6.1.10=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath org.tallison:isoparser:1.9.41.7=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath org.tallison:jmatio:1.5=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath org.tallison:metadata-extractor:2.17.1.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath +org.testcontainers:testcontainers:1.20.4=jarValidation,testCompileClasspath,testRuntimeClasspath org.tukaani:xz:1.9=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath org.xerial.snappy:snappy-java:1.1.10.8=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath xerces:xercesImpl:2.12.2=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath -empty=apiHelper,apiHelperTest,compileOnlyHelper,compileOnlyHelperTest,missingdoclet,packaging,permitAggregatorUse,permitTestAggregatorUse,permitTestUnusedDeclared,permitTestUsedUndeclared,permitUnusedDeclared,permitUsedUndeclared,signatures +empty=apiHelper,apiHelperTest,compileOnlyHelper,compileOnlyHelperTest,missingdoclet,packaging,permitAggregatorUse,permitTestAggregatorUse,permitTestUnusedDeclared,permitTestUsedUndeclared,permitUnusedDeclared,signatures diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java index 014d56caae4..ba9b72b1863 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java @@ -18,9 +18,10 @@ import java.io.IOException; import java.io.InputStream; -import java.io.StringWriter; import java.lang.invoke.MethodHandles; -import java.util.Locale; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.regex.Pattern; import org.apache.solr.common.SolrException; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.params.UpdateParams; @@ -28,43 +29,21 @@ import org.apache.solr.common.util.ContentStreamBase; import org.apache.solr.common.util.NamedList; import org.apache.solr.core.SolrCore; +import org.apache.solr.handler.extraction.fromtika.ToTextContentHandler; +import org.apache.solr.handler.extraction.fromtika.ToXMLContentHandler; import org.apache.solr.handler.loader.ContentStreamLoader; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.update.AddUpdateCommand; import org.apache.solr.update.processor.UpdateRequestProcessor; -import org.apache.tika.config.TikaConfig; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.HttpHeaders; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaMetadataKeys; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.AutoDetectParser; -import org.apache.tika.parser.DefaultParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.parser.PasswordProvider; -import org.apache.tika.parser.html.HtmlMapper; -import org.apache.tika.sax.XHTMLContentHandler; -import org.apache.tika.sax.xpath.Matcher; -import org.apache.tika.sax.xpath.MatchingContentHandler; -import org.apache.tika.sax.xpath.XPathParser; -import org.apache.xml.serialize.BaseMarkupSerializer; -import org.apache.xml.serialize.OutputFormat; -import org.apache.xml.serialize.TextSerializer; -import org.apache.xml.serialize.XMLSerializer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; /** - * The class responsible for loading extracted content into Solr. - * - * @deprecated Will be replaced with something similar that calls out to a separate Tika Server - * process running in its own JVM. + * The class responsible for loading extracted content into Solr. It will delegate parsing to a + * {@link ExtractionBackend} and then load the resulting SolrInputDocument into Solr. */ -@Deprecated(since = "9.10.0") public class ExtractingDocumentLoader extends ContentStreamLoader { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); @@ -75,40 +54,30 @@ public class ExtractingDocumentLoader extends ContentStreamLoader { /** Extract Only supported format. Default */ public static final String XML_FORMAT = "xml"; - /** XHTML XPath parser. */ - private static final XPathParser PARSER = new XPathParser("xhtml", XHTMLContentHandler.XHTML); - final SolrCore core; - final SolrParams params; final UpdateRequestProcessor processor; final boolean ignoreTikaException; - protected AutoDetectParser autoDetectParser; private final AddUpdateCommand templateAdd; - protected TikaConfig config; - protected ParseContextConfig parseContextConfig; protected SolrContentHandlerFactory factory; + protected ExtractionBackend backend; public ExtractingDocumentLoader( SolrQueryRequest req, UpdateRequestProcessor processor, - TikaConfig config, - ParseContextConfig parseContextConfig, - SolrContentHandlerFactory factory) { - this.params = req.getParams(); + SolrContentHandlerFactory factory, + ExtractionBackend backend) { + SolrParams params = req.getParams(); this.core = req.getCore(); - this.config = config; - this.parseContextConfig = parseContextConfig; this.processor = processor; templateAdd = new AddUpdateCommand(req); templateAdd.overwrite = params.getBool(UpdateParams.OVERWRITE, true); templateAdd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1); - // this is lightweight - autoDetectParser = new AutoDetectParser(config); this.factory = factory; + this.backend = backend; ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false); } @@ -131,169 +100,153 @@ public void load( ContentStream stream, UpdateRequestProcessor processor) throws Exception { - Parser parser = null; - String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null); - if (streamType != null) { - // Cache? Parsers are lightweight to construct and thread-safe, so I'm told - MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT)); - parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt); - } else { - parser = autoDetectParser; - } - if (parser != null) { - Metadata metadata = new Metadata(); - - // If you specify the resource name (the filename, roughly) with this parameter, - // then Tika can make use of it in guessing the appropriate MIME type: - String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null); - if (resourceName != null) { - metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName); - } - // Provide stream's content type as hint for auto detection - if (stream.getContentType() != null) { - metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType()); - } - - try (InputStream inputStream = stream.getStream()) { - metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName()); - metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo()); - metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize())); - metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType()); - // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata - String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType()); - if (charset != null) { - metadata.add(HttpHeaders.CONTENT_ENCODING, charset); + SolrParams params = req.getParams(); + String streamType = params.get(ExtractingParams.STREAM_TYPE, null); + String resourceName = params.get(ExtractingParams.RESOURCE_NAME, null); + + try (InputStream inputStream = stream.getStream()) { + String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType()); + + String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION); + boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false); + // Prefer new parameter name; fall back to legacy name for backward compatibility + boolean tikaserverRecursive = params.getBool(ExtractingParams.TIKASERVER_RECURSIVE, false); + String extractFormat = + params.get(ExtractingParams.EXTRACT_FORMAT, extractOnly ? XML_FORMAT : TEXT_FORMAT); + + // Parse optional passwords file into a map + LinkedHashMap pwMap = null; + String passwordsFile = params.get(ExtractingParams.PASSWORD_MAP_FILE); + if (passwordsFile != null) { + try (java.io.InputStream is = core.getResourceLoader().openResource(passwordsFile)) { + pwMap = RegexRulesPasswordProvider.parseRulesFile(is); } + } - String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION); - boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false); - SolrContentHandler handler = - factory.createSolrContentHandler(metadata, params, req.getSchema()); - ContentHandler parsingHandler = handler; + Integer tikaTimeoutSecs = params.getInt(ExtractingParams.TIKASERVER_TIMEOUT_SECS); + ExtractionRequest extractionRequest = + ExtractionRequest.builder() + .streamType(streamType) + .resourceName(resourceName) + .contentType(stream.getContentType()) + .charset(charset) + .streamName(stream.getName()) + .streamSourceInfo(stream.getSourceInfo()) + .streamSize(stream.getSize()) + .resourcePassword(params.get(ExtractingParams.RESOURCE_PASSWORD, null)) + .passwordsMap(pwMap) + .extractFormat(extractFormat) + .tikaServerRecursive(tikaserverRecursive) + .tikaServerTimeoutSeconds(tikaTimeoutSecs) + .tikaServerRequestHeaders(Collections.emptyMap()) + .build(); + + boolean captureAttr = params.getBool(ExtractingParams.CAPTURE_ATTRIBUTES, false); + String[] captureElems = params.getParams(ExtractingParams.CAPTURE_ELEMENTS); + boolean needsSaxParsing = + extractOnly + || xpathExpr != null + || captureAttr + || (captureElems != null && captureElems.length > 0) + || (params.get(ExtractingParams.RESOURCE_PASSWORD) != null) + || (passwordsFile != null); + + if (extractOnly) { + try { + ExtractionMetadata md = backend.buildMetadataFromRequest(extractionRequest); + String content; + if (ExtractingDocumentLoader.TEXT_FORMAT.equals(extractionRequest.extractFormat) + || xpathExpr != null) { + content = + extractWithHandler( + inputStream, xpathExpr, extractionRequest, md, new ToTextContentHandler()); + } else { // XML format + content = + extractWithHandler( + inputStream, xpathExpr, extractionRequest, md, new ToXMLContentHandler()); + if (!content.startsWith("\n" + content; + } + } - StringWriter writer = null; - BaseMarkupSerializer serializer = null; - if (extractOnly == true) { - String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml"); - writer = new StringWriter(); - if (extractFormat.equals(TEXT_FORMAT)) { - serializer = new TextSerializer(); - serializer.setOutputCharStream(writer); - serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true)); - } else { - serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true)); + rsp.add(stream.getName(), content); + NamedList metadataNL = new NamedList<>(); + for (String name : md.keySet()) { + metadataNL.add(name, md.get(name).toArray(new String[0])); } - if (xpathExpr != null) { - Matcher matcher = PARSER.parse(xpathExpr); - serializer - .startDocument(); // The MatchingContentHandler does not invoke startDocument. See - // https://lists.apache.org/thread.html/5ec63e104e564a2363e45f74d5aced6520b7d32b4b625762ef56cb86%401226775505%40%3Cdev.tika.apache.org%3E - parsingHandler = new MatchingContentHandler(serializer, matcher); - } else { - parsingHandler = serializer; + rsp.add(stream.getName() + "_metadata", metadataNL); + } catch (Exception e) { + if (ignoreTikaException) { + if (log.isWarnEnabled()) + log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e); + return; } - } else if (xpathExpr != null) { - Matcher matcher = PARSER.parse(xpathExpr); - parsingHandler = new MatchingContentHandler(handler, matcher); - } // else leave it as is + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); + } + return; + } + if (needsSaxParsing) { + ExtractionMetadata metadata = backend.buildMetadataFromRequest(extractionRequest); + SolrContentHandler handler = + factory.createSolrContentHandler(metadata, params, req.getSchema()); try { - // potentially use a wrapper handler for parsing, but we still need the SolrContentHandler - // for getting the document. - ParseContext context = parseContextConfig.create(); - - context.set(Parser.class, parser); - context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE); - - // Password handling - RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider(); - String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE); - if (pwMapFile != null && pwMapFile.length() > 0) { - InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile); - if (is != null) { - log.debug("Password file supplied: {}", pwMapFile); - epp.parse(is); - } - } - context.set(PasswordProvider.class, epp); - String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD); - if (resourcePassword != null) { - epp.setExplicitPassword(resourcePassword); - log.debug("Literal password supplied for file {}", resourceName); - } - parser.parse(inputStream, parsingHandler, metadata, context); - } catch (TikaException e) { + backend.extractWithSaxHandler(inputStream, extractionRequest, metadata, handler); + } catch (Exception e) { if (ignoreTikaException) { if (log.isWarnEnabled()) { - log.warn( - "skip extracting text due to {}. metadata={}", - e.getLocalizedMessage(), - metadata, - e); + log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e); + return; } - } else { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } - if (extractOnly == false) { - addDoc(handler); - } else { - // serializer is not null, so we need to call endDoc on it if using xpath - if (xpathExpr != null) { - serializer.endDocument(); - } - rsp.add(stream.getName(), writer.toString()); - writer.close(); - String[] names = metadata.names(); - NamedList metadataNL = new NamedList<>(); - for (int i = 0; i < names.length; i++) { - String[] vals = metadata.getValues(names[i]); - metadataNL.add(names[i], vals); - } - rsp.add(stream.getName() + "_metadata", metadataNL); + + addDoc(handler); + return; + } + + ExtractionResult result; + try { + result = backend.extract(inputStream, extractionRequest); + } catch (Exception e) { + if (ignoreTikaException) { + if (log.isWarnEnabled()) + log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e); + return; } - } catch (SAXException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } - } else { - throw new SolrException( - SolrException.ErrorCode.BAD_REQUEST, - "Stream type of " - + streamType - + " didn't match any known parsers. Please supply the " - + ExtractingParams.STREAM_TYPE - + " parameter."); - } - } - public static class MostlyPassthroughHtmlMapper implements HtmlMapper { - public static final HtmlMapper INSTANCE = new MostlyPassthroughHtmlMapper(); + ExtractionMetadata metadata = result.getMetadata(); - /** - * Keep all elements and their content. - * - *

Apparently <SCRIPT> and <STYLE> elements are blocked elsewhere - */ - @Override - public boolean isDiscardElement(String name) { - return false; - } - - /** Lowercases the attribute name */ - @Override - public String mapSafeAttribute(String elementName, String attributeName) { - return attributeName.toLowerCase(Locale.ENGLISH); + SolrContentHandler handler = + factory.createSolrContentHandler(metadata, params, req.getSchema()); + handler.appendToContent(result.getContent()); + addDoc(handler); } + } - /** - * Lowercases the element name, but returns null for <BR>, which suppresses the - * start-element event for lt;BR> tags. This also suppresses the <BODY> tags because - * those are handled internally by Tika's XHTMLContentHandler. - */ - @Override - public String mapSafeElement(String name) { - String lowerName = name.toLowerCase(Locale.ROOT); - return (lowerName.equals("br") || lowerName.equals("body")) ? null : lowerName; + /* + * Extracts content from the given input stream using an optional XPath expression + * and a SAX content handler. The extraction process may filter content based on + * the XPath expression, if provided. + */ + private String extractWithHandler( + InputStream inputStream, + String xpathExpr, + ExtractionRequest extractionRequest, + ExtractionMetadata md, + DefaultHandler ch) + throws Exception { + if (xpathExpr != null) { + org.apache.tika.sax.xpath.XPathParser xparser = + new org.apache.tika.sax.xpath.XPathParser( + "xhtml", org.apache.tika.sax.XHTMLContentHandler.XHTML); + org.apache.tika.sax.xpath.Matcher matcher = xparser.parse(xpathExpr); + ch = new org.apache.tika.sax.xpath.MatchingContentHandler(ch, matcher); } + backend.extractWithSaxHandler(inputStream, extractionRequest, md, ch); + return ch.toString(); } } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingMetadataConstants.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingMetadataConstants.java index 0a72edca0ec..cecbfdb048b 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingMetadataConstants.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingMetadataConstants.java @@ -22,4 +22,7 @@ public interface ExtractingMetadataConstants { String STREAM_SOURCE_INFO = "stream_source_info"; String STREAM_SIZE = "stream_size"; String STREAM_CONTENT_TYPE = "stream_content_type"; + String HTTP_HEADER_CONTENT_TYPE = "Content-Type"; + String HTTP_HEADER_CONTENT_ENCODING = "Content-Encoding"; + String RESOURCE_NAME_KEY = "resourceName"; } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java index a7d159678f1..ddd31d30f77 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java @@ -136,4 +136,26 @@ public interface ExtractingParams { * .*=<defaultmypassword> at the end */ public static final String PASSWORD_MAP_FILE = "passwordsFile"; + + /** Backend selection, either `local` or `tikaserver`. */ + public static final String EXTRACTION_BACKEND = "extraction.backend"; + + /** Preferred: Fix/normalize metadata naming for Tika Server compatibility */ + public static final String TIKASERVER_METADATA_COMPATIBILITY = + "tikaserver.metadata.compatibility"; + + /** URL of Tika Server instance. */ + public static final String TIKASERVER_URL = "tikaserver.url"; + + /** Max characters allowed in parsed content */ + public static final String TIKASERVER_MAX_CHARS = "tikaserver.maxChars"; + + /** + * Enable recursive parsing of embedded documents when using TikaServer. This is experimental, + * uses /rmeta endpoint, uses more RAM and is disabled by default. + */ + public static final String TIKASERVER_RECURSIVE = "tikaserver.recursive"; + + /** Default or per-request timeout in seconds for TikaServer HTTP calls. */ + public static final String TIKASERVER_TIMEOUT_SECS = "tikaserver.timeoutSeconds"; } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java index c9a319bc0bb..a64f7eea819 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java @@ -16,8 +16,8 @@ */ package org.apache.solr.handler.extraction; -import java.io.InputStream; -import java.nio.file.Path; +import java.io.IOException; +import java.lang.invoke.MethodHandles; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.core.SolrCore; @@ -28,26 +28,29 @@ import org.apache.solr.security.PermissionNameProvider; import org.apache.solr.update.processor.UpdateRequestProcessor; import org.apache.solr.util.plugin.SolrCoreAware; -import org.apache.tika.config.TikaConfig; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Handler for rich documents like PDF or Word or any other file format that Tika handles that need * the text to be extracted first from the document. - * - * @deprecated Will be replaced with something similar that calls out to a separate Tika Server - * process running in its own JVM. */ -@Deprecated(since = "9.10.0") +@SuppressWarnings("removal") public class ExtractingRequestHandler extends ContentStreamHandlerBase implements SolrCoreAware, PermissionNameProvider { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + public static final String PARSE_CONTEXT_CONFIG = "parseContext.config"; public static final String CONFIG_LOCATION = "tika.config"; - protected TikaConfig config; + protected String tikaConfigLoc; protected ParseContextConfig parseContextConfig; protected SolrContentHandlerFactory factory; + protected String defaultBackendName; + protected LocalTikaExtractionBackend localBackend; + protected TikaServerExtractionBackend tikaServerBackend; // may be null if not configured @Override public PermissionNameProvider.Name getPermissionName(AuthorizationContext request) { @@ -57,22 +60,8 @@ public PermissionNameProvider.Name getPermissionName(AuthorizationContext reques @Override public void inform(SolrCore core) { try { - String tikaConfigLoc = (String) initArgs.get(CONFIG_LOCATION); - if (tikaConfigLoc == null) { // default - ClassLoader classLoader = core.getResourceLoader().getClassLoader(); - try (InputStream is = classLoader.getResourceAsStream("solr-default-tika-config.xml")) { - config = new TikaConfig(is); - } - } else { - Path configFile = Path.of(tikaConfigLoc); - if (configFile.isAbsolute()) { - config = new TikaConfig(configFile); - } else { // in conf/ - try (InputStream is = core.getResourceLoader().openResource(tikaConfigLoc)) { - config = new TikaConfig(is); - } - } - } + // Store tika config location (backend-specific) + this.tikaConfigLoc = (String) initArgs.get(CONFIG_LOCATION); String parseContextConfigLoc = (String) initArgs.get(PARSE_CONTEXT_CONFIG); if (parseContextConfigLoc == null) { // default: @@ -81,20 +70,109 @@ public void inform(SolrCore core) { parseContextConfig = new ParseContextConfig(core.getResourceLoader(), parseContextConfigLoc); } + + // Always create local backend + this.localBackend = new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig); + + // Optionally create Tika Server backend if URL configured + String tikaServerUrl = (String) initArgs.get(ExtractingParams.TIKASERVER_URL); + if (tikaServerUrl != null && !tikaServerUrl.trim().isEmpty()) { + int timeoutSecs = 0; + Object initTimeout = initArgs.get(ExtractingParams.TIKASERVER_TIMEOUT_SECS); + if (initTimeout != null) { + try { + timeoutSecs = Integer.parseInt(String.valueOf(initTimeout)); + } catch (NumberFormatException nfe) { + throw new SolrException( + ErrorCode.SERVER_ERROR, + "Invalid value for '" + + ExtractingParams.TIKASERVER_TIMEOUT_SECS + + "': " + + initTimeout, + nfe); + } + } + Object maxCharsObj = initArgs.get(ExtractingParams.TIKASERVER_MAX_CHARS); + long maxCharsLimit = TikaServerExtractionBackend.DEFAULT_MAXCHARS_LIMIT; + if (maxCharsObj != null) { + try { + maxCharsLimit = Long.parseLong(String.valueOf(maxCharsObj)); + } catch (NumberFormatException nfe) { + throw new SolrException( + ErrorCode.SERVER_ERROR, + "Invalid value for '" + + ExtractingParams.TIKASERVER_MAX_CHARS + + "': " + + maxCharsObj); + } + } + this.tikaServerBackend = + new TikaServerExtractionBackend(tikaServerUrl, timeoutSecs, initArgs, maxCharsLimit); + } + + // Choose default backend name + String backendName = (String) initArgs.get(ExtractingParams.EXTRACTION_BACKEND); + this.defaultBackendName = + (backendName == null || backendName.trim().isEmpty()) + ? LocalTikaExtractionBackend.NAME + : backendName; + + // Validate backend and check configuration + switch (this.defaultBackendName) { + case LocalTikaExtractionBackend.NAME: + break; + case TikaServerExtractionBackend.NAME: + // Tika Server backend requires URL to be configured + if (this.tikaServerBackend == null) { + throw new SolrException( + ErrorCode.INVALID_STATE, "Tika Server backend requested but no URL configured"); + } + break; + default: + throw new SolrException( + ErrorCode.BAD_REQUEST, + "Invalid extraction backend: '" + + this.defaultBackendName + + "'. Must be one of: '" + + LocalTikaExtractionBackend.NAME + + "', '" + + TikaServerExtractionBackend.NAME + + "'"); + } } catch (Exception e) { - throw new SolrException(ErrorCode.SERVER_ERROR, "Unable to load Tika Config", e); + throw new SolrException( + ErrorCode.SERVER_ERROR, "Unable to initialize ExtractingRequestHandler", e); } - factory = createFactory(); - } - - protected SolrContentHandlerFactory createFactory() { - return new SolrContentHandlerFactory(); + factory = new SolrContentHandlerFactory(); } @Override protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) { - return new ExtractingDocumentLoader(req, processor, config, parseContextConfig, factory); + // Allow per-request override of backend via request param + String backendParam = req.getParams().get(ExtractingParams.EXTRACTION_BACKEND); + String nameToUse = + (backendParam != null && !backendParam.trim().isEmpty()) + ? backendParam + : defaultBackendName; + + ExtractionBackend extractionBackend; + if (LocalTikaExtractionBackend.NAME.equals(nameToUse)) { + extractionBackend = localBackend; + } else if (TikaServerExtractionBackend.NAME.equals(nameToUse)) { + if (tikaServerBackend == null) { + throw new SolrException( + ErrorCode.BAD_REQUEST, + "Tika Server backend requested but '" + + ExtractingParams.TIKASERVER_URL + + "' is not configured"); + } + extractionBackend = tikaServerBackend; + } else { + throw new SolrException(ErrorCode.BAD_REQUEST, "Unknown extraction backend: " + nameToUse); + } + + return new ExtractingDocumentLoader(req, processor, factory, extractionBackend); } // ////////////////////// SolrInfoMBeans methods ////////////////////// @@ -102,4 +180,22 @@ protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProce public String getDescription() { return "Add/Update Rich document"; } + + @Override + public void close() throws IOException { + // Close our backends to release any shared resources (e.g., Jetty HttpClient) + try { + if (tikaServerBackend != null) { + tikaServerBackend.close(); + } + } finally { + try { + if (localBackend != null) { + localBackend.close(); + } + } finally { + super.close(); + } + } + } } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java new file mode 100644 index 00000000000..4550b1f8617 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.io.Closeable; +import java.io.IOException; +import java.io.InputStream; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Content extraction backends must implement this interface. + * + *

Implementations must be thread-safe as a single instance may be shared across multiple + * concurrent requests. + */ +public interface ExtractionBackend extends Closeable { + /** + * Extract plain text and metadata from the inputStream. Implementations should not close the + * inputStream. + */ + ExtractionResult extract(InputStream inputStream, ExtractionRequest request) throws Exception; + + /** + * Perform extraction of text from inputStream with SAX handler. Examples of SAX handlers are + * SolrContentHandler, ToTextContentHandler, ToXMLContentHandler and MatchingContentHandler. + */ + void extractWithSaxHandler( + InputStream inputStream, + ExtractionRequest request, + ExtractionMetadata md, + DefaultHandler saxContentHandler) + throws Exception; + + /** Build ExtractionMetadata from the request context */ + default ExtractionMetadata buildMetadataFromRequest(ExtractionRequest request) { + ExtractionMetadata md = new ExtractionMetadata(); + md.add(ExtractingMetadataConstants.RESOURCE_NAME_KEY, request.resourceName); + md.add(ExtractingMetadataConstants.HTTP_HEADER_CONTENT_TYPE, request.contentType); + md.add(ExtractingMetadataConstants.STREAM_NAME, request.streamName); + md.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, request.streamSourceInfo); + md.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(request.streamSize)); + md.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, request.contentType); + md.add(ExtractingMetadataConstants.HTTP_HEADER_CONTENT_ENCODING, request.charset); + return md; + } + + /** A short name for debugging/config, e.g., "local" or "tikaserver". */ + String name(); + + @Override + default void close() throws IOException { + // default no-op; specific backends may override to release shared resources + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java new file mode 100644 index 00000000000..764a0a9d152 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; + +/** A map of metadata name/value pairs. */ +public class ExtractionMetadata extends LinkedHashMap> { + /** + * Add a metadata value. If the name already exists, the value will be appended to the existing + * list. + */ + public void add(String name, String value) { + if (name == null || value == null) return; + computeIfAbsent(name, k -> new ArrayList<>()).add(value); + } + + /** Add multiple metadata values. */ + public void add(String name, Collection values) { + if (name == null || values == null || values.isEmpty()) return; + computeIfAbsent(name, k -> new ArrayList<>()).addAll(values); + } + + /** Gets all metadata values for the given name. */ + public List get(String name) { + List vals = super.get(name); + return (vals == null) ? Collections.emptyList() : vals; + } + + /** Gets the first metadata value for the given name or null if not set. */ + public String getFirst(String name) { + List vals = super.get(name); + if (vals == null || vals.isEmpty()) return null; + return vals.getFirst(); + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionPasswordProvider.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionPasswordProvider.java new file mode 100644 index 00000000000..6dbee85b988 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionPasswordProvider.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +/** Generic password provider without Tika dependency */ +public interface ExtractionPasswordProvider { + /** Given some metadata, return a password to use for the given document. */ + String getPassword(ExtractionMetadata metadata); +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java new file mode 100644 index 00000000000..50c6f0c7b7b --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.regex.Pattern; + +/** Immutable request info needed by extraction backends. */ +public class ExtractionRequest { + public final String streamType; + public final String resourceName; + public final String contentType; + public final String charset; + public final String streamName; + public final String streamSourceInfo; + public final Long streamSize; + public final String resourcePassword; + public final java.util.LinkedHashMap passwordsMap; + public final String extractFormat; + + // Below variables are only used by TikaServerExtractionBackend + public final boolean tikaServerRecursive; + public final Integer tikaServerTimeoutSeconds; // optional per-request override + public final Map tikaServerRequestHeaders = new HashMap<>(); + + /** + * Constructs an ExtractionRequest object containing metadata and configurations for extraction + * backends. This constructor is private - use {@link #builder()} to create instances. + * + * @param streamType the explicit MIME type of the document (optional) + * @param resourceName the name of the resource, typically a filename hint + * @param contentType the HTTP content-type header value + * @param charset the derived character set of the stream if available + * @param streamName the name of the content stream + * @param streamSourceInfo additional information about the stream source + * @param streamSize the size of the stream in bytes + * @param resourcePassword an optional password used for encrypted documents + * @param passwordsMap an optional map of regex patterns to passwords for encrypted content + * @param extractFormat the desired format for extraction output + * @param tikaServerRecursive a flag indicating whether extraction should be recursive. TikaServer + * only + * @param tikaServerTimeoutSeconds optional per-request timeout override in seconds (TikaServer + * only). If null or ≤ 0, the default timeout will be used + * @param tikaServerRequestHeaders optional headers to be included in requests to the extraction + * service. TikaServer only + */ + private ExtractionRequest( + String streamType, + String resourceName, + String contentType, + String charset, + String streamName, + String streamSourceInfo, + Long streamSize, + String resourcePassword, + java.util.LinkedHashMap passwordsMap, + String extractFormat, + boolean tikaServerRecursive, + Integer tikaServerTimeoutSeconds, + Map tikaServerRequestHeaders) { + this.streamType = streamType; + this.resourceName = resourceName; + this.contentType = contentType; + this.charset = charset; + this.streamName = streamName; + this.streamSourceInfo = streamSourceInfo; + this.streamSize = streamSize; + this.resourcePassword = resourcePassword; + this.passwordsMap = passwordsMap; + this.extractFormat = extractFormat; + this.tikaServerRecursive = tikaServerRecursive; + this.tikaServerTimeoutSeconds = tikaServerTimeoutSeconds; + if (tikaServerRequestHeaders != null) { + this.tikaServerRequestHeaders.putAll(tikaServerRequestHeaders); + } + } + + /** Creates a new Builder for constructing ExtractionRequest instances. */ + public static Builder builder() { + return new Builder(); + } + + /** Builder for creating ExtractionRequest instances with improved readability and safety. */ + public static class Builder { + private String streamType; + private String resourceName; + private String contentType; + private String charset; + private String streamName; + private String streamSourceInfo; + private Long streamSize; + private String resourcePassword; + private LinkedHashMap passwordsMap; + private String extractFormat; + private boolean tikaServerRecursive = false; + private Integer tikaServerTimeoutSeconds; + private Map tikaServerRequestHeaders; + + private Builder() {} + + public Builder streamType(String streamType) { + this.streamType = streamType; + return this; + } + + public Builder resourceName(String resourceName) { + this.resourceName = resourceName; + return this; + } + + public Builder contentType(String contentType) { + this.contentType = contentType; + return this; + } + + public Builder charset(String charset) { + this.charset = charset; + return this; + } + + public Builder streamName(String streamName) { + this.streamName = streamName; + return this; + } + + public Builder streamSourceInfo(String streamSourceInfo) { + this.streamSourceInfo = streamSourceInfo; + return this; + } + + public Builder streamSize(Long streamSize) { + this.streamSize = streamSize; + return this; + } + + public Builder resourcePassword(String resourcePassword) { + this.resourcePassword = resourcePassword; + return this; + } + + public Builder passwordsMap(LinkedHashMap passwordsMap) { + this.passwordsMap = passwordsMap; + return this; + } + + public Builder extractFormat(String extractFormat) { + this.extractFormat = extractFormat; + return this; + } + + public Builder tikaServerRecursive(boolean tikaServerRecursive) { + this.tikaServerRecursive = tikaServerRecursive; + return this; + } + + public Builder tikaServerTimeoutSeconds(Integer tikaServerTimeoutSeconds) { + this.tikaServerTimeoutSeconds = tikaServerTimeoutSeconds; + return this; + } + + public Builder tikaServerRequestHeaders(Map tikaServerRequestHeaders) { + this.tikaServerRequestHeaders = tikaServerRequestHeaders; + return this; + } + + public ExtractionRequest build() { + return new ExtractionRequest( + streamType, + resourceName, + contentType, + charset, + streamName, + streamSourceInfo, + streamSize, + resourcePassword, + passwordsMap, + extractFormat, + tikaServerRecursive, + tikaServerTimeoutSeconds, + tikaServerRequestHeaders); + } + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionResult.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionResult.java new file mode 100644 index 00000000000..97767d15367 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionResult.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +/** Immutable extraction result with plain text content and neutral metadata. */ +public final class ExtractionResult { + private final String content; + private final ExtractionMetadata metadata; + + public ExtractionResult(String content, ExtractionMetadata metadata) { + this.content = content == null ? "" : content; + this.metadata = metadata; + } + + /** Extracted textual content (plain text). */ + public String getContent() { + return content; + } + + /** Extracted metadata in neutral, backend-agnostic form. */ + public ExtractionMetadata getMetadata() { + return metadata; + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java new file mode 100644 index 00000000000..1ca7268a31b --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.io.InputStream; +import java.nio.file.Path; +import java.util.List; +import java.util.Locale; +import org.apache.solr.core.SolrCore; +import org.apache.solr.logging.DeprecationLog; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.DefaultParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.parser.html.HtmlMapper; +import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Extraction backend using local in-process Apache Tika. This encapsulates the previous direct + * usage of Tika from the loader. + * + * @deprecated Will be removed soon, please use the 'tikaserver' extraction backend instead. + */ +@Deprecated(since = "9.10.0", forRemoval = true) +public class LocalTikaExtractionBackend implements ExtractionBackend { + private final TikaConfig tikaConfig; + private final ParseContextConfig parseContextConfig; + private final AutoDetectParser autoDetectParser; + + // Local HtmlMapper moved from ExtractingDocumentLoader + private static class MostlyPassthroughHtmlMapper implements HtmlMapper { + static final HtmlMapper INSTANCE = new MostlyPassthroughHtmlMapper(); + + @Override + public boolean isDiscardElement(String name) { + return false; + } + + @Override + public String mapSafeAttribute(String elementName, String attributeName) { + return attributeName.toLowerCase(java.util.Locale.ENGLISH); + } + + @Override + public String mapSafeElement(String name) { + String lowerName = name.toLowerCase(java.util.Locale.ROOT); + return (lowerName.equals("br") || lowerName.equals("body")) ? null : lowerName; + } + } + + public LocalTikaExtractionBackend(TikaConfig config, ParseContextConfig parseContextConfig) { + this.tikaConfig = config; + this.parseContextConfig = parseContextConfig; + this.autoDetectParser = new AutoDetectParser(config); + } + + /** + * Construct backend by loading TikaConfig based on handler/core configuration without exposing + * Tika types to the handler. + */ + public LocalTikaExtractionBackend( + SolrCore core, String tikaConfigLoc, ParseContextConfig parseContextConfig) throws Exception { + TikaConfig cfg; + if (tikaConfigLoc == null) { // default + ClassLoader classLoader = core.getResourceLoader().getClassLoader(); + try (InputStream is = classLoader.getResourceAsStream("solr-default-tika-config.xml")) { + cfg = new TikaConfig(is); + } + } else { + Path configFile = Path.of(tikaConfigLoc); + core.getCoreContainer().assertPathAllowed(configFile); + if (configFile.isAbsolute()) { + cfg = new TikaConfig(configFile); + } else { // in conf/ + try (InputStream is = core.getResourceLoader().openResource(tikaConfigLoc)) { + cfg = new TikaConfig(is); + } + } + } + this.tikaConfig = cfg; + this.parseContextConfig = parseContextConfig; + this.autoDetectParser = new AutoDetectParser(cfg); + DeprecationLog.log("Local Tika", "The 'local' extraction backend is deprecated"); + } + + public static final String NAME = "local"; + + @Override + public String name() { + return NAME; + } + + private Parser selectParser(ExtractionRequest request) { + if (request.streamType != null) { + MediaType mt = MediaType.parse(request.streamType.trim().toLowerCase(Locale.ROOT)); + return new DefaultParser(tikaConfig.getMediaTypeRegistry()).getParsers().get(mt); + } + return autoDetectParser; + } + + private Metadata buildMetadata(ExtractionRequest request) { + ExtractionMetadata extractionMetadata = buildMetadataFromRequest(request); + Metadata md = new Metadata(); + for (String name : extractionMetadata.keySet()) { + List vals = extractionMetadata.get(name); + if (vals != null) for (String v : vals) md.add(name, v); + } + return md; + } + + private ParseContext buildContext(Parser parser, ExtractionRequest request) { + ParseContext context = parseContextConfig.create(); + context.set(Parser.class, parser); + context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE); + RegexRulesPasswordProvider pwd = new RegexRulesPasswordProvider(); + if (request.resourcePassword != null) { + pwd.setExplicitPassword(request.resourcePassword); + } + if (request.passwordsMap != null) { + pwd.setPasswordMap(request.passwordsMap); + } + context.set(PasswordProvider.class, new PasswordProviderAdapter(pwd)); + return context; + } + + private static ExtractionMetadata tikaMetadataToExtractionMetadata(Metadata md) { + ExtractionMetadata out = new ExtractionMetadata(); + for (String name : md.names()) { + String[] vals = md.getValues(name); + if (vals != null) for (String v : vals) out.add(name, v); + } + return out; + } + + @Override + public ExtractionResult extract(InputStream inputStream, ExtractionRequest request) + throws Exception { + Parser parser = selectParser(request); + if (parser == null) { + throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType); + } + ParseContext context = buildContext(parser, request); + Metadata md = buildMetadata(request); + BodyContentHandler textHandler = new BodyContentHandler(-1); + parser.parse(inputStream, textHandler, md, context); + return new ExtractionResult(textHandler.toString(), tikaMetadataToExtractionMetadata(md)); + } + + @Override + public void extractWithSaxHandler( + InputStream inputStream, + ExtractionRequest request, + ExtractionMetadata md, + DefaultHandler saxContentHandler) + throws Exception { + Parser parser = selectParser(request); + if (parser == null) { + throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType); + } + ParseContext context = buildContext(parser, request); + Metadata tikaMetadata = buildMetadata(request); + parser.parse(inputStream, saxContentHandler, tikaMetadata, context); + for (String name : tikaMetadata.names()) { + String[] vals = tikaMetadata.getValues(name); + if (vals != null) for (String v : vals) md.add(name, v); + } + } + + private static class PasswordProviderAdapter implements PasswordProvider { + private final ExtractionPasswordProvider delegate; + + public PasswordProviderAdapter(ExtractionPasswordProvider delegate) { + this.delegate = delegate; + } + + @Override + public String getPassword(Metadata metadata) { + return delegate.getPassword(tikaMetadataToExtractionMetadata(metadata)); + } + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java index 84b4e94171c..38337217f3a 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java @@ -26,9 +26,6 @@ import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import org.apache.lucene.util.IOUtils; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaMetadataKeys; -import org.apache.tika.parser.PasswordProvider; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -37,20 +34,21 @@ * matching against a list of regular expressions. The list of passwords is supplied in an optional * Map. If an explicit password is set, it will be used. */ -public class RegexRulesPasswordProvider implements PasswordProvider { +public class RegexRulesPasswordProvider implements ExtractionPasswordProvider { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private LinkedHashMap passwordMap = new LinkedHashMap<>(); private String explicitPassword; @Override - public String getPassword(Metadata meta) { + public String getPassword(ExtractionMetadata extractionMetadata) { if (getExplicitPassword() != null) { return getExplicitPassword(); } if (passwordMap.size() > 0) - return lookupPasswordFromMap(meta.get(TikaMetadataKeys.RESOURCE_NAME_KEY)); + return lookupPasswordFromMap( + extractionMetadata.getFirst(ExtractingMetadataConstants.RESOURCE_NAME_KEY)); return null; } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java index 9edba0e925e..f9d84167127 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java @@ -23,6 +23,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Set; @@ -30,8 +31,6 @@ import org.apache.solr.common.params.SolrParams; import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.SchemaField; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaMetadataKeys; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.Attributes; @@ -57,7 +56,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara protected final SolrInputDocument document; - protected final Metadata metadata; + protected final ExtractionMetadata metadata; protected final SolrParams params; protected final StringBuilder catchAllBuilder = new StringBuilder(2048); protected final IndexSchema schema; @@ -74,7 +73,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara private Set literalFieldNames = null; - public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) { + public SolrContentHandler(ExtractionMetadata metadata, SolrParams params, IndexSchema schema) { this.document = new SolrInputDocument(); this.metadata = metadata; this.params = params; @@ -152,6 +151,13 @@ protected void addContent() { addField(contentFieldName, catchAllBuilder.toString(), null); } + /** Append pre-extracted plain text content to the catch-all builder. */ + public void appendToContent(String text) { + if (text != null && !text.isEmpty()) { + catchAllBuilder.append(text); + } + } + /** * Add in the literals to the document using the {@link #params} and the {@link #LITERALS_PREFIX}. */ @@ -170,10 +176,10 @@ protected void addLiterals() { /** Add in any metadata using {@link #metadata} as the source. */ protected void addMetadata() { - for (String name : metadata.names()) { + for (String name : metadata.keySet()) { if (literalsOverride && literalFieldNames.contains(name)) continue; - String[] vals = metadata.getValues(name); - addField(name, null, vals); + List vals = metadata.get(name); + addField(name, null, vals.toArray(new String[0])); } } @@ -200,7 +206,7 @@ protected void addField(String fname, String fval, String[] vals) { sf = schema.getFieldOrNull(name); } else if (sf == null && defaultField.length() > 0 - && name.equals(TikaMetadataKeys.RESOURCE_NAME_KEY) + && name.equals(ExtractingMetadataConstants.RESOURCE_NAME_KEY) == false /*let the fall through below handle this*/) { name = defaultField; sf = schema.getFieldOrNull(name); @@ -213,7 +219,7 @@ protected void addField(String fname, String fval, String[] vals) { // you? if (sf == null && unknownFieldPrefix.length() == 0 - && Objects.equals(name, TikaMetadataKeys.RESOURCE_NAME_KEY)) { + && Objects.equals(name, ExtractingMetadataConstants.RESOURCE_NAME_KEY)) { return; } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java index 1070e744d84..b4fe031a068 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java @@ -18,7 +18,6 @@ import org.apache.solr.common.params.SolrParams; import org.apache.solr.schema.IndexSchema; -import org.apache.tika.metadata.Metadata; /** */ public class SolrContentHandlerFactory { @@ -26,7 +25,7 @@ public class SolrContentHandlerFactory { public SolrContentHandlerFactory() {} public SolrContentHandler createSolrContentHandler( - Metadata metadata, SolrParams params, IndexSchema schema) { + ExtractionMetadata metadata, SolrParams params, IndexSchema schema) { return new SolrContentHandler(metadata, params, schema); } } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java new file mode 100644 index 00000000000..b4a5324575b --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java @@ -0,0 +1,450 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.io.IOException; +import java.io.InputStream; +import java.net.ConnectException; +import java.net.SocketTimeoutException; +import java.nio.channels.ClosedChannelException; +import java.time.Duration; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.ThreadFactory; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.util.ExecutorUtil; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.util.SolrNamedThreadFactory; +import org.apache.solr.handler.extraction.fromtika.BodyContentHandler; +import org.apache.solr.util.RefCounted; +import org.eclipse.jetty.client.HttpClient; +import org.eclipse.jetty.client.InputStreamRequestContent; +import org.eclipse.jetty.client.InputStreamResponseListener; +import org.eclipse.jetty.client.Request; +import org.eclipse.jetty.client.Response; +import org.eclipse.jetty.io.EofException; +import org.eclipse.jetty.util.thread.ScheduledExecutorScheduler; +import org.xml.sax.helpers.DefaultHandler; + +/** Extraction backend using the Tika Server. It uses a shared Jetty HttpClient. */ +public class TikaServerExtractionBackend implements ExtractionBackend { + /** + * Default maximum response size (100MB) to prevent excessive memory usage from large documents + */ + public static final long DEFAULT_MAXCHARS_LIMIT = 100 * 1024 * 1024; + + private static final Object INIT_LOCK = new Object(); + private final String baseUrl; + private static final int DEFAULT_TIMEOUT_SECONDS = 3 * 60; + private final Duration defaultTimeout; + private final TikaServerParser tikaServerResponseParser = new TikaServerParser(); + private boolean tikaMetadataCompatibility; + private HashMap initArgsMap = new HashMap<>(); + private final long maxCharsLimit; + + // Singleton holder for the shared HttpClient/Executor resources (one per JVM) + private static volatile RefCounted SHARED_RESOURCES; + // Per-backend handle (same RefCounted instance as SHARED_RESOURCES) that this instance will + // decref() on close + private RefCounted acquiredResourcesRef; + + public TikaServerExtractionBackend(String baseUrl) { + this(baseUrl, DEFAULT_TIMEOUT_SECONDS, null, DEFAULT_MAXCHARS_LIMIT); + } + + public TikaServerExtractionBackend( + String baseUrl, int timeoutSeconds, NamedList initArgs, long maxCharsLimit) { + // Validate baseUrl + if (baseUrl == null || baseUrl.trim().isEmpty()) { + throw new IllegalArgumentException("baseUrl cannot be null or empty"); + } + // Validate URL format and scheme + try { + java.net.URI uri = new java.net.URI(baseUrl); + String scheme = uri.getScheme(); + if (scheme == null + || (!scheme.equalsIgnoreCase("http") && !scheme.equalsIgnoreCase("https"))) { + throw new IllegalArgumentException( + "baseUrl must use http or https scheme, got: " + baseUrl); + } + uri.toURL(); // Additional validation that it's a valid URL + } catch (java.net.URISyntaxException | java.net.MalformedURLException e) { + throw new IllegalArgumentException("Invalid baseUrl: " + baseUrl, e); + } + + this.maxCharsLimit = maxCharsLimit; + if (initArgs != null) { + initArgs.toMap(this.initArgsMap); + } + Object metaCompatObh = this.initArgsMap.get(ExtractingParams.TIKASERVER_METADATA_COMPATIBILITY); + if (metaCompatObh != null) { + this.tikaMetadataCompatibility = Boolean.parseBoolean(metaCompatObh.toString()); + } + if (timeoutSeconds <= 0) { + timeoutSeconds = DEFAULT_TIMEOUT_SECONDS; + } + if (baseUrl.endsWith("/")) { + this.baseUrl = baseUrl.substring(0, baseUrl.length() - 1); + } else { + this.baseUrl = baseUrl; + } + this.defaultTimeout = + Duration.ofSeconds(timeoutSeconds > 0 ? timeoutSeconds : DEFAULT_TIMEOUT_SECONDS); + + // Acquire a reference to the shared resources; keep a handle so we can decref() on close + acquiredResourcesRef = initializeHttpClient().incref(); + } + + public static final String NAME = "tikaserver"; + + @Override + public String name() { + return NAME; + } + + @Override + public ExtractionResult extract(InputStream inputStream, ExtractionRequest request) + throws Exception { + try (InputStream tikaResponse = callTikaServer(inputStream, request)) { + ExtractionMetadata md = buildMetadataFromRequest(request); + BodyContentHandler bodyContentHandler = new BodyContentHandler(-1); + if (request.tikaServerRecursive) { + tikaServerResponseParser.parseRmetaJson(tikaResponse, bodyContentHandler, md); + } else { + tikaServerResponseParser.parseXml(tikaResponse, bodyContentHandler, md); + } + if (tikaMetadataCompatibility) { + appendBackCompatTikaMetadata(md); + } + return new ExtractionResult(bodyContentHandler.toString(), md); + } + } + + @Override + public void extractWithSaxHandler( + InputStream inputStream, + ExtractionRequest request, + ExtractionMetadata md, + DefaultHandler saxContentHandler) + throws Exception { + try (InputStream tikaResponse = callTikaServer(inputStream, request)) { + if (request.tikaServerRecursive) { + tikaServerResponseParser.parseRmetaJson(tikaResponse, saxContentHandler, md); + } else { + tikaServerResponseParser.parseXml(tikaResponse, saxContentHandler, md); + } + if (tikaMetadataCompatibility) { + appendBackCompatTikaMetadata(md); + } + } + } + + /** + * Call the Tika Server to extract text and metadata. Depending on request.recursive, + * will either return XML (false) or JSON array (true). The recursive mode consumes more memory + * both on the TikaServer side and on the Solr side + * + * @return InputStream of the response body, either XML or JSON depending on + * request.tikaserverRecursive + */ + InputStream callTikaServer(InputStream inputStream, ExtractionRequest request) throws Exception { + String url = baseUrl + (request.tikaServerRecursive ? "/rmeta" : "/tika"); + + HttpClient client = acquiredResourcesRef.get().client; + + Request req = client.newRequest(url).method("PUT"); + Duration effectiveTimeout = + (request.tikaServerTimeoutSeconds != null && request.tikaServerTimeoutSeconds > 0) + ? Duration.ofSeconds(request.tikaServerTimeoutSeconds) + : defaultTimeout; + req.timeout(effectiveTimeout.toMillis(), TimeUnit.MILLISECONDS); + + // Headers + String accept = (request.tikaServerRecursive ? "application/json" : "text/xml"); + req.headers(h -> h.add("Accept", accept)); + String contentType = (request.streamType != null) ? request.streamType : request.contentType; + if (contentType != null) { + req.headers(h -> h.add("Content-Type", contentType)); + } + if (!request.tikaServerRequestHeaders.isEmpty()) { + req.headers( + h -> + request.tikaServerRequestHeaders.forEach( + (k, v) -> { + if (k != null && v != null) h.add(k, v); + })); + } + + ExtractionMetadata md = buildMetadataFromRequest(request); + if (request.resourcePassword != null || request.passwordsMap != null) { + RegexRulesPasswordProvider passwordProvider = new RegexRulesPasswordProvider(); + if (request.resourcePassword != null) { + passwordProvider.setExplicitPassword(request.resourcePassword); + } + if (request.passwordsMap != null) { + passwordProvider.setPasswordMap(request.passwordsMap); + } + String pwd = passwordProvider.getPassword(md); + if (pwd != null) { + req.headers(h -> h.add("Password", pwd)); // Tika Server expects this header if provided + } + } + if (request.resourceName != null) { + req.headers( + h -> + h.add( + "Content-Disposition", "attachment; filename=\"" + request.resourceName + "\"")); + } + + if (contentType != null) { + req.body(new InputStreamRequestContent(contentType, inputStream)); + } else { + req.body(new InputStreamRequestContent(inputStream)); + } + + InputStreamResponseListener listener = new InputStreamResponseListener(); + req.send(listener); + + final Response response; + try { + response = listener.get(effectiveTimeout.toMillis(), TimeUnit.MILLISECONDS); + } catch (TimeoutException te) { + throw new SolrException( + SolrException.ErrorCode.GATEWAY_TIMEOUT, + "Timeout after " + + effectiveTimeout.toMillis() + + " ms while waiting for response from TikaServer " + + url, + te); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "Interrupted while waiting for response from TikaServer " + url, + ie); + } catch (ExecutionException ee) { + Throwable cause = ee.getCause(); + if (cause instanceof ConnectException + || cause instanceof SocketTimeoutException + || cause instanceof EofException + || cause instanceof ClosedChannelException) { + throw new SolrException( + SolrException.ErrorCode.SERVICE_UNAVAILABLE, + "Error communicating with TikaServer " + + url + + ": " + + cause.getClass().getSimpleName() + + ": " + + cause.getMessage(), + cause); + } + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "Unexpected error while calling TikaServer " + url, + ee); + } + + int code = response.getStatus(); + if (code < 200 || code >= 300) { + SolrException.ErrorCode errorCode = SolrException.ErrorCode.getErrorCode(code); + String reason = response.getReason(); + String msg = + "TikaServer " + + url + + " returned status " + + code + + (reason != null ? " (" + reason + ")" : ""); + throw new SolrException(errorCode, msg); + } + + InputStream responseStream = listener.getInputStream(); + // Bound the amount of data we read from Tika Server to avoid excessive memory/CPU usage + return new LimitingInputStream(responseStream, maxCharsLimit); + } + + private static class LimitingInputStream extends InputStream { + private final InputStream in; + private final long max; + private long count; + + LimitingInputStream(InputStream in, long max) { + this.in = in; + this.max = max; + this.count = 0L; + } + + private void checkLimit(long toAdd) { + if (max <= 0) return; // non-positive means unlimited + long newCount = count + toAdd; + if (newCount > max) { + throw new SolrException( + SolrException.ErrorCode.BAD_REQUEST, + "TikaServer response exceeded the configured maximum size of " + max + " bytes"); + } + count = newCount; + } + + @Override + public int read() throws IOException { + int b = in.read(); + if (b != -1) { + checkLimit(1); + } + return b; + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + int n = in.read(b, off, len); + if (n > 0) { + checkLimit(n); + } + return n; + } + + @Override + public long skip(long n) throws IOException { + long skipped = in.skip(n); + if (skipped > 0) { + checkLimit(skipped); + } + return skipped; + } + + @Override + public void close() throws IOException { + in.close(); + } + + @Override + public int available() throws IOException { + return in.available(); + } + } + + private static final class HttpClientResources { + final HttpClient client; + final ExecutorService executor; + + HttpClientResources(HttpClient client, ExecutorService executor) { + this.client = client; + this.executor = executor; + } + } + + private static final class ResourcesRef extends RefCounted { + ResourcesRef(HttpClientResources r) { + super(r); + } + + @Override + protected void close() { + // stop client and shutdown executor + try { + if (resource.client != null) resource.client.stop(); + } catch (Throwable ignore) { + } + try { + if (resource.executor != null) resource.executor.shutdownNow(); + } catch (Throwable ignore) { + } + synchronized (INIT_LOCK) { + // clear the shared reference when closed + if (SHARED_RESOURCES == this) { + SHARED_RESOURCES = null; + } + } + } + } + + private static RefCounted initializeHttpClient() { + RefCounted ref = SHARED_RESOURCES; + if (ref != null) return ref; + synchronized (INIT_LOCK) { + if (SHARED_RESOURCES != null) return SHARED_RESOURCES; + ThreadFactory tf = new SolrNamedThreadFactory("TikaServerHttpClient"); + ExecutorService exec = ExecutorUtil.newMDCAwareCachedThreadPool(tf); + HttpClient client = new HttpClient(); + client.setExecutor(exec); + client.setScheduler(new ScheduledExecutorScheduler("TikaServerHttpClient-scheduler", true)); + try { + client.start(); + } catch (Exception e) { + try { + exec.shutdownNow(); + } catch (Throwable ignore) { + } + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, "Failed to start shared Jetty HttpClient", e); + } + SHARED_RESOURCES = new ResourcesRef(new HttpClientResources(client, exec)); + return SHARED_RESOURCES; + } + } + + private final Map fieldMappings = new LinkedHashMap<>(); + + // TODO: Improve backward compatibility by adding more mappings + { + fieldMappings.put("dc:title", "title"); + fieldMappings.put("dc:creator", "author"); + fieldMappings.put("dc:description", "description"); + fieldMappings.put("dc:subject", "subject"); + fieldMappings.put("dc:language", "language"); + fieldMappings.put("dc:publisher", "publisher"); + fieldMappings.put("dcterms:created", "created"); + fieldMappings.put("dcterms:modified", "modified"); + fieldMappings.put("meta:author", "Author"); + fieldMappings.put("meta:creation-date", "Creation-Date"); + fieldMappings.put("meta:save-date", "Last-Save-Date"); + fieldMappings.put("meta:keyword", "Keywords"); + fieldMappings.put("pdf:docinfo:keywords", "Keywords"); + } + + /* + * Appends back-compatible metadata into the given {@code ExtractionMetadata} instance by mapping + * source fields to target fields, provided that backward compatibility is enabled. If a source + * field exists and the target field is not yet populated, the values from the source field will + * be added to the target field. + */ + private void appendBackCompatTikaMetadata(ExtractionMetadata md) { + for (Map.Entry mapping : fieldMappings.entrySet()) { + String sourceField = mapping.getKey(); + String targetField = mapping.getValue(); + if (md.getFirst(sourceField) != null && md.getFirst(targetField) == null) { + md.add(targetField, md.get(sourceField)); + } + } + } + + @Override + public void close() { + RefCounted ref; + synchronized (INIT_LOCK) { + ref = acquiredResourcesRef; + acquiredResourcesRef = null; + } + if (ref != null) { + ref.decref(); + } + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerParser.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerParser.java new file mode 100644 index 00000000000..38f0afec3d1 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerParser.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.Map; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.util.Utils; +import org.xml.sax.ContentHandler; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +public class TikaServerParser { + private final SAXParser saxParser; + + public TikaServerParser() { + SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setNamespaceAware(true); + try { + factory.setFeature("http://xml.org/sax/features/external-general-entities", false); + factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); + factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + } catch (Throwable ignore) { + // Some parsers may not support all features; ignore + } + try { + saxParser = factory.newSAXParser(); + } catch (Exception e) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); + } + } + + /** + * Parses response in XML format from Tika Server /tika endpoint. The result is that the metadata + * object is populated and the content handler is called with extracted text. + */ + public void parseXml(InputStream inputStream, ContentHandler handler, ExtractionMetadata metadata) + throws IOException, SAXException { + DefaultHandler xmlHandler = new TikaXmlResponseSaxContentHandler(handler, metadata); + try (Reader reader = + new XmlSanitizingReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))) { + saxParser.parse(new InputSource(reader), xmlHandler); + } + } + + /** + * Parses response in JSON format from Tika Server /rmeta endpoint. The result is that the + * metadata object is populated, and the content handler is called with extracted text. + * + * @param jsonStream - JSON stream to parse + * @param handler - SAX content handler to call with extracted text + * @param md - metadata object to populate + */ + @SuppressWarnings({"rawtypes", "PatternVariableCanBeUsed"}) + void parseRmetaJson(InputStream jsonStream, DefaultHandler handler, ExtractionMetadata md) + throws IOException, SAXException { + Object parsed = Utils.fromJSON(jsonStream); + if (!(parsed instanceof List)) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, "Unexpected /rmeta response, expected JSON array"); + } + List list = (List) parsed; + for (Object o : list) { + if (!(o instanceof Map)) continue; + Map map = (Map) o; + // Copy metadata + for (Object k : map.keySet()) { + String key = String.valueOf(k); + Object val = map.get(k); + if ("X-TIKA:content".equalsIgnoreCase(key)) { + // handled below + continue; + } + if (val instanceof List) { + for (Object v : (List) val) { + if (v != null) md.add(key, String.valueOf(v)); + } + } else if (val != null) { + md.add(key, String.valueOf(val)); + } + } + Object content = map.get("X-TIKA:content"); + if (content != null) { + String xhtml = String.valueOf(content); + if (!xhtml.isEmpty() && handler != null) { + InputStream inputStream = + new ByteArrayInputStream(xhtml.getBytes(StandardCharsets.UTF_8)); + try (Reader reader = + new XmlSanitizingReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))) { + saxParser.parse(new InputSource(reader), handler); + } + } + } + } + } + + /** Custom SAX handler that will extract meta tags from the tika xml and delegate */ + static class TikaXmlResponseSaxContentHandler extends DefaultHandler { + private final ContentHandler delegate; + private final ExtractionMetadata metadata; + private boolean inHead = false; + + public TikaXmlResponseSaxContentHandler(ContentHandler delegate, ExtractionMetadata metadata) { + this.delegate = delegate; + this.metadata = metadata; + } + + @Override + public void startDocument() throws SAXException { + if (delegate != null) delegate.startDocument(); + } + + @Override + public void endDocument() throws SAXException { + if (delegate != null) delegate.endDocument(); + } + + @Override + public void startElement( + String uri, String localName, String qName, org.xml.sax.Attributes attributes) + throws SAXException { + String ln = localName != null && !localName.isEmpty() ? localName : qName; + if ("head".equalsIgnoreCase(ln)) { + inHead = true; + } else if (inHead && "meta".equalsIgnoreCase(ln) && attributes != null) { + String name = attributes.getValue("name"); + String content = attributes.getValue("content"); + if (name != null && content != null) { + metadata.add(name, content); + } + } + if (delegate != null) delegate.startElement(uri, localName, qName, attributes); + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + String ln = localName != null && !localName.isEmpty() ? localName : qName; + if ("head".equalsIgnoreCase(ln)) { + inHead = false; + } + if (delegate != null) delegate.endElement(uri, localName, qName); + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + if (delegate != null) delegate.characters(ch, start, length); + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { + if (delegate != null) delegate.ignorableWhitespace(ch, start, length); + } + + @Override + public void startPrefixMapping(String prefix, String uri) throws SAXException { + if (delegate != null) delegate.startPrefixMapping(prefix, uri); + } + + @Override + public void endPrefixMapping(String prefix) throws SAXException { + if (delegate != null) delegate.endPrefixMapping(prefix); + } + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java new file mode 100644 index 00000000000..78e3ca149fc --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.io.FilterReader; +import java.io.IOException; +import java.io.Reader; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Filters out null character entities (�, �, etc.) from XML content. + * + *

Removes numeric character entities that resolve to code point 0, such as or + * . Everything else is passed through unchanged. + */ +final class XmlSanitizingReader extends FilterReader { + private static final Pattern NULL_ENTITY_PATTERN = + Pattern.compile("&#(0+|x0+);", Pattern.CASE_INSENSITIVE); + private static final int BUFFER_SIZE = 8192; + private static final int OVERLAP_SIZE = 16; // Max entity length: � + + private final char[] readBuffer = new char[BUFFER_SIZE + OVERLAP_SIZE]; + private final char[] buffer = new char[BUFFER_SIZE + OVERLAP_SIZE]; + private final StringBuilder sb = new StringBuilder(BUFFER_SIZE + OVERLAP_SIZE); + private final StringBuffer result = new StringBuffer(BUFFER_SIZE + OVERLAP_SIZE); + private int bufferPos = 0; + private int bufferLimit = 0; + private int overlapLen = 0; + private boolean eof = false; + + XmlSanitizingReader(Reader in) { + super(in); + } + + @Override + public int read() throws IOException { + if (bufferPos < bufferLimit) { + return buffer[bufferPos++]; + } + if (fillBuffer() == -1) { + return -1; + } + return buffer[bufferPos++]; + } + + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + if (len == 0) return 0; + int totalRead = 0; + while (totalRead < len) { + int available = bufferLimit - bufferPos; + if (available > 0) { + int toCopy = Math.min(available, len - totalRead); + System.arraycopy(buffer, bufferPos, cbuf, off + totalRead, toCopy); + bufferPos += toCopy; + totalRead += toCopy; + } else { + if (fillBuffer() == -1) { + return totalRead == 0 ? -1 : totalRead; + } + } + } + return totalRead; + } + + private int fillBuffer() throws IOException { + if (eof) return -1; + + // Copy overlap from end of previous buffer + if (overlapLen > 0) { + System.arraycopy(buffer, bufferLimit - overlapLen, readBuffer, 0, overlapLen); + } + + // Read new data + int read = in.read(readBuffer, overlapLen, BUFFER_SIZE); + if (read == -1) { + eof = true; + if (overlapLen == 0) return -1; + // Process remaining overlap at EOF + read = 0; + } + + // Sanitize without allocating a String + sb.setLength(0); + sb.append(readBuffer, 0, overlapLen + read); + + result.setLength(0); + Matcher matcher = NULL_ENTITY_PATTERN.matcher(sb); + while (matcher.find()) { + matcher.appendReplacement(result, ""); + } + matcher.appendTail(result); + + result.getChars(0, result.length(), buffer, 0); + bufferLimit = result.length(); + bufferPos = overlapLen; + + // Edge case: if sanitization removed characters from overlap at EOF, + // bufferPos might exceed bufferLimit + if (bufferPos > bufferLimit) { + bufferPos = bufferLimit; + } + + // Keep last OVERLAP_SIZE chars for next iteration (unless EOF) + overlapLen = eof ? 0 : Math.min(OVERLAP_SIZE, bufferLimit); + + return bufferLimit - bufferPos; + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/fromtika/BodyContentHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/fromtika/BodyContentHandler.java new file mode 100644 index 00000000000..34a3c071808 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/fromtika/BodyContentHandler.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This file was copied from Apache Tika 1.28.5 (org.apache.tika.sax.BodyContentHandler). + * It still depends on Tika, see imports. + *

+ * TODO: Find a replacement for this class before removing Tika dependency + *

+ */ +package org.apache.solr.handler.extraction.fromtika; + +import java.io.OutputStream; +import java.io.Writer; +import org.apache.tika.sax.WriteOutContentHandler; +import org.apache.tika.sax.xpath.Matcher; +import org.apache.tika.sax.xpath.MatchingContentHandler; +import org.apache.tika.sax.xpath.XPathParser; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Content handler decorator that only passes everything inside the XHTML <body/> tag to the + * underlying handler. Note that the <body/> tag itself is not passed on. + */ +public class BodyContentHandler extends ContentHandlerDecorator { + + /** The XHTML namespace URI (from Apache Tika 1.28.5 XHTMLContentHandler.XHTML) */ + private static final String XHTML = "http://www.w3.org/1999/xhtml"; + + /** XHTML XPath parser. */ + private static final XPathParser PARSER = new XPathParser("xhtml", XHTML); + + /** The XPath matcher used to select the XHTML body contents. */ + private static final Matcher MATCHER = PARSER.parse("/xhtml:html/xhtml:body/descendant::node()"); + + /** + * Creates a content handler that passes all XHTML body events to the given underlying content + * handler. + * + * @param handler content handler + */ + public BodyContentHandler(ContentHandler handler) { + super(new MatchingContentHandler(handler, MATCHER)); + } + + /** + * Creates a content handler that writes XHTML body character events to the given writer. + * + * @param writer writer + */ + public BodyContentHandler(Writer writer) { + this(new WriteOutContentHandler(writer)); + } + + /** + * Creates a content handler that writes XHTML body character events to the given output stream + * using the default encoding. + * + * @param stream output stream + */ + public BodyContentHandler(OutputStream stream) { + this(new WriteOutContentHandler(stream)); + } + + /** + * Creates a content handler that writes XHTML body character events to an internal string buffer. + * The contents of the buffer can be retrieved using the {@link #toString()} method. + * + *

The internal string buffer is bounded at the given number of characters. If this write limit + * is reached, then a {@link SAXException} is thrown. + * + * @since Apache Tika 0.7 + * @param writeLimit maximum number of characters to include in the string, or -1 to disable the + * write limit + */ + public BodyContentHandler(int writeLimit) { + this(new WriteOutContentHandler(writeLimit)); + } + + /** + * Creates a content handler that writes XHTML body character events to an internal string buffer. + * The contents of the buffer can be retrieved using the {@link #toString()} method. + * + *

The internal string buffer is bounded at 100k characters. If this write limit is reached, + * then a {@link SAXException} is thrown. + */ + public BodyContentHandler() { + this(new WriteOutContentHandler()); + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/fromtika/ContentHandlerDecorator.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/fromtika/ContentHandlerDecorator.java new file mode 100644 index 00000000000..8fda4712697 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/fromtika/ContentHandlerDecorator.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This file was originally part of Apache Tika 1.28.5 (org.apache.tika.sax.ContentHandlerDecorator) + * and has been copied into the Solr codebase to eliminate the Tika dependency. + */ +package org.apache.solr.handler.extraction.fromtika; + +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Decorator base class for the {@link ContentHandler} interface. This class simply delegates all + * SAX events calls to an underlying decorated handler instance. Subclasses can provide extra + * decoration by overriding one or more of the SAX event methods. + */ +public class ContentHandlerDecorator extends DefaultHandler { + + /** Decorated SAX event handler. */ + private ContentHandler handler; + + /** + * Creates a decorator for the given SAX event handler. + * + * @param handler SAX event handler to be decorated + */ + public ContentHandlerDecorator(ContentHandler handler) { + assert handler != null; + this.handler = handler; + } + + /** + * Creates a decorator that by default forwards incoming SAX events to a dummy content handler + * that simply ignores all the events. Subclasses should use the {@link + * #setContentHandler(ContentHandler)} method to switch to a more usable underlying content + * handler. + */ + protected ContentHandlerDecorator() { + this(new DefaultHandler()); + } + + /** + * Sets the underlying content handler. All future SAX events will be directed to this handler + * instead of the one that was previously used. + * + * @param handler content handler + */ + protected void setContentHandler(ContentHandler handler) { + assert handler != null; + this.handler = handler; + } + + @Override + public void startPrefixMapping(String prefix, String uri) throws SAXException { + try { + handler.startPrefixMapping(prefix, uri); + } catch (SAXException e) { + handleException(e); + } + } + + @Override + public void endPrefixMapping(String prefix) throws SAXException { + try { + handler.endPrefixMapping(prefix); + } catch (SAXException e) { + handleException(e); + } + } + + @Override + public void processingInstruction(String target, String data) throws SAXException { + try { + handler.processingInstruction(target, data); + } catch (SAXException e) { + handleException(e); + } + } + + @Override + public void setDocumentLocator(Locator locator) { + handler.setDocumentLocator(locator); + } + + @Override + public void startDocument() throws SAXException { + try { + handler.startDocument(); + } catch (SAXException e) { + handleException(e); + } + } + + @Override + public void endDocument() throws SAXException { + try { + handler.endDocument(); + } catch (SAXException e) { + handleException(e); + } + } + + @Override + public void startElement(String uri, String localName, String name, Attributes atts) + throws SAXException { + try { + handler.startElement(uri, localName, name, atts); + } catch (SAXException e) { + handleException(e); + } + } + + @Override + public void endElement(String uri, String localName, String name) throws SAXException { + try { + handler.endElement(uri, localName, name); + } catch (SAXException e) { + handleException(e); + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + try { + handler.characters(ch, start, length); + } catch (SAXException e) { + handleException(e); + } + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { + try { + handler.ignorableWhitespace(ch, start, length); + } catch (SAXException e) { + handleException(e); + } + } + + @Override + public void skippedEntity(String name) throws SAXException { + try { + handler.skippedEntity(name); + } catch (SAXException e) { + handleException(e); + } + } + + @Override + public String toString() { + return handler.toString(); + } + + /** + * Handle any exceptions thrown by methods in this class. This method provides a single place to + * implement custom exception handling. The default behaviour is simply to re-throw the given + * exception, but subclasses can also provide alternative ways of handling the situation. + * + * @param exception the exception that was thrown + * @throws SAXException the exception (if any) thrown to the client + */ + protected void handleException(SAXException exception) throws SAXException { + throw exception; + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/fromtika/ToTextContentHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/fromtika/ToTextContentHandler.java new file mode 100644 index 00000000000..d4255fa2fb1 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/fromtika/ToTextContentHandler.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This file was originally part of Apache Tika 1.28.5 (org.apache.tika.sax.ToTextContentHandler) + * and has been copied into the Solr codebase to eliminate the Tika dependency. + */ +package org.apache.solr.handler.extraction.fromtika; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.StringWriter; +import java.io.UnsupportedEncodingException; +import java.io.Writer; +import java.nio.charset.Charset; +import java.util.Locale; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * SAX event handler that writes all character content out to a character stream. No escaping or + * other transformations are made on the character content. + * + *

As of Tika 1.20, this handler ignores content within <script> and <style> tags. + * + * @since Apache Tika 0.10 + */ +public class ToTextContentHandler extends DefaultHandler { + + private static final String STYLE = "STYLE"; + private static final String SCRIPT = "SCRIPT"; + private int styleDepth = 0; + private int scriptDepth = 0; + + /** The character stream. */ + private final Writer writer; + + /** + * Creates a content handler that writes character events to the given writer. + * + * @param writer writer + */ + public ToTextContentHandler(Writer writer) { + this.writer = writer; + } + + /** + * Creates a content handler that writes character events to the given output stream using the + * platform default encoding. + * + * @param stream output stream + */ + public ToTextContentHandler(OutputStream stream) { + this(new OutputStreamWriter(stream, Charset.defaultCharset())); + } + + /** + * Creates a content handler that writes character events to the given output stream using the + * given encoding. + * + * @param stream output stream + * @param encoding output encoding + * @throws UnsupportedEncodingException if the encoding is unsupported + */ + public ToTextContentHandler(OutputStream stream, String encoding) + throws UnsupportedEncodingException { + this(new OutputStreamWriter(stream, encoding)); + } + + /** + * Creates a content handler that writes character events to an internal string buffer. Use the + * {@link #toString()} method to access the collected character content. + */ + public ToTextContentHandler() { + this(new StringWriter()); + } + + /** Writes the given characters to the given character stream. */ + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + + if (styleDepth + scriptDepth != 0) { + return; + } + + try { + writer.write(ch, start, length); + } catch (IOException e) { + throw new SAXException("Error writing: " + new String(ch, start, length), e); + } + } + + /** + * Writes the given ignorable characters to the given character stream. The default implementation + * simply forwards the call to the {@link #characters(char[], int, int)} method. + */ + @Override + public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { + characters(ch, start, length); + } + + /** + * Flushes the character stream so that no characters are forgotten in internal buffers. + * + * @see TIKA-179 + * @throws SAXException if the stream can not be flushed + */ + @Override + public void endDocument() throws SAXException { + try { + writer.flush(); + } catch (IOException e) { + throw new SAXException("Error flushing character output", e); + } + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) + throws SAXException { + String uc = (qName == null) ? "" : qName.toUpperCase(Locale.ENGLISH); + if (uc.equals(STYLE)) { + styleDepth++; + } + if (uc.equals(SCRIPT)) { + scriptDepth++; + } + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + String uc = (qName == null) ? "" : qName.toUpperCase(Locale.ENGLISH); + if (uc.equals(STYLE)) { + styleDepth--; + } + if (uc.equals(SCRIPT)) { + scriptDepth--; + } + } + + /** + * Returns the contents of the internal string buffer where all the received characters have been + * collected. Only works when this object was constructed using the empty default constructor or + * by passing a {@link StringWriter} to the other constructor. + */ + @Override + public String toString() { + return writer.toString(); + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/fromtika/ToXMLContentHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/fromtika/ToXMLContentHandler.java new file mode 100644 index 00000000000..5310e234d7b --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/fromtika/ToXMLContentHandler.java @@ -0,0 +1,270 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This file was originally part of Apache Tika 1.28.5 (org.apache.tika.sax.ToXMLContentHandler) + * and has been copied into the Solr codebase to eliminate the Tika dependency. + */ +package org.apache.solr.handler.extraction.fromtika; + +import java.io.OutputStream; +import java.io.UnsupportedEncodingException; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +/** + * SAX event handler that serializes the XML document to a character stream. The incoming SAX events + * are expected to be well-formed (properly nested, etc.) and to explicitly include namespace + * declaration attributes and corresponding namespace prefixes in element and attribute names. + * + * @since Apache Tika 0.10 + */ +public class ToXMLContentHandler extends ToTextContentHandler { + + private static class ElementInfo { + + private final ElementInfo parent; + + private final Map namespaces; + + public ElementInfo(ElementInfo parent, Map namespaces) { + this.parent = parent; + if (namespaces.isEmpty()) { + this.namespaces = Collections.emptyMap(); + } else { + this.namespaces = new HashMap(namespaces); + } + } + + public String getPrefix(String uri) throws SAXException { + String prefix = namespaces.get(uri); + if (prefix != null) { + return prefix; + } else if (parent != null) { + return parent.getPrefix(uri); + } else if (uri == null || uri.length() == 0) { + return ""; + } else { + throw new SAXException("Namespace " + uri + " not declared"); + } + } + + public String getQName(String uri, String localName) throws SAXException { + String prefix = getPrefix(uri); + if (prefix.length() > 0) { + return prefix + ":" + localName; + } else { + return localName; + } + } + } + + private final String encoding; + + protected boolean inStartElement = false; + + protected final Map namespaces = new HashMap(); + + private ElementInfo currentElement; + + /** + * Creates an XML serializer that writes to the given byte stream using the given character + * encoding. + * + * @param stream output stream + * @param encoding output encoding + * @throws UnsupportedEncodingException if the encoding is unsupported + */ + public ToXMLContentHandler(OutputStream stream, String encoding) + throws UnsupportedEncodingException { + super(stream, encoding); + this.encoding = encoding; + } + + public ToXMLContentHandler(String encoding) { + super(); + this.encoding = encoding; + } + + public ToXMLContentHandler() { + super(); + this.encoding = null; + } + + /** Writes the XML prefix. */ + @Override + public void startDocument() throws SAXException { + if (encoding != null) { + write("\n"); + } + + currentElement = null; + namespaces.clear(); + } + + @Override + public void startPrefixMapping(String prefix, String uri) throws SAXException { + try { + if (currentElement != null && prefix.equals(currentElement.getPrefix(uri))) { + return; + } + } catch (SAXException ignore) { + } + namespaces.put(uri, prefix); + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) + throws SAXException { + lazyCloseStartElement(); + + currentElement = new ElementInfo(currentElement, namespaces); + + write('<'); + write(currentElement.getQName(uri, localName)); + + for (int i = 0; i < atts.getLength(); i++) { + write(' '); + write(currentElement.getQName(atts.getURI(i), atts.getLocalName(i))); + write('='); + write('"'); + char[] ch = atts.getValue(i).toCharArray(); + writeEscaped(ch, 0, ch.length, true); + write('"'); + } + + for (Map.Entry entry : namespaces.entrySet()) { + write(' '); + write("xmlns"); + String prefix = entry.getValue(); + if (prefix.length() > 0) { + write(':'); + write(prefix); + } + write('='); + write('"'); + char[] ch = entry.getKey().toCharArray(); + writeEscaped(ch, 0, ch.length, true); + write('"'); + } + namespaces.clear(); + + inStartElement = true; + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + if (inStartElement) { + write(" />"); + inStartElement = false; + } else { + write("'); + } + + namespaces.clear(); + + // Reset the position in the tree, to avoid endless stack overflow + // chains (see TIKA-1070) + currentElement = currentElement.parent; + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + lazyCloseStartElement(); + writeEscaped(ch, start, start + length, false); + } + + private void lazyCloseStartElement() throws SAXException { + if (inStartElement) { + write('>'); + inStartElement = false; + } + } + + /** + * Writes the given character as-is. + * + * @param ch character to be written + * @throws SAXException if the character could not be written + */ + protected void write(char ch) throws SAXException { + super.characters(new char[] {ch}, 0, 1); + } + + /** + * Writes the given string of character as-is. + * + * @param string string of character to be written + * @throws SAXException if the character string could not be written + */ + protected void write(String string) throws SAXException { + super.characters(string.toCharArray(), 0, string.length()); + } + + /** + * Writes the given characters as-is followed by the given entity. + * + * @param ch character array + * @param from start position in the array + * @param to end position in the array + * @param entity entity code + * @return next position in the array, after the characters plus one entity + * @throws SAXException if the characters could not be written + */ + private int writeCharsAndEntity(char[] ch, int from, int to, String entity) throws SAXException { + super.characters(ch, from, to - from); + write('&'); + write(entity); + write(';'); + return to + 1; + } + + /** + * Writes the given characters with XML meta characters escaped. + * + * @param ch character array + * @param from start position in the array + * @param to end position in the array + * @param attribute whether the characters should be escaped as an attribute value or normal + * character content + * @throws SAXException if the characters could not be written + */ + private void writeEscaped(char[] ch, int from, int to, boolean attribute) throws SAXException { + int pos = from; + while (pos < to) { + if (ch[pos] == '<') { + from = pos = writeCharsAndEntity(ch, from, pos, "lt"); + } else if (ch[pos] == '>') { + from = pos = writeCharsAndEntity(ch, from, pos, "gt"); + } else if (ch[pos] == '&') { + from = pos = writeCharsAndEntity(ch, from, pos, "amp"); + } else if (attribute && ch[pos] == '"') { + from = pos = writeCharsAndEntity(ch, from, pos, "quot"); + } else { + pos++; + } + } + super.characters(ch, from, to - from); + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/fromtika/package-info.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/fromtika/package-info.java new file mode 100644 index 00000000000..ddd0b2d4eb6 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/fromtika/package-info.java @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Classes in this package are copied from and (C) the Apache Tika project. */ +package org.apache.solr.handler.extraction.fromtika; diff --git a/solr/modules/extraction/src/test-files/extraction/example.html b/solr/modules/extraction/src/test-files/extraction/example.html index 5732f6214bc..2801c3c97d8 100644 --- a/solr/modules/extraction/src/test-files/extraction/example.html +++ b/solr/modules/extraction/src/test-files/extraction/example.html @@ -6,8 +6,8 @@

Here is some text

-
Here is some text in a div
-
This has a link.
+

a h1 tag

+

This has a link in a paragraph.

News
  • diff --git a/solr/modules/extraction/src/test-files/extraction/simple.html b/solr/modules/extraction/src/test-files/extraction/simple.html index 3c807fb1d98..3ec4d4e0d01 100644 --- a/solr/modules/extraction/src/test-files/extraction/simple.html +++ b/solr/modules/extraction/src/test-files/extraction/simple.html @@ -10,7 +10,7 @@ Here is some text

    distinct
    words

    -
    Here is some text in a div
    +

    Here is some text in a h1

    This has a link.