diff --git a/CHANGELOG.md b/CHANGELOG.md index f7c1a47aec42..be82d6e44ecf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ This file lists Solr's raw release notes with details of every change to Solr. M - Added efSearch parameter to knn query, exposed efSearchScaleFactor that is used to calculate efSearch internally #17928 [SOLR-17928](https://issues.apache.org/jira/browse/SOLR-17928) (Puneet Ahuja) (Elia Porciani) - Support indexing primitive float[] values for DenseVectorField via JavaBin [SOLR-17948](https://issues.apache.org/jira/browse/SOLR-17948) (Puneet Ahuja) (Noble Paul) - Enable MergeOnFlushMergePolicy in Solr [SOLR-17984](https://issues.apache.org/jira/browse/SOLR-17984) ([Houston Putman](https://home.apache.org/phonebook.html?uid=houston) @HoustonPutman) +- Add support for stored-only fields in ExportWriter with includeStoredFields=true. The default is false because it can negatively impact performance. [SOLR-18071](https://issues.apache.org/jira/browse/SOLR-18071) (Luke Kot-Zaniewski) ### Changed (30 changes) diff --git a/changelog/unreleased/SOLR-18071-support-stored-fields-export-writer.yml b/changelog/unreleased/SOLR-18071-support-stored-fields-export-writer.yml new file mode 100644 index 000000000000..dbd1b8c02371 --- /dev/null +++ b/changelog/unreleased/SOLR-18071-support-stored-fields-export-writer.yml @@ -0,0 +1,8 @@ +# See https://github.com/apache/solr/blob/main/dev-docs/changelog.adoc +title: Support including stored fields in Export Writer output. +type: added # added, changed, fixed, deprecated, removed, dependency_update, security, other +authors: + - name: Luke Kot-Zaniewski +links: + - name: SOLR-18071 + url: https://issues.apache.org/jira/browse/SOLR-18071 diff --git a/solr/core/src/java/org/apache/solr/handler/export/DoubleFieldWriter.java b/solr/core/src/java/org/apache/solr/handler/export/DoubleFieldWriter.java index e439560894b4..561d03366786 100644 --- a/solr/core/src/java/org/apache/solr/handler/export/DoubleFieldWriter.java +++ b/solr/core/src/java/org/apache/solr/handler/export/DoubleFieldWriter.java @@ -34,8 +34,7 @@ public DoubleFieldWriter( } @Override - public boolean write( - SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter ew, int fieldIndex) + public void write(SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter ew) throws IOException { double val; SortValue sortValue = sortDoc.getSortValue(this.field); @@ -43,7 +42,7 @@ public boolean write( if (sortValue.isPresent()) { val = (double) sortValue.getCurrentValue(); } else { // empty-value - return false; + return; } } else { // field is not part of 'sort' param, but part of 'fl' param @@ -53,10 +52,9 @@ public boolean write( if (vals != null) { val = Double.longBitsToDouble(vals.longValue()); } else { - return false; + return; } } ew.put(this.field, val); - return true; } } diff --git a/solr/core/src/java/org/apache/solr/handler/export/ExportWriter.java b/solr/core/src/java/org/apache/solr/handler/export/ExportWriter.java index ba88e3ce7637..998730a953f1 100644 --- a/solr/core/src/java/org/apache/solr/handler/export/ExportWriter.java +++ b/solr/core/src/java/org/apache/solr/handler/export/ExportWriter.java @@ -28,8 +28,11 @@ import java.lang.invoke.MethodHandles; import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.TreeSet; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; @@ -99,15 +102,15 @@ public class ExportWriter implements SolrCore.RawWriter, Closeable { public static final String BATCH_SIZE_PARAM = "batchSize"; public static final String QUEUE_SIZE_PARAM = "queueSize"; + public static final String INCLUDE_STORED_FIELDS_PARAM = "includeStoredFields"; public static final int DEFAULT_BATCH_SIZE = 30000; public static final int DEFAULT_QUEUE_SIZE = 150000; private static final FieldWriter EMPTY_FIELD_WRITER = new FieldWriter() { @Override - public boolean write( - SortDoc sortDoc, LeafReaderContext readerContext, EntryWriter out, int fieldIndex) { - return false; + public void write(SortDoc sortDoc, LeafReaderContext readerContext, EntryWriter out) { + // do nothing } }; @@ -482,45 +485,72 @@ void writeDoc( throws IOException { int ord = sortDoc.ord; LeafReaderContext context = leaves.get(ord); - int fieldIndex = 0; for (FieldWriter fieldWriter : writers) { - if (fieldWriter.write(sortDoc, context, ew, fieldIndex)) { - ++fieldIndex; - } + fieldWriter.write(sortDoc, context, ew); } } public List getFieldWriters(String[] fields, SolrQueryRequest req) throws IOException { DocValuesIteratorCache dvIterCache = new DocValuesIteratorCache(req.getSearcher(), false); - SolrReturnFields solrReturnFields = new SolrReturnFields(fields, req); + boolean includeStoredFields = req.getParams().getBool(INCLUDE_STORED_FIELDS_PARAM, false); List writers = new ArrayList<>(); + Set docValueFields = new LinkedHashSet<>(); + Map storedFields = new LinkedHashMap<>(); + for (String field : req.getSearcher().getFieldNames()) { if (!solrReturnFields.wantsField(field)) { continue; } SchemaField schemaField = req.getSchema().getField(field); - if (!schemaField.hasDocValues()) { - throw new IOException(schemaField + " must have DocValues to use this feature."); - } - boolean multiValued = schemaField.multiValued(); FieldType fieldType = schemaField.getType(); - FieldWriter writer; - if (fieldType instanceof SortableTextField && !schemaField.useDocValuesAsStored()) { - if (solrReturnFields.getRequestedFieldNames() != null - && solrReturnFields.getRequestedFieldNames().contains(field)) { - // Explicitly requested field cannot be used due to not having useDocValuesAsStored=true, - // throw exception + Set requestFieldNames = + solrReturnFields.getRequestedFieldNames() == null + ? Set.of() + : solrReturnFields.getRequestedFieldNames(); + + if (canUseDocValues(schemaField, fieldType)) { + // Prefer DocValues when available + docValueFields.add(schemaField); + } else if (schemaField.stored()) { + // Field is stored-only (no usable DocValues) + if (includeStoredFields) { + storedFields.put(field, schemaField); + } else if (requestFieldNames.contains(field)) { + // Explicitly requested field without DocValues and includeStoredFields=false + throw new IOException( + schemaField + + " must have DocValues to use this feature. " + + "Try setting includeStoredFields=true to retrieve this field from stored values."); + } + // Else: glob matched stored-only field without includeStoredFields - silently skip + } else if (requestFieldNames.contains(field)) { + // Explicitly requested field that has neither DocValues nor stored + if (fieldType instanceof SortableTextField && !schemaField.useDocValuesAsStored()) { throw new IOException( schemaField + " Must have useDocValuesAsStored='true' to be used with export writer"); } else { - // Glob pattern matched field cannot be used due to not having useDocValuesAsStored=true - continue; + throw new IOException( + schemaField + " must have DocValues or be stored to use this feature."); } } + // Else: glob matched field with neither DocValues nor stored - silently skip + } + + for (SchemaField schemaField : docValueFields) { + String field = schemaField.getName(); + boolean multiValued = schemaField.multiValued(); + FieldType fieldType = schemaField.getType(); + FieldWriter writer; + + if (schemaField.stored() && !storedFields.isEmpty()) { + // if we're reading StoredFields *anyway*, then we might as well avoid this extra DV lookup + storedFields.put(field, schemaField); + continue; + } DocValuesIteratorCache.FieldDocValuesSupplier docValuesCache = dvIterCache.getSupplier(field); @@ -574,9 +604,24 @@ public List getFieldWriters(String[] fields, SolrQueryRequest req) } writers.add(writer); } + + if (!storedFields.isEmpty()) { + writers.add(new StoredFieldsWriter(storedFields)); + } + return writers; } + private static boolean canUseDocValues(SchemaField schemaField, FieldType fieldType) { + return schemaField.hasDocValues() + // Special handling for SortableTextField: unlike other field types, it requires + // useDocValuesAsStored=true to be included via glob patterns in /export. This + // matches the behavior of /select (which requires useDocValuesAsStored=true for + // all globbed fields) and avoids performance issues. The requirement cannot be + // extended to other field types in /export for backward compatibility reasons. + && (!(fieldType instanceof SortableTextField) || schemaField.useDocValuesAsStored()); + } + SortDoc getSortDoc(SolrIndexSearcher searcher, SortField[] sortFields) throws IOException { SortValue[] sortValues = new SortValue[sortFields.length]; IndexSchema schema = searcher.getSchema(); @@ -591,7 +636,7 @@ SortDoc getSortDoc(SolrIndexSearcher searcher, SortField[] sortFields) throws IO throw new IOException(field + " must have DocValues to use this feature."); } - if (ft instanceof SortableTextField && schemaField.useDocValuesAsStored() == false) { + if (ft instanceof SortableTextField && !schemaField.useDocValuesAsStored()) { throw new IOException( schemaField + " Must have useDocValuesAsStored='true' to be used with export writer"); } diff --git a/solr/core/src/java/org/apache/solr/handler/export/FieldWriter.java b/solr/core/src/java/org/apache/solr/handler/export/FieldWriter.java index 1923afb410f7..4b7cf7eb47b6 100644 --- a/solr/core/src/java/org/apache/solr/handler/export/FieldWriter.java +++ b/solr/core/src/java/org/apache/solr/handler/export/FieldWriter.java @@ -22,7 +22,15 @@ import org.apache.solr.common.MapWriter; abstract class FieldWriter { - public abstract boolean write( - SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter out, int fieldIndex) + /** + * Writes field values from the document to the output. + * + * @param sortDoc the document being exported + * @param readerContext the leaf reader context for accessing field values + * @param out the output writer to write field values to + * @throws IOException if an I/O error occurs while reading or writing field values + */ + public abstract void write( + SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter out) throws IOException; } diff --git a/solr/core/src/java/org/apache/solr/handler/export/FloatFieldWriter.java b/solr/core/src/java/org/apache/solr/handler/export/FloatFieldWriter.java index a60c14e6b0ad..68a36f84b717 100644 --- a/solr/core/src/java/org/apache/solr/handler/export/FloatFieldWriter.java +++ b/solr/core/src/java/org/apache/solr/handler/export/FloatFieldWriter.java @@ -34,8 +34,7 @@ public FloatFieldWriter( } @Override - public boolean write( - SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter ew, int fieldIndex) + public void write(SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter ew) throws IOException { float val; SortValue sortValue = sortDoc.getSortValue(this.field); @@ -43,7 +42,7 @@ public boolean write( if (sortValue.isPresent()) { val = (float) sortValue.getCurrentValue(); } else { // empty-value - return false; + return; } } else { // field is not part of 'sort' param, but part of 'fl' param @@ -53,10 +52,9 @@ public boolean write( if (vals != null) { val = Float.intBitsToFloat((int) vals.longValue()); } else { - return false; + return; } } ew.put(this.field, val); - return true; } } diff --git a/solr/core/src/java/org/apache/solr/handler/export/IntFieldWriter.java b/solr/core/src/java/org/apache/solr/handler/export/IntFieldWriter.java index bf0396d4ab87..fc7c2d174ab8 100644 --- a/solr/core/src/java/org/apache/solr/handler/export/IntFieldWriter.java +++ b/solr/core/src/java/org/apache/solr/handler/export/IntFieldWriter.java @@ -34,8 +34,7 @@ public IntFieldWriter( } @Override - public boolean write( - SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter ew, int fieldIndex) + public void write(SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter ew) throws IOException { int val; SortValue sortValue = sortDoc.getSortValue(this.field); @@ -43,7 +42,7 @@ public boolean write( if (sortValue.isPresent()) { val = (int) sortValue.getCurrentValue(); } else { // empty-value - return false; + return; } } else { // field is not part of 'sort' param, but part of 'fl' param @@ -53,10 +52,9 @@ public boolean write( if (vals != null) { val = (int) vals.longValue(); } else { - return false; + return; } } ew.put(this.field, val); - return true; } } diff --git a/solr/core/src/java/org/apache/solr/handler/export/LongFieldWriter.java b/solr/core/src/java/org/apache/solr/handler/export/LongFieldWriter.java index 7961549477cf..38997e5a495c 100644 --- a/solr/core/src/java/org/apache/solr/handler/export/LongFieldWriter.java +++ b/solr/core/src/java/org/apache/solr/handler/export/LongFieldWriter.java @@ -35,8 +35,7 @@ public LongFieldWriter( } @Override - public boolean write( - SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter ew, int fieldIndex) + public void write(SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter ew) throws IOException { long val; SortValue sortValue = sortDoc.getSortValue(this.field); @@ -44,7 +43,7 @@ public boolean write( if (sortValue.isPresent()) { val = (long) sortValue.getCurrentValue(); } else { // empty-value - return false; + return; } } else { // field is not part of 'sort' param, but part of 'fl' param @@ -54,11 +53,10 @@ public boolean write( if (vals != null) { val = vals.longValue(); } else { - return false; + return; } } doWrite(ew, val); - return true; } protected void doWrite(MapWriter.EntryWriter ew, long val) throws IOException { diff --git a/solr/core/src/java/org/apache/solr/handler/export/MultiFieldWriter.java b/solr/core/src/java/org/apache/solr/handler/export/MultiFieldWriter.java index 7f5bdee4899f..51ea833f8526 100644 --- a/solr/core/src/java/org/apache/solr/handler/export/MultiFieldWriter.java +++ b/solr/core/src/java/org/apache/solr/handler/export/MultiFieldWriter.java @@ -61,15 +61,14 @@ public MultiFieldWriter( } @Override - public boolean write( - SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter out, int fieldIndex) + public void write(SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter out) throws IOException { if (this.fieldType.isPointField()) { SortedNumericDocValues vals = docValuesCache.getSortedNumericDocValues( sortDoc.docId, readerContext.reader(), readerContext.ord); if (vals == null) { - return false; + return; } final SortedNumericDocValues docVals = vals; @@ -82,13 +81,12 @@ public boolean write( w.add(bitsToValue.apply(docVals.nextValue())); } }); - return true; } else { SortedSetDocValues vals = docValuesCache.getSortedSetDocValues( sortDoc.docId, readerContext.reader(), readerContext.ord); if (vals == null) { - return false; + return; } final SortedSetDocValues docVals = vals; @@ -105,7 +103,6 @@ public boolean write( else w.add(fieldType.toObject(f)); } }); - return true; } } diff --git a/solr/core/src/java/org/apache/solr/handler/export/StoredFieldsWriter.java b/solr/core/src/java/org/apache/solr/handler/export/StoredFieldsWriter.java new file mode 100644 index 000000000000..58d502e2579d --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/export/StoredFieldsWriter.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler.export; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.Map; +import java.util.WeakHashMap; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.StoredFieldVisitor; +import org.apache.lucene.index.StoredFields; +import org.apache.solr.common.MapWriter.EntryWriter; +import org.apache.solr.schema.BoolField; +import org.apache.solr.schema.DateValueFieldType; +import org.apache.solr.schema.SchemaField; + +class StoredFieldsWriter extends FieldWriter { + + private static final ThreadLocal> + STORED_FIELDS_MAP = ThreadLocal.withInitial(WeakHashMap::new); + private final Map schemaFields; + + public StoredFieldsWriter(Map fieldsToRead) { + this.schemaFields = fieldsToRead; + } + + @Override + public void write(SortDoc sortDoc, LeafReaderContext readerContext, EntryWriter out) + throws IOException { + WeakHashMap map = STORED_FIELDS_MAP.get(); + LeafReader reader = readerContext.reader(); + StoredFields storedFields = map.get(reader.getReaderCacheHelper().getKey()); + if (storedFields == null) { + storedFields = reader.storedFields(); + map.put(reader.getReaderCacheHelper().getKey(), storedFields); + } + ExportVisitor visitor = new ExportVisitor(out); + storedFields.document(sortDoc.docId, visitor); + visitor.flush(); + } + + class ExportVisitor extends StoredFieldVisitor { + + final EntryWriter out; + String lastFieldName; + List multiValue = null; + int fieldsVisited; + + public ExportVisitor(EntryWriter out) { + this.out = out; + } + + @Override + public void stringField(FieldInfo fieldInfo, String value) throws IOException { + var schemaField = schemaFields.get(fieldInfo.name); + var fieldType = schemaField == null ? null : schemaField.getType(); + if (fieldType instanceof BoolField) { + // Convert "T"/"F" stored value to boolean true/false + addField(fieldInfo.name, Boolean.valueOf(fieldType.indexedToReadable(value))); + } else { + addField(fieldInfo.name, value); + } + } + + @Override + public void intField(FieldInfo fieldInfo, int value) throws IOException { + addField(fieldInfo.name, value); + } + + @Override + public void longField(FieldInfo fieldInfo, long value) throws IOException { + var schemaField = schemaFields.get(fieldInfo.name); + var fieldType = schemaField == null ? null : schemaField.getType(); + if (fieldType instanceof DateValueFieldType) { + Date date = new Date(value); + addField(fieldInfo.name, date); + } else { + addField(fieldInfo.name, value); + } + } + + @Override + public void floatField(FieldInfo fieldInfo, float value) throws IOException { + addField(fieldInfo.name, value); + } + + @Override + public void doubleField(FieldInfo fieldInfo, double value) throws IOException { + addField(fieldInfo.name, value); + } + + @Override + public Status needsField(FieldInfo fieldInfo) { + return schemaFields.containsKey(fieldInfo.name) ? Status.YES : Status.NO; + } + + private void addField(String fieldName, T value) throws IOException { + if (fieldName.equals(lastFieldName)) { + // assume adding another value to a multi-value field + multiValue.add(value); + return; + } + // new/different field... + flush(); // completes the previous field if there's something to do + fieldsVisited++; + lastFieldName = fieldName; + + if (schemaFields.get(fieldName).multiValued()) { + multiValue = new ArrayList<>(); + multiValue.add(value); + } else { + out.put(fieldName, value); + } + } + + private void flush() throws IOException { + if (multiValue != null) { + out.put(lastFieldName, multiValue); + multiValue = null; + } + } + } +} diff --git a/solr/core/src/java/org/apache/solr/handler/export/StringFieldWriter.java b/solr/core/src/java/org/apache/solr/handler/export/StringFieldWriter.java index 2f8d0963e3a1..228f3c1c743a 100644 --- a/solr/core/src/java/org/apache/solr/handler/export/StringFieldWriter.java +++ b/solr/core/src/java/org/apache/solr/handler/export/StringFieldWriter.java @@ -59,8 +59,7 @@ public StringFieldWriter( } @Override - public boolean write( - SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter ew, int fieldIndex) + public void write(SortDoc sortDoc, LeafReaderContext readerContext, MapWriter.EntryWriter ew) throws IOException { StringValue stringValue = (StringValue) sortDoc.getSortValue(this.field); BytesRef ref = null; @@ -74,7 +73,7 @@ public boolean write( if (stringValue.currentOrd == -1) { // Null sort value - return false; + return; } if (this.lastOrd == stringValue.currentOrd) { @@ -89,7 +88,7 @@ public boolean write( docValuesCache.getSortedDocValues( sortDoc.docId, readerContext.reader(), readerContext.ord); if (vals == null) { - return false; + return; } int ord = vals.ordValue(); @@ -102,7 +101,6 @@ public boolean write( } writeBytes(ew, ref, fieldType); - return true; } protected void writeBytes(MapWriter.EntryWriter ew, BytesRef ref, FieldType fieldType) diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-sortingresponse.xml b/solr/core/src/test-files/solr/collection1/conf/schema-sortingresponse.xml index d821c3935f2b..5674b1dd7b2f 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema-sortingresponse.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema-sortingresponse.xml @@ -33,7 +33,7 @@ - + @@ -105,7 +105,7 @@ - + @@ -128,6 +128,22 @@ + + + + + + + + + + + + + + + + id diff --git a/solr/core/src/test/org/apache/solr/handler/export/TestExportWriter.java b/solr/core/src/test/org/apache/solr/handler/export/TestExportWriter.java index f4836a93ce2f..84b413c9f8e5 100644 --- a/solr/core/src/test/org/apache/solr/handler/export/TestExportWriter.java +++ b/solr/core/src/test/org/apache/solr/handler/export/TestExportWriter.java @@ -17,6 +17,7 @@ package org.apache.solr.handler.export; import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.IOException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; @@ -952,7 +953,7 @@ private void testSortingOutput() throws Exception { s.contains("\"status\":400}")); assertTrue( "Should have a cause when exporting sortabledv_m, it does not have useDocValuesAsStored='true'", - s.contains("Must have useDocValuesAsStored='true' to be used with export writer")); + s.contains("includeStoredFields=true")); s = h.query( @@ -970,7 +971,7 @@ private void testSortingOutput() throws Exception { s.contains("\"status\":400}")); assertTrue( "Should have a cause when exporting sortabledv, it does not have useDocValuesAsStored='true'", - s.contains("Must have useDocValuesAsStored='true' to be used with export writer")); + s.contains("includeStoredFields=true")); } private void assertJsonEquals(String actual, String expected) { @@ -1292,9 +1293,7 @@ public void testExpr() throws Exception { assertTrue("doc doesn't have exception", doc.containsKey(StreamParams.EXCEPTION)); assertTrue( "wrong exception message", - doc.get(StreamParams.EXCEPTION) - .toString() - .contains("Must have useDocValuesAsStored='true'")); + doc.get(StreamParams.EXCEPTION).toString().contains("includeStoredFields=true")); } @Test @@ -1476,4 +1475,299 @@ private void addField(SolrInputDocument doc, String type, String value, boolean doc.addField("number_" + type + (mv ? "s" : "") + "_ni_t", value); doc.addField("number_" + type + (mv ? "s" : "") + "_ni_p", value); } + + @Test + public void testIncludeStoredFieldsExplicitRequest() throws Exception { + // Test that stored-only fields are returned when includeStoredFields=true + clearIndex(); + + assertU( + adoc( + "id", "1", + "intdv", "1", + "str_s_stored", "hello", + "num_i_stored", "42", + "num_l_stored", "1234567890123", + "num_f_stored", "3.14", + "num_d_stored", "2.71828", + "date_dt_stored", "2024-01-15T10:30:00Z", + "bool_b_stored", "true")); + assertU(commit()); + + String resp = + h.query( + req( + "qt", "/export", + "q", "*:*", + "fl", + "id,str_s_stored,num_i_stored,num_l_stored,num_f_stored,num_d_stored,date_dt_stored,bool_b_stored", + "sort", "intdv asc", + "includeStoredFields", "true")); + + assertJsonEquals( + resp, + """ + { + "responseHeader":{"status":0}, + "response":{ + "numFound":1, + "docs":[{ + "id":"1", + "str_s_stored":"hello", + "num_i_stored":42, + "num_l_stored":1234567890123, + "num_f_stored":3.14, + "num_d_stored":2.71828, + "date_dt_stored":"2024-01-15T10:30:00Z", + "bool_b_stored":true}]}} + """); + } + + @Test + public void testIncludeStoredFieldsErrorWithoutParam() throws Exception { + // Test that error with hint is thrown when requesting stored-only field without + // includeStoredFields + clearIndex(); + + assertU(adoc("id", "1", "intdv", "1", "str_s_stored", "hello")); + assertU(commit()); + + // Request stored-only field without includeStoredFields=true should error + String resp = + h.query( + req( + "qt", "/export", + "q", "*:*", + "fl", "id,str_s_stored", + "sort", "intdv asc")); + + assertTrue( + "Expected error message to contain hint about includeStoredFields", + resp.contains("includeStoredFields=true")); + assertTrue("Expected error message to mention the field", resp.contains("str_s_stored")); + } + + @Test + public void testIncludeStoredFieldsGlobSkipsWithoutParam() throws Exception { + // Test that glob pattern silently skips stored-only fields when includeStoredFields=false + clearIndex(); + + assertU( + adoc( + "id", "1", + "intdv", "1", + "stringdv", "docvalue_string", + "str_s_stored", "stored_string")); + assertU(commit()); + + // Explicit fl with stored-only field should error + String resp = + h.query( + req( + "qt", "/export", + "q", "*:*", + "fl", "id,intdv,stringdv,str_s_stored", + "sort", "intdv asc")); + + // Should error because str_s_stored is explicitly requested + assertTrue( + "Expected error for explicitly requested stored-only field", resp.contains("str_s_stored")); + assertTrue( + "Expected hint about includeStoredFields", resp.contains("includeStoredFields=true")); + + // Now test with glob - should silently skip stored-only fields and succeed + resp = + h.query( + req( + "qt", "/export", + "q", "*:*", + "fl", "intdv,*", + "sort", "intdv asc")); + + assertJsonEquals( + resp, + """ + { + "responseHeader":{"status":0}, + "response":{ + "numFound":1, + "docs":[{ + "id":"1", + "intdv":1, + "stringdv":"docvalue_string"}]}} + """); + } + + @Test + public void testIncludeStoredFieldsGlobIncludesWithParam() throws Exception { + // Test that glob pattern includes stored-only fields when includeStoredFields=true + clearIndex(); + + assertU( + adoc( + "id", "1", + "intdv", "1", + "stringdv", "docvalue_string", + "str_s_stored", "stored_string")); + assertU(commit()); + + // Glob fl=* with includeStoredFields=true should include stored-only fields + String resp = + h.query( + req( + "qt", "/export", + "q", "*:*", + "fl", "*", + "sort", "intdv asc", + "includeStoredFields", "true")); + + assertJsonEquals( + resp, + """ + { + "responseHeader":{"status":0}, + "response":{ + "numFound":1, + "docs":[{ + "intdv":1, + "stringdv":"docvalue_string", + "id":"1", + "str_s_stored":"stored_string"}]}} + """); + } + + @Test + public void testIncludeStoredFieldsMultiValued() throws Exception { + // Test that multi-valued stored-only fields work correctly + clearIndex(); + + assertU( + adoc( + "id", "1", + "intdv", "1", + "strs_ss_stored", "value1", + "strs_ss_stored", "value2", + "strs_ss_stored", "value3", + "nums_is_stored", "10", + "nums_is_stored", "20", + "nums_is_stored", "30")); + assertU(commit()); + + String resp = + h.query( + req( + "qt", "/export", + "q", "*:*", + "fl", "id,strs_ss_stored,nums_is_stored", + "sort", "intdv asc", + "includeStoredFields", "true")); + + assertJsonEquals( + resp, + """ + { + "responseHeader":{"status":0}, + "response":{ + "numFound":1, + "docs":[{ + "id":"1", + "strs_ss_stored":["value1","value2","value3"], + "nums_is_stored":[10,20,30]}]}} + """); + } + + @Test + public void testIncludeStoredFieldsAllTypes() throws Exception { + // Test all supported stored field types including Date + clearIndex(); + + assertU( + adoc( + "id", "1", + "intdv", "1", + "str_s_stored", "test_string", + "num_i_stored", "123", + "num_l_stored", "9876543210", + "num_f_stored", "1.5", + "num_d_stored", "2.5", + "date_dt_stored", "2025-12-25T00:00:00Z", + "bool_b_stored", "false")); + assertU( + adoc( + "id", "2", + "intdv", "2", + "str_s_stored", "another_string", + "num_i_stored", "456", + "num_l_stored", "1234567890", + "num_f_stored", "2.5", + "num_d_stored", "3.5", + "date_dt_stored", "2025-06-15T12:30:00Z", + "bool_b_stored", "true")); + assertU(commit()); + + String resp = + h.query( + req( + "qt", "/export", + "q", "*:*", + "fl", + "id,str_s_stored,num_i_stored,num_l_stored,num_f_stored,num_d_stored,date_dt_stored,bool_b_stored", + "sort", "intdv asc", + "includeStoredFields", "true")); + + assertJsonEquals( + resp, + """ + { + "responseHeader":{"status":0}, + "response":{ + "numFound":2, + "docs":[{ + "id":"1", + "str_s_stored":"test_string", + "num_i_stored":123, + "num_l_stored":9876543210, + "num_f_stored":1.5, + "num_d_stored":2.5, + "date_dt_stored":"2025-12-25T00:00:00Z", + "bool_b_stored":false}, + { + "id":"2", + "str_s_stored":"another_string", + "num_i_stored":456, + "num_l_stored":1234567890, + "num_f_stored":2.5, + "num_d_stored":3.5, + "date_dt_stored":"2025-06-15T12:30:00Z", + "bool_b_stored":true}]}} + """); + } + + @Test + public void testSortingWithoutDocValues() throws Exception { + // Attempting to sort on a field without DocValues should fail + clearIndex(); + + assertU( + adoc( + "id", "1", + "sorted_i_stored", "0")); + assertU(commit()); + + IOException ex = + expectThrows( + IOException.class, + () -> + h.query( + req( + "qt", "/export", + "q", "*:*", + "fl", "id", + "sort", "sorted_i_stored asc", + "includeStoredFields", "true"))); + + assertTrue( + "Error message should mention DocValues requirement", + ex.getMessage().contains("DocValues")); + } } diff --git a/solr/solr-ref-guide/modules/query-guide/pages/exporting-result-sets.adoc b/solr/solr-ref-guide/modules/query-guide/pages/exporting-result-sets.adoc index bbd31c7b358f..fc6f4d6a7ef3 100644 --- a/solr/solr-ref-guide/modules/query-guide/pages/exporting-result-sets.adoc +++ b/solr/solr-ref-guide/modules/query-guide/pages/exporting-result-sets.adoc @@ -25,7 +25,9 @@ The cases where this functionality may be useful include: session analysis, dist == Field Requirements -All the fields being sorted and exported must have docValues set to `true`. +All the fields being sorted must have docValues set to `true`. +By default, fields in the field list (`fl`) must also have docValues. +However, you can include stored-only fields (fields without docValues) by setting the `includeStoredFields` parameter to `true`. For more information, see the section on xref:indexing-guide:docvalues.adoc[]. == The /export RequestHandler @@ -44,6 +46,12 @@ Filter queries are also supported. An optional parameter `batchSize` determines the size of the internal buffers for partial results. The default value is `30000` but users may want to specify smaller values to limit the memory use (at the cost of degraded performance) or higher values to improve export performance (the relationship is not linear and larger values don't bring proportionally larger performance increases). +An optional parameter `includeStoredFields` (default `false`) enables exporting fields that only have stored values (no docValues). +When set to `true`, fields without docValues but with stored values can be included in the field list (`fl`). +Note that retrieving stored fields may significantly impact export performance compared to docValues fields, as stored fields require additional I/O operations. +If all requested fields are `docValues=true` then the data will only be read from docValues. +This behavior applies to fields that are also `stored=true` and does not depend on the value of the `includeStoredFields` parameter. + The supported response writers are `json` and `javabin`. For backward compatibility reasons `wt=xsort` is also supported as input, but `wt=xsort` behaves same as `wt=json`. The default output format is `json`. @@ -58,8 +66,8 @@ http://localhost:8983/solr/core_name/export?q=my-query&sort=severity+desc,timest === Specifying the Sort Criteria The `sort` property defines how documents will be sorted in the exported result set. -Results can be sorted by any field that has a field type of int,long, float, double, string. -The sort fields must be single valued fields. +Results can be sorted by any field that has a field type of int, long, float, double, string. +The sort fields must be single valued fields and must have docValues enabled. The export performance will get slower as you add more sort fields. If there is enough physical memory available outside of the JVM to load up the sort fields then the performance will be linearly slower with addition of sort fields. @@ -71,6 +79,10 @@ The `fl` property defines the fields that will be exported with the result set. Any of the field types that can be sorted (i.e., int, long, float, double, string, date, boolean) can be used in the field list. The fields can be single or multi-valued. +By default, fields in the field list must have docValues enabled. +However, when the `includeStoredFields` parameter is set to `true`, fields with only stored values (no docValues) can also be included. +Note that sort fields still require docValues, regardless of this setting. + Wildcard patterns can be used for the field list (e.g. `fl=*_i`) and will be expanded to the list of fields that match the pattern and are able to be exported, see <>. Returning scores is not supported at this time. @@ -105,6 +117,61 @@ http://localhost:8983/solr/core_name/export?q=my-query&sort=reporter+desc,&fl=re (Note that the `over` parameter must use one of the fields requested in the `fl` parameter). +== Comparison with Cursors + +The `/export` handler and xref:pagination-of-results.adoc#fetching-a-large-number-of-sorted-results-cursors[cursor-based pagination] offer different trade-offs for streaming large result sets. + +[cols="h,2,2"] +|=== +| |Export |Cursors + +|Advantages +a| +* Query executed once -- efficient +* Consistent snapshot (no duplicates or missing docs) +* Lower latency to the first document (typically) +* Decoupled reader and writer creates smoother flow +a| +* Sharded collection support, intrinsically supported +* Flexible sort criteria +* Resumable across requests and restarts +* Full `SearchHandler` features (highlighting, etc.) + +|Disadvantages +a| +* Requires streaming expressions for distributed queries +* Sort criteria can only be fields with docValues; no score +* Must consume in a single session +* A long session may retain old segments from being removed in a timely manner +a| +* Query re-executed for each page -- inefficient +* Possible duplicates or missing docs with concurrent updates +* Higher latency to the first document (typically) +* Uneven flow; large batches needed for throughput +|=== + +=== Details + +With cursors, the query is re-executed for each page of results. +In contrast, `/export` runs the filter query once and the resulting segment-level bitmasks are applied once per segment, after which the documents are simply iterated over. +Additionally, the segments that existed when the stream was opened are held open for the duration of the export, eliminating the disappearing or duplicate document issues that can occur with cursors. +However, this means IndexReaders are kept around for longer periods of time, which delays cleanup of memory and disk resources until the export completes. + +The `/export` handler has significantly lower latency until the first document is returned, because the internal batch size is decoupled from the response message size. +With cursors, you typically need to set the `rows` parameter to a high value (e.g., 10k-100k depending on `fl`/document size) to achieve decent throughput, and provided you have enough memory (rows * shards * `fl`-size). +However, this creates a "glugging" effect: when you request a large batch, Solr must build the entire payload and send it over the wire while your client waits (assuming a sharded-collection). +Only after receiving and decoding this large payload can the client request the next batch, but in the interim Solr sits idle on this request. +With the `/export` handler, these steps are decoupled; Solr can continue sorting and decoding/encoding documents while waiting for more demand from the client. + +The advantage of cursors is _flexibility_. +Cursors impose no constraints on the sort criteria except that you must include a unique key, which isn't a real constraint. +Cursors work as part of `SearchHandler` and thus can include most/all capabilities of it like highlighting. +A `cursorMark` can be persisted and resumed later, even across restarts, or never continued if enough results were consumed to satisfy the use-case. +An `/export` stream must be consumed in a single session. +Cursors also support distributed queries by default while `/export` does not, although they can be achieved using +xref:streaming-expressions.adoc[streaming expressions] which are built on top of the `/export` handler. + == Distributed Support See the section xref:streaming-expressions.adoc[] for distributed support. +