diff --git a/changelog/unreleased/SOLR-17492.yml b/changelog/unreleased/SOLR-17492.yml
new file mode 100644
index 00000000000..50ebce08559
--- /dev/null
+++ b/changelog/unreleased/SOLR-17492.yml
@@ -0,0 +1,8 @@
+# See https://github.com/apache/solr/blob/main/dev-docs/changelog.adoc
+title: Introduce advice on how to scale Solr from 1 node to 100's of nodes in Ref Guide
+type: added # added, changed, fixed, deprecated, removed, dependency_update, security, other
+authors:
+ - name: Eric Pugh
+links:
+ - name: SOLR-17492
+ url: https://issues.apache.org/jira/browse/SOLR-17492
diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml
index c9616523957..be8af1b6ec9 100644
--- a/gradle/libs.versions.toml
+++ b/gradle/libs.versions.toml
@@ -45,6 +45,8 @@ apache-tomcat = "6.0.53"
apache-zookeeper = "3.9.4"
# @keep for version alignment
apiguardian = "1.1.2"
+# @keep Antora mermaid extension version used in ref-guide
+antora-mermaid-extension = "0.0.8"
# @keep Asciidoctor mathjax version used in ref-guide
asciidoctor-mathjax = "0.0.9"
# @keep Asciidoctor tabs version used in ref-guide
diff --git a/solr/solr-ref-guide/build.gradle b/solr/solr-ref-guide/build.gradle
index 5b53514d698..1473f79a4c5 100644
--- a/solr/solr-ref-guide/build.gradle
+++ b/solr/solr-ref-guide/build.gradle
@@ -245,6 +245,14 @@ task downloadAsciidoctorMathjaxExtension(type: NpmTask) {
outputs.dir("${project.ext.nodeProjectDir}/node_modules/@djencks/asciidoctor-mathjax")
}
+task downloadAntoraMermaidExtension(type: NpmTask) {
+ group = 'Build Dependency Download'
+ args = ["install", "@sntke/antora-mermaid-extension@${libs.versions.antora.mermaid.extension.get()}"]
+
+ inputs.property("antora-mermaid-extension version", libs.versions.antora.mermaid.extension.get())
+ outputs.dir("${project.ext.nodeProjectDir}/node_modules/@sntke/antora-mermaid-extension")
+}
+
task downloadAsciidoctorTabsExtension(type: NpmTask) {
group = 'Build Dependency Download'
args = ["install", "-D", "@asciidoctor/tabs@${libs.versions.asciidoctor.tabs.get()}"]
@@ -260,6 +268,7 @@ task downloadAntora {
dependsOn tasks.downloadAntoraCli
dependsOn tasks.downloadAntoraSiteGenerator
dependsOn tasks.downloadAntoraLunrExtension
+ dependsOn tasks.downloadAntoraMermaidExtension
dependsOn tasks.downloadAsciidoctorMathjaxExtension
dependsOn tasks.downloadAsciidoctorTabsExtension
}
diff --git a/solr/solr-ref-guide/modules/deployment-guide/deployment-nav.adoc b/solr/solr-ref-guide/modules/deployment-guide/deployment-nav.adoc
index 55301601e3e..54f3a1599ef 100644
--- a/solr/solr-ref-guide/modules/deployment-guide/deployment-nav.adoc
+++ b/solr/solr-ref-guide/modules/deployment-guide/deployment-nav.adoc
@@ -18,7 +18,7 @@
.Deployment Guide
* xref:solr-control-script-reference.adoc[]
-
+* xref:deployment-topology-overview.adoc[]
* Installation & Deployment
** xref:system-requirements.adoc[]
** xref:installing-solr.adoc[]
diff --git a/solr/solr-ref-guide/modules/deployment-guide/pages/deployment-topology-overview.adoc b/solr/solr-ref-guide/modules/deployment-guide/pages/deployment-topology-overview.adoc
new file mode 100644
index 00000000000..15e876b9b4b
--- /dev/null
+++ b/solr/solr-ref-guide/modules/deployment-guide/pages/deployment-topology-overview.adoc
@@ -0,0 +1,302 @@
+= Deployment Topology Overview
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+////
+This page has a number of graphs to help you visualize different Solr deployment strategies.
+
+The graphs are developed using Mermaid syntax.
+
+The site https://mermaid.live/ allows you to play with those diagrams in real time.
+////
+
+This section embodies the Solr community's thoughts on best practices for deploying Solr depending on your needs.
+
+== Overview
+There is a logical progression of topologies for scaling Solr based on the number of nodes you anticipate needing to meet your indexing needs, querying needs, and storage needs.
+
+[mermaid,width=100%]
+....
+flowchart LR
+ node1((1. Single
Node))
+ node2((2. Two Nodes))
+ node3((3. Three or
Five Nodes))
+ node4((4. Six to Twelve
Nodes))
+ node5((5. Twelve to
Twenty Five
Nodes))
+ node6((6. Twenty Six
and Beyond))
+
+ node1 --- node2
+ node2 --- node3
+ node3 --- node4
+ node4 --- node5
+ node5 --- node6
+
+ classDef uniform padding:20px,min-width:120px,min-height:120px
+ class node1,node2,node3,node4,node5,node6 uniform
+....
+
+
+== Solr from smallest to largest
+
+When we start Solr, we're already starting Solr with the underpinnings required to let Solr scale in a smooth fashion, the coordination library ZooKeeper.
+ZooKeeper is the unifying technology that supports maintaining state from a single node up to many 100's of nodes.
+
+=== Simplest Setup
+
+If you only need a single Solr node, then it's perfectly reasonable to start Solr with `bin/solr start`. You will have a single Solr node running in SolrCloud mode, with all the lovely APIs and features that SolrCloud provides.
+
+[mermaid]
+....
+flowchart TD
+ node1[node1]
+....
+
+Use this approach when:
+
+* You are just playing around with Solr.
+* You aren't worried about High Availability or Failover.
+* You want the simplest deployment approach.
+
+
+=== Introducing Fail Over
+
+The next most common setup after a single node is having two separate nodes running on separate machines, with one as the xref:cluster-types.adoc#leaders[Leader] and the other as the Follower.
+
+This leverages a simplistic ZooKeeper xref:getting-started:solr-glossary.adoc#ensemble[Ensemble] setup.
+You start the first Solr node with the embedded ZooKeeper, and then the second Solr is set to connect to the ZooKeeper embedded in the first Solr node.
+
+[mermaid]
+....
+flowchart LR
+ node1["solr1 | zk"]
+ node2[solr2]
+
+ node1 --- node2
+....
+
+This is basically what you are setting up when you run the `bin/solr start -e cloud` cloud example.
+
+```
+mkdir -p ./two_nodes/node1/solr
+mkdir -p ./two_nodes/node2/solr
+bin/solr start -p 8983 --solr-home "./two_nodes/node1/solr"
+bin/solr start -p 8984 --solr-home "./two_nodes/node2/solr" -z 127.0.0.1:9983
+```
+
+Use this approach when:
+
+* You want failover, but you aren't worried about high availability. You have a load balancer in front of the two Solr nodes that detects when one goes down and redirects traffic to the remaining node for querying.
+* You are prepared to handle the consequences for indexing operations if one of the nodes goes down.
+
+=== Adding High Availability
+
+You can then scale beyond two nodes by running either three or five Solr nodes.
+We specify three or five Solr nodes because ZooKeeper is running on every single node, and therefore requires an odd number of nodes in order to prevent Split Brain issues in the cluster.
+
+
+NOTE: This approach requires https://github.com/apache/solr/pull/2391 to be completed! We are leaving it in the Ref Guide as a pointer to the future.
+
+NOTE: What is the difference between failover and high availability? Failover, in the two-node situation, means that you can still issue queries and keep your application running, but you can no longer index data because ZooKeeper can't form a quorum. This is often referred to as a Split Brain situation. High Availability means that all aspects of your service continues uninterrupted in the event of any node going down.
+
+
+
+[mermaid,width=100%]
+....
+flowchart LR
+ node1[node1]
+ node2[node2]
+ node3[node3]
+ node4[node4]
+ node5[node5]
+
+ node1 --- node2
+ node2 --- node3
+ node3 --- node4
+ node4 --- node5
+ node5 --- node1
+
+ classDef uniform padding:20px,min-width:120px,min-height:120px
+ class node1,node2,node3,node4,node5 uniform
+....
+
+Use these approaches when:
+
+* You want to be able to split your logical Collection across multiple Shards and distribute Replicas around the cluster.
+* You don't want to go through the effort of deploying a separate ZK ensemble independently. And honestly, you don't need to either.
+* You want true High Availability. With three nodes, you can lose one and continue. With five nodes you can lose two nodes and still continue.
+
+
+Some cons to this approach are:
+
+* Having five ZooKeeper instances all updating each other is fine, but it starts to break down if you expand to 7 or 9 ZooKeeper instances forming the Quorum.
+* We currently don't have any flexible resizing of the quorum. You need to select the appropriate size when setting up your cluster.
+
+=== Moving Beyond the Basic Cluster
+
+NOTE: This isn't yet fleshed out as to how it works!
+
+Solr has a concept of node xref:deployment-guide:node-roles.adoc#roles[Roles] that can be leveraged to establish a set of Solr nodes that run embedded ZooKeeper, and then a larger set of Solr nodes that connect to those ZooKeepers. We currently have the concept of "data" nodes that host shards and replicas, and we can introduce a "zookeeper" node that also runs the embedded ZooKeeper process.
+
+This will work well as you grow from six to 12 nodes in your cluster.
+
+[mermaid,width=100%]
+....
+flowchart LR
+ subgraph zk[ZooKeeper Ensemble]
+ node1[data, zookeeper]
+ node2[data, zookeeper]
+ node3[data, zookeeper]
+ end
+
+ node4[data]
+ node5[data]
+ node6[data]
+ node7[data]
+ node8[data]
+ node9[data]
+
+ node1 --- node2
+ node2 --- node3
+ node3 --- node1
+ node3 --- node4
+ node4 --- node5
+ node5 --- node6
+ node6 --- node7
+ node7 --- node8
+ node8 --- node9
+ node9 --- node1
+
+ style node1 fill:#ffff00
+ style node2 fill:#ffff00
+ style node3 fill:#ffff00
+....
+
+=== Separating out ZooKeeper workload
+
+As your load in the cluster goes up, sharing ZooKeeper workloads with Solr workloads may become a bottleneck.
+At this point you may want to run distinct seperate ZooKeeper nodes on their own servers.
+
+[mermaid,width=100%]
+....
+flowchart LR
+ subgraph zk[ZooKeeper Ensemble]
+ direction TB
+ zk1[zookeeper]
+ zk2[zookeeper]
+ zk3[zookeeper]
+ end
+
+ subgraph solr[Solr Nodes]
+ direction TB
+ node4[node4] --- node5[node5] --- node6[node6] --- node7[node7] --- node8[node8]
+ node9[node9] --- node10[node10] --- node11[node11] --- node12[node12] --- node13[node13]
+ node14[node14] --- node15[node15] --- node16[node16] --- node17[node17] --- node18[node18]
+ node19[node19] --- node20[node20]
+ end
+
+ zk --- solr
+
+ style zk1 fill:#ffff00
+ style zk2 fill:#ffff00
+ style zk3 fill:#ffff00
+....
+
+Use this approach when:
+
+* You go beyond 12 Solr nodes up to 25 Solr nodes.
+* You are leveraging all the features of SolrCloud to support multiple collections and different types of query and load characteristics, especially tuning shard and replica counts.
+* You may need to move to five ZooKeepers in their own setup to support traffic.
+
+Some cons to this approach are:
+
+* You are responsible for configuring and maintaining the external ZooKeeper ensemble.
+* You need to define how you will handle failover/HA for the ZooKeeper ensemble itself.
+
+=== Going massive means going Kubernetes
+
+Beyond 25 nodes, you really need to think about more advanced tooling for managing all your nodes.
+We discourage rolling your own Zookeeper orchestration, as there are many pitfalls.
+Instead, use a well-supported container orchestrator with support for Solr and Zookeeper.
+For Kubernetes, we provide the https://solr.apache.org/operator/[Solr Operator] sub project.
+There are also 3rd party Helm charts available.
+
+[mermaid,width=100%]
+....
+flowchart TB
+ subgraph kubernetes[Kubernetes]
+ operator[Solr Operator]
+
+ subgraph solr[Solr Pods]
+ direction TB
+ node1[node1] --- node2[node2] --- node3[node3] --- node4[node4] --- node5[node5]
+ node6[node6] --- node7[node7] --- node8[node8] --- node9[node9] --- node10[node10]
+ node11[node11] --- node12[node12] --- node13[node13] --- node14[node14] --- node15[node15]
+ node16[node16] --- node17[node17] --- node18[node18] --- node19[node19] --- node20[node20]
+ node21[node21] --- node22[node22] --- node23[node23] --- node24[node24] --- node25[node25]
+ node26[node26] --- node27[node27] --- node28[node28] --- node29[node29] --- node30[node30]
+ end
+
+ subgraph zk[ZooKeeper Ensemble]
+ direction TB
+ zk1[zookeeper]
+ zk2[zookeeper]
+ zk3[zookeeper]
+ zk1 --- zk2
+ zk2 --- zk3
+ zk3 --- zk1
+ end
+ end
+
+ operator --> solr
+ solr --- zk
+
+ style operator fill:#00ffff
+ style zk1 fill:#ffff00
+ style zk2 fill:#ffff00
+ style zk3 fill:#ffff00
+....
+
+Use this approach when:
+
+* You need to deploy more than 25 Solr nodes.
+* You have the operational maturity to manage massive data sets and fleets of Kubernetes pods.
+* You want a standardized approach to deployment, scaling, and management.
+* You may adopt this earlier if you are already a Kubernetes-savvy organization.
+
+Some con's to this approach are:
+
+* Kubernetes has a steep learning curve; it's advisable to have experienced team members or consultants.
+* Managing stateful applications like Solr in Kubernetes requires careful planning for persistence and recovery.
+
+== What about User Managed Solr?
+
+The User Managed mode is no longer recommended. Historically, it was primarily used because running a seperate ZooKeeper cluster was viewed as difficult and expensive.
+These days, running an embedded ZooKeeper inside of your Solr node is straightforward, eliminating the main reason for User Managed deployments.
+Additionally, User Managed mode doesn't support all the features and APIs that SolrCloud provides.
+
+== What about Embedding Solr in my Java Application?
+
+{solr-javadocs}/core/org/apache/solr/client/solrj/embedded/EmbeddedSolrServer.html[Embedded Solr] is used extensively in Solr's own unit testing strategy.
+It's also frequently used to build dedicated indexes in distributed systems like Spark.
+However, it means that your application's dependencies are intertwined with Solr's dependencies, and that the primary focus of the Solr community is to deliver a standalone search engine, not a library.
+YMMV.
+
+== What about [YOUR SPECIFIC NEED]
+
+There are Solr use cases that require extreme scaling on certain specific axes, whether that is a massive multi-tenant use case, extreme query load, or extreme ingestion performance.
+
+Each of these requirements will bring its own specific best practices that you will need to embrace, and have their own impact on how you deploy Solr.
diff --git a/solr/solr-ref-guide/modules/deployment-guide/pages/installing-solr.adoc b/solr/solr-ref-guide/modules/deployment-guide/pages/installing-solr.adoc
index 15cc9898a02..44c66e14851 100644
--- a/solr/solr-ref-guide/modules/deployment-guide/pages/installing-solr.adoc
+++ b/solr/solr-ref-guide/modules/deployment-guide/pages/installing-solr.adoc
@@ -59,7 +59,7 @@ A very good blog post that discusses the issues to consider is https://lucidwork
One thing to note when planning your installation is that a hard limit exists in Lucene for the number of documents in a single index: approximately 2.14 billion documents (2,147,483,647 to be exact).
In practice, it is highly unlikely that such a large number of documents would fit and perform well in a single index, and you will likely need to distribute your index across a cluster before you ever approach this number.
-If you know you will exceed this number of documents in total before you've even started indexing, it's best to plan your installation with xref:cluster-types.adoc#solrcloud-mode[SolrCloud] as part of your design from the start.
+Fortunantly, by default Solr is configured to be deployed in xref:cluster-types.adoc#solrcloud-mode[SolrCloud] mode to let you scale up.
== Package Installation
@@ -84,9 +84,7 @@ This directory includes several important scripts that will make using Solr easi
solr and solr.cmd::: This is xref:solr-control-script-reference.adoc[Solr's Control Script], also known as `bin/solr` (*nix) / `bin/solr.cmd` (Windows).
This script is the preferred tool to start and stop Solr.
-You can also create collections or cores, configure authentication, and work with configuration files when running in SolrCloud mode.
-
-post::: The xref:indexing-guide:post-tool.adoc[], which provides a simple command line interface for POSTing content to Solr.
+You can also create collections or cores, configure authentication, work with configuration files and even index documents into Solr.
solr.in.sh and solr.in.cmd:::
These are property files for *nix and Windows systems, respectively.
@@ -193,8 +191,7 @@ For instance, to launch the "techproducts" example, you would do:
bin/solr start -e techproducts
----
-Currently, the available examples you can run are: techproducts, schemaless, and cloud.
-See the section xref:solr-control-script-reference.adoc#running-with-example-configurations[Running with Example Configurations] for details on each example.
+See the section xref:solr-control-script-reference.adoc#running-with-example-configurations[Running with Example Configurations] for details on all the examples available.
.Going deeper with SolrCloud
NOTE: Running the `cloud` example demonstrates running multiple nodes of Solr using xref:cluster-types.adoc#solrcloud-mode[SolrCloud] mode.
diff --git a/solr/solr-ref-guide/modules/deployment-guide/pages/taking-solr-to-production.adoc b/solr/solr-ref-guide/modules/deployment-guide/pages/taking-solr-to-production.adoc
index e91be0fd62c..0161350c22e 100644
--- a/solr/solr-ref-guide/modules/deployment-guide/pages/taking-solr-to-production.adoc
+++ b/solr/solr-ref-guide/modules/deployment-guide/pages/taking-solr-to-production.adoc
@@ -249,10 +249,15 @@ You can also refer to xref:jvm-settings.adoc[] for tuning your memory and garbag
The `bin/solr` script uses the `-XX:+CrashOnOutOfMemoryError` JVM option to crash Solr on `OutOfMemoryError` exceptions.
This behavior is recommended. In SolrCloud mode ZooKeeper will be immediately notified that a node has experienced a non-recoverable error.
-=== Going to Production with SolrCloud
-To run Solr in SolrCloud mode, you need to set the `ZK_HOST` variable in the include file to point to your ZooKeeper ensemble.
-Running the embedded ZooKeeper is not supported in production environments.
+=== Going to Production with SolrCloud with Embedded ZooKeeper
+
+Solr runs by default in SolrCloud mode with an embedded ZooKeeper, no additional configuration required.
+
+=== Going to Production with SolrCloud with External ZooKeeper Ensemble
+
+To run Solr in SolrCloud mode with an external ZooKeeper ensemble, you need to set the `ZK_HOST` variable in the include file to point to your ZooKeeper ensemble.
+
For instance, if you have a ZooKeeper ensemble hosted on the following three hosts on the default client port 2181 (zk1, zk2, and zk3), then you would set:
[source,bash]
@@ -260,7 +265,11 @@ For instance, if you have a ZooKeeper ensemble hosted on the following three hos
ZK_HOST=zk1,zk2,zk3
----
-When the `ZK_HOST` variable is set, Solr will launch in "cloud" mode.
+When the `ZK_HOST` variable is set, Solr will launch and connect to the defined ZooKeepers instead of starting an embedded ZooKeeper.
+
+See xref:zookeeper-ensemble.adoc[Zookeeper Ensemble] for more on setting up ZooKeeper.
+
+See xref:deployment-topology-overview.adoc[Deployment Topology Overview] for more details on different approaches for deployment.
==== ZooKeeper chroot
diff --git a/solr/solr-ref-guide/modules/deployment-guide/pages/zookeeper-ensemble.adoc b/solr/solr-ref-guide/modules/deployment-guide/pages/zookeeper-ensemble.adoc
index d422b89572a..c1683ac1633 100644
--- a/solr/solr-ref-guide/modules/deployment-guide/pages/zookeeper-ensemble.adoc
+++ b/solr/solr-ref-guide/modules/deployment-guide/pages/zookeeper-ensemble.adoc
@@ -17,12 +17,9 @@
// specific language governing permissions and limitations
// under the License.
-Although Solr comes bundled with http://zookeeper.apache.org[Apache ZooKeeper], you are strongly encouraged to use an external ZooKeeper setup in production.
+Although Solr comes bundled with http://zookeeper.apache.org[Apache ZooKeeper], depending on your scaling needs you may need to use a external ZooKeeper setup in production.
-While using Solr's embedded ZooKeeper instance is fine for getting started, you shouldn't use this in production because it does not provide any failover: if the Solr instance that hosts ZooKeeper shuts down, ZooKeeper is also shut down.
-Any shards or Solr instances that rely on it will not be able to communicate with it or each other.
-
-The solution to this problem is to set up an external ZooKeeper _ensemble_, which is a number of servers running ZooKeeper that communicate with each other to coordinate the activities of the cluster.
+See xref:deployment-topology-overview[Deployment Topology Overview] for more information on different approaches.
== How Many ZooKeeper Nodes?
diff --git a/solr/solr-ref-guide/playbook.template.yml b/solr/solr-ref-guide/playbook.template.yml
index ad283d03392..72cb9e0479e 100644
--- a/solr/solr-ref-guide/playbook.template.yml
+++ b/solr/solr-ref-guide/playbook.template.yml
@@ -57,3 +57,7 @@ antora:
extensions:
- require: '@antora/lunr-extension'
index_latest_only: true
+ - require: '@sntke/antora-mermaid-extension'
+ mermaid_library_url: https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.esm.min.mjs
+ mermaid_initialize_options:
+ start_on_load: true