apache · epugh · Oct 19, 2024 · Oct 19, 2024 · Oct 19, 2024 · Oct 19, 2024
diff --git a/changelog/unreleased/SOLR-17492.yml b/changelog/unreleased/SOLR-17492.yml
@@ -0,0 +1,8 @@
+# See https://github.com/apache/solr/blob/main/dev-docs/changelog.adoc
+title: Introduce advice on how to scale Solr from 1 node to 100's of nodes in Ref Guide
+type: added # added, changed, fixed, deprecated, removed, dependency_update, security, other
+authors:
+  - name: Eric Pugh
+links:
+  - name: SOLR-17492
+    url: https://issues.apache.org/jira/browse/SOLR-17492
diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml
@@ -45,6 +45,8 @@ apache-tomcat = "6.0.53"
 apache-zookeeper = "3.9.4"
 # @keep for version alignment
 apiguardian = "1.1.2"
+# @keep Antora mermaid extension version used in ref-guide
+antora-mermaid-extension = "0.0.8"
 # @keep Asciidoctor mathjax version used in ref-guide
 asciidoctor-mathjax = "0.0.9"
 # @keep Asciidoctor tabs version used in ref-guide

diff --git a/solr/solr-ref-guide/build.gradle b/solr/solr-ref-guide/build.gradle
@@ -245,6 +245,14 @@ task downloadAsciidoctorMathjaxExtension(type: NpmTask) {
   outputs.dir("${project.ext.nodeProjectDir}/node_modules/@djencks/asciidoctor-mathjax")
 }
 
+task downloadAntoraMermaidExtension(type: NpmTask) {
+    group = 'Build Dependency Download'
+    args = ["install", "@sntke/antora-mermaid-extension@${libs.versions.antora.mermaid.extension.get()}"]
+
+    inputs.property("antora-mermaid-extension version", libs.versions.antora.mermaid.extension.get())
+    outputs.dir("${project.ext.nodeProjectDir}/node_modules/@sntke/antora-mermaid-extension")
+}
+
 task downloadAsciidoctorTabsExtension(type: NpmTask) {
   group = 'Build Dependency Download'
   args = ["install", "-D", "@asciidoctor/tabs@${libs.versions.asciidoctor.tabs.get()}"]
@@ -260,6 +268,7 @@ task downloadAntora {
   dependsOn tasks.downloadAntoraCli
   dependsOn tasks.downloadAntoraSiteGenerator
   dependsOn tasks.downloadAntoraLunrExtension
+  dependsOn tasks.downloadAntoraMermaidExtension
   dependsOn tasks.downloadAsciidoctorMathjaxExtension
   dependsOn tasks.downloadAsciidoctorTabsExtension
 }

diff --git a/solr/solr-ref-guide/modules/deployment-guide/deployment-nav.adoc b/solr/solr-ref-guide/modules/deployment-guide/deployment-nav.adoc
@@ -18,7 +18,7 @@
 .Deployment Guide
 
 * xref:solr-control-script-reference.adoc[]
-
+* xref:deployment-topology-overview.adoc[]
 * Installation & Deployment
 ** xref:system-requirements.adoc[]
 ** xref:installing-solr.adoc[]

diff --git a/...solr-ref-guide/modules/deployment-guide/pages/deployment-topology-overview.adoc b/...solr-ref-guide/modules/deployment-guide/pages/deployment-topology-overview.adoc
@@ -0,0 +1,302 @@
+= Deployment Topology Overview
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+////
+This page has a number of graphs to help you visualize different Solr deployment strategies.
+
+The graphs are developed using Mermaid syntax.
+
+The site https://mermaid.live/ allows you to play with those diagrams in real time.
+////
+
+This section embodies the Solr community's thoughts on best practices for deploying Solr depending on your needs.
+
+== Overview
+There is a logical progression of topologies for scaling Solr based on the number of nodes you anticipate needing to meet your indexing needs, querying needs, and storage needs.  
+
+[mermaid,width=100%]
+....
+flowchart LR
+    node1((1. Single<br/>Node))
+    node2((2. Two Nodes))
+    node3((3. Three or<br/>Five Nodes))
+    node4((4. Six to Twelve<br/>Nodes))
+    node5((5. Twelve to<br/>Twenty Five<br/>Nodes))
+    node6((6. Twenty Six<br/>and Beyond))
+
+    node1 --- node2
+    node2 --- node3
+    node3 --- node4
+    node4 --- node5
+    node5 --- node6
+
+    classDef uniform padding:20px,min-width:120px,min-height:120px
+    class node1,node2,node3,node4,node5,node6 uniform
+....
+
+
+== Solr from smallest to largest
+
+When we start Solr, we're already starting Solr with the underpinnings required to let Solr scale in a smooth fashion, the coordination library ZooKeeper.
+ZooKeeper is the unifying technology that supports maintaining state from a single node up to many 100's of nodes.
+
+=== Simplest Setup
+
+If you only need a single Solr node, then it's perfectly reasonable to start Solr with `bin/solr start`.   You will have a single Solr node running in SolrCloud mode, with all the lovely APIs and features that SolrCloud provides.
+
+[mermaid]
+....
+flowchart TD
+    node1[node1]
+....
+
+Use this approach when:
+
+* You are just playing around with Solr.
+* You aren't worried about High Availability or Failover.
+* You want the simplest deployment approach.
+
+
+=== Introducing Fail Over
+
+The next most common setup after a single node is having two separate nodes running on separate machines, with one as the xref:cluster-types.adoc#leaders[Leader] and the other as the Follower.  
+
+This leverages a simplistic ZooKeeper xref:getting-started:solr-glossary.adoc#ensemble[Ensemble] setup.
+You start the first Solr node with the embedded ZooKeeper, and then the second Solr is set to connect to the ZooKeeper embedded in the first Solr node.
+
+[mermaid]
+....
+flowchart LR
+    node1["solr1 | zk"]
+    node2[solr2]
+
+    node1 --- node2
+....
+
+This is basically what you are setting up when you run the `bin/solr start -e cloud` cloud example.
+
+```
+mkdir -p ./two_nodes/node1/solr
+mkdir -p ./two_nodes/node2/solr
+bin/solr start -p 8983 --solr-home "./two_nodes/node1/solr"
+bin/solr start -p 8984 --solr-home "./two_nodes/node2/solr"  -z 127.0.0.1:9983
+```
+
+Use this approach when:
+
+* You want failover, but you aren't worried about high availability. You have a load balancer in front of the two Solr nodes that detects when one goes down and redirects traffic to the remaining node for querying.
+* You are prepared to handle the consequences for indexing operations if one of the nodes goes down.
+
+=== Adding High Availability
+
+You can then scale beyond two nodes by running either three or five Solr nodes.  
+We specify three or five Solr nodes because ZooKeeper is running on every single node, and therefore requires an odd number of nodes in order to prevent Split Brain issues in the cluster.
+
+
+NOTE: This approach requires https://github.com/apache/solr/pull/2391 to be completed!  We are leaving it in the Ref Guide as a pointer to the future.
+
+NOTE: What is the difference between failover and high availability? Failover, in the two-node situation, means that you can still issue queries and keep your application running, but you can no longer index data because ZooKeeper can't form a quorum. This is often referred to as a Split Brain situation. High Availability means that all aspects of your service continues uninterrupted in the event of any node going down.
+
+
+
+[mermaid,width=100%]
+....
+flowchart LR
+    node1[node1]
+    node2[node2]
+    node3[node3]
+    node4[node4]
+    node5[node5]
+
+    node1 --- node2
+    node2 --- node3
+    node3 --- node4
+    node4 --- node5
+    node5 --- node1
+
+    classDef uniform padding:20px,min-width:120px,min-height:120px
+    class node1,node2,node3,node4,node5 uniform
+....
+
+Use these approaches when:
+
+* You want to be able to split your logical Collection across multiple Shards and distribute Replicas around the cluster.
+* You don't want to go through the effort of deploying a separate ZK ensemble independently. And honestly, you don't need to either.
+* You want true High Availability.  With three nodes, you can lose one and continue.  With five nodes you can lose two nodes and still continue.
+
+
+Some cons to this approach are:
+
+* Having five ZooKeeper instances all updating each other is fine, but it starts to break down if you expand to 7 or 9 ZooKeeper instances forming the Quorum.
+* We currently don't have any flexible resizing of the quorum. You need to select the appropriate size when setting up your cluster.
+
+=== Moving Beyond the Basic Cluster
+
+NOTE: This isn't yet fleshed out as to how it works!
+
+Solr has a concept of node xref:deployment-guide:node-roles.adoc#roles[Roles] that can be leveraged to establish a set of Solr nodes that run embedded ZooKeeper, and then a larger set of Solr nodes that connect to those ZooKeepers. We currently have the concept of "data" nodes that host shards and replicas, and we can introduce a "zookeeper" node that also runs the embedded ZooKeeper process.   
+
+This will work well as you grow from six to 12 nodes in your cluster.
+
+[mermaid,width=100%]
+....
+flowchart LR
+    subgraph zk[ZooKeeper Ensemble]
+        node1[data, zookeeper]
+        node2[data, zookeeper]
+        node3[data, zookeeper]
+    end
+
+    node4[data]
+    node5[data]
+    node6[data]
+    node7[data]
+    node8[data]
+    node9[data]
+
+    node1 --- node2
+    node2 --- node3
+    node3 --- node1
+    node3 --- node4
+    node4 --- node5
+    node5 --- node6
+    node6 --- node7
+    node7 --- node8
+    node8 --- node9
+    node9 --- node1
+
+    style node1 fill:#ffff00
+    style node2 fill:#ffff00
+    style node3 fill:#ffff00
+....
+
+=== Separating out ZooKeeper workload
+
+As your load in the cluster goes up, sharing ZooKeeper workloads with Solr workloads may become a bottleneck.
+At this point you may want to run distinct seperate ZooKeeper nodes on their own servers.
+
+[mermaid,width=100%]
+....
+flowchart LR
+    subgraph zk[ZooKeeper Ensemble]
+        direction TB
+        zk1[zookeeper]
+        zk2[zookeeper]
+        zk3[zookeeper]
+    end
+
+    subgraph solr[Solr Nodes]
+        direction TB
+        node4[node4] --- node5[node5] --- node6[node6] --- node7[node7] --- node8[node8]
+        node9[node9] --- node10[node10] --- node11[node11] --- node12[node12] --- node13[node13]
+        node14[node14] --- node15[node15] --- node16[node16] --- node17[node17] --- node18[node18]
+        node19[node19] --- node20[node20]
+    end
+
+    zk --- solr
+
+    style zk1 fill:#ffff00
+    style zk2 fill:#ffff00
+    style zk3 fill:#ffff00
+....
+
+Use this approach when:
+
+* You go beyond 12 Solr nodes up to 25 Solr nodes.
+* You are leveraging all the features of SolrCloud to support multiple collections and different types of query and load characteristics, especially tuning shard and replica counts.
+* You may need to move to five ZooKeepers in their own setup to support traffic.
+
+Some cons to this approach are:
+
+* You are responsible for configuring and maintaining the external ZooKeeper ensemble.
+* You need to define how you will handle failover/HA for the ZooKeeper ensemble itself.
+
+=== Going massive means going Kubernetes
+
+Beyond 25 nodes, you really need to think about more advanced tooling for managing all your nodes.  
+We discourage rolling your own Zookeeper orchestration, as there are many pitfalls. 
+Instead, use a well-supported container orchestrator with support for Solr and Zookeeper.
+For Kubernetes, we provide the https://solr.apache.org/operator/[Solr Operator] sub project.
+There are also 3rd party Helm charts available. 
+
+[mermaid,width=100%]
+....
+flowchart TB
+    subgraph kubernetes[Kubernetes]
+        operator[Solr Operator]
+
+        subgraph solr[Solr Pods]
+            direction TB
+            node1[node1] --- node2[node2] --- node3[node3] --- node4[node4] --- node5[node5]
+            node6[node6] --- node7[node7] --- node8[node8] --- node9[node9] --- node10[node10]
+            node11[node11] --- node12[node12] --- node13[node13] --- node14[node14] --- node15[node15]
+            node16[node16] --- node17[node17] --- node18[node18] --- node19[node19] --- node20[node20]
+            node21[node21] --- node22[node22] --- node23[node23] --- node24[node24] --- node25[node25]
+            node26[node26] --- node27[node27] --- node28[node28] --- node29[node29] --- node30[node30]
+        end
+
+        subgraph zk[ZooKeeper Ensemble]
+            direction TB
+            zk1[zookeeper]
+            zk2[zookeeper]
+            zk3[zookeeper]
+            zk1 --- zk2
+            zk2 --- zk3
+            zk3 --- zk1
+        end
+    end
+
+    operator --> solr
+    solr --- zk
+
+    style operator fill:#00ffff
+    style zk1 fill:#ffff00
+    style zk2 fill:#ffff00
+    style zk3 fill:#ffff00
+....
+
+Use this approach when:
+
+* You need to deploy more than 25 Solr nodes.
+* You have the operational maturity to manage massive data sets and fleets of Kubernetes pods.
+* You want a standardized approach to deployment, scaling, and management.
+* You may adopt this earlier if you are already a Kubernetes-savvy organization.
+
+Some con's to this approach are:
+
+* Kubernetes has a steep learning curve; it's advisable to have experienced team members or consultants.
+* Managing stateful applications like Solr in Kubernetes requires careful planning for persistence and recovery.
+
+== What about User Managed Solr?
+
+The User Managed mode is no longer recommended. Historically, it was primarily used because running a seperate ZooKeeper cluster was viewed as difficult and expensive.
+These days, running an embedded ZooKeeper inside of your Solr node is straightforward, eliminating the main reason for User Managed deployments. 
+Additionally, User Managed mode doesn't support all the features and APIs that SolrCloud provides.
+
+== What about Embedding Solr in my Java Application?
+
+{solr-javadocs}/core/org/apache/solr/client/solrj/embedded/EmbeddedSolrServer.html[Embedded Solr] is used extensively in Solr's own unit testing strategy.  
+It's also frequently used to build dedicated indexes in distributed systems like Spark.
+However, it means that your application's dependencies are intertwined with Solr's dependencies, and that the primary focus of the Solr community is to deliver a standalone search engine, not a library.
+YMMV.  
+
+== What about [YOUR SPECIFIC NEED]
+
+There are Solr use cases that require extreme scaling on certain specific axes, whether that is a massive multi-tenant use case, extreme query load, or extreme ingestion performance.
+
+Each of these requirements will bring its own specific best practices that you will need to embrace, and have their own impact on how you deploy Solr.
diff --git a/solr/solr-ref-guide/modules/deployment-guide/pages/installing-solr.adoc b/solr/solr-ref-guide/modules/deployment-guide/pages/installing-solr.adoc
@@ -59,7 +59,7 @@ A very good blog post that discusses the issues to consider is https://lucidwork
 
 One thing to note when planning your installation is that a hard limit exists in Lucene for the number of documents in a single index: approximately 2.14 billion documents (2,147,483,647 to be exact).
 In practice, it is highly unlikely that such a large number of documents would fit and perform well in a single index, and you will likely need to distribute your index across a cluster before you ever approach this number.
-If you know you will exceed this number of documents in total before you've even started indexing, it's best to plan your installation with xref:cluster-types.adoc#solrcloud-mode[SolrCloud] as part of your design from the start.
+Fortunantly, by default Solr is configured to be deployed in xref:cluster-types.adoc#solrcloud-mode[SolrCloud] mode to let you scale up.
 
 == Package Installation
 
@@ -84,9 +84,7 @@ This directory includes several important scripts that will make using Solr easi
 
 solr and solr.cmd::: This is xref:solr-control-script-reference.adoc[Solr's Control Script], also known as `bin/solr` (*nix) / `bin/solr.cmd` (Windows).
 This script is the preferred tool to start and stop Solr.
-You can also create collections or cores, configure authentication, and work with configuration files when running in SolrCloud mode.
-
-post::: The xref:indexing-guide:post-tool.adoc[], which provides a simple command line interface for POSTing content to Solr.
+You can also create collections or cores, configure authentication, work with configuration files and even index documents into Solr.
 
 solr.in.sh and solr.in.cmd:::
 These are property files for *nix and Windows systems, respectively.
@@ -193,8 +191,7 @@ For instance, to launch the "techproducts" example, you would do:
 bin/solr start -e techproducts
 ----
 
-Currently, the available examples you can run are: techproducts, schemaless, and cloud.
-See the section xref:solr-control-script-reference.adoc#running-with-example-configurations[Running with Example Configurations] for details on each example.
+See the section xref:solr-control-script-reference.adoc#running-with-example-configurations[Running with Example Configurations] for details on all the examples available.
 
 .Going deeper with SolrCloud
 NOTE: Running the `cloud` example demonstrates running multiple nodes of Solr using xref:cluster-types.adoc#solrcloud-mode[SolrCloud] mode.