Skip to content
96 changes: 75 additions & 21 deletions builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,21 @@ import (
)

var defaultBuilderOpts = &BuilderOpts{
Encoder: 1,
RegistryTableSize: 10000,
RegistryMRUSize: 2,
Encoder: 1,
RegistryTableSize: 10000,
RegistryMRUSize: 2,
UnfinishedNodesStackSize: 64,
BuilderNodePoolingConfig: BuilderNodePoolingConfig{
// This value should always be significantly larger than
// RegistryTableSize * RegistryMRUSize because that defines
// how many items can be stored in the registry, and if the
// pools MaxSize is not much larger than that, then all the
// BuilderNodes will end up stuck in the registry and not
// returned to the pool which will cause the building process
// to begin allocating a lot.
MaxSize: 100000,
MaxTransitionSize: 32,
},
}

// A Builder is used to build a new FST. When possible data is
Expand Down Expand Up @@ -50,9 +62,10 @@ func newBuilder(w io.Writer, opts *BuilderOpts) (*Builder, error) {
if opts == nil {
opts = defaultBuilderOpts
}
builderNodePool := &builderNodePool{}

builderNodePool := newBuilderNodePool(opts.BuilderNodePoolingConfig)
rv := &Builder{
unfinished: newUnfinishedNodes(builderNodePool),
unfinished: newUnfinishedNodes(builderNodePool, opts),
registry: newRegistry(builderNodePool, opts.RegistryTableSize, opts.RegistryMRUSize),
builderNodePool: builderNodePool,
opts: opts,
Expand Down Expand Up @@ -83,6 +96,7 @@ func (b *Builder) Reset(w io.Writer) error {
if err != nil {
return err
}

return nil
}

Expand Down Expand Up @@ -156,12 +170,22 @@ func (b *Builder) compileFrom(iState int) error {
func (b *Builder) compile(node *builderNode) (int, error) {
if node.final && len(node.trans) == 0 &&
node.finalOutput == 0 {
// We're done with this node so its safe to put it back in the pool.
b.builderNodePool.Put(node)
return 0, nil
}
found, addr, entry := b.registry.entry(node)
if found {
// This node already existed in the registry (and thus the registry
// did not assume ownership of it) so its safe to put it back in
// the pool.
b.builderNodePool.Put(node)
return addr, nil
}
// If the node was not found in the registry, then the registry will
// have assumed ownership of it and is responsible for returning it
// to the pool.

addr, err := b.encoder.encodeState(node, b.lastAddr)
if err != nil {
return 0, err
Expand Down Expand Up @@ -194,10 +218,15 @@ func (u *unfinishedNodes) Reset() {
u.pushEmpty(false)
}

func newUnfinishedNodes(p *builderNodePool) *unfinishedNodes {
func newUnfinishedNodes(p *builderNodePool, opts *BuilderOpts) *unfinishedNodes {
initialSize := opts.UnfinishedNodesStackSize
if initialSize <= 0 {
initialSize = defaultBuilderOpts.UnfinishedNodesStackSize
}

rv := &unfinishedNodes{
stack: make([]*builderNodeUnfinished, 0, 64),
cache: make([]builderNodeUnfinished, 64),
stack: make([]*builderNodeUnfinished, 0, initialSize),
cache: make([]builderNodeUnfinished, initialSize),
builderNodePool: p,
}
rv.pushEmpty(false)
Expand Down Expand Up @@ -416,21 +445,40 @@ func outputCat(l, r uint64) uint64 {
return l + r
}

// BuilderNodePoolingConfig is the configuration struct for the BuilderNodePool.
// Note that unsafe.SizeOf(transition{}) is 24 bytes and unsafe.SizeOf(BuilderNode{})
// is 48 bytes so the maximum amount of memory used by the pool should be approximately
// MaxSize * (48 + 24 * MaxTransitionSize) not including the extra space required
// by the G.C. Note if an F.S.T construction never requires this many BuilderNodes then
// the maximum size of the pool will never be reached as it is allocated lazily.
type BuilderNodePoolingConfig struct {
// Maximum number of builder nodes can be retained in the pool.
MaxSize int
// Maximum size of the transitions array for an individual builder node.
MaxTransitionSize int
}

// builderNodePool pools builderNodes using a singly linked list.
//
// NB: builderNode lifecylce is described by the following interactions -
// +------------------------+ +----------------------+
// | Unfinished Nodes | Transfer once | Registry |
// |(not frozen builderNode)|-----builderNode is ------->| (frozen builderNode) |
// +------------------------+ marked frozen +----------------------+
// ^ |
// | |
// | Put()
// | Get() on +-------------------+ when
// +-new char--------| builderNode Pool |<-----------evicted
// +-------------------+
// The lifecycle is as follows:
//
// 1. Builder retrieves a node from the pool using Get() whenever it needs one.
// 2. After a node is compiled it is either:
// a. Discarded and immediately returned to the pool.
// b. Transferred to the registry (which assumes ownership of it) and will
// return it to the pool when it evicts the node to make room for another,
// or when the entire registry is Reset().
type builderNodePool struct {
head *builderNode
config BuilderNodePoolingConfig
size int
head *builderNode
}

func newBuilderNodePool(config BuilderNodePoolingConfig) *builderNodePool {
// Pool will lazy alloc.
return &builderNodePool{
config: config,
}
}

func (p *builderNodePool) Get() *builderNode {
Expand All @@ -439,14 +487,20 @@ func (p *builderNodePool) Get() *builderNode {
}
head := p.head
p.head = p.head.next
p.size--
return head
}

func (p *builderNodePool) Put(v *builderNode) {
if v == nil {
if v == nil ||
p.size >= p.config.MaxSize ||
cap(v.trans) > p.config.MaxTransitionSize {
// Don't store nil or allow the pool to violate its config.
return
}

v.reset()
v.next = p.head
p.head = v
p.size++
}
19 changes: 18 additions & 1 deletion registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@ type registryCell struct {
node *builderNode
}

// Registry is used as a form of LRU so that the number of nodes that need to be kept
// in memory is reduced. When the builder is compiling the FST and is presented with
// compiling a given node, it can check the registry to see if an equivalent node has
// already been compiled. If so, the registry will return the address of the already
// compiled node and the builder can use that. If an equivalent node has not already
// been compiled (or was, but has since been evicted from the LRU), the builder will
// recompile it into the encoder and then add it to the registry for future use.
type registry struct {
builderNodePool *builderNodePool
table []registryCell
Expand All @@ -40,7 +47,12 @@ func newRegistry(p *builderNodePool, tableSize, mruSize int) *registry {
func (r *registry) Reset() {
var empty registryCell
for i := range r.table {
r.builderNodePool.Put(r.table[i].node)
if r.table[i].node != nil {
// Only try and return to the pool if the node actually exists to
// avoid excessive function call overhead in the scenario where many
// of the cells are empty.
r.builderNodePool.Put(r.table[i].node)
}
r.table[i] = empty
}
}
Expand Down Expand Up @@ -77,6 +89,10 @@ func (r *registry) hash(b *builderNode) int {

type registryCache []registryCell

// The registry is responsible for returning BuilderNodes that it controls to the BuilderNodePool once
// they are evicted. As a result, all the codepaths in the entry method that return false (entry was not
// found and the registry is assuming ownership of this node) will return the corresponding evicted node to
// the builderNodePool.
func (r registryCache) entry(node *builderNode, pool *builderNodePool) (bool, int, *registryCell) {
if len(r) == 1 {
if r[0].node != nil && r[0].node.equiv(node) {
Expand All @@ -93,6 +109,7 @@ func (r registryCache) entry(node *builderNode, pool *builderNodePool) (bool, in
return true, addr, nil
}
}

// no match
last := len(r) - 1
pool.Put(r[last].node)
Expand Down
8 changes: 5 additions & 3 deletions vellum.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,11 @@ var ErrIteratorDone = errors.New("iterator-done")
// BuilderOpts is a structure to let advanced users customize the behavior
// of the builder and some aspects of the generated FST.
type BuilderOpts struct {
Encoder int
RegistryTableSize int
RegistryMRUSize int
Encoder int
RegistryTableSize int
RegistryMRUSize int
UnfinishedNodesStackSize int
BuilderNodePoolingConfig BuilderNodePoolingConfig
}

// New returns a new Builder which will stream out the
Expand Down
28 changes: 24 additions & 4 deletions vellum_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,26 @@ func TestRoundTripSimple(t *testing.T) {
}

func TestRoundTripThousand(t *testing.T) {
dataset := thousandTestWords
randomThousandVals := randomValues(dataset)
b, err := New(nil, nil)
if err != nil {
t.Fatalf("error creating builder: %v", err)
}

testRoundTripThousand(t, b)
}

func TestRoundTripThousandBuilderIsReusable(t *testing.T) {
b, err := New(nil, nil)
if err != nil {
t.Fatalf("error creating builder: %v", err)
}

for i := 0; i < 1000; i++ {
testRoundTripThousand(t, b)
}
}

func testRoundTripThousand(t *testing.T, b *Builder) {
f, err := ioutil.TempFile("", "vellum")
if err != nil {
t.Fatal(err)
Expand All @@ -167,11 +184,14 @@ func TestRoundTripThousand(t *testing.T) {
}
}()

b, err := New(f, nil)
err = b.Reset(f)
if err != nil {
t.Fatalf("error creating builder: %v", err)
t.Fatalf("error resetting builder: %v", err)
}

dataset := thousandTestWords
randomThousandVals := randomValues(dataset)

err = insertStrings(b, dataset, randomThousandVals)
if err != nil {
t.Fatalf("error inserting thousand words: %v", err)
Expand Down