Skip to content

Commit 6707376

Browse files
committed
fix(unixfs): align HAMT sharding threshold with JS implementation
HAMT sharding threshold comparison was historically implemented as `>` in JS and `>=` in Go: - JS: https://github.com/ipfs/helia/blob/005c2a7/packages/unixfs/src/commands/utils/is-over-shard-threshold.ts#L31 - Go: https://github.com/ipfs/boxo/blob/319662c/ipld/unixfs/io/directory.go#L438 This inconsistency meant a directory exactly at the 256 KiB threshold would stay basic in JS but convert to HAMT in Go, producing different CIDs for the same input. This commit changes Go to use `>` (matching JS), so a directory exactly at the threshold now stays as a basic (flat) directory. This aligns cross-implementation behavior for CID determinism per IPIP-499. Also adds SizeEstimationMode to MkdirOpts so MFS directories respect the configured estimation mode instead of always using the global default.
1 parent 4ff72d0 commit 6707376

File tree

6 files changed

+124
-20
lines changed

6 files changed

+124
-20
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ The following emojis are used to highlight certain changes:
3333

3434
### Fixed
3535

36+
- 🛠 `ipld/unixfs/io`: fixed HAMT sharding threshold comparison to use `>` instead of `>=`. A directory exactly at the threshold now stays as a basic (flat) directory, aligning behavior with code documentation and the JS implementation. This is a theoretical breaking change, but unlikely to impact real-world users as it requires a directory to be exactly at the threshold boundary. If you depend on the old behavior, adjust `HAMTShardingSize` to be 1 byte lower. See [IPIP-499](https://github.com/ipfs/specs/pull/499).
3637
- `bitswap/network`: Fixed goroutine leak that could cause bitswap to stop serving blocks after extended uptime. The root cause is `stream.Close()` blocking indefinitely when remote peers are unresponsive during multistream handshake ([go-libp2p#3448](https://github.com/libp2p/go-libp2p/pull/3448)). This PR ([#1083](https://github.com/ipfs/boxo/pull/1083)) adds a localized fix specific to bitswap's `SendMessage` by setting a read deadline before closing streams.
3738

3839
### Security

ipld/unixfs/io/directory.go

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ var log = logging.Logger("unixfs")
2626
// The size is not the *exact* block size of the encoded BasicDirectory but just
2727
// the estimated size based byte length of links name and CID (BasicDirectory's
2828
// ProtoNode doesn't use the Data field so this estimate is pretty accurate).
29+
//
30+
// Threshold behavior: directory converts to HAMT when estimatedSize > HAMTShardingSize.
31+
// A directory exactly at the threshold stays basic (threshold value is NOT included).
2932
var HAMTShardingSize = int(256 * units.KiB)
3033

3134
// DefaultShardWidth is the default value used for hamt sharding width.
@@ -325,7 +328,8 @@ func NewBasicDirectory(dserv ipld.DAGService, opts ...DirectoryOption) (*BasicDi
325328
return nil, err
326329
}
327330

328-
// Scan node links (if any) to restore estimated size.
331+
// Initialize all size tracking fields from the node.
332+
// This must be done after node is created and options applied.
329333
basicDir.computeEstimatedSizeAndTotalLinks()
330334

331335
return basicDir, nil
@@ -338,7 +342,7 @@ func NewBasicDirectoryFromNode(dserv ipld.DAGService, node *mdag.ProtoNode) *Bas
338342
basicDir.node = node
339343
basicDir.dserv = dserv
340344

341-
// Scan node links (if any) to restore estimated size.
345+
// Initialize all size tracking fields from the node.
342346
basicDir.computeEstimatedSizeAndTotalLinks()
343347

344348
return basicDir
@@ -487,8 +491,12 @@ func (d *BasicDirectory) GetSizeEstimationMode() SizeEstimationMode {
487491
return HAMTSizeEstimation // fall back to global
488492
}
489493

494+
// computeEstimatedSizeAndTotalLinks initializes all size tracking fields from the current node.
495+
// This includes estimatedSize (for links-based estimation), totalLinks count,
496+
// and cachedBlockSize (for block-based estimation when that mode is active).
490497
func (d *BasicDirectory) computeEstimatedSizeAndTotalLinks() {
491498
d.estimatedSize = 0
499+
d.totalLinks = 0
492500
// err is just breaking the iteration and we always return nil
493501
_ = d.ForEachLink(context.TODO(), func(l *ipld.Link) error {
494502
d.addToEstimatedSize(l.Name, l.Cid)
@@ -497,6 +505,12 @@ func (d *BasicDirectory) computeEstimatedSizeAndTotalLinks() {
497505
})
498506
// ForEachLink will never fail traversing the BasicDirectory
499507
// and neither the inner callback `addToEstimatedSize`.
508+
509+
// Initialize cachedBlockSize for block-based estimation.
510+
// Check both instance-specific mode and global mode.
511+
if d.GetSizeEstimationMode() == SizeEstimationBlock && d.node != nil {
512+
d.cachedBlockSize, _ = calculateBlockSize(d.node)
513+
}
500514
}
501515

502516
func (d *BasicDirectory) addToEstimatedSize(name string, linkCid cid.Cid) {
@@ -568,7 +582,9 @@ func (d *BasicDirectory) needsToSwitchByLinkSize(name string, nodeToAdd ipld.Nod
568582
operationSizeChange += linksize.LinkSizeFunction(name, nodeToAdd.Cid())
569583
}
570584

571-
switchShardingSize := d.estimatedSize+operationSizeChange >= HAMTShardingSize
585+
// Switch to HAMT when size exceeds threshold (> not >=).
586+
// Directory exactly at threshold stays basic.
587+
switchShardingSize := d.estimatedSize+operationSizeChange > HAMTShardingSize
572588
switchMaxLinks := d.checkMaxLinksExceeded(nodeToAdd, entryToRemove)
573589
return switchShardingSize || switchMaxLinks, nil
574590
}
@@ -610,7 +626,9 @@ func (d *BasicDirectory) needsToSwitchByBlockSize(name string, nodeToAdd ipld.No
610626
return false, err
611627
}
612628

613-
switchShardingSize := exactSize >= HAMTShardingSize
629+
// Switch to HAMT when size exceeds threshold (> not >=).
630+
// Directory exactly at threshold stays basic.
631+
switchShardingSize := exactSize > HAMTShardingSize
614632
switchMaxLinks := d.checkMaxLinksExceeded(nodeToAdd, entryToRemove)
615633
return switchShardingSize || switchMaxLinks, nil
616634
}
@@ -1041,9 +1059,9 @@ func (d *HAMTDirectory) sizeBelowThreshold(ctx context.Context, sizeChange int)
10411059
}
10421060

10431061
partialSize += d.linkSizeFor(linkResult.Link)
1044-
if partialSize+sizeChange >= HAMTShardingSize {
1045-
// We have already fetched enough shards to assert we are above the
1046-
// threshold, so no need to keep fetching.
1062+
// Check if size exceeds threshold (> not >=, matching upgrade logic).
1063+
// Early exit: no need to enumerate more links once we know we're above.
1064+
if partialSize+sizeChange > HAMTShardingSize {
10471065
below = false
10481066
break
10491067
}

ipld/unixfs/io/directory_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -990,3 +990,4 @@ func TestHAMTDirectorySizeEstimationMode(t *testing.T) {
990990
assert.Equal(t, SizeEstimationBlock, hamtDir.GetSizeEstimationMode())
991991
})
992992
}
993+

ipld/unixfs/io/profile_test.go

Lines changed: 78 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,18 +130,19 @@ func TestProfileHAMTThresholdBehavior(t *testing.T) {
130130
assert.False(t, isHAMT, "should remain BasicDirectory when below threshold (200 < 300)")
131131
})
132132

133-
t.Run("at threshold switches to HAMTDirectory", func(t *testing.T) {
133+
t.Run("at threshold stays BasicDirectory", func(t *testing.T) {
134134
dir, err := NewDirectory(ds)
135135
require.NoError(t, err)
136136

137137
// Add 3 entries = 300 bytes, exactly at threshold
138+
// With > comparison, at threshold stays basic
138139
for i := range 3 {
139140
err = dir.AddChild(ctx, fmt.Sprintf("e%d", i), child)
140141
require.NoError(t, err)
141142
}
142143

143144
_, isHAMT := dir.(*DynamicDirectory).Directory.(*HAMTDirectory)
144-
assert.True(t, isHAMT, "should switch to HAMTDirectory when at threshold (300 >= 300)")
145+
assert.False(t, isHAMT, "should stay BasicDirectory when exactly at threshold (300 == 300)")
145146
})
146147

147148
t.Run("above threshold switches to HAMTDirectory", func(t *testing.T) {
@@ -270,4 +271,79 @@ func TestProfileHAMTThresholdBehavior(t *testing.T) {
270271
assert.Greater(t, actualBlockSize, linksEstimate,
271272
"links-based estimation should underestimate actual block size")
272273
})
274+
275+
t.Run("SizeEstimationBlock exact threshold boundary", func(t *testing.T) {
276+
saveAndRestoreGlobals(t)
277+
278+
// Test that the HAMT switch happens exactly when size > threshold (not >=)
279+
HAMTSizeEstimation = SizeEstimationBlock
280+
281+
ds := mdtest.Mock()
282+
ctx := t.Context()
283+
child := ft.EmptyDirNode()
284+
require.NoError(t, ds.Add(ctx, child))
285+
286+
// First, determine the exact block size after adding entries
287+
testDir, err := NewDirectory(ds, WithSizeEstimationMode(SizeEstimationBlock))
288+
require.NoError(t, err)
289+
290+
// Add entries and track sizes at each step
291+
var sizes []int
292+
for i := 0; i < 10; i++ {
293+
err = testDir.AddChild(ctx, fmt.Sprintf("entry%d", i), child)
294+
require.NoError(t, err)
295+
296+
node, err := testDir.GetNode()
297+
require.NoError(t, err)
298+
pn, ok := node.(*mdag.ProtoNode)
299+
require.True(t, ok)
300+
size, err := calculateBlockSize(pn)
301+
require.NoError(t, err)
302+
sizes = append(sizes, size)
303+
}
304+
305+
// Set threshold to exactly the size after 5 entries
306+
// (entry0..entry4 = 5 entries)
307+
exactThreshold := sizes[4]
308+
t.Logf("threshold set to exactly %d bytes (size after 5 entries)", exactThreshold)
309+
310+
t.Run("at exact threshold stays BasicDirectory", func(t *testing.T) {
311+
HAMTShardingSize = exactThreshold
312+
313+
dir, err := NewDirectory(ds, WithSizeEstimationMode(SizeEstimationBlock))
314+
require.NoError(t, err)
315+
316+
// Add 5 entries to reach exactly threshold size
317+
for i := 0; i < 5; i++ {
318+
err = dir.AddChild(ctx, fmt.Sprintf("entry%d", i), child)
319+
require.NoError(t, err)
320+
}
321+
322+
node, err := dir.GetNode()
323+
require.NoError(t, err)
324+
pn := node.(*mdag.ProtoNode)
325+
actualSize, _ := calculateBlockSize(pn)
326+
t.Logf("actual size: %d, threshold: %d", actualSize, exactThreshold)
327+
328+
_, isHAMT := dir.(*DynamicDirectory).Directory.(*HAMTDirectory)
329+
assert.False(t, isHAMT, "should stay BasicDirectory when size equals threshold (%d == %d)", actualSize, exactThreshold)
330+
})
331+
332+
t.Run("one byte over threshold switches to HAMTDirectory", func(t *testing.T) {
333+
// Set threshold to size[4] - 1 so that size[4] is > threshold
334+
HAMTShardingSize = sizes[4] - 1
335+
336+
dir, err := NewDirectory(ds, WithSizeEstimationMode(SizeEstimationBlock))
337+
require.NoError(t, err)
338+
339+
// Add 5 entries - last one should trigger HAMT
340+
for i := 0; i < 5; i++ {
341+
err = dir.AddChild(ctx, fmt.Sprintf("entry%d", i), child)
342+
require.NoError(t, err)
343+
}
344+
345+
_, isHAMT := dir.(*DynamicDirectory).Directory.(*HAMTDirectory)
346+
assert.True(t, isHAMT, "should switch to HAMTDirectory when size exceeds threshold (%d > %d)", sizes[4], HAMTShardingSize)
347+
})
348+
})
273349
}

mfs/dir.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,12 +89,16 @@ func NewDirectory(ctx context.Context, name string, node ipld.Node, parent paren
8989
// The directory is added to the DAGService. To create a new MFS
9090
// root use NewEmptyRootFolder instead.
9191
func NewEmptyDirectory(ctx context.Context, name string, parent parent, dserv ipld.DAGService, prov provider.MultihashProvider, opts MkdirOpts) (*Directory, error) {
92-
db, err := uio.NewDirectory(dserv,
92+
dirOpts := []uio.DirectoryOption{
9393
uio.WithMaxLinks(opts.MaxLinks),
9494
uio.WithMaxHAMTFanout(opts.MaxHAMTFanout),
9595
uio.WithStat(opts.Mode, opts.ModTime),
9696
uio.WithCidBuilder(opts.CidBuilder),
97-
)
97+
}
98+
if opts.SizeEstimationMode != nil {
99+
dirOpts = append(dirOpts, uio.WithSizeEstimationMode(*opts.SizeEstimationMode))
100+
}
101+
db, err := uio.NewDirectory(dserv, dirOpts...)
98102
if err != nil {
99103
return nil, err
100104
}
@@ -580,6 +584,7 @@ func (d *Directory) setNodeData(data []byte, links []*ipld.Link) error {
580584
// We need to carry our desired settings.
581585
db.SetMaxLinks(d.unixfsDir.GetMaxLinks())
582586
db.SetMaxHAMTFanout(d.unixfsDir.GetMaxHAMTFanout())
587+
db.SetSizeEstimationMode(d.unixfsDir.GetSizeEstimationMode())
583588
d.unixfsDir = db
584589

585590
return nil

mfs/ops.go

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"strings"
1010
"time"
1111

12+
uio "github.com/ipfs/boxo/ipld/unixfs/io"
1213
cid "github.com/ipfs/go-cid"
1314
ipld "github.com/ipfs/go-ipld-format"
1415
)
@@ -120,13 +121,14 @@ func PutNode(r *Root, path string, nd ipld.Node) error {
120121

121122
// MkdirOpts is used by Mkdir
122123
type MkdirOpts struct {
123-
Mkparents bool
124-
Flush bool
125-
CidBuilder cid.Builder
126-
Mode os.FileMode
127-
ModTime time.Time
128-
MaxLinks int
129-
MaxHAMTFanout int
124+
Mkparents bool
125+
Flush bool
126+
CidBuilder cid.Builder
127+
Mode os.FileMode
128+
ModTime time.Time
129+
MaxLinks int
130+
MaxHAMTFanout int
131+
SizeEstimationMode *uio.SizeEstimationMode
130132
}
131133

132134
// Mkdir creates a directory at 'path' under the directory 'd', creating
@@ -157,8 +159,9 @@ func Mkdir(r *Root, pth string, opts MkdirOpts) error {
157159

158160
// opts to make the parents leave MkParents and Flush as false.
159161
parentsOpts := MkdirOpts{
160-
MaxLinks: opts.MaxLinks,
161-
MaxHAMTFanout: opts.MaxHAMTFanout,
162+
MaxLinks: opts.MaxLinks,
163+
MaxHAMTFanout: opts.MaxHAMTFanout,
164+
SizeEstimationMode: opts.SizeEstimationMode,
162165
}
163166

164167
for i, d := range parts[:len(parts)-1] {

0 commit comments

Comments
 (0)