diff --git a/README.md b/README.md index 6c7faad..603eafa 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,7 @@ struct itemx { uint8_t md[20]; /* sha1 message digest */ uint32_t sid; /* owner slab id */ uint32_t offset; /* item offset from owner slab base */ + rel_time_t expiry; /* expiry in secs */ uint64_t cas; /* cas */ } __attribute__ ((__packed__)); ``` @@ -54,9 +55,9 @@ Each index entry contains both object-specific information (key name, &c.) and d To further reduce the memory consumed by the index, we store the SHA-1 hash of the key in each index entry, instead of the key itself. The SHA-1 hash acts as the unique identifier for each object. The on-disk object format contains the complete object key and value. False positives from SHA-1 hash collisions are detected after object retrieval from the disk by comparison with the requested key. If there are collisions on the write path, new objects with the same hash key simply overwrite previous objects. -The index entry (struct itemx) on a 64-bit system is 44 bytes in size. It is possible to further reduce index entry size to 28 bytes, if CAS is unsupported, MD5 hashing is used, and the next pointer is reduced to 4 bytes. +The index entry (struct itemx) on a 64-bit system is 48 bytes in size. It is possible to further reduce index entry size to 32 bytes, if CAS is unsupported, MD5 hashing is used, and the next pointer is reduced to 4 bytes. -At this point, it is instructive to consider the relative size of fatcache's index and the on-disk data. With a 44 byte index entry, an index consuming 44 MB of memory can address 1M objects. If the average object size is 1 KB, then a 44 MB index can address 1 GB of on-disk storage - a 23x memory overcommit. If the average object size is 500 bytes, then a 44 MB index can address 500 MB of SSD - a 11x memory overcommit. Index size and object size relate in this way to determine the addressable capacity of the SSD. +At this point, it is instructive to consider the relative size of fatcache's index and the on-disk data. With a 44 byte index entry, an index consuming 48 MB of memory can address 1M objects. If the average object size is 1 KB, then a 48 MB index can address 1 GB of on-disk storage - a 23x memory overcommit. If the average object size is 500 bytes, then a 48 MB index can address 500 MB of SSD - a 11x memory overcommit. Index size and object size relate in this way to determine the addressable capacity of the SSD. ## Build diff --git a/src/fc_item.c b/src/fc_item.c index 640efb4..aa05664 100644 --- a/src/fc_item.c +++ b/src/fc_item.c @@ -24,23 +24,6 @@ extern struct settings settings; static uint64_t cas_id; -/* - * Return true if the item has expired, otherwise return false. Items - * with expiry of 0 are considered as unexpirable. - */ -bool -item_expired(struct item *it) -{ - ASSERT(it->magic == ITEM_MAGIC); - - if(it->expiry != 0 && it->expiry < time_now()) { - itemx_removex(it->hash, it->md); - return true; - } else { - return false; - } -} - /* * Return the owner slab of item it. */ @@ -96,7 +79,6 @@ item_get(uint8_t *key, uint8_t nkey, uint8_t cid, uint32_t ndata, it->cid = cid; it->nkey = nkey; it->ndata = ndata; - it->expiry = expiry; it->flags = flags; fc_memcpy(it->md, md, sizeof(it->md)); it->hash = hash; @@ -105,9 +87,9 @@ item_get(uint8_t *key, uint8_t nkey, uint8_t cid, uint32_t ndata, log_debug(LOG_VERB, "get it '%.*s' at offset %"PRIu32" with cid %"PRIu8 " expiry %u", it->nkey, item_key(it), it->offset, it->cid, - it->expiry); + expiry); - itemx_putx(it->hash, it->md, it->sid, it->offset, ++cas_id); + itemx_putx(it->hash, it->md, it->sid, it->offset, expiry, ++cas_id); return it; } diff --git a/src/fc_item.h b/src/fc_item.h index c033a69..3d58e31 100644 --- a/src/fc_item.h +++ b/src/fc_item.h @@ -28,7 +28,6 @@ struct item { uint8_t unused[2]; /* unused */ uint8_t nkey; /* key length */ uint32_t ndata; /* date length */ - rel_time_t expiry; /* expiry in secs */ uint32_t flags; /* flags opaque to the server */ uint8_t md[20]; /* key message digest */ uint32_t hash; /* key hash */ @@ -88,7 +87,6 @@ item_data(struct item *it) return it->end + it->nkey; } -bool item_expired(struct item *it); struct slab *item_to_slab(struct item *it); uint8_t item_slabcid(uint8_t nkey, uint32_t ndata); diff --git a/src/fc_itemx.c b/src/fc_itemx.c index 80cc841..1070261 100644 --- a/src/fc_itemx.c +++ b/src/fc_itemx.c @@ -32,6 +32,26 @@ static struct itemx_tqh free_itemxq; /* free itemx q */ static struct itemx *istart; /* itemx memory start */ static struct itemx *iend; /* itemx memory end */ +/* + * Return true if the itemx has expired, otherwise return false. Itemx + * with expiry of 0 are considered as unexpirable. + */ +bool +itemx_expired(struct itemx *itx) +{ + uint32_t hash; + + ASSERT(itx != NULL); + + if(itx->expiry != 0 && itx->expiry < time_now()) { + hash = sha1_hash(itx->md); + itemx_removex(hash, itx->md); + return true; + } else { + return false; + } +} + /* * Returns true, if there are no free item indexes, otherwise * return false. @@ -178,7 +198,7 @@ itemx_getx(uint32_t hash, uint8_t *md) void itemx_putx(uint32_t hash, uint8_t *md, uint32_t sid, uint32_t offset, - uint64_t cas) + rel_time_t expiry, uint64_t cas) { struct itemx *itx; struct itemx_tqh *bucket; @@ -188,6 +208,7 @@ itemx_putx(uint32_t hash, uint8_t *md, uint32_t sid, uint32_t offset, itx = itemx_get(); itx->sid = sid; itx->offset = offset; + itx->expiry = expiry; itx->cas = cas; fc_memcpy(itx->md, md, sizeof(itx->md)); diff --git a/src/fc_itemx.h b/src/fc_itemx.h index 94858f1..7798238 100644 --- a/src/fc_itemx.h +++ b/src/fc_itemx.h @@ -25,6 +25,7 @@ struct itemx { uint8_t md[20]; /* sha1 message digest */ uint32_t sid; /* owner slab id */ uint32_t offset; /* item offset from owner slab base */ + rel_time_t expiry; /* expiry in secs */ uint64_t cas; /* cas */ } __attribute__ ((__packed__)); @@ -34,8 +35,9 @@ rstatus_t itemx_init(void); void itemx_deinit(void); bool itemx_empty(void); +bool itemx_expired(struct itemx *itx); struct itemx *itemx_getx(uint32_t hash, uint8_t *md); -void itemx_putx(uint32_t hash, uint8_t *md, uint32_t sid, uint32_t ioff, uint64_t cas); +void itemx_putx(uint32_t hash, uint8_t *md, uint32_t sid, uint32_t ioff, rel_time_t expiry, uint64_t cas); bool itemx_removex(uint32_t hash, uint8_t *md); #endif diff --git a/src/fc_request.c b/src/fc_request.c index 5fe2716..d708fef 100644 --- a/src/fc_request.c +++ b/src/fc_request.c @@ -190,6 +190,10 @@ req_process_get(struct context *ctx, struct conn *conn, struct msg *msg) return; } + if (itemx_expired(itx)) { + rsp_send_status(ctx, conn, msg, MSG_RSP_NOT_FOUND); + return; + } /* * On a hit, we read the item with address [sid, offset] and respond * with item value if the item hasn't expired yet. @@ -199,10 +203,6 @@ req_process_get(struct context *ctx, struct conn *conn, struct msg *msg) rsp_send_error(ctx, conn, msg, MSG_RSP_SERVER_ERROR, errno); return; } - if (item_expired(it)) { - rsp_send_status(ctx, conn, msg, MSG_RSP_NOT_FOUND); - return; - } rsp_send_value(ctx, conn, msg, it, itx->cas); } @@ -254,17 +254,12 @@ static void req_process_add(struct context *ctx, struct conn *conn, struct msg *msg) { struct itemx *itx; - struct item *it; /* add, adds only if the mapping is not present */ itx = itemx_getx(msg->hash, msg->md); - if (itx != NULL) { - it = slab_read_item(itx->sid, itx->offset); - /* if the item hasn't expired yet */ - if(!item_expired(it)) { - rsp_send_status(ctx, conn, msg, MSG_RSP_NOT_STORED); - return; - } + if (itx != NULL && !itemx_expired(itx)) { + rsp_send_status(ctx, conn, msg, MSG_RSP_NOT_STORED); + return; } req_process_set(ctx, conn, msg); @@ -274,18 +269,10 @@ static void req_process_replace(struct context *ctx, struct conn *conn, struct msg *msg) { struct itemx *itx; - struct item *it; /* replace, only replaces if the mapping is present */ itx = itemx_getx(msg->hash, msg->md); - if (itx == NULL) { - rsp_send_status(ctx, conn, msg, MSG_RSP_NOT_STORED); - return; - } - - /* if the item has expired */ - it = slab_read_item(itx->sid, itx->offset); - if(item_expired(it)) { + if (itx == NULL || itemx_expired(itx)) { rsp_send_status(ctx, conn, msg, MSG_RSP_NOT_STORED); return; } @@ -339,7 +326,7 @@ req_process_concat(struct context *ctx, struct conn *conn, struct msg *msg) /* 1). look up existing itemx */ itx = itemx_getx(msg->hash, msg->md); - if (itx == NULL) { + if (itx == NULL || itemx_expired(itx)) { /* 2a). miss -> return NOT_STORED */ rsp_send_status(ctx, conn, msg, MSG_RSP_NOT_STORED); return; @@ -351,10 +338,6 @@ req_process_concat(struct context *ctx, struct conn *conn, struct msg *msg) rsp_send_error(ctx, conn, msg, MSG_RSP_SERVER_ERROR, errno); return; } - if (item_expired(oit)) { - rsp_send_status(ctx, conn, msg, MSG_RSP_NOT_STORED); - return; - } ndata = msg->vlen + oit->ndata; cid = item_slabcid(nkey, ndata); @@ -410,7 +393,7 @@ req_process_num(struct context *ctx, struct conn *conn, struct msg *msg) /* 1). look up existing itemx */ itx = itemx_getx(msg->hash, msg->md); - if (itx == NULL) { + if (itx == NULL || itemx_expired(itx)) { /* 2a). miss -> return NOT_FOUND */ rsp_send_status(ctx, conn, msg, MSG_RSP_NOT_FOUND); return; @@ -422,10 +405,6 @@ req_process_num(struct context *ctx, struct conn *conn, struct msg *msg) rsp_send_error(ctx, conn, msg, MSG_RSP_SERVER_ERROR, errno); return; } - if (item_expired(it)) { - rsp_send_status(ctx, conn, msg, MSG_RSP_NOT_FOUND); - return; - } /* 3). sanity check item data to be a number */ status = fc_atou64(item_data(it), it->ndata, &cnum);