Build Your Own Database From Scratch EBook·Paperback
⟵ prev Contents next ⟶

🔥 My other Book: Build Your Own Redis

05. B-Tree: The Practice (Part II)

Following the previous chapter on B-tree implementation.

5.1 The B-Tree Deletion

Step 1: Delete From Leaf Nodes

The code for deleting a key from a leaf node is just like other nodeReplace* functions.

// remove a key from a leaf node
func leafDelete(new BNode, old BNode, idx uint16) {
    new.setHeader(BNODE_LEAF, old.nkeys()-1)
    nodeAppendRange(new, old, 0, 0, idx)
    nodeAppendRange(new, old, idx, idx+1, old.nkeys()-(idx+1))
}

Step 2: Recursive Deletion

The structure is similar to the insertion.

// delete a key from the tree
func treeDelete(tree *BTree, node BNode, key []byte) BNode {
    // where to find the key?
    idx := nodeLookupLE(node, key)
    // act depending on the node type
    switch node.btype() {
    case BNODE_LEAF:
        if !bytes.Equal(key, node.getKey(idx)) {
            return BNode{} // not found
        }
        // delete the key in the leaf
        new := BNode{data: make([]byte, BTREE_PAGE_SIZE)}
        leafDelete(new, node, idx)
        return new
    case BNODE_NODE:
        return nodeDelete(tree, node, idx, key)
    default:
        panic("bad node!")
    }
}

Step 3: Handle Internal Nodes

The difference is that we need to merge nodes instead of splitting nodes. A node may be merged into one of its left or right siblings. The nodeReplace* functions are for updating links.

// part of the treeDelete()
func nodeDelete(tree *BTree, node BNode, idx uint16, key []byte) BNode {
    // recurse into the kid
    kptr := node.getPtr(idx)
    updated := treeDelete(tree, tree.get(kptr), key)
    if len(updated.data) == 0 {
        return BNode{} // not found
    }
    tree.del(kptr)

    new := BNode{data: make([]byte, BTREE_PAGE_SIZE)}
    // check for merging
    mergeDir, sibling := shouldMerge(tree, node, idx, updated)
    switch {
    case mergeDir < 0: // left
        merged := BNode{data: make([]byte, BTREE_PAGE_SIZE)}
        nodeMerge(merged, sibling, updated)
        tree.del(node.getPtr(idx - 1))
        nodeReplace2Kid(new, node, idx-1, tree.new(merged), merged.getKey(0))
    case mergeDir > 0: // right
        merged := BNode{data: make([]byte, BTREE_PAGE_SIZE)}
        nodeMerge(merged, updated, sibling)
        tree.del(node.getPtr(idx + 1))
        nodeReplace2Kid(new, node, idx, tree.new(merged), merged.getKey(0))
    case mergeDir == 0:
        if updated.nkeys() == 0 {
            // kid is empty after deletion and has no sibling to merge with.
            // this happens when its parent has only one kid.
            // discard the empty kid and return the parent as an empty node.
            assert(node.nkeys() == 1 && idx == 0)
            new.setHeader(BNODE_NODE, 0)
            // the empty node will be eliminated before reaching root.
        } else {
            nodeReplaceKidN(tree, new, node, idx, updated)
        }
    }
    return new
}

Extra care regarding empty nodes: If a node has no siblings, it cannot be merged, even if all its keys are deleted. In this case, we need to remove the empty node, this will also cause its parent to become an empty node, the empty node will propagate upwords until eventually merged.

// merge 2 nodes into 1
func nodeMerge(new BNode, left BNode, right BNode) {
    new.setHeader(left.btype(), left.nkeys()+right.nkeys())
    nodeAppendRange(new, left, 0, 0, left.nkeys())
    nodeAppendRange(new, right, left.nkeys(), 0, right.nkeys())
}

Step 4: The Conditions for Merging

The conditions for merging are:

  1. The node is smaller than 1/4 of a page (this is arbitrary).
  2. Has a sibling and the merged result does not exceed one page.
// should the updated kid be merged with a sibling?
func shouldMerge(
    tree *BTree, node BNode,
    idx uint16, updated BNode,
) (int, BNode) {
    if updated.nbytes() > BTREE_PAGE_SIZE/4 {
        return 0, BNode{}
    }

    if idx > 0 {
        sibling := tree.get(node.getPtr(idx - 1))
        merged := sibling.nbytes() + updated.nbytes() - HEADER
        if merged <= BTREE_PAGE_SIZE {
            return -1, sibling
        }
    }
    if idx+1 < node.nkeys() {
        sibling := tree.get(node.getPtr(idx + 1))
        merged := sibling.nbytes() + updated.nbytes() - HEADER
        if merged <= BTREE_PAGE_SIZE {
            return +1, sibling
        }
    }
    return 0, BNode{}
}

The deletion code is done.

5.2 The Root Node

We need to keep track of the root node as the tree grows and shrinks. Let’s start with deletion.

This is the final interface for B-tree deletion. The height of the tree will be reduced by one when:

  1. The root node is not a leaf.
  2. The root node has only one child.
func (tree *BTree) Delete(key []byte) bool {
    assert(len(key) != 0)
    assert(len(key) <= BTREE_MAX_KEY_SIZE)
    if tree.root == 0 {
        return false
    }

    updated := treeDelete(tree, tree.get(tree.root), key)
    if len(updated.data) == 0 {
        return false // not found
    }

    tree.del(tree.root)
    if updated.btype() == BNODE_NODE && updated.nkeys() == 1 {
        // remove a level
        tree.root = updated.getPtr(0)
    } else {
        tree.root = tree.new(updated)
    }
    return true
}

And below is the final interface for insertion:

// the interface
func (tree *BTree) Insert(key []byte, val []byte) {
    assert(len(key) != 0)
    assert(len(key) <= BTREE_MAX_KEY_SIZE)
    assert(len(val) <= BTREE_MAX_VAL_SIZE)

    if tree.root == 0 {
        // create the first node
        root := BNode{data: make([]byte, BTREE_PAGE_SIZE)}
        root.setHeader(BNODE_LEAF, 2)
        // a dummy key, this makes the tree cover the whole key space.
        // thus a lookup can always find a containing node.
        nodeAppendKV(root, 0, 0, nil, nil)
        nodeAppendKV(root, 1, 0, key, val)
        tree.root = tree.new(root)
        return
    }

    node := tree.get(tree.root)
    tree.del(tree.root)

    node = treeInsert(tree, node, key, val)
    nsplit, splitted := nodeSplit3(node)
    if nsplit > 1 {
        // the root was split, add a new level.
        root := BNode{data: make([]byte, BTREE_PAGE_SIZE)}
        root.setHeader(BNODE_NODE, nsplit)
        for i, knode := range splitted[:nsplit] {
            ptr, key := tree.new(knode), knode.getKey(0)
            nodeAppendKV(root, uint16(i), ptr, key, nil)
        }
        tree.root = tree.new(root)
    } else {
        tree.root = tree.new(splitted[0])
    }
}

It does two things:

  1. A new root node is created when the old root is split into multiple nodes.
  2. When inserting the first key, create the first leaf node as the root.

There is a little trick here. We insert an empty key into the tree when we create the first node. The empty key is the lowest possible key by sorting order, it makes the lookup function nodeLookupLE always successful, eliminating the case of failing to find a node that contains the input key.

5.3 Testing the B-Tree

Since our data structure code is pure data structure code (without IO), the page allocation code is isolated via 3 callbacks. Below is the container code for testing our B-tree, it keeps pages in an in-memory hashmap without persisting them to disk. In the next chapter, we’ll implement persistence without modifying the B-tree code.

type C struct {
    tree  BTree
    ref   map[string]string
    pages map[uint64]BNode
}

func newC() *C {
    pages := map[uint64]BNode{}
    return &C{
        tree: BTree{
            get: func(ptr uint64) BNode {
                node, ok := pages[ptr]
                assert(ok)
                return node
            },
            new: func(node BNode) uint64 {
                assert(node.nbytes() <= BTREE_PAGE_SIZE)
                key := uint64(uintptr(unsafe.Pointer(&node.data[0])))
                assert(pages[key].data == nil)
                pages[key] = node
                return key
            },
            del: func(ptr uint64) {
                _, ok := pages[ptr]
                assert(ok)
                delete(pages, ptr)
            },
        },
        ref:   map[string]string{},
        pages: pages,
    }
}

We use a reference map to record each B-tree update, so that we can verify the correctness of a B-tree later.

func (c *C) add(key string, val string) {
    c.tree.Insert([]byte(key), []byte(val))
    c.ref[key] = val
}

func (c *C) del(key string) bool {
    delete(c.ref, key)
    return c.tree.Delete([]byte(key))
}

Test cases are left to the reader as an exercise.

5.4 Closing Remarks

This B-tree implementation is pretty minimal, but minimal is good for the purpose of learning. Real-world implementations can be much more complicated and contain practical optimizations.

There are some easy improvements to our B-tree implementation:

  1. Use different formats for leaf nodes and internal nodes. Leaf nodes do not need pointers and internal nodes do not need values. This saves some space.
  2. One of the lengths of the key or value is redundant — the length of the KV pair can be inferred from the offset of the next key.
  3. The first key of a node is not needed because it’s inherited from a link of its parent.
  4. Add a checksum to detect data corruption.

The next step in building a KV store is to persist our B-tree to the disk, which is the topic of the next chapter.

See also:
codecrafters.io offers “Build Your Own X” courses in many programming languages.
Including Redis, Git, SQLite, Docker, and more.
Check it out

⟵ prev Contents next ⟶