Build Your Own Database From Scratch - James Smith
Build Your Own Database From Scratch - James Smith
Build Your Own Database From Scratch - James Smith
James Smith
2023-05-17
OceanofPDF.com
Build Your Own
Database From Scratch
00. Introduction
0.1 What is This Book About?
0.2 How to Use This Book?
0.3 Topic One: Persistence
0.4 Topic Two: Indexing
0.5 Topic Three: Concurrency
01. Files vs Databases
1.1 Persisting Data to Files
1.2 Atomic Renaming
1.3 fsync
1.4 Append-Only Logs
02. Indexing
2.1 Key-Value Store and
Relational DB
2.2 Hashtables
2.3 B-Trees
2.4 LSM-Trees
03. B-Tree: The Ideas
3.1 The Intuitions of the B-Tree
and BST
3.2 B-tree and Nested Arrays
3.3 B-Tree Operations
3.4 Immutable Data Structures
04. B-Tree: The Practice (Part I)
4.1 The Node Format
4.2 Data Types
4.3 Decoding the B-tree Node
4.4 The B-Tree Insertion
05. B-Tree: The Practice (Part II)
5.1 The B-Tree Deletion
5.2 The Root Node
5.3 Testing the B-Tree
5.4 Closing Remarks
06. Persist to Disk
6.1 The Method for Persisting
Data
6.2 mmap-Based IO
6.3 The Master Page
6.4 Allocating Disk Pages
6.5 Initializing the Database
6.6 Update Operations
07. Free List: Reusing Pages
7.1 Design the Free List
7.2 The Free List Datatype
7.3 The Free List
Implementation
7.4 Manage Disk Pages
08. Rows and Columns
8.1 Introduction
8.2 Data Structures
8.3 Point Query
8.4 Updates
8.5 Create New Tables
09. Range Query
9.1 B-Tree Iterator
9.2 Data Serialization
9.3 Range Query
10. Secondary Index
10.1 Index Definitions
10.2 Maintaining Indexes
10.3 Using Secondary Indexes
11. Atomic Transactions
11.1 KV Transaction Interfaces
11.2 DB Transaction Interfaces
11.3 Implementing the KV
Transaction
12. Concurrent Readers and
Writers
12.1 The Readers-Writer
Problem
12.2 Analysing the
Implementation
12.3 Concurrent Transactions
12.4 The Free List
12.5 Closing Remarks
13. Query Language: Parser
13.1 The Grammar
13.2 Operator Precedence and
Recursion
13.3 Parsing Expressions
13.4 Parsing Statements
14. Query Language: Execution
14.1 Introduction
14.2 Expression Evaluation
14.3 Fetching Rows
14.4 Executing Statements
14.5 Further Steps
OceanofPDF.com
00. Introduction
https://build-your-own.org
OceanofPDF.com
01. Files vs Databases
This chapter shows the limitations of
simply dumping data to files and the
problems that databases solve.
_, err = fp.Write(data)
return err
}
This naive approach has some
drawbacks:
_, err = fp.Write(data)
if err != nil {
os.Remove(tmp)
return err
}
1.3 fsync
To fix the problem, we must flush the
data to the disk before renaming it.
The Linux syscall for this is “fsync”.
_, err = fp.Write(data)
if err != nil {
os.Remove(tmp)
return err
}
OceanofPDF.com
02. Indexing
2.2 Hashtables
2.3 B-Trees
2.4 LSM-Trees
How to query:
How to update:
5. When updating a key, the key is
inserted into a file from the top
level first.
6. If the file size exceeds a
threshold, merge it with the next
level.
7. The file size threshold increases
exponentially with each level,
which means that the amount of
data also increases exponentially.
Let’s analyze how this works. For
queries:
For updates:
OceanofPDF.com
03. B-Tree: The Ideas
parent parent
/ | \ => / | | \
L1 L2 L6 L1 L3 L4 L6
new_root
/ \
root N1 N2
/ | \ => / | | \
L1 L2 L6 L1 L3 L4 L6
OceanofPDF.com
04. B-Tree: The Practice
(Part I)
This chapter implements an immutable
B+ tree in Golang. The
implementation is minimal and thus is
easy to follow.
const (
BNODE_NODE = 1 // internal nodes without
values
BNODE_LEAF = 2 // leaf nodes with values
)
const HEADER = 4
func init() {
node1max := HEADER + 8 + 2 + 4 +
BTREE_MAX_KEY_SIZE + BTREE_MAX_VAL_SIZE
assert(node1max <= BTREE_PAGE_SIZE)
}
// header
func (node BNode) btype() uint16 {
return
binary.LittleEndian.Uint16(node.data)
}
func (node BNode) nkeys() uint16 {
return
binary.LittleEndian.Uint16(node.data[2:4])
}
func (node BNode) setHeader(btype uint16,
nkeys uint16) {
binary.LittleEndian.PutUint16(node.data[0:2],
btype)
binary.LittleEndian.PutUint16(node.data[2:4],
nkeys)
}
// pointers
func (node BNode) getPtr(idx uint16) uint64
{
assert(idx < node.nkeys())
pos := HEADER + 8*idx
return
binary.LittleEndian.Uint64(node.data[pos:])
}
func (node BNode) setPtr(idx uint16, val
uint64) {
assert(idx < node.nkeys())
pos := HEADER + 8*idx
binary.LittleEndian.PutUint64(node.data[pos:],
val)
}
binary.LittleEndian.PutUint16(node.data[offsetPos(node,
idx):], offset)
}
// key-values
func (node BNode) kvPos(idx uint16) uint16 {
assert(idx <= node.nkeys())
return HEADER + 8*node.nkeys() +
2*node.nkeys() + node.getOffset(idx)
}
func (node BNode) getKey(idx uint16) []byte
{
assert(idx < node.nkeys())
pos := node.kvPos(idx)
klen :=
binary.LittleEndian.Uint16(node.data[pos:])
return node.data[pos+4:][:klen]
}
func (node BNode) getVal(idx uint16) []byte
{
assert(idx < node.nkeys())
pos := node.kvPos(idx)
klen :=
binary.LittleEndian.Uint16(node.data[pos+0:])
vlen :=
binary.LittleEndian.Uint16(node.data[pos+2:])
return node.data[pos+4+klen:][:vlen]
}
// pointers
for i := uint16(0); i < n; i++ {
new.setPtr(dstNew+i,
old.getPtr(srcOld+i))
}
// offsets
dstBegin := new.getOffset(dstNew)
srcBegin := old.getOffset(srcOld)
for i := uint16(1); i <= n; i++ { //
NOTE: the range is [1, n]
offset := dstBegin +
old.getOffset(srcOld+i) - srcBegin
new.setOffset(dstNew+i, offset)
}
// KVs
begin := old.kvPos(srcOld)
end := old.kvPos(srcOld + n)
copy(new.data[new.kvPos(dstNew):],
old.data[begin:end])
}
binary.LittleEndian.PutUint16(new.data[pos+0:],
uint16(len(key)))
binary.LittleEndian.PutUint16(new.data[pos+2:],
uint16(len(val)))
copy(new.data[pos+4:], key)
copy(new.data[pos+4+uint16(len(key)):],
val)
// the offset of the next key
new.setOffset(idx+1,
new.getOffset(idx)+4+uint16((len(key)+len(val))))
}
Step 3: Recursive Insertion
OceanofPDF.com
05. B-Tree: The Practice
(Part II)
Following the previous chapter on B-
tree implementation.
if idx > 0 {
sibling := tree.get(node.getPtr(idx
- 1))
merged := sibling.nbytes() +
updated.nbytes() - HEADER
if merged <= BTREE_PAGE_SIZE {
return -1, sibling
}
}
if idx+1 < node.nkeys() {
sibling := tree.get(node.getPtr(idx
+ 1))
merged := sibling.nbytes() +
updated.nbytes() - HEADER
if merged <= BTREE_PAGE_SIZE {
return +1, sibling
}
}
return 0, BNode{}
}
updated := treeDelete(tree,
tree.get(tree.root), key)
if len(updated.data) == 0 {
return false // not found
}
tree.del(tree.root)
if updated.btype() == BNODE_NODE &&
updated.nkeys() == 1 {
// remove a level
tree.root = updated.getPtr(0)
} else {
tree.root = tree.new(updated)
}
return true
}
// the interface
func (tree *BTree) Insert(key []byte, val
[]byte) {
assert(len(key) != 0)
assert(len(key) <= BTREE_MAX_KEY_SIZE)
assert(len(val) <= BTREE_MAX_VAL_SIZE)
if tree.root == 0 {
// create the first node
root := BNode{data: make([]byte,
BTREE_PAGE_SIZE)}
root.setHeader(BNODE_LEAF, 2)
// a dummy key, this makes the tree
cover the whole key space.
// thus a lookup can always find a
containing node.
nodeAppendKV(root, 0, 0, nil, nil)
nodeAppendKV(root, 1, 0, key, val)
tree.root = tree.new(root)
return
}
node := tree.get(tree.root)
tree.del(tree.root)
type C struct {
tree BTree
ref map[string]string
pages map[uint64]BNode
}
func newC() *C {
pages := map[uint64]BNode{}
return &C{
tree: BTree{
get: func(ptr uint64) BNode {
node, ok := pages[ptr]
assert(ok)
return node
},
new: func(node BNode) uint64 {
assert(node.nbytes() <=
BTREE_PAGE_SIZE)
key :=
uint64(uintptr(unsafe.Pointer(&node.data[0])))
assert(pages[key].data ==
nil)
pages[key] = node
return key
},
del: func(ptr uint64) {
_, ok := pages[ptr]
assert(ok)
delete(pages, ptr)
},
},
ref: map[string]string{},
pages: pages,
}
}
OceanofPDF.com
06. Persist to Disk
The B-tree data structure from the
previous chapter can be dumped to
disk easily. Let’s build a simple KV
store on top of it. Since our B-tree
implementation is immutable, we’ll
allocate disk space in an append-only
manner, reusing disk space is
deferred to the next chapter.
6.2 mmap-Based IO
The contents of a disk file can be
mapped from a virtual address using
the mmap syscall. Reading from this
address initiates transparent disk IO,
which is the same as reading the file
via the read syscall, but without the
need for a user-space buffer and the
overhead of a syscall. The mapped
address is a proxy to the page cache,
modifying data via it is the same as
the write syscall.
mmap is convenient, and we’ll use it for
our KV store. However, the use of
mmap is not essential.
if fi.Size()%BTREE_PAGE_SIZE != 0 {
return 0, nil, errors.New("File
size is not a multiple of page size.")
}
mmapSize := 64 << 20
assert(mmapSize%BTREE_PAGE_SIZE == 0)
for mmapSize < int(fi.Size()) {
mmapSize *= 2
}
// mmapSize can be larger than the file
syscall.PROT_READ|syscall.PROT_WRITE,
syscall.MAP_SHARED,
)
if err != nil {
return 0, nil, fmt.Errorf("mmap:
%w", err)
}
type KV struct {
Path string
// internals
fp *os.File
tree BTree
mmap struct {
file int // file size, can
be larger than the database size
total int // mmap size, can
be larger than the file size
chunks [][]byte // multiple mmaps,
can be non-continuous
}
page struct {
flushed uint64 // database size
in number of pages
temp [][]byte // newly allocated
pages
}
}
syscall.PROT_READ|syscall.PROT_WRITE,
syscall.MAP_SHARED,
)
if err != nil {
return fmt.Errorf("mmap: %w", err)
}
db.mmap.total += db.mmap.total
db.mmap.chunks = append(db.mmap.chunks,
chunk)
return nil
}
The size of the new mapping
increases exponentially so that we
don’t have to call mmap frequently.
| the_master_page | pages... |
tree_root | pages... |
| btree_root | page_used | ^
^
| | |
|
+------------+----------------------+
|
|
|
+-----------------------
----------------+
data := db.mmap.chunks[0]
root :=
binary.LittleEndian.Uint64(data[16:])
used :=
binary.LittleEndian.Uint64(data[24:])
db.tree.root = root
db.page.flushed = used
return nil
}
Below is the function for updating the
master page. Unlike the code for
reading, it doesn’t use the mapped
address for writing. This is because
modifying a page via mmap is not
atomic. The kernel could flush the
page midway and corrupt the disk
file, while a small write that doesn’t
cross the page boundary is
guaranteed to be atomic.
// update the master page. it must be
atomic.
func masterStore(db *KV) error {
var data [32]byte
copy(data[:16], []byte(DB_SIG))
binary.LittleEndian.PutUint64(data[16:],
db.tree.root)
binary.LittleEndian.PutUint64(data[24:],
db.page.flushed)
// NOTE: Updating the page via mmap is
not atomic.
// Use the `pwrite()` syscall
instead.
_, err := db.fp.WriteAt(data[:], 0)
if err != nil {
return fmt.Errorf("write master
page: %w", err)
}
return nil
}
type KV struct {
// omitted...
page struct {
flushed uint64 // database size
in number of pages
temp [][]byte // newly allocated
pages
}
}
// callback for BTree, allocate a new page.
func (db *KV) pageNew(node BNode) uint64 {
// TODO: reuse deallocated pages
assert(len(node.data) <=
BTREE_PAGE_SIZE)
ptr := db.page.flushed +
uint64(len(db.page.temp))
db.page.temp = append(db.page.temp,
node.data)
return ptr
}
db.mmap.file = fileSize
return nil
}
// btree callbacks
db.tree.get = db.pageGet
db.tree.new = db.pageNew
db.tree.del = db.pageDel
// done
return nil
fail:
db.Close()
return fmt.Errorf("KV.Open: %w", err)
}
// cleanups
func (db *KV) Close() {
for _, chunk := range db.mmap.chunks {
err := syscall.Munmap(chunk)
assert(err == nil)
}
_ = db.fp.Close()
}
// read the db
func (db *KV) Get(key []byte) ([]byte,
bool) {
return db.tree.Get(key)
}
// update the db
func (db *KV) Set(key []byte, val []byte)
error {
db.tree.Insert(key, val)
return flushPages(db)
}
OceanofPDF.com
07. Free List: Reusing
Pages
Since our B-tree is immutable, every
update to the KV store creates new
nodes in the path instead of updating
current nodes, leaving some nodes
unreachable from the latest version.
We need to reuse these unreachable
nodes from old versions, otherwise,
the database file will grow
indefinitely.
const BNODE_FREE_LIST = 3
const FREE_LIST_HEADER = 4 + 8 + 8
const FREE_LIST_CAP = (BTREE_PAGE_SIZE -
FREE_LIST_HEADER) / 8
// done
flnSetTotal(fl.get(fl.head),
uint64(total+len(freed)))
}
func flPush(fl *FreeList, freed []uint64,
reuse []uint64) {
for len(freed) > 0 {
new := BNode{make([]byte,
BTREE_PAGE_SIZE)}
if len(reuse) > 0 {
// reuse a pointer from the
list
fl.head, reuse = reuse[0],
reuse[1:]
fl.use(fl.head, new)
} else {
// or append a page to house
the new node
fl.head = fl.new(new)
}
}
assert(len(reuse) == 0)
}
7.4 Manage Disk Pages
type KV struct {
// omitted...
page struct {
flushed uint64 // database size in
number of pages
nfree int // number of pages
taken from the free list
nappend int // number of pages
to be appended
// newly allocated or deallocated
pages keyed by the pointer.
// nil value denotes a deallocated
page.
updates map[uint64][]byte
}
}
Step 5: Done
OceanofPDF.com
08. Rows and Columns
8.1 Introduction
const (
TYPE_ERROR = 0
TYPE_BYTES = 1
TYPE_INT64 = 2
)
// table cell
type Value struct {
Type uint32
I64 int64
Str []byte
}
// table row
type Record struct {
Cols []string
Vals []Value
}
// table definition
type TableDef struct {
// user defined
Name string
Types []uint32 // column types
Cols []string // column names
PKeys int // the first `PKeys`
columns are the primary key
// auto-assigned B-tree key prefixes for
different tables
Prefix uint32
}
rec.Cols = append(rec.Cols,
tdef.Cols[tdef.PKeys:]...)
rec.Vals = append(rec.Vals,
values[tdef.PKeys:]...)
return true, nil
}
tdef := &TableDef{}
err = json.Unmarshal(rec.Get("def").Str,
tdef)
assert(err == nil)
return tdef
}
8.4 Updates
// add a record
func (db *DB) Set(table string, rec Record,
mode int) (bool, error) {
tdef := getTableDef(db, table)
if tdef == nil {
return false, fmt.Errorf("table not
found: %s", table)
}
return dbUpdate(db, tdef, rec, mode)
}
func (db *DB) Insert(table string, rec
Record) (bool, error) {
return db.Set(table, rec,
MODE_INSERT_ONLY)
}
func (db *DB) Update(table string, rec
Record) (bool, error) {
return db.Set(table, rec,
MODE_UPDATE_ONLY)
}
func (db *DB) Upsert(table string, rec
Record) (bool, error) {
return db.Set(table, rec, MODE_UPSERT)
}
Three steps:
assert(tdef.Prefix >
TABLE_PREFIX_MIN)
} else {
meta.AddStr("val", make([]byte, 4))
}
// update the next prefix
binary.LittleEndian.PutUint32(meta.Get("val").Str,
tdef.Prefix+1)
_, err = dbUpdate(db, TDEF_META, *meta,
0)
if err != nil {
return err
}
// B-tree iterator
type BIter struct {
tree *BTree
path []BNode // from root to leaf
pos []uint16 // indexes into nodes
}
const (
CMP_GE = +3 // >=
CMP_GT = +2 // >
CMP_LT = -2 // <
CMP_LE = -3 // <=
)
// order-preserving encoding
func encodeValues(out []byte, vals []Value)
[]byte {
for _, v := range vals {
switch v.Type {
case TYPE_INT64:
var buf [8]byte
u := uint64(v.I64) + (1 << 63)
binary.BigEndian.PutUint64(buf[:], u)
out = append(out, buf[:]...)
case TYPE_BYTES:
out = append(out,
escapeString(v.Str)...)
out = append(out, 0) // null-
terminated
default:
panic("what?")
}
}
return out
}
req.tdef = tdef
OceanofPDF.com
10. Secondary Index
In this chapter, we’ll add extra indexes
(also known as secondary indexes) to
our database. Queries will no longer
be restricted to the primary key.
// table definition
type TableDef struct {
// user defined
Name string
Types []uint32 // column types
Cols []string // column names
PKeys int // the first `PKeys`
columns are the primary key
Indexes [][]string
// auto-assigned B-tree key prefixes for
different tables/indexes
Prefix uint32
IndexPrefixes []uint32
}
To find a row via an index, the index
must contain a copy of the primary
key. We’ll accomplish this by
appending primary key columns to the
index; this also makes the index key
unique, which is assumed by the B-
tree lookup code.
binary.LittleEndian.PutUint32(meta.Get("val").Str,
tdef.Prefix+ntree)
_, err = dbUpdate(db, TDEF_META, *meta,
0)
if err != nil {
return err
}
// store the definition
// omited...
}
const (
INDEX_ADD = 1
INDEX_DEL = 2
)
// maintain indexes
if req.Updated && !req.Added {
decodeValues(req.Old,
values[tdef.PKeys:]) // get the old row
indexOp(db, tdef, Record{tdef.Cols,
values}, INDEX_DEL)
}
if req.Updated {
indexOp(db, tdef, rec, INDEX_ADD)
}
return added, nil
}
// delete a record by its primary key
func dbDelete(db *DB, tdef *TableDef, rec
Record) (bool, error) {
// omitted...
// maintain indexes
if deleted {
// likewise...
}
return true, nil
}
10.3 Using Secondary Indexes
pos := 0
if len(in) > 0 && in[0] >= 0xfe {
out[0] = 0xfe
out[1] = in[0]
pos += 2
in = in[1:]
}
// omitted...
return out
}
tdef := sc.tdef
rec.Cols = tdef.Cols
rec.Vals = rec.Vals[:0]
key, val := sc.iter.Deref()
if sc.indexNo < 0 {
// primary key, decode the KV pair
// omitted...
} else {
// secondary index
// The "value" part of the KV store
is not used by indexes
assert(len(val) == 0)
// select an index
indexNo, err := findIndex(tdef,
req.Key1.Cols)
if err != nil {
return err
}
index, prefix := tdef.Cols[:tdef.PKeys],
tdef.Prefix
if indexNo >= 0 {
index, prefix =
tdef.Indexes[indexNo],
tdef.IndexPrefixes[indexNo]
}
req.db = db
req.tdef = tdef
req.indexNo = indexNo
Step 5: Congratulations
OceanofPDF.com
11. Atomic Transactions
A transaction allows multiple updates
to be performed atomically (all or
nothing). In the last chapter, updating
a row can result in multiple KV
updates (due to secondary indexes),
which are not atomic and can lead to
corruption if interrupted (not crash-
resistant). Implementing transactions
fixes this.
// KV transaction
type KVTX struct {
// later...
}
// begin a transaction
func (kv *KV) Begin(tx *KVTX)
// end a transaction: commit updates
func (kv *KV) Commit(tx *KVTX) error
// end a transaction: rollback
func (kv *KV) Abort(tx *KVTX)
// KV operations
func (tx *KVTX) Get(key []byte) ([]byte,
bool) {
return tx.db.tree.Get(key)
}
func (tx *KVTX) Seek(key []byte, cmp int)
*BIter {
return tx.db.tree.Seek(key, cmp)
}
func (tx *KVTX) Update(req *InsertReq) bool
{
tx.db.tree.InsertEx(req)
return req.Added
}
func (tx *KVTX) Del(req *DeleteReq) bool {
return tx.db.tree.DeleteEx(req)
}
11.2 DB Transaction Interfaces
// DB transaction
type DBTX struct {
kv KVTX
db *DB
}
// KV transaction
type KVTX struct {
db *KV
// for the rollback
tree struct {
root uint64
}
free struct {
head uint64
}
}
// begin a transaction
func (kv *KV) Begin(tx *KVTX) {
tx.db = kv
tx.tree.root = kv.tree.root
tx.free.head = kv.free.head
}
// rollback the tree and other in-memory
data structures.
func rollbackTX(tx *KVTX) {
kv := tx.db
kv.tree.root = tx.tree.root
kv.free.head = tx.free.head
kv.page.nfree = 0
kv.page.nappend = 0
kv.page.updates = map[uint64][]byte{}
}
OceanofPDF.com
12. Concurrent Readers
and Writers
3 major changes:
type KV struct {
// omitted...
mu sync.Mutex
writer sync.Mutex
// omitted...
}
type KV struct {
Path string
// internals
fp *os.File
// mod 1: moved the B-tree and the free
list
tree struct {
root uint64
}
free FreeListData
mmap struct {
// same; omitted...
}
// mod 2: moved the page management
page struct {
flushed uint64 // database size in
number of pages
}
// mod 3: mutexes
mu sync.Mutex
writer sync.Mutex
// mod 4: version number and the reader
list
version uint64
readers ReaderList // heap, for
tracking the minimum reader version
}
// implements heap.Interface
type ReaderList []*KVReader
// read-only KV transactions
type KVReader struct {
// the snapshot
version uint64
tree BTree
mmap struct {
chunks [][]byte // copied from
struct KV. read-only.
}
// for removing from the heap
index int
}
// KV transaction
type KVTX struct {
KVReader
db *KV
free FreeList
page struct {
nappend int // number of pages to
be appended
// newly allocated or deallocated
pages keyed by the pointer.
// nil value denotes a deallocated
page.
updates map[uint64][]byte
}
}
// begin a transaction
func (kv *KV) Begin(tx *KVTX) {
tx.db = kv
tx.page.updates = map[uint64][]byte{}
tx.mmap.chunks = kv.mmap.chunks
kv.writer.Lock()
tx.version = kv.version
// btree
tx.tree.root = kv.tree.root
tx.tree.get = tx.pageGet
tx.tree.new = tx.pageNew
tx.tree.del = tx.pageDel
// freelist
tx.free.FreeListData = kv.free
tx.free.version = kv.version
tx.free.get = tx.pageGet
tx.free.new = tx.pageAppend
tx.free.use = tx.pageUse
tx.free.minReader = kv.version
kv.mu.Lock()
if len(kv.readers) > 0 {
tx.free.minReader =
kv.readers[0].version
}
kv.mu.Unlock()
}
// begin a transaction
func (kv *KV) Begin(tx *KVTX) {
// omitted...
tx.free.minReader = kv.version
kv.mu.Lock()
if len(kv.readers) > 0 {
tx.free.minReader =
kv.readers[0].version
}
kv.mu.Unlock()
}
return ptr
}
// a < b
func versionBefore(a, b uint64) bool {
return int64(a-b) < 0
}
OceanofPDF.com
13. Query Language:
Parser
The last thing to add to our database is
a query language. A query language
exposes all the functionality we have
implemented as a human interface.
13.1.1 Statements
13.1.2 Conditions
However, the conditions in our
language differ from SQL. Unlike
SQL, which uses the WHERE clause to
select rows, we separate conditions
into indexing conditions and non-
indexing conditions.
13.1.3 Expressions
The language also contains arbitrary
expressions in the SELECT statement,
FILTER conditions, and the UPDATE
statement. Expressions are just
recursive binary or unary operations.
-a
a * b, a / b
a + b, a - b
a = b, a < b, ... -- all comparisons
NOT a
a AND b
a OR b
(a, b, c, ...) -- tuple
13.2 Operator Precedence and
Recursion
// syntax tree
type QLNode struct {
Value // Type, I64, Str
Kids []QLNode
}
a
a + b
a + b + c + ...
def parse_add():
node = parse_column()
while parse('+'):
right = parse_column()
node = QLNode(type='+', kids=[node,
right])
return node
Now we add the multiplication
operator, which has a different
precedence. Let’s revise the
expression a + b, the subexpression a
or b could be a multiplication, which
should be applied before the addition.
(e.g.: when the b is c * d). We’ll add a
level of recursion to handle this:
def parse_add():
node = parse_mul()
while parse('+'):
right = parse_mul()
node = QLNode(type='+', kids=[node,
right])
return node
def parse_mul():
node = parse_column()
while parse('*'):
right = parse_column()
node = QLNode(type='*', kids=[node,
right])
return node
p.idx += len(kw)
}
return true
}
13.3.2 Generalization
The pExprOr should recurse into the AND
operator (pExprAnd) according to the
precedence list. But there are many
precedences, so let’s generalize this.
func pExprBinop(
p *Parser, node *QLNode,
ops []string, types []uint32, next
func(*Parser, *QLNode),
) {
assert(len(ops) == len(types))
left := QLNode{}
next(p, &left)
*node = left
}
end := p.idx
if !(end < len(p.input) &&
isSymStart(p.input[end])) {
return false
}
end++
for end < len(p.input) &&
isSym(p.input[end]) {
end++
}
if
pKeywordSet[strings.ToLower(string(p.input[p.idx:end]))]
{
return false // not allowed
}
node.Type = QL_SYM
node.Str = p.input[p.idx:end]
p.idx = end
return true
}
// stmt: select
type QLSelect struct {
QLScan
Names []string // expr AS name
Output []QLNode // expression list
}
// stmt: update
type QLUpdate struct {
QLScan
Names []string
Values []QLNode
}
// stmt: insert
type QLInsert struct {
Table string
Mode int
Names []string
Values [][]QLNode
}
// stmt: delete
type QLDelete struct {
QLScan
}
// SELECT xxx
pSelectExprList(p, &stmt)
// FROM table
if !pKeyword(p, "from") {
pErr(p, nil, "expect `FROM` table")
}
stmt.Table = pMustSym(p)
// INDEX BY xxx FILTER yyy LIMIT zzz
pScan(p, &stmt.QLScan)
if p.err != nil {
return nil
}
return &stmt
}
OceanofPDF.com
14. Query Language:
Execution
14.1 Introduction
// output
for _, irec := range records {
orec := Record{Cols: req.Names}
for _, node := range req.Output {
ctx := QLEvalContex{env: irec}
qlEval(&ctx, node)
if ctx.err != nil {
return nil, ctx.err
}
orec.Vals = append(orec.Vals,
ctx.out)
}
out = append(out, orec)
}
return out, nil
}
It does 2 things:
switch node.Type {
// refer to a column
case QL_SYM:
if v :=
ctx.env.Get(string(node.Str)); v != nil {
ctx.out = *v
} else {
qlErr(ctx, "unknown column:
%s", node.Str)
}
// a literal value
case QL_I64, QL_STR:
ctx.out = node.Value
// more; omitted...
default:
panic("not implemented")
}
}
// unary ops
case QL_NEG:
qlEval(ctx, node.Kids[0])
if ctx.out.Type == TYPE_INT64 {
ctx.out.I64 = -ctx.out.I64
} else {
qlErr(ctx, "QL_NEG type error")
}
Corresponding examples:
if req.Key2.Type != 0 {
sc.Key2, sc.Cmp2, err =
qlEvalScanKey(req.Key1)
if err != nil {
return err
}
}
rec := Record{}
if ok {
sc.Deref(&rec)
}
// `FILTER`
if ok && req.Filter.Type != 0 {
ctx := QLEvalContex{env: rec}
qlEval(&ctx, req.Filter)
if ctx.err != nil {
return nil, ctx.err
}
if ctx.out.Type != TYPE_INT64 {
return nil,
errors.New("filter is not of boolean type")
}
ok = (ctx.out.I64 != 0)
}
if ok {
out = append(out, rec)
}
sc.Next()
}
return out, nil
}
tdef := getTableDef(&tx.DBReader,
req.Table)
pk := tdef.Cols[:tdef.PKeys]
for _, rec := range records {
key := Record{Cols: pk}
for _, col := range pk {
key.Vals = append(key.Vals,
*rec.Get(col))
}
deleted, err :=
tx.Delete(req.Table, key)
assert(err == nil && deleted) //
deleting an existing row
}
return uint64(len(records)), nil
}
https://build-your-own.org/
OceanofPDF.com