Skip to content

Commit b0eaa4c

Browse files
author
Amit Kapila
committed
Avoid creation of the free space map for small heap relations, take 2.
Previously, all heaps had FSMs. For very small tables, this means that the FSM took up more space than the heap did. This is wasteful, so now we refrain from creating the FSM for heaps with 4 pages or fewer. If the last known target block has insufficient space, we still try to insert into some other page before giving up and extending the relation, since doing otherwise leads to table bloat. Testing showed that trying every page penalized performance slightly, so we compromise and try every other page. This way, we visit at most two pages. Any pages with wasted free space become visible at next relation extension, so we still control table bloat. As a bonus, directly attempting one or two pages can even be faster than consulting the FSM would have been. Once the FSM is created for a heap we don't remove it even if somebody deletes all the rows from the corresponding relation. We don't think it is a useful optimization as it is quite likely that relation will again grow to the same size. Author: John Naylor, Amit Kapila Reviewed-by: Amit Kapila Tested-by: Mithun C Y Discussion: https://www.postgresql.org/message-id/CAJVSVGWvB13PzpbLEecFuGFc5V2fsO736BsdTakPiPAcdMM5tQ@mail.gmail.com
1 parent be12aa4 commit b0eaa4c

File tree

16 files changed

+543
-102
lines changed

16 files changed

+543
-102
lines changed

contrib/pageinspect/expected/page.out

Lines changed: 39 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,69 @@
11
CREATE EXTENSION pageinspect;
2-
CREATE TABLE test1 (a int, b int);
3-
INSERT INTO test1 VALUES (16777217, 131584);
4-
VACUUM test1; -- set up FSM
2+
CREATE TABLE test_rel_forks (a int);
3+
-- Make sure there are enough blocks in the heap for the FSM to be created.
4+
INSERT INTO test_rel_forks SELECT i from generate_series(1,2000) i;
5+
-- set up FSM and VM
6+
VACUUM test_rel_forks;
57
-- The page contents can vary, so just test that it can be read
68
-- successfully, but don't keep the output.
7-
SELECT octet_length(get_raw_page('test1', 'main', 0)) AS main_0;
9+
SELECT octet_length(get_raw_page('test_rel_forks', 'main', 0)) AS main_0;
810
main_0
911
--------
1012
8192
1113
(1 row)
1214

13-
SELECT octet_length(get_raw_page('test1', 'main', 1)) AS main_1;
14-
ERROR: block number 1 is out of range for relation "test1"
15-
SELECT octet_length(get_raw_page('test1', 'fsm', 0)) AS fsm_0;
15+
SELECT octet_length(get_raw_page('test_rel_forks', 'main', 100)) AS main_100;
16+
ERROR: block number 100 is out of range for relation "test_rel_forks"
17+
SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 0)) AS fsm_0;
1618
fsm_0
1719
-------
1820
8192
1921
(1 row)
2022

21-
SELECT octet_length(get_raw_page('test1', 'fsm', 1)) AS fsm_1;
22-
fsm_1
23-
-------
24-
8192
25-
(1 row)
26-
27-
SELECT octet_length(get_raw_page('test1', 'vm', 0)) AS vm_0;
23+
SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 20)) AS fsm_20;
24+
ERROR: block number 20 is out of range for relation "test_rel_forks"
25+
SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 0)) AS vm_0;
2826
vm_0
2927
------
3028
8192
3129
(1 row)
3230

33-
SELECT octet_length(get_raw_page('test1', 'vm', 1)) AS vm_1;
34-
ERROR: block number 1 is out of range for relation "test1"
31+
SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 1)) AS vm_1;
32+
ERROR: block number 1 is out of range for relation "test_rel_forks"
3533
SELECT octet_length(get_raw_page('xxx', 'main', 0));
3634
ERROR: relation "xxx" does not exist
37-
SELECT octet_length(get_raw_page('test1', 'xxx', 0));
35+
SELECT octet_length(get_raw_page('test_rel_forks', 'xxx', 0));
3836
ERROR: invalid fork name
3937
HINT: Valid fork names are "main", "fsm", "vm", and "init".
40-
SELECT get_raw_page('test1', 0) = get_raw_page('test1', 'main', 0);
38+
SELECT * FROM fsm_page_contents(get_raw_page('test_rel_forks', 'fsm', 0));
39+
fsm_page_contents
40+
-------------------
41+
0: 39 +
42+
1: 39 +
43+
3: 39 +
44+
7: 39 +
45+
15: 39 +
46+
31: 39 +
47+
63: 39 +
48+
127: 39 +
49+
255: 39 +
50+
511: 39 +
51+
1023: 39 +
52+
2047: 39 +
53+
4095: 39 +
54+
fp_next_slot: 0 +
55+
56+
(1 row)
57+
58+
SELECT get_raw_page('test_rel_forks', 0) = get_raw_page('test_rel_forks', 'main', 0);
4159
?column?
4260
----------
4361
t
4462
(1 row)
4563

64+
DROP TABLE test_rel_forks;
65+
CREATE TABLE test1 (a int, b int);
66+
INSERT INTO test1 VALUES (16777217, 131584);
4667
SELECT pagesize, version FROM page_header(get_raw_page('test1', 0));
4768
pagesize | version
4869
----------+---------
@@ -62,26 +83,6 @@ SELECT tuple_data_split('test1'::regclass, t_data, t_infomask, t_infomask2, t_bi
6283
{"\\x01000001","\\x00020200"}
6384
(1 row)
6485

65-
SELECT * FROM fsm_page_contents(get_raw_page('test1', 'fsm', 0));
66-
fsm_page_contents
67-
-------------------
68-
0: 254 +
69-
1: 254 +
70-
3: 254 +
71-
7: 254 +
72-
15: 254 +
73-
31: 254 +
74-
63: 254 +
75-
127: 254 +
76-
255: 254 +
77-
511: 254 +
78-
1023: 254 +
79-
2047: 254 +
80-
4095: 254 +
81-
fp_next_slot: 0 +
82-
83-
(1 row)
84-
8586
DROP TABLE test1;
8687
-- check that using any of these functions with a partitioned table or index
8788
-- would fail

contrib/pageinspect/sql/page.sql

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,35 @@
11
CREATE EXTENSION pageinspect;
22

3-
CREATE TABLE test1 (a int, b int);
4-
INSERT INTO test1 VALUES (16777217, 131584);
3+
CREATE TABLE test_rel_forks (a int);
4+
-- Make sure there are enough blocks in the heap for the FSM to be created.
5+
INSERT INTO test_rel_forks SELECT i from generate_series(1,2000) i;
56

6-
VACUUM test1; -- set up FSM
7+
-- set up FSM and VM
8+
VACUUM test_rel_forks;
79

810
-- The page contents can vary, so just test that it can be read
911
-- successfully, but don't keep the output.
1012

11-
SELECT octet_length(get_raw_page('test1', 'main', 0)) AS main_0;
12-
SELECT octet_length(get_raw_page('test1', 'main', 1)) AS main_1;
13+
SELECT octet_length(get_raw_page('test_rel_forks', 'main', 0)) AS main_0;
14+
SELECT octet_length(get_raw_page('test_rel_forks', 'main', 100)) AS main_100;
1315

14-
SELECT octet_length(get_raw_page('test1', 'fsm', 0)) AS fsm_0;
15-
SELECT octet_length(get_raw_page('test1', 'fsm', 1)) AS fsm_1;
16+
SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 0)) AS fsm_0;
17+
SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 20)) AS fsm_20;
1618

17-
SELECT octet_length(get_raw_page('test1', 'vm', 0)) AS vm_0;
18-
SELECT octet_length(get_raw_page('test1', 'vm', 1)) AS vm_1;
19+
SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 0)) AS vm_0;
20+
SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 1)) AS vm_1;
1921

2022
SELECT octet_length(get_raw_page('xxx', 'main', 0));
21-
SELECT octet_length(get_raw_page('test1', 'xxx', 0));
23+
SELECT octet_length(get_raw_page('test_rel_forks', 'xxx', 0));
24+
25+
SELECT * FROM fsm_page_contents(get_raw_page('test_rel_forks', 'fsm', 0));
26+
27+
SELECT get_raw_page('test_rel_forks', 0) = get_raw_page('test_rel_forks', 'main', 0);
2228

23-
SELECT get_raw_page('test1', 0) = get_raw_page('test1', 'main', 0);
29+
DROP TABLE test_rel_forks;
30+
31+
CREATE TABLE test1 (a int, b int);
32+
INSERT INTO test1 VALUES (16777217, 131584);
2433

2534
SELECT pagesize, version FROM page_header(get_raw_page('test1', 0));
2635

@@ -29,8 +38,6 @@ SELECT page_checksum(get_raw_page('test1', 0), 0) IS NOT NULL AS silly_checksum_
2938
SELECT tuple_data_split('test1'::regclass, t_data, t_infomask, t_infomask2, t_bits)
3039
FROM heap_page_items(get_raw_page('test1', 0));
3140

32-
SELECT * FROM fsm_page_contents(get_raw_page('test1', 'fsm', 0));
33-
3441
DROP TABLE test1;
3542

3643
-- check that using any of these functions with a partitioned table or index

doc/src/sgml/storage.sgml

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -590,12 +590,13 @@ tuple would otherwise be too big.
590590
<indexterm><primary>FSM</primary><see>Free Space Map</see></indexterm>
591591

592592
<para>
593-
Each heap and index relation, except for hash indexes, has a Free Space Map
594-
(FSM) to keep track of available space in the relation. It's stored
595-
alongside the main relation data in a separate relation fork, named after the
596-
filenode number of the relation, plus a <literal>_fsm</literal> suffix. For example,
597-
if the filenode of a relation is 12345, the FSM is stored in a file called
598-
<filename>12345_fsm</filename>, in the same directory as the main relation file.
593+
Each heap relation, unless it is very small, and each index relation, except
594+
for hash indexes, has a Free Space Map (FSM) to keep track of available
595+
space in the relation. It's stored alongside the main relation data in a
596+
separate relation fork, named after the filenode number of the relation, plus
597+
a <literal>_fsm</literal> suffix. For example, if the filenode of a relation
598+
is 12345, the FSM is stored in a file called <filename>12345_fsm</filename>,
599+
in the same directory as the main relation file.
599600
</para>
600601

601602
<para>

src/backend/access/brin/brin.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1150,7 +1150,7 @@ terminate_brin_buildstate(BrinBuildState *state)
11501150
freespace = PageGetFreeSpace(page);
11511151
blk = BufferGetBlockNumber(state->bs_currentInsertBuf);
11521152
ReleaseBuffer(state->bs_currentInsertBuf);
1153-
RecordPageWithFreeSpace(state->bs_irel, blk, freespace);
1153+
RecordPageWithFreeSpace(state->bs_irel, blk, freespace, InvalidBlockNumber);
11541154
FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1);
11551155
}
11561156

src/backend/access/brin/brin_pageops.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
310310

311311
if (extended)
312312
{
313-
RecordPageWithFreeSpace(idxrel, newblk, freespace);
313+
RecordPageWithFreeSpace(idxrel, newblk, freespace, InvalidBlockNumber);
314314
FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
315315
}
316316

@@ -461,7 +461,7 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
461461

462462
if (extended)
463463
{
464-
RecordPageWithFreeSpace(idxrel, blk, freespace);
464+
RecordPageWithFreeSpace(idxrel, blk, freespace, InvalidBlockNumber);
465465
FreeSpaceMapVacuumRange(idxrel, blk, blk + 1);
466466
}
467467

@@ -654,7 +654,7 @@ brin_page_cleanup(Relation idxrel, Buffer buf)
654654

655655
/* Measure free space and record it */
656656
RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf),
657-
br_page_get_freespace(page));
657+
br_page_get_freespace(page), InvalidBlockNumber);
658658
}
659659

660660
/*
@@ -703,7 +703,7 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
703703
/* Choose initial target page, re-using existing target if known */
704704
newblk = RelationGetTargetBlock(irel);
705705
if (newblk == InvalidBlockNumber)
706-
newblk = GetPageWithFreeSpace(irel, itemsz);
706+
newblk = GetPageWithFreeSpace(irel, itemsz, true);
707707

708708
/*
709709
* Loop until we find a page with sufficient free space. By the time we
@@ -895,7 +895,7 @@ brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer)
895895
* pages whose FSM records were forgotten in a crash.
896896
*/
897897
RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buffer),
898-
br_page_get_freespace(page));
898+
br_page_get_freespace(page), InvalidBlockNumber);
899899
}
900900

901901

src/backend/access/heap/hio.c

Lines changed: 28 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -246,8 +246,14 @@ RelationAddExtraBlocks(Relation relation, BulkInsertState bistate)
246246
* Immediately update the bottom level of the FSM. This has a good
247247
* chance of making this page visible to other concurrently inserting
248248
* backends, and we want that to happen without delay.
249+
*
250+
* Since we know the table will end up with extraBlocks additional
251+
* pages, we pass the final number to avoid possible unnecessary
252+
* system calls and to make sure the FSM is created when we add the
253+
* first new page.
249254
*/
250-
RecordPageWithFreeSpace(relation, blockNum, freespace);
255+
RecordPageWithFreeSpace(relation, blockNum, freespace,
256+
firstBlock + extraBlocks);
251257
}
252258
while (--extraBlocks > 0);
253259

@@ -384,20 +390,9 @@ RelationGetBufferForTuple(Relation relation, Size len,
384390
* We have no cached target page, so ask the FSM for an initial
385391
* target.
386392
*/
387-
targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace);
388-
389-
/*
390-
* If the FSM knows nothing of the rel, try the last page before we
391-
* give up and extend. This avoids one-tuple-per-page syndrome during
392-
* bootstrapping or in a recently-started system.
393-
*/
394-
if (targetBlock == InvalidBlockNumber)
395-
{
396-
BlockNumber nblocks = RelationGetNumberOfBlocks(relation);
397-
398-
if (nblocks > 0)
399-
targetBlock = nblocks - 1;
400-
}
393+
targetBlock = GetPageWithFreeSpace(relation,
394+
len + saveFreeSpace,
395+
false);
401396
}
402397

403398
loop:
@@ -504,6 +499,13 @@ RelationGetBufferForTuple(Relation relation, Size len,
504499
{
505500
/* use this page as future insert target, too */
506501
RelationSetTargetBlock(relation, targetBlock);
502+
503+
/*
504+
* In case we used an in-memory map of available blocks, reset it
505+
* for next use.
506+
*/
507+
FSMClearLocalMap();
508+
507509
return buffer;
508510
}
509511

@@ -563,9 +565,12 @@ RelationGetBufferForTuple(Relation relation, Size len,
563565

564566
/*
565567
* Check if some other backend has extended a block for us while
566-
* we were waiting on the lock.
568+
* we were waiting on the lock. We only check the FSM -- if there
569+
* isn't one we don't recheck the number of blocks.
567570
*/
568-
targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace);
571+
targetBlock = GetPageWithFreeSpace(relation,
572+
len + saveFreeSpace,
573+
true);
569574

570575
/*
571576
* If some other waiter has already extended the relation, we
@@ -670,5 +675,11 @@ RelationGetBufferForTuple(Relation relation, Size len,
670675
*/
671676
RelationSetTargetBlock(relation, BufferGetBlockNumber(buffer));
672677

678+
/*
679+
* In case we used an in-memory map of available blocks, reset it for next
680+
* use.
681+
*/
682+
FSMClearLocalMap();
683+
673684
return buffer;
674685
}

0 commit comments

Comments
 (0)