Skip to content

Commit 787eba7

Browse files
committed
When creating a large hash index, pre-sort the index entries by estimated
bucket number, so as to ensure locality of access to the index during the insertion step. Without this, building an index significantly larger than available RAM takes a very long time because of thrashing. On the other hand, sorting is just useless overhead when the index does fit in RAM. We choose to sort when the initial index size exceeds effective_cache_size. This is a revised version of work by Tom Raney and Shreya Bhargava.
1 parent ec6550c commit 787eba7

File tree

8 files changed

+324
-39
lines changed

8 files changed

+324
-39
lines changed

src/backend/access/hash/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
# Makefile for access/hash
55
#
66
# IDENTIFICATION
7-
# $PostgreSQL: pgsql/src/backend/access/hash/Makefile,v 1.14 2008/02/19 10:30:06 petere Exp $
7+
# $PostgreSQL: pgsql/src/backend/access/hash/Makefile,v 1.15 2008/03/16 23:15:08 tgl Exp $
88
#
99
#-------------------------------------------------------------------------
1010

@@ -13,6 +13,6 @@ top_builddir = ../../../..
1313
include $(top_builddir)/src/Makefile.global
1414

1515
OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \
16-
hashsearch.o hashutil.o
16+
hashsearch.o hashsort.o hashutil.o
1717

1818
include $(top_srcdir)/src/backend/common.mk

src/backend/access/hash/hash.c

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.99 2008/03/15 20:46:31 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.100 2008/03/16 23:15:08 tgl Exp $
1212
*
1313
* NOTES
1414
* This file contains only the public interface routines.
@@ -22,13 +22,15 @@
2222
#include "access/hash.h"
2323
#include "catalog/index.h"
2424
#include "commands/vacuum.h"
25+
#include "optimizer/cost.h"
2526
#include "optimizer/plancat.h"
2627

2728

2829
/* Working state for hashbuild and its callback */
2930
typedef struct
3031
{
31-
double indtuples;
32+
HSpool *spool; /* NULL if not using spooling */
33+
double indtuples; /* # tuples accepted into index */
3234
} HashBuildState;
3335

3436
static void hashbuildCallback(Relation index,
@@ -51,6 +53,7 @@ hashbuild(PG_FUNCTION_ARGS)
5153
IndexBuildResult *result;
5254
BlockNumber relpages;
5355
double reltuples;
56+
uint32 num_buckets;
5457
HashBuildState buildstate;
5558

5659
/*
@@ -61,19 +64,43 @@ hashbuild(PG_FUNCTION_ARGS)
6164
elog(ERROR, "index \"%s\" already contains data",
6265
RelationGetRelationName(index));
6366

64-
/* estimate the number of rows currently present in the table */
67+
/* Estimate the number of rows currently present in the table */
6568
estimate_rel_size(heap, NULL, &relpages, &reltuples);
6669

67-
/* initialize the hash index metadata page and initial buckets */
68-
_hash_metapinit(index, reltuples);
70+
/* Initialize the hash index metadata page and initial buckets */
71+
num_buckets = _hash_metapinit(index, reltuples);
6972

70-
/* build the index */
73+
/*
74+
* If we just insert the tuples into the index in scan order, then
75+
* (assuming their hash codes are pretty random) there will be no locality
76+
* of access to the index, and if the index is bigger than available RAM
77+
* then we'll thrash horribly. To prevent that scenario, we can sort the
78+
* tuples by (expected) bucket number. However, such a sort is useless
79+
* overhead when the index does fit in RAM. We choose to sort if the
80+
* initial index size exceeds effective_cache_size.
81+
*
82+
* NOTE: this test will need adjustment if a bucket is ever different
83+
* from one page.
84+
*/
85+
if (num_buckets >= (uint32) effective_cache_size)
86+
buildstate.spool = _h_spoolinit(index, num_buckets);
87+
else
88+
buildstate.spool = NULL;
89+
90+
/* prepare to build the index */
7191
buildstate.indtuples = 0;
7292

7393
/* do the heap scan */
7494
reltuples = IndexBuildHeapScan(heap, index, indexInfo,
7595
hashbuildCallback, (void *) &buildstate);
7696

97+
if (buildstate.spool)
98+
{
99+
/* sort the tuples and insert them into the index */
100+
_h_indexbuild(buildstate.spool);
101+
_h_spooldestroy(buildstate.spool);
102+
}
103+
77104
/*
78105
* Return statistics
79106
*/
@@ -110,7 +137,11 @@ hashbuildCallback(Relation index,
110137
return;
111138
}
112139

113-
_hash_doinsert(index, itup);
140+
/* Either spool the tuple for sorting, or just put it into the index */
141+
if (buildstate->spool)
142+
_h_spool(itup, buildstate->spool);
143+
else
144+
_hash_doinsert(index, itup);
114145

115146
buildstate->indtuples += 1;
116147

src/backend/access/hash/hashpage.c

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.73 2008/03/15 20:46:31 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.74 2008/03/16 23:15:08 tgl Exp $
1212
*
1313
* NOTES
1414
* Postgres hash pages look like ordinary relation pages. The opaque
@@ -315,13 +315,14 @@ _hash_chgbufaccess(Relation rel,
315315
* the initial buckets, and the initial bitmap page.
316316
*
317317
* The initial number of buckets is dependent on num_tuples, an estimate
318-
* of the number of tuples to be loaded into the index initially.
318+
* of the number of tuples to be loaded into the index initially. The
319+
* chosen number of buckets is returned.
319320
*
320321
* We are fairly cavalier about locking here, since we know that no one else
321322
* could be accessing this index. In particular the rule about not holding
322323
* multiple buffer locks is ignored.
323324
*/
324-
void
325+
uint32
325326
_hash_metapinit(Relation rel, double num_tuples)
326327
{
327328
HashMetaPage metap;
@@ -437,11 +438,22 @@ _hash_metapinit(Relation rel, double num_tuples)
437438
metap->hashm_ovflpoint = log2_num_buckets;
438439
metap->hashm_firstfree = 0;
439440

441+
/*
442+
* Release buffer lock on the metapage while we initialize buckets.
443+
* Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS
444+
* won't accomplish anything. It's a bad idea to hold buffer locks
445+
* for long intervals in any case, since that can block the bgwriter.
446+
*/
447+
_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
448+
440449
/*
441450
* Initialize the first N buckets
442451
*/
443452
for (i = 0; i < num_buckets; i++)
444453
{
454+
/* Allow interrupts, in case N is huge */
455+
CHECK_FOR_INTERRUPTS();
456+
445457
buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i));
446458
pg = BufferGetPage(buf);
447459
pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
@@ -453,13 +465,18 @@ _hash_metapinit(Relation rel, double num_tuples)
453465
_hash_wrtbuf(rel, buf);
454466
}
455467

468+
/* Now reacquire buffer lock on metapage */
469+
_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
470+
456471
/*
457472
* Initialize first bitmap page
458473
*/
459474
_hash_initbitmap(rel, metap, num_buckets + 1);
460475

461476
/* all done */
462477
_hash_wrtbuf(rel, metabuf);
478+
479+
return num_buckets;
463480
}
464481

465482
/*

src/backend/access/hash/hashsort.c

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
/*-------------------------------------------------------------------------
2+
*
3+
* hashsort.c
4+
* Sort tuples for insertion into a new hash index.
5+
*
6+
* When building a very large hash index, we pre-sort the tuples by bucket
7+
* number to improve locality of access to the index, and thereby avoid
8+
* thrashing. We use tuplesort.c to sort the given index tuples into order.
9+
*
10+
* Note: if the number of rows in the table has been underestimated,
11+
* bucket splits may occur during the index build. In that case we'd
12+
* be inserting into two or more buckets for each possible masked-off
13+
* hash code value. That's no big problem though, since we'll still have
14+
* plenty of locality of access.
15+
*
16+
*
17+
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
18+
* Portions Copyright (c) 1994, Regents of the University of California
19+
*
20+
* IDENTIFICATION
21+
* $PostgreSQL: pgsql/src/backend/access/hash/hashsort.c,v 1.1 2008/03/16 23:15:08 tgl Exp $
22+
*
23+
*-------------------------------------------------------------------------
24+
*/
25+
26+
#include "postgres.h"
27+
28+
#include "access/hash.h"
29+
#include "miscadmin.h"
30+
#include "utils/tuplesort.h"
31+
32+
33+
/*
34+
* Status record for spooling/sorting phase.
35+
*/
36+
struct HSpool
37+
{
38+
Tuplesortstate *sortstate; /* state data for tuplesort.c */
39+
Relation index;
40+
};
41+
42+
43+
/*
44+
* create and initialize a spool structure
45+
*/
46+
HSpool *
47+
_h_spoolinit(Relation index, uint32 num_buckets)
48+
{
49+
HSpool *hspool = (HSpool *) palloc0(sizeof(HSpool));
50+
uint32 hash_mask;
51+
52+
hspool->index = index;
53+
54+
/*
55+
* Determine the bitmask for hash code values. Since there are currently
56+
* num_buckets buckets in the index, the appropriate mask can be computed
57+
* as follows.
58+
*
59+
* Note: at present, the passed-in num_buckets is always a power of 2,
60+
* so we could just compute num_buckets - 1. We prefer not to assume
61+
* that here, though.
62+
*/
63+
hash_mask = (((uint32) 1) << _hash_log2(num_buckets)) - 1;
64+
65+
/*
66+
* We size the sort area as maintenance_work_mem rather than work_mem to
67+
* speed index creation. This should be OK since a single backend can't
68+
* run multiple index creations in parallel.
69+
*/
70+
hspool->sortstate = tuplesort_begin_index_hash(index,
71+
hash_mask,
72+
maintenance_work_mem,
73+
false);
74+
75+
return hspool;
76+
}
77+
78+
/*
79+
* clean up a spool structure and its substructures.
80+
*/
81+
void
82+
_h_spooldestroy(HSpool *hspool)
83+
{
84+
tuplesort_end(hspool->sortstate);
85+
pfree(hspool);
86+
}
87+
88+
/*
89+
* spool an index entry into the sort file.
90+
*/
91+
void
92+
_h_spool(IndexTuple itup, HSpool *hspool)
93+
{
94+
tuplesort_putindextuple(hspool->sortstate, itup);
95+
}
96+
97+
/*
98+
* given a spool loaded by successive calls to _h_spool,
99+
* create an entire index.
100+
*/
101+
void
102+
_h_indexbuild(HSpool *hspool)
103+
{
104+
IndexTuple itup;
105+
bool should_free;
106+
107+
tuplesort_performsort(hspool->sortstate);
108+
109+
while ((itup = tuplesort_getindextuple(hspool->sortstate,
110+
true, &should_free)) != NULL)
111+
{
112+
_hash_doinsert(hspool->index, itup);
113+
if (should_free)
114+
pfree(itup);
115+
}
116+
}

src/backend/access/nbtree/nbtsort.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
* Portions Copyright (c) 1994, Regents of the University of California
5858
*
5959
* IDENTIFICATION
60-
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.114 2008/01/01 19:45:46 momjian Exp $
60+
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.115 2008/03/16 23:15:08 tgl Exp $
6161
*
6262
*-------------------------------------------------------------------------
6363
*/
@@ -158,8 +158,8 @@ _bt_spoolinit(Relation index, bool isunique, bool isdead)
158158
* work_mem.
159159
*/
160160
btKbytes = isdead ? work_mem : maintenance_work_mem;
161-
btspool->sortstate = tuplesort_begin_index(index, isunique,
162-
btKbytes, false);
161+
btspool->sortstate = tuplesort_begin_index_btree(index, isunique,
162+
btKbytes, false);
163163

164164
return btspool;
165165
}

0 commit comments

Comments
 (0)