Skip to content

Commit e1a6679

Browse files
committed
Fix O(N^2) behavior in pg_dump when many objects are in dependency loops.
Combining the loop workspace with the record of already-processed objects might have been a cute trick, but it behaves horridly if there are many dependency loops to repair: the time spent in the first step of findLoop() grows as O(N^2). Instead use a separate flag array indexed by dump ID, which we can check in constant time. The length of the workspace array is now never more than the actual length of a dependency chain, which should be reasonably short in all cases of practical interest. The code is noticeably easier to understand this way, too. Per gripe from Mike Roest. Since this is a longstanding performance bug, backpatch to all supported versions.
1 parent b77da19 commit e1a6679

File tree

1 file changed

+59
-60
lines changed

1 file changed

+59
-60
lines changed

src/bin/pg_dump/pg_dump_sort.c

Lines changed: 59 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -106,11 +106,11 @@ static bool TopoSort(DumpableObject **objs,
106106
static void addHeapElement(int val, int *heap, int heapLength);
107107
static int removeHeapElement(int *heap, int heapLength);
108108
static void findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs);
109-
static bool findLoop(DumpableObject *obj,
109+
static int findLoop(DumpableObject *obj,
110110
DumpId startPoint,
111+
bool *processed,
111112
DumpableObject **workspace,
112-
int depth,
113-
int *newDepth);
113+
int depth);
114114
static void repairDependencyLoop(DumpableObject **loop,
115115
int nLoop);
116116
static void describeDumpableObject(DumpableObject *obj,
@@ -500,58 +500,52 @@ static void
500500
findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs)
501501
{
502502
/*
503-
* We use a workspace array, the initial part of which stores objects
504-
* already processed, and the rest of which is used as temporary space to
505-
* try to build a loop in. This is convenient because we do not care
506-
* about loops involving already-processed objects (see notes above); we
507-
* can easily reject such loops in findLoop() because of this
508-
* representation. After we identify and process a loop, we can add it to
509-
* the initial part of the workspace just by moving the boundary pointer.
510-
*
511-
* When we determine that an object is not part of any interesting loop,
512-
* we also add it to the initial part of the workspace. This is not
513-
* necessary for correctness, but saves later invocations of findLoop()
514-
* from uselessly chasing references to such an object.
515-
*
516-
* We make the workspace large enough to hold all objects in the original
517-
* universe. This is probably overkill, but it's provably enough space...
503+
* We use two data structures here. One is a bool array processed[],
504+
* which is indexed by dump ID and marks the objects already processed
505+
* during this invocation of findDependencyLoops(). The other is a
506+
* workspace[] array of DumpableObject pointers, in which we try to build
507+
* lists of objects constituting loops. We make workspace[] large enough
508+
* to hold all the objects, which is huge overkill in most cases but could
509+
* theoretically be necessary if there is a single dependency chain
510+
* linking all the objects.
518511
*/
512+
bool *processed;
519513
DumpableObject **workspace;
520-
int initiallen;
521514
bool fixedloop;
522515
int i;
523516

517+
processed = (bool *) calloc(getMaxDumpId() + 1, sizeof(bool));
524518
workspace = (DumpableObject **) malloc(totObjs * sizeof(DumpableObject *));
525-
if (workspace == NULL)
519+
if (processed == NULL || workspace == NULL)
526520
exit_horribly(NULL, modulename, "out of memory\n");
527-
initiallen = 0;
528521
fixedloop = false;
529522

530523
for (i = 0; i < nObjs; i++)
531524
{
532525
DumpableObject *obj = objs[i];
533-
int newlen;
526+
int looplen;
527+
int j;
534528

535-
workspace[initiallen] = NULL; /* see test below */
529+
looplen = findLoop(obj, obj->dumpId, processed, workspace, 0);
536530

537-
if (findLoop(obj, obj->dumpId, workspace, initiallen, &newlen))
531+
if (looplen > 0)
538532
{
539-
/* Found a loop of length newlen - initiallen */
540-
repairDependencyLoop(&workspace[initiallen], newlen - initiallen);
541-
/* Add loop members to workspace */
542-
initiallen = newlen;
533+
/* Found a loop, repair it */
534+
repairDependencyLoop(workspace, looplen);
543535
fixedloop = true;
536+
/* Mark loop members as processed */
537+
for (j = 0; j < looplen; j++)
538+
processed[workspace[j]->dumpId] = true;
544539
}
545540
else
546541
{
547542
/*
548-
* Didn't find a loop, but add this object to workspace anyway,
549-
* unless it's already present. We piggyback on the test that
550-
* findLoop() already did: it won't have tentatively added obj to
551-
* workspace if it's already present.
543+
* There's no loop starting at this object, but mark it processed
544+
* anyway. This is not necessary for correctness, but saves later
545+
* invocations of findLoop() from uselessly chasing references to
546+
* such an object.
552547
*/
553-
if (workspace[initiallen] == obj)
554-
initiallen++;
548+
processed[obj->dumpId] = true;
555549
}
556550
}
557551

@@ -560,45 +554,51 @@ findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs)
560554
exit_horribly(NULL, modulename, "could not identify dependency loop\n");
561555

562556
free(workspace);
557+
free(processed);
563558
}
564559

565560
/*
566561
* Recursively search for a circular dependency loop that doesn't include
567-
* any existing workspace members.
562+
* any already-processed objects.
568563
*
569564
* obj: object we are examining now
570565
* startPoint: dumpId of starting object for the hoped-for circular loop
571-
* workspace[]: work array for previously processed and current objects
566+
* processed[]: flag array marking already-processed objects
567+
* workspace[]: work array in which we are building list of loop members
572568
* depth: number of valid entries in workspace[] at call
573-
* newDepth: if successful, set to new number of workspace[] entries
574569
*
575-
* On success, *newDepth is set and workspace[] entries depth..*newDepth-1
576-
* are filled with pointers to the members of the loop.
570+
* On success, the length of the loop is returned, and workspace[] is filled
571+
* with pointers to the members of the loop. On failure, we return 0.
577572
*
578573
* Note: it is possible that the given starting object is a member of more
579574
* than one cycle; if so, we will find an arbitrary one of the cycles.
580575
*/
581-
static bool
576+
static int
582577
findLoop(DumpableObject *obj,
583578
DumpId startPoint,
579+
bool *processed,
584580
DumpableObject **workspace,
585-
int depth,
586-
int *newDepth)
581+
int depth)
587582
{
588583
int i;
589584

590585
/*
591-
* Reject if obj is already present in workspace. This test serves three
592-
* purposes: it prevents us from finding loops that overlap
593-
* previously-processed loops, it prevents us from going into infinite
594-
* recursion if we are given a startPoint object that links to a cycle
595-
* it's not a member of, and it guarantees that we can't overflow the
596-
* allocated size of workspace[].
586+
* Reject if obj is already processed. This test prevents us from finding
587+
* loops that overlap previously-processed loops.
588+
*/
589+
if (processed[obj->dumpId])
590+
return 0;
591+
592+
/*
593+
* Reject if obj is already present in workspace. This test prevents us
594+
* from going into infinite recursion if we are given a startPoint object
595+
* that links to a cycle it's not a member of, and it guarantees that we
596+
* can't overflow the allocated size of workspace[].
597597
*/
598598
for (i = 0; i < depth; i++)
599599
{
600600
if (workspace[i] == obj)
601-
return false;
601+
return 0;
602602
}
603603

604604
/*
@@ -612,10 +612,7 @@ findLoop(DumpableObject *obj,
612612
for (i = 0; i < obj->nDeps; i++)
613613
{
614614
if (obj->dependencies[i] == startPoint)
615-
{
616-
*newDepth = depth;
617-
return true;
618-
}
615+
return depth;
619616
}
620617

621618
/*
@@ -624,18 +621,20 @@ findLoop(DumpableObject *obj,
624621
for (i = 0; i < obj->nDeps; i++)
625622
{
626623
DumpableObject *nextobj = findObjectByDumpId(obj->dependencies[i]);
624+
int newDepth;
627625

628626
if (!nextobj)
629627
continue; /* ignore dependencies on undumped objects */
630-
if (findLoop(nextobj,
631-
startPoint,
632-
workspace,
633-
depth,
634-
newDepth))
635-
return true;
628+
newDepth = findLoop(nextobj,
629+
startPoint,
630+
processed,
631+
workspace,
632+
depth);
633+
if (newDepth > 0)
634+
return newDepth;
636635
}
637636

638-
return false;
637+
return 0;
639638
}
640639

641640
/*

0 commit comments

Comments
 (0)