python · encukou · Feb 1, 2024 · Jan 31, 2024
diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c
@@ -46,6 +46,7 @@ struct collection_state {
     GCState *gcstate;
     Py_ssize_t collected;
     Py_ssize_t uncollectable;
+    Py_ssize_t long_lived_total;
     struct worklist unreachable;
     struct worklist legacy_finalizers;
     struct worklist wrcb_to_call;
@@ -443,7 +444,7 @@ scan_heap_visitor(const mi_heap_t *heap, const mi_heap_area_t *area,
     else {
         // object is reachable, restore `ob_tid`; we're done with these objects
         gc_restore_tid(op);
-        state->gcstate->long_lived_total++;
+        state->long_lived_total++;
     }
 
     return true;
@@ -605,6 +606,8 @@ get_gc_state(void)
 void
 _PyGC_InitState(GCState *gcstate)
 {
+    // TODO: move to pycore_runtime_init.h once the incremental GC lands.
+    gcstate->generations[0].threshold = 2000;
 }
 
 
@@ -885,62 +888,6 @@ invoke_gc_callback(PyThreadState *tstate, const char *phase,
     assert(!_PyErr_Occurred(tstate));
 }
 
-
-/* Find the oldest generation (highest numbered) where the count
- * exceeds the threshold.  Objects in the that generation and
- * generations younger than it will be collected. */
-static int
-gc_select_generation(GCState *gcstate)
-{
-    for (int i = NUM_GENERATIONS-1; i >= 0; i--) {
-        if (gcstate->generations[i].count > gcstate->generations[i].threshold) {
-            /* Avoid quadratic performance degradation in number
-               of tracked objects (see also issue #4074):
-
-               To limit the cost of garbage collection, there are two strategies;
-                 - make each collection faster, e.g. by scanning fewer objects
-                 - do less collections
-               This heuristic is about the latter strategy.
-
-               In addition to the various configurable thresholds, we only trigger a
-               full collection if the ratio
-
-                long_lived_pending / long_lived_total
-
-               is above a given value (hardwired to 25%).
-
-               The reason is that, while "non-full" collections (i.e., collections of
-               the young and middle generations) will always examine roughly the same
-               number of objects -- determined by the aforementioned thresholds --,
-               the cost of a full collection is proportional to the total number of
-               long-lived objects, which is virtually unbounded.
-
-               Indeed, it has been remarked that doing a full collection every
-               <constant number> of object creations entails a dramatic performance
-               degradation in workloads which consist in creating and storing lots of
-               long-lived objects (e.g. building a large list of GC-tracked objects would
-               show quadratic performance, instead of linear as expected: see issue #4074).
-
-               Using the above ratio, instead, yields amortized linear performance in
-               the total number of objects (the effect of which can be summarized
-               thusly: "each full garbage collection is more and more costly as the
-               number of objects grows, but we do fewer and fewer of them").
-
-               This heuristic was suggested by Martin von Löwis on python-dev in
-               June 2008. His original analysis and proposal can be found at:
-               http://mail.python.org/pipermail/python-dev/2008-June/080579.html
-            */
-            if (i == NUM_GENERATIONS - 1
-                && gcstate->long_lived_pending < gcstate->long_lived_total / 4)
-            {
-                continue;
-            }
-            return i;
-        }
-    }
-    return -1;
-}
-
 static void
 cleanup_worklist(struct worklist *worklist)
 {
@@ -952,6 +899,21 @@ cleanup_worklist(struct worklist *worklist)
     }
 }
 
+static bool
+gc_should_collect(GCState *gcstate)
+{
+    int count = _Py_atomic_load_int_relaxed(&gcstate->generations[0].count);
+    int threshold = gcstate->generations[0].threshold;
+    if (count <= threshold || threshold == 0 || !gcstate->enabled) {
+        return false;
+    }
+    // Avoid quadratic behavior by scaling threshold to the number of live
+    // objects. A few tests rely on immediate scheduling of the GC so we ignore
+    // the scaled threshold if generations[1].threshold is set to zero.
+    return (count > gcstate->long_lived_total / 4 ||
+            gcstate->generations[1].threshold == 0);
+}
+
 static void
 gc_collect_internal(PyInterpreterState *interp, struct collection_state *state)
 {
@@ -1029,15 +991,10 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason)
         return 0;
     }
 
-    if (generation == GENERATION_AUTO) {
-        // Select the oldest generation that needs collecting. We will collect
-        // objects from that generation and all generations younger than it.
-        generation = gc_select_generation(gcstate);
-        if (generation < 0) {
-            // No generation needs to be collected.
-            _Py_atomic_store_int(&gcstate->collecting, 0);
-            return 0;
-        }
+    if (reason == _Py_GC_REASON_HEAP && !gc_should_collect(gcstate)) {
+        // Don't collect if the threshold is not exceeded.
+        _Py_atomic_store_int(&gcstate->collecting, 0);
+        return 0;
     }
 
     assert(generation >= 0 && generation < NUM_GENERATIONS);
@@ -1082,6 +1039,7 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason)
 
     m = state.collected;
     n = state.uncollectable;
+    gcstate->long_lived_total = state.long_lived_total;
 
     if (gcstate->debug & _PyGC_DEBUG_STATS) {
         double d = _PyTime_AsSecondsDouble(_PyTime_GetPerfCounter() - t1);
@@ -1523,12 +1481,10 @@ _PyObject_GC_Link(PyObject *op)
 {
     PyThreadState *tstate = _PyThreadState_GET();
     GCState *gcstate = &tstate->interp->gc;
-    gcstate->generations[0].count++; /* number of allocated GC objects */
-    if (gcstate->generations[0].count > gcstate->generations[0].threshold &&
-        gcstate->enabled &&
-        gcstate->generations[0].threshold &&
-        !_Py_atomic_load_int_relaxed(&gcstate->collecting) &&
-        !_PyErr_Occurred(tstate))
+    gcstate->generations[0].count++;
+
+    if (gc_should_collect(gcstate) &&
+        !_Py_atomic_load_int_relaxed(&gcstate->collecting))
     {
         _Py_ScheduleGC(tstate->interp);
     }
@@ -1537,7 +1493,7 @@ _PyObject_GC_Link(PyObject *op)
 void
 _Py_RunGC(PyThreadState *tstate)
 {
-    gc_collect_main(tstate, GENERATION_AUTO, _Py_GC_REASON_HEAP);
+    gc_collect_main(tstate, 0, _Py_GC_REASON_HEAP);
 }
 
 static PyObject *