From 605f930346309cb5a1ef2d46439fbb6122a26f01 Mon Sep 17 00:00:00 2001
From: Guido van Rossum <guido@python.org>
Date: Tue, 27 Jun 2023 16:19:53 -0700
Subject: [PATCH 1/5] Write macro expansions to opcode_metadata.h

---
 Python/opcode_metadata.h                | 11 +++++
 Python/optimizer.c                      |  4 +-
 Tools/cases_generator/generate_cases.py | 54 +++++++++++++++++++++++--
 3 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/Python/opcode_metadata.h b/Python/opcode_metadata.h
index 4e31a5e69614bb..b44285599f3766 100644
--- a/Python/opcode_metadata.h
+++ b/Python/opcode_metadata.h
@@ -1131,10 +1131,18 @@ const struct opcode_macro_expansion _PyOpcode_macro_expansion[256] = {
     [STORE_FAST] = { .nuops = 1, .uops = { { STORE_FAST, 0, 0 } } },
     [POP_TOP] = { .nuops = 1, .uops = { { POP_TOP, 0, 0 } } },
     [PUSH_NULL] = { .nuops = 1, .uops = { { PUSH_NULL, 0, 0 } } },
+    [END_FOR] = { .nuops = 2, .uops = { { POP_TOP, 0, 0 }, { POP_TOP, 0, 0 } } },
     [END_SEND] = { .nuops = 1, .uops = { { END_SEND, 0, 0 } } },
     [UNARY_NEGATIVE] = { .nuops = 1, .uops = { { UNARY_NEGATIVE, 0, 0 } } },
     [UNARY_NOT] = { .nuops = 1, .uops = { { UNARY_NOT, 0, 0 } } },
     [UNARY_INVERT] = { .nuops = 1, .uops = { { UNARY_INVERT, 0, 0 } } },
+    [BINARY_OP_MULTIPLY_INT] = { .nuops = 2, .uops = { { _GUARD_BOTH_INT, 0, 0 }, { _BINARY_OP_MULTIPLY_INT, 0, 0 } } },
+    [BINARY_OP_ADD_INT] = { .nuops = 2, .uops = { { _GUARD_BOTH_INT, 0, 0 }, { _BINARY_OP_ADD_INT, 0, 0 } } },
+    [BINARY_OP_SUBTRACT_INT] = { .nuops = 2, .uops = { { _GUARD_BOTH_INT, 0, 0 }, { _BINARY_OP_SUBTRACT_INT, 0, 0 } } },
+    [BINARY_OP_MULTIPLY_FLOAT] = { .nuops = 2, .uops = { { _GUARD_BOTH_FLOAT, 0, 0 }, { _BINARY_OP_MULTIPLY_FLOAT, 0, 0 } } },
+    [BINARY_OP_ADD_FLOAT] = { .nuops = 2, .uops = { { _GUARD_BOTH_FLOAT, 0, 0 }, { _BINARY_OP_ADD_FLOAT, 0, 0 } } },
+    [BINARY_OP_SUBTRACT_FLOAT] = { .nuops = 2, .uops = { { _GUARD_BOTH_FLOAT, 0, 0 }, { _BINARY_OP_SUBTRACT_FLOAT, 0, 0 } } },
+    [BINARY_OP_ADD_UNICODE] = { .nuops = 2, .uops = { { _GUARD_BOTH_UNICODE, 0, 0 }, { _BINARY_OP_ADD_UNICODE, 0, 0 } } },
     [BINARY_SLICE] = { .nuops = 1, .uops = { { BINARY_SLICE, 0, 0 } } },
     [STORE_SLICE] = { .nuops = 1, .uops = { { STORE_SLICE, 0, 0 } } },
     [BINARY_SUBSCR_LIST_INT] = { .nuops = 1, .uops = { { BINARY_SUBSCR_LIST_INT, 0, 0 } } },
@@ -1162,6 +1170,9 @@ const struct opcode_macro_expansion _PyOpcode_macro_expansion[256] = {
     [DELETE_ATTR] = { .nuops = 1, .uops = { { DELETE_ATTR, 0, 0 } } },
     [STORE_GLOBAL] = { .nuops = 1, .uops = { { STORE_GLOBAL, 0, 0 } } },
     [DELETE_GLOBAL] = { .nuops = 1, .uops = { { DELETE_GLOBAL, 0, 0 } } },
+    [LOAD_LOCALS] = { .nuops = 1, .uops = { { _LOAD_LOCALS, 0, 0 } } },
+    [LOAD_NAME] = { .nuops = 2, .uops = { { _LOAD_LOCALS, 0, 0 }, { _LOAD_FROM_DICT_OR_GLOBALS, 0, 0 } } },
+    [LOAD_FROM_DICT_OR_GLOBALS] = { .nuops = 1, .uops = { { _LOAD_FROM_DICT_OR_GLOBALS, 0, 0 } } },
     [DELETE_DEREF] = { .nuops = 1, .uops = { { DELETE_DEREF, 0, 0 } } },
     [LOAD_FROM_DICT_OR_DEREF] = { .nuops = 1, .uops = { { LOAD_FROM_DICT_OR_DEREF, 0, 0 } } },
     [LOAD_DEREF] = { .nuops = 1, .uops = { { LOAD_DEREF, 0, 0 } } },
diff --git a/Python/optimizer.c b/Python/optimizer.c
index 0a6cc5c92f04f5..52880a503abf35 100644
--- a/Python/optimizer.c
+++ b/Python/optimizer.c
@@ -474,6 +474,8 @@ PyUnstable_Optimizer_NewUOpOptimizer(void)
     }
     opt->optimize = uop_optimize;
     opt->resume_threshold = UINT16_MAX;
-    opt->backedge_threshold = 0;
+    // Need at least 3 iterations to settle specializations.
+    // A few lower bits of the counter are reserved for other flags.
+    opt->backedge_threshold = 3 << OPTIMIZER_BITS_IN_COUNTER;
     return (PyObject *)opt;
 }
diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py
index d66b28139e43fb..1adbd215377a2f 100644
--- a/Tools/cases_generator/generate_cases.py
+++ b/Tools/cases_generator/generate_cases.py
@@ -1246,14 +1246,18 @@ def write_metadata(self) -> None:
                             pass
                         case parser.InstDef(name=name):
                             instr = self.instrs[name]
+                            # Since an 'op' is not a bytecode, it has no expansion
                             if instr.kind != "op" and instr.is_viable_uop():
+                                # Double check there aren't any used cache effects.
+                                # If this fails, see write_macro_expansions().
+                                assert not [e for e in instr.cache_effects if e.name != UNUSED], \
+                                    (instr.name, instr.cache_effects)
                                 self.out.emit(
                                     f"[{name}] = "
                                     f"{{ .nuops = 1, .uops = {{ {{ {name}, 0, 0 }} }} }},"
                                 )
                         case parser.Macro():
-                            # TODO: emit expansion if all parts are viable uops
-                            pass
+                            self.write_macro_expansions(self.macro_instrs[thing.name])
                         case parser.Pseudo():
                             pass
                         case _:
@@ -1303,7 +1307,7 @@ def write_pseudo_instrs(self) -> None:
     def write_uop_defines(self) -> None:
         """Write '#define XXX NNN' for each uop"""
         self.out.emit("")
-        counter = 300
+        counter = 300  # TODO: Avoid collision with pseudo instructions
         def add(name: str) -> None:
             nonlocal counter
             self.out.emit(f"#define {name} {counter}")
@@ -1314,6 +1318,47 @@ def add(name: str) -> None:
             if instr.kind == "op" and instr.is_viable_uop():
                 add(instr.name)
 
+    def write_macro_expansions(self, mac: MacroInstruction) -> None:
+        """Write the macro expansions for a macro-instruction."""
+        # TODO: Refactor to share code with write_cody(), is_viaible_uop(), etc.
+        offset = 0  # Cache effect offset
+        expansions: list[tuple[str, int, int]] = []  # [(name, size, offset), ...]
+        for part in mac.parts:
+            match part:
+                case parser.CacheEffect():
+                    offset += part.size
+                case Component():
+                    # All component instructions must be viable uops
+                    if not part.instr.is_viable_uop():
+                        print(f"Part {part.instr.name} of {mac.name} is not a viable uop")
+                        return
+                    effect = None
+                    effect_offset = 0
+                    for eff in part.instr.cache_effects:
+                        if eff.name != UNUSED:
+                            assert effect is None, \
+                                f"{mac.name} has multiple cache effects {eff.name}"
+                            effect = eff
+                            effect_offset = offset
+                        offset += eff.size
+                    if effect is not None:
+                        # We don't seem to have examples of this yet; remove the print once we do
+                        print(f"Part {part.instr.name} of {mac.name} has cache effect {effect.name}")
+                        assert not part.instr.instr_flags.HAS_ARG_FLAG, \
+                            f"{mac.name} has both cache effect and arg flag"
+                        exp = (part.instr.name, effect.size, effect_offset)
+                    else:
+                        exp = (part.instr.name, 0, 0)
+                        expansions.append(exp)
+                case _:
+                    typing.assert_never(part)
+        assert len(expansions) > 0, f"Macro {mac.name} has empty expansion?!"
+        pieces = [f"{{ {name}, {size}, {offset} }}" for name, size, offset in expansions]
+        self.out.emit(
+            f"[{mac.name}] = "
+            f"{{ .nuops = {len(expansions)}, .uops = {{ {', '.join(pieces)} }} }},"
+        )
+
     def emit_metadata_entry(
         self, name: str, fmt: str, flags: InstructionFlags
     ) -> None:
@@ -1379,6 +1424,7 @@ def write_executor_instructions(self) -> None:
             for thing in self.everything:
                 match thing:
                     case OverriddenInstructionPlaceHolder():
+                        # TODO: Is this helpful?
                         self.write_overridden_instr_place_holder(thing)
                     case parser.InstDef():
                         instr = self.instrs[thing.name]
@@ -1388,7 +1434,7 @@ def write_executor_instructions(self) -> None:
                                 instr.write(self.out, tier=TIER_TWO)
                                 self.out.emit("break;")
                     case parser.Macro():
-                        pass  # TODO
+                        pass
                     case parser.Pseudo():
                         pass
                     case _:

From bbc59518f525bb8c78fa20f1c5d5ca0b3ce3fc93 Mon Sep 17 00:00:00 2001
From: Guido van Rossum <guido@python.org>
Date: Tue, 27 Jun 2023 16:45:10 -0700
Subject: [PATCH 2/5] Print uop names in uops debug output

This would be simpler if the numeric opcode values were generated
by generate_cases.py.
---
 Python/ceval.c                          |  6 +++---
 Python/opcode_metadata.h                | 17 +++++++++++++++++
 Python/optimizer.c                      |  4 ++--
 Tools/cases_generator/generate_cases.py | 12 ++++++++----
 4 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/Python/ceval.c b/Python/ceval.c
index e19860d04a7914..65b6f2481665de 100644
--- a/Python/ceval.c
+++ b/Python/ceval.c
@@ -2817,10 +2817,10 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
         oparg = (int)operand;
 #ifdef LLTRACE
         if (lltrace >= 3) {
-            const char *opname = opcode < 256 ? _PyOpcode_OpName[opcode] : "";
+            const char *opname = opcode < 256 ? _PyOpcode_OpName[opcode] : _PyOpcode_uop_name[opcode];
             int stack_level = (int)(stack_pointer - _PyFrame_Stackbase(frame));
-            fprintf(stderr, "  uop %s %d, operand %" PRIu64 ", stack_level %d\n",
-                    opname, opcode, operand, stack_level);
+            fprintf(stderr, "  uop %s, operand %" PRIu64 ", stack_level %d\n",
+                    opname, operand, stack_level);
         }
 #endif
         pc++;
diff --git a/Python/opcode_metadata.h b/Python/opcode_metadata.h
index b44285599f3766..614192a1a59adc 100644
--- a/Python/opcode_metadata.h
+++ b/Python/opcode_metadata.h
@@ -913,6 +913,7 @@ struct opcode_macro_expansion {
 #ifndef NEED_OPCODE_METADATA
 extern const struct opcode_metadata _PyOpcode_opcode_metadata[512];
 extern const struct opcode_macro_expansion _PyOpcode_macro_expansion[256];
+extern const char *_PyOpcode_uop_name[512];
 #else
 const struct opcode_metadata _PyOpcode_opcode_metadata[512] = {
     [NOP] = { true, INSTR_FMT_IX, 0 },
@@ -1218,4 +1219,20 @@ const struct opcode_macro_expansion _PyOpcode_macro_expansion[256] = {
     [COPY] = { .nuops = 1, .uops = { { COPY, 0, 0 } } },
     [SWAP] = { .nuops = 1, .uops = { { SWAP, 0, 0 } } },
 };
+const char *_PyOpcode_uop_name[512] = {
+    [300] = "EXIT_TRACE",
+    [301] = "SET_IP",
+    [302] = "_GUARD_BOTH_INT",
+    [303] = "_BINARY_OP_MULTIPLY_INT",
+    [304] = "_BINARY_OP_ADD_INT",
+    [305] = "_BINARY_OP_SUBTRACT_INT",
+    [306] = "_GUARD_BOTH_FLOAT",
+    [307] = "_BINARY_OP_MULTIPLY_FLOAT",
+    [308] = "_BINARY_OP_ADD_FLOAT",
+    [309] = "_BINARY_OP_SUBTRACT_FLOAT",
+    [310] = "_GUARD_BOTH_UNICODE",
+    [311] = "_BINARY_OP_ADD_UNICODE",
+    [312] = "_LOAD_LOCALS",
+    [313] = "_LOAD_FROM_DICT_OR_GLOBALS",
+};
 #endif
diff --git a/Python/optimizer.c b/Python/optimizer.c
index 52880a503abf35..9d77ab4ff879bb 100644
--- a/Python/optimizer.c
+++ b/Python/optimizer.c
@@ -325,8 +325,8 @@ translate_bytecode_to_trace(
     }
 #define ADD_TO_TRACE(OPCODE, OPERAND) \
         if (lltrace >= 2) { \
-            const char *opname = (OPCODE) < 256 ? _PyOpcode_OpName[(OPCODE)] : ""; \
-            fprintf(stderr, "  ADD_TO_TRACE(%s %d, %" PRIu64 ")\n", opname, (OPCODE), (uint64_t)(OPERAND)); \
+            const char *opname = (OPCODE) < 256 ? _PyOpcode_OpName[(OPCODE)] : _PyOpcode_uop_name[(OPCODE)]; \
+            fprintf(stderr, "  ADD_TO_TRACE(%s, %" PRIu64 ")\n", opname, (uint64_t)(OPERAND)); \
         } \
         trace[trace_length].opcode = (OPCODE); \
         trace[trace_length].operand = (OPERAND); \
diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py
index 1adbd215377a2f..957917bea34906 100644
--- a/Tools/cases_generator/generate_cases.py
+++ b/Tools/cases_generator/generate_cases.py
@@ -1182,7 +1182,8 @@ def write_metadata(self) -> None:
 
             self.write_pseudo_instrs()
 
-            self.write_uop_defines()
+            self.out.emit("")
+            self.write_uop_items(lambda name, counter: f"#define {name} {counter}")
 
             self.write_stack_effect_functions()
 
@@ -1213,6 +1214,7 @@ def write_metadata(self) -> None:
             self.out.emit("#ifndef NEED_OPCODE_METADATA")
             self.out.emit("extern const struct opcode_metadata _PyOpcode_opcode_metadata[512];")
             self.out.emit("extern const struct opcode_macro_expansion _PyOpcode_macro_expansion[256];")
+            self.out.emit("extern const char *_PyOpcode_uop_name[512];")
             self.out.emit("#else")
 
             self.out.emit("const struct opcode_metadata _PyOpcode_opcode_metadata[512] = {")
@@ -1263,6 +1265,9 @@ def write_metadata(self) -> None:
                         case _:
                             typing.assert_never(thing)
 
+            with self.out.block("const char *_PyOpcode_uop_name[512] =", ";"):
+                self.write_uop_items(lambda name, counter: f"[{counter}] = \"{name}\",")
+
             self.out.emit("#endif")
 
         with open(self.pymetadata_filename, "w") as f:
@@ -1304,13 +1309,12 @@ def write_pseudo_instrs(self) -> None:
             self.out.emit(f"    ((OP) == {op}) || \\")
         self.out.emit(f"    0")
 
-    def write_uop_defines(self) -> None:
+    def write_uop_items(self, make_text: typing.Callable[[str, int], str]) -> None:
         """Write '#define XXX NNN' for each uop"""
-        self.out.emit("")
         counter = 300  # TODO: Avoid collision with pseudo instructions
         def add(name: str) -> None:
             nonlocal counter
-            self.out.emit(f"#define {name} {counter}")
+            self.out.emit(make_text(name, counter))
             counter += 1
         add("EXIT_TRACE")
         add("SET_IP")

From 60403c9dc88afdb33b234cbf9cddc51862ab1c0f Mon Sep 17 00:00:00 2001
From: Guido van Rossum <guido@python.org>
Date: Tue, 27 Jun 2023 16:50:47 -0700
Subject: [PATCH 3/5] Only output uop names in debug mode

---
 Python/opcode_metadata.h                | 4 ++++
 Tools/cases_generator/generate_cases.py | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/Python/opcode_metadata.h b/Python/opcode_metadata.h
index 614192a1a59adc..1bd3848a1dc633 100644
--- a/Python/opcode_metadata.h
+++ b/Python/opcode_metadata.h
@@ -913,7 +913,9 @@ struct opcode_macro_expansion {
 #ifndef NEED_OPCODE_METADATA
 extern const struct opcode_metadata _PyOpcode_opcode_metadata[512];
 extern const struct opcode_macro_expansion _PyOpcode_macro_expansion[256];
+#ifdef Py_DEBUG
 extern const char *_PyOpcode_uop_name[512];
+#endif
 #else
 const struct opcode_metadata _PyOpcode_opcode_metadata[512] = {
     [NOP] = { true, INSTR_FMT_IX, 0 },
@@ -1219,6 +1221,7 @@ const struct opcode_macro_expansion _PyOpcode_macro_expansion[256] = {
     [COPY] = { .nuops = 1, .uops = { { COPY, 0, 0 } } },
     [SWAP] = { .nuops = 1, .uops = { { SWAP, 0, 0 } } },
 };
+#ifdef Py_DEBUG
 const char *_PyOpcode_uop_name[512] = {
     [300] = "EXIT_TRACE",
     [301] = "SET_IP",
@@ -1236,3 +1239,4 @@ const char *_PyOpcode_uop_name[512] = {
     [313] = "_LOAD_FROM_DICT_OR_GLOBALS",
 };
 #endif
+#endif
diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py
index 957917bea34906..0b6ad839fb48c2 100644
--- a/Tools/cases_generator/generate_cases.py
+++ b/Tools/cases_generator/generate_cases.py
@@ -1214,7 +1214,9 @@ def write_metadata(self) -> None:
             self.out.emit("#ifndef NEED_OPCODE_METADATA")
             self.out.emit("extern const struct opcode_metadata _PyOpcode_opcode_metadata[512];")
             self.out.emit("extern const struct opcode_macro_expansion _PyOpcode_macro_expansion[256];")
+            self.out.emit("#ifdef Py_DEBUG")
             self.out.emit("extern const char *_PyOpcode_uop_name[512];")
+            self.out.emit("#endif")
             self.out.emit("#else")
 
             self.out.emit("const struct opcode_metadata _PyOpcode_opcode_metadata[512] = {")
@@ -1265,8 +1267,10 @@ def write_metadata(self) -> None:
                         case _:
                             typing.assert_never(thing)
 
+            self.out.emit("#ifdef Py_DEBUG")
             with self.out.block("const char *_PyOpcode_uop_name[512] =", ";"):
                 self.write_uop_items(lambda name, counter: f"[{counter}] = \"{name}\",")
+            self.out.emit("#endif")
 
             self.out.emit("#endif")
 

From 9e35908e77512338843821ab7b1db544835b8c92 Mon Sep 17 00:00:00 2001
From: Guido van Rossum <guido@python.org>
Date: Tue, 27 Jun 2023 20:33:58 -0700
Subject: [PATCH 4/5] Attempt to silence c-analyzer

(Why can't I run this locally on macOS?)
---
 Python/opcode_metadata.h                | 4 ++--
 Tools/cases_generator/generate_cases.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Python/opcode_metadata.h b/Python/opcode_metadata.h
index 1bd3848a1dc633..cfdc09294e9dee 100644
--- a/Python/opcode_metadata.h
+++ b/Python/opcode_metadata.h
@@ -914,7 +914,7 @@ struct opcode_macro_expansion {
 extern const struct opcode_metadata _PyOpcode_opcode_metadata[512];
 extern const struct opcode_macro_expansion _PyOpcode_macro_expansion[256];
 #ifdef Py_DEBUG
-extern const char *_PyOpcode_uop_name[512];
+extern const char * const _PyOpcode_uop_name[512];
 #endif
 #else
 const struct opcode_metadata _PyOpcode_opcode_metadata[512] = {
@@ -1222,7 +1222,7 @@ const struct opcode_macro_expansion _PyOpcode_macro_expansion[256] = {
     [SWAP] = { .nuops = 1, .uops = { { SWAP, 0, 0 } } },
 };
 #ifdef Py_DEBUG
-const char *_PyOpcode_uop_name[512] = {
+const char * const _PyOpcode_uop_name[512] = {
     [300] = "EXIT_TRACE",
     [301] = "SET_IP",
     [302] = "_GUARD_BOTH_INT",
diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py
index 0b6ad839fb48c2..9fcdcbef96a319 100644
--- a/Tools/cases_generator/generate_cases.py
+++ b/Tools/cases_generator/generate_cases.py
@@ -1215,7 +1215,7 @@ def write_metadata(self) -> None:
             self.out.emit("extern const struct opcode_metadata _PyOpcode_opcode_metadata[512];")
             self.out.emit("extern const struct opcode_macro_expansion _PyOpcode_macro_expansion[256];")
             self.out.emit("#ifdef Py_DEBUG")
-            self.out.emit("extern const char *_PyOpcode_uop_name[512];")
+            self.out.emit("extern const char * const _PyOpcode_uop_name[512];")
             self.out.emit("#endif")
             self.out.emit("#else")
 
@@ -1268,7 +1268,7 @@ def write_metadata(self) -> None:
                             typing.assert_never(thing)
 
             self.out.emit("#ifdef Py_DEBUG")
-            with self.out.block("const char *_PyOpcode_uop_name[512] =", ";"):
+            with self.out.block("const char * const _PyOpcode_uop_name[512] =", ";"):
                 self.write_uop_items(lambda name, counter: f"[{counter}] = \"{name}\",")
             self.out.emit("#endif")
 

From ea90c43d9d511847f2c591ada61aa967144cdce5 Mon Sep 17 00:00:00 2001
From: Guido van Rossum <guido@python.org>
Date: Wed, 28 Jun 2023 10:57:35 -0700
Subject: [PATCH 5/5] Make Instruction and MacroInstruction pre-compute active
 cache effects

---
 Tools/cases_generator/generate_cases.py | 175 +++++++++++-------------
 1 file changed, 81 insertions(+), 94 deletions(-)

diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py
index 9fcdcbef96a319..7c99e3b929b463 100644
--- a/Tools/cases_generator/generate_cases.py
+++ b/Tools/cases_generator/generate_cases.py
@@ -300,6 +300,13 @@ def emit_macros(cls, out: Formatter):
                 f"(_PyOpcode_opcode_metadata[(OP)].flags & ({name}))")
 
 
+@dataclasses.dataclass
+class ActiveCacheEffect:
+    """Wraps a CacheEffect that is actually used, in context."""
+    effect: parser.CacheEffect
+    offset: int
+
+
 FORBIDDEN_NAMES_IN_UOPS = (
     "resume_with_error",  # Proxy for "goto", which isn't an IDENTIFIER
     "unbound_local_error",
@@ -344,6 +351,7 @@ class Instruction:
     unmoved_names: frozenset[str]
     instr_fmt: str
     instr_flags: InstructionFlags
+    active_caches: list[ActiveCacheEffect]
 
     # Set later
     family: parser.Family | None = None
@@ -375,15 +383,19 @@ def __init__(self, inst: parser.InstDef):
 
         self.instr_flags = InstructionFlags.fromInstruction(inst)
 
+        self.active_caches = []
+        offset = 0
+        for effect in self.cache_effects:
+            if effect.name != UNUSED:
+                self.active_caches.append(ActiveCacheEffect(effect, offset))
+            offset += effect.size
+
         if self.instr_flags.HAS_ARG_FLAG:
             fmt = "IB"
         else:
             fmt = "IX"
-        cache = "C"
-        for ce in self.cache_effects:
-            for _ in range(ce.size):
-                fmt += cache
-                cache = "0"
+        if offset:
+            fmt += "C" + "0"*(offset-1)
         self.instr_fmt = fmt
 
     def is_viable_uop(self) -> bool:
@@ -392,18 +404,11 @@ def is_viable_uop(self) -> bool:
             return False
         if self.instr_flags.HAS_ARG_FLAG:
             # If the instruction uses oparg, it cannot use any caches
-            for c in self.cache_effects:
-                if c.name != UNUSED:
-                    return False
+            if self.active_caches:
+                return False
         else:
             # If it doesn't use oparg, it can have one cache entry
-            caches: list[parser.CacheEffect] = []
-            cache_offset = 0
-            for c in self.cache_effects:
-                if c.name != UNUSED:
-                    caches.append(c)
-                cache_offset += c.size
-            if len(caches) > 1:
+            if len(self.active_caches) > 1:
                 return False
         for forbidden in FORBIDDEN_NAMES_IN_UOPS:
             # TODO: Don't check in '#ifdef ENABLE_SPECIALIZATION' regions
@@ -458,7 +463,7 @@ def write(self, out: Formatter, tier: Tiers = TIER_ONE) -> None:
 
         # out.emit(f"next_instr += OPSIZE({self.inst.name}) - 1;")
 
-        self.write_body(out, 0, tier=tier)
+        self.write_body(out, 0, self.active_caches, tier=tier)
 
         # Skip the rest if the block always exits
         if self.always_exits:
@@ -492,33 +497,30 @@ def write_body(
             self,
             out: Formatter,
             dedent: int,
-            cache_adjust: int = 0,
+            active_caches: list[ActiveCacheEffect],
             tier: Tiers = TIER_ONE,
         ) -> None:
         """Write the instruction body."""
         # Write cache effect variable declarations and initializations
-        cache_offset = cache_adjust
-        for ceffect in self.cache_effects:
-            if ceffect.name != UNUSED:
-                bits = ceffect.size * BITS_PER_CODE_UNIT
-                if bits == 64:
-                    # NOTE: We assume that 64-bit data in the cache
-                    # is always an object pointer.
-                    # If this becomes false, we need a way to specify
-                    # syntactically what type the cache data is.
-                    typ = "PyObject *"
-                    func = "read_obj"
-                else:
-                    typ = f"uint{bits}_t "
-                    func = f"read_u{bits}"
-                if tier == TIER_ONE:
-                    out.emit(
-                        f"{typ}{ceffect.name} = {func}(&next_instr[{cache_offset}].cache);"
-                    )
-                else:
-                    out.emit(f"{typ}{ceffect.name} = operand;")
-            cache_offset += ceffect.size
-        assert cache_offset == self.cache_offset + cache_adjust
+        for active in active_caches:
+            ceffect = active.effect
+            bits = ceffect.size * BITS_PER_CODE_UNIT
+            if bits == 64:
+                # NOTE: We assume that 64-bit data in the cache
+                # is always an object pointer.
+                # If this becomes false, we need a way to specify
+                # syntactically what type the cache data is.
+                typ = "PyObject *"
+                func = "read_obj"
+            else:
+                typ = f"uint{bits}_t "
+                func = f"read_u{bits}"
+            if tier == TIER_ONE:
+                out.emit(
+                    f"{typ}{ceffect.name} = {func}(&next_instr[{active.offset}].cache);"
+                )
+            else:
+                out.emit(f"{typ}{ceffect.name} = operand;")
 
         # Write the body, substituting a goto for ERROR_IF() and other stuff
         assert dedent <= 0
@@ -583,8 +585,9 @@ class Component:
     instr: Instruction
     input_mapping: StackEffectMapping
     output_mapping: StackEffectMapping
+    active_caches: list[ActiveCacheEffect]
 
-    def write_body(self, out: Formatter, cache_adjust: int) -> None:
+    def write_body(self, out: Formatter) -> None:
         with out.block(""):
             input_names = {ieffect.name for _, ieffect in self.input_mapping}
             for var, ieffect in self.input_mapping:
@@ -593,7 +596,7 @@ def write_body(self, out: Formatter, cache_adjust: int) -> None:
                 if oeffect.name not in input_names:
                     out.declare(oeffect, None)
 
-            self.instr.write_body(out, dedent=-4, cache_adjust=cache_adjust)
+            self.instr.write_body(out, -4, self.active_caches)
 
             for var, oeffect in self.output_mapping:
                 out.assign(var, oeffect)
@@ -611,6 +614,7 @@ class MacroInstruction:
     instr_flags: InstructionFlags
     macro: parser.Macro
     parts: list[Component | parser.CacheEffect]
+    cache_offset: int
     predicted: bool = False
 
 
@@ -873,11 +877,11 @@ def effect_counts(self, name: str) -> tuple[int, int, int]:
             cache = instr.cache_offset
             input = len(instr.input_effects)
             output = len(instr.output_effects)
-        elif macro := self.macro_instrs.get(name):
-            cache, input, output = 0, 0, 0
-            for part in macro.parts:
+        elif mac := self.macro_instrs.get(name):
+            cache = mac.cache_offset
+            input, output = 0, 0
+            for part in mac.parts:
                 if isinstance(part, Component):
-                    cache += part.instr.cache_offset
                     # A component may pop what the previous component pushed,
                     # so we offset the input/output counts by that.
                     delta_i = len(part.instr.input_effects)
@@ -885,9 +889,6 @@ def effect_counts(self, name: str) -> tuple[int, int, int]:
                     offset = min(delta_i, output)
                     input += delta_i - offset
                     output += delta_o - offset
-                else:
-                    assert isinstance(part, parser.CacheEffect), part
-                    cache += part.size
         else:
             assert False, f"Unknown instruction {name!r}"
         return cache, input, output
@@ -906,29 +907,25 @@ def analyze_macro(self, macro: parser.Macro) -> MacroInstruction:
         stack, initial_sp = self.stack_analysis(components)
         sp = initial_sp
         parts: list[Component | parser.CacheEffect] = []
-        format = "IB"
         flags = InstructionFlags.newEmpty()
-        cache = "C"
+        offset = 0
         for component in components:
             match component:
                 case parser.CacheEffect() as ceffect:
                     parts.append(ceffect)
-                    for _ in range(ceffect.size):
-                        format += cache
-                        cache = "0"
+                    offset += ceffect.size
                 case Instruction() as instr:
-                    part, sp = self.analyze_instruction(instr, stack, sp)
+                    part, sp, offset = self.analyze_instruction(instr, stack, sp, offset)
                     parts.append(part)
-                    for ce in instr.cache_effects:
-                        for _ in range(ce.size):
-                            format += cache
-                            cache = "0"
                     flags.add(instr.instr_flags)
                 case _:
                     typing.assert_never(component)
         final_sp = sp
+        format = "IB"
+        if offset:
+            format += "C" + "0"*(offset-1)
         return MacroInstruction(
-            macro.name, stack, initial_sp, final_sp, format, flags, macro, parts
+            macro.name, stack, initial_sp, final_sp, format, flags, macro, parts, offset
         )
 
     def analyze_pseudo(self, pseudo: parser.Pseudo) -> PseudoInstruction:
@@ -941,8 +938,8 @@ def analyze_pseudo(self, pseudo: parser.Pseudo) -> PseudoInstruction:
         return PseudoInstruction(pseudo.name, targets, fmts[0], targets[0].instr_flags)
 
     def analyze_instruction(
-        self, instr: Instruction, stack: list[StackEffect], sp: int
-    ) -> tuple[Component, int]:
+        self, instr: Instruction, stack: list[StackEffect], sp: int, offset: int
+    ) -> tuple[Component, int, int]:
         input_mapping: StackEffectMapping = []
         for ieffect in reversed(instr.input_effects):
             sp -= 1
@@ -951,7 +948,12 @@ def analyze_instruction(
         for oeffect in instr.output_effects:
             output_mapping.append((stack[sp], oeffect))
             sp += 1
-        return Component(instr, input_mapping, output_mapping), sp
+        active_effects: list[ActiveCacheEffect] = []
+        for ceffect in instr.cache_effects:
+            if ceffect.name != UNUSED:
+                active_effects.append(ActiveCacheEffect(ceffect, offset))
+            offset += ceffect.size
+        return Component(instr, input_mapping, output_mapping, active_effects), sp, offset
 
     def check_macro_components(
         self, macro: parser.Macro
@@ -1030,7 +1032,7 @@ def stack_analysis(
 
     def get_stack_effect_info(
         self, thing: parser.InstDef | parser.Macro | parser.Pseudo
-    ) -> tuple[AnyInstruction | None, str, str]:
+    ) -> tuple[AnyInstruction | None, str | None, str | None]:
         def effect_str(effects: list[StackEffect]) -> str:
             n_effect, sym_effect = list_effect_size(effects)
             if sym_effect:
@@ -1108,6 +1110,7 @@ def write_stack_effect_functions(self) -> None:
                 continue
             instr, popped, pushed = self.get_stack_effect_info(thing)
             if instr is not None:
+                assert popped is not None and pushed is not None
                 popped_data.append((instr, popped))
                 pushed_data.append((instr, pushed))
 
@@ -1254,8 +1257,7 @@ def write_metadata(self) -> None:
                             if instr.kind != "op" and instr.is_viable_uop():
                                 # Double check there aren't any used cache effects.
                                 # If this fails, see write_macro_expansions().
-                                assert not [e for e in instr.cache_effects if e.name != UNUSED], \
-                                    (instr.name, instr.cache_effects)
+                                assert not instr.active_caches, (instr.name, instr.cache_effects)
                                 self.out.emit(
                                     f"[{name}] = "
                                     f"{{ .nuops = 1, .uops = {{ {{ {name}, 0, 0 }} }} }},"
@@ -1332,34 +1334,19 @@ def write_macro_expansions(self, mac: MacroInstruction) -> None:
         offset = 0  # Cache effect offset
         expansions: list[tuple[str, int, int]] = []  # [(name, size, offset), ...]
         for part in mac.parts:
-            match part:
-                case parser.CacheEffect():
-                    offset += part.size
-                case Component():
-                    # All component instructions must be viable uops
-                    if not part.instr.is_viable_uop():
-                        print(f"Part {part.instr.name} of {mac.name} is not a viable uop")
-                        return
-                    effect = None
-                    effect_offset = 0
-                    for eff in part.instr.cache_effects:
-                        if eff.name != UNUSED:
-                            assert effect is None, \
-                                f"{mac.name} has multiple cache effects {eff.name}"
-                            effect = eff
-                            effect_offset = offset
-                        offset += eff.size
-                    if effect is not None:
-                        # We don't seem to have examples of this yet; remove the print once we do
-                        print(f"Part {part.instr.name} of {mac.name} has cache effect {effect.name}")
-                        assert not part.instr.instr_flags.HAS_ARG_FLAG, \
-                            f"{mac.name} has both cache effect and arg flag"
-                        exp = (part.instr.name, effect.size, effect_offset)
-                    else:
-                        exp = (part.instr.name, 0, 0)
-                        expansions.append(exp)
-                case _:
-                    typing.assert_never(part)
+            if isinstance(part, Component):
+                # All component instructions must be viable uops
+                if not part.instr.is_viable_uop():
+                    print(f"NOTE: Part {part.instr.name} of {mac.name} is not a viable uop")
+                    return
+                if part.instr.instr_flags.HAS_ARG_FLAG or not part.active_caches:
+                    size, offset = 0, 0
+                else:
+                    # If this assert triggers, is_viable_uops() lied
+                    assert len(part.active_caches) == 1, (mac.name, part.instr.name)
+                    cache = part.active_caches[0]
+                    size, offset = cache.effect.size, cache.offset
+                expansions.append((part.instr.name, size, offset))
         assert len(expansions) > 0, f"Macro {mac.name} has empty expansion?!"
         pieces = [f"{{ {name}, {size}, {offset} }}" for name, size, offset in expansions]
         self.out.emit(
@@ -1483,7 +1470,7 @@ def write_macro(self, mac: MacroInstruction) -> None:
                         cache_adjust += size
                     case Component() as comp:
                         last_instr = comp.instr
-                        comp.write_body(self.out, cache_adjust)
+                        comp.write_body(self.out)
                         cache_adjust += comp.instr.cache_offset
 
             if cache_adjust: