// Copyright 2023 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef V8_COMPILER_TURBOSHAFT_PRETENURING_PROPAGATION_REDUCER_H_ #define V8_COMPILER_TURBOSHAFT_PRETENURING_PROPAGATION_REDUCER_H_ #include "src/compiler/turboshaft/assembler.h" #include "src/compiler/turboshaft/phase.h" #include "src/compiler/turboshaft/reducer-traits.h" #include "src/compiler/turboshaft/utils.h" #include "src/zone/zone-allocator.h" #include "src/zone/zone-containers.h" #include "src/zone/zone.h" namespace v8::internal::compiler::turboshaft { // This reducer propagates pretenuring (= allocations of Old objects rather than // Young objects) throughout the graph: if a young allocation is stored in an // old allocation, then we'll make it old instead. The idea being that 1) if an // object is stored in an old object, it makes sense for it for be considered // old, and 2) this reduces the size of the remembered sets. // For instance, if we have: // // a = Allocate(old) // b = Allocate(young) // c = Allocate(young) // b.x = c // a.x = b // // Then we'll make `b` and `c` allocate Old objects directly, because they'll be // stored to old objects (transitively for `c`, but it still counts). // On the other hand, if we have: // // a = Allocate(old) // b = Allocate(young) // c = Allocate(young) // a.x = c // b.c = a // // Then we'll allocate `c` Old as well (because it is stored in an old pointer), // but `b` will stay young, since it's not stored in an Old object (it contains // a pointer to an old object, but that's probably not a good reason to make it // old). // // // Implementation // // In a first phase, we iterate the input graph and create a directed graph // (called `store_graph_`) where each node points to the nodes stored in it (so, // an edge between `a` and `b` means that `b` is stored in `a`). We also collect // all of the old allocations in a separate list (`old_allocs_`). The // `store_graph_` of the first example above will thus be: // // a -----> b -----> c // // And the graph of the second example will be: // // a -----> c b -----> a // // (it contains two unconnected subgraphs) // // Then, in a second phase, we iterate the old allocations (`old_allocs_`), and // for each one, we do a DFS in `store_graph_`, marking all of the nodes we // encounter as old, and stopping on old nodes (which 1- prevents infinite loops // easily, and 2- is sound because all of the initially-old pointers are roots // of this phase). On the 2 examples above, `a` will be the old entry in // `old_allocs_`, so in both cases will do a single DFS starting from `a`. In // the 1st case, it's easy to see that this DFS will encounter `b` and `c` // (which will thus both become Old), while in the 2nd case, this DFS will only // reach `c` (which will thus become Old, while `b` won't be changed). // // To be more precise, we also record Phi inputs in `store_graph_`, so that if // we have something like: // // a = Allocate(old) // b = Allocate(young) // c = Allocate(young) // p = Phi(b, c) // a.x = p // // Then we oldify both `b` and `c`. In this case, `store_graph_` would be // // --------> b // / // a ----> p // \ // --------> c // // Which means that in the second phase, we'll start a DFS on `a` (the only old // allocation), move to `p` (the only node reachable from `a`), and the oldify // `b` and `c` (which are reachable from `p`). // // // Limitation: when a Phi of old allocations is used as the left-hand side of a // Store where the value being stored is a young allocation, we don't oldify the // young allocation. For instance, we won't oldify `a` in this example: // // a = Allocate(young) // b = Allocate(old) // c = Allocate(old) // p = Phi(b, c) // p.x = a // // The reason being that the store_graph sturcture isn't well suited for this, // since an edge Phi->Node can mean either that Node is stored (via a StoreOp) // in Phi, or that Node is an input of Phi. The `store_graph_` for the example // above will thus look like: // // ------> b // / // p ------> a // \ // ------> c // // In order to oldify `a`, we would need to register `p` in `old_allocs_`, // except that we should only do this when `p` is actually old, and we discover // that only in the second phase. // Consider for instance this more complex example: // // a = Allocate(old) // b = Allocate(young) // c = Allocate(young) // d = Allocate(young) // a.x = b // a.y = c // p = Phi(b, c) // p.x = d // // The graph will be: // // -----> b <----- // / \ // a p -----> d // \ / // -----> c <----- // // And the only entry in `old_allocs_` will be `a`. During the DFS from `a`, // allocations `b` and `c` will be oldified. At this point, `p` will point to // edges to 2 old (`b` and `c`) and 1 young (`d`) nodes. // We could look at all Phis in `store_graph_` and consider one by one for being // roots of an oldifying DFS: if all of the inputs of a phi `p` (in the sense // OpIndex inputs in the input_graph) are Old, then start an oldifying DFS from // `p`. However, the worst case complexity would be something like O(n^2) where // `n` is the number of Phis in the graph (since we could end up checking all // Phis but only finding a single one that is old, but the DFS could make a // single other phi old, thus repeating the process). This complexity could be // made linear by maintaining additional datastructures on the side, but there // isn't much evidence that this optimization would be often useful in practice. class PretenuringPropagationAnalyzer { … }; // Forward delcaration template <class Next> class MemoryOptimizationReducer; template <class Next> class PretenuringPropagationReducer : public Next { … }; } // namespace v8::internal::compiler::turboshaft #endif // V8_COMPILER_TURBOSHAFT_PRETENURING_PROPAGATION_REDUCER_H_