// Copyright 2019 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef V8_REGEXP_REGEXP_COMPILER_H_ #define V8_REGEXP_REGEXP_COMPILER_H_ #include <bitset> #include "src/base/small-vector.h" #include "src/base/strings.h" #include "src/regexp/regexp-flags.h" #include "src/regexp/regexp-nodes.h" namespace v8 { namespace internal { class DynamicBitSet; class Isolate; regexp_compiler_constants // namespace regexp_compiler_constants inline bool NeedsUnicodeCaseEquivalents(RegExpFlags flags) { … } // Details of a quick mask-compare check that can look ahead in the // input stream. class QuickCheckDetails { … }; // Improve the speed that we scan for an initial point where a non-anchored // regexp can match by using a Boyer-Moore-like table. This is done by // identifying non-greedy non-capturing loops in the nodes that eat any // character one at a time. For example in the middle of the regexp // /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly // inserted at the start of any non-anchored regexp. // // When we have found such a loop we look ahead in the nodes to find the set of // characters that can come at given distances. For example for the regexp // /.?foo/ we know that there are at least 3 characters ahead of us, and the // sets of characters that can occur are [any, [f, o], [o]]. We find a range in // the lookahead info where the set of characters is reasonably constrained. In // our example this is from index 1 to 2 (0 is not constrained). We can now // look 3 characters ahead and if we don't find one of [f, o] (the union of // [f, o] and [o]) then we can skip forwards by the range size (in this case 2). // // For Unicode input strings we do the same, but modulo 128. // // We also look at the first string fed to the regexp and use that to get a hint // of the character frequencies in the inputs. This affects the assessment of // whether the set of characters is 'reasonably constrained'. // // We also have another lookahead mechanism (called quick check in the code), // which uses a wide load of multiple characters followed by a mask and compare // to determine whether a match is possible at this point. enum ContainedInLattice { … }; inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) { … } class BoyerMoorePositionInfo : public ZoneObject { … }; class BoyerMooreLookahead : public ZoneObject { … }; // There are many ways to generate code for a node. This class encapsulates // the current way we should be generating. In other words it encapsulates // the current state of the code generator. The effect of this is that we // generate code for paths that the matcher can take through the regular // expression. A given node in the regexp can be code-generated several times // as it can be part of several traces. For example for the regexp: // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part // of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code // to match foo is generated only once (the traces have a common prefix). The // code to store the capture is deferred and generated (twice) after the places // where baz has been matched. class Trace { … }; class GreedyLoopState { … }; struct PreloadState { … }; // Analysis performs assertion propagation and computes eats_at_least_ values. // See the comments on AssertionPropagator and EatsAtLeastPropagator for more // details. RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpFlags flags, RegExpNode* node); class FrequencyCollator { … }; class RegExpCompiler { … }; // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates. class UnicodeRangeSplitter { … }; // We need to check for the following characters: 0x39C 0x3BC 0x178. // TODO(jgruber): Move to CharacterRange. bool RangeContainsLatin1Equivalents(CharacterRange range); } // namespace internal } // namespace v8 #endif // V8_REGEXP_REGEXP_COMPILER_H_