1 | /*
|
---|
2 | * Copyright (C) 2013-2019 Apple Inc. All rights reserved.
|
---|
3 | *
|
---|
4 | * Redistribution and use in source and binary forms, with or without
|
---|
5 | * modification, are permitted provided that the following conditions
|
---|
6 | * are met:
|
---|
7 | * 1. Redistributions of source code must retain the above copyright
|
---|
8 | * notice, this list of conditions and the following disclaimer.
|
---|
9 | * 2. Redistributions in binary form must reproduce the above copyright
|
---|
10 | * notice, this list of conditions and the following disclaimer in the
|
---|
11 | * documentation and/or other materials provided with the distribution.
|
---|
12 | *
|
---|
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
---|
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
---|
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
---|
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
---|
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
---|
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
---|
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
---|
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
---|
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
---|
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
---|
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
---|
24 | */
|
---|
25 |
|
---|
26 | #include "config.h"
|
---|
27 | #include "BinarySwitch.h"
|
---|
28 |
|
---|
29 | #if ENABLE(JIT)
|
---|
30 |
|
---|
31 | #include <wtf/ListDump.h>
|
---|
32 |
|
---|
33 | namespace JSC {
|
---|
34 |
|
---|
35 | namespace BinarySwitchInternal {
|
---|
36 | static constexpr bool verbose = false;
|
---|
37 | }
|
---|
38 |
|
---|
39 | static unsigned globalCounter; // We use a different seed every time we are invoked.
|
---|
40 |
|
---|
41 | BinarySwitch::BinarySwitch(GPRReg value, const Vector<int64_t>& cases, Type type)
|
---|
42 | : m_type(type)
|
---|
43 | , m_value(value)
|
---|
44 | , m_weakRandom(globalCounter++)
|
---|
45 | , m_index(0)
|
---|
46 | , m_caseIndex(UINT_MAX)
|
---|
47 | {
|
---|
48 | if (cases.isEmpty())
|
---|
49 | return;
|
---|
50 |
|
---|
51 | if (BinarySwitchInternal::verbose)
|
---|
52 | dataLog("Original cases: ", listDump(cases), "\n");
|
---|
53 |
|
---|
54 | for (unsigned i = 0; i < cases.size(); ++i)
|
---|
55 | m_cases.append(Case(cases[i], i));
|
---|
56 |
|
---|
57 | std::sort(m_cases.begin(), m_cases.end());
|
---|
58 |
|
---|
59 | if (BinarySwitchInternal::verbose)
|
---|
60 | dataLog("Sorted cases: ", listDump(m_cases), "\n");
|
---|
61 |
|
---|
62 | #if ASSERT_ENABLED
|
---|
63 | for (unsigned i = 1; i < m_cases.size(); ++i)
|
---|
64 | ASSERT(m_cases[i - 1] < m_cases[i], i, m_cases.size(), m_cases[i].value, m_cases[i].index);
|
---|
65 | #endif
|
---|
66 |
|
---|
67 | build(0, false, m_cases.size());
|
---|
68 | }
|
---|
69 |
|
---|
70 | BinarySwitch::~BinarySwitch()
|
---|
71 | {
|
---|
72 | }
|
---|
73 |
|
---|
74 | bool BinarySwitch::advance(MacroAssembler& jit)
|
---|
75 | {
|
---|
76 | if (m_cases.isEmpty()) {
|
---|
77 | m_fallThrough.append(jit.jump());
|
---|
78 | return false;
|
---|
79 | }
|
---|
80 |
|
---|
81 | if (m_index == m_branches.size()) {
|
---|
82 | RELEASE_ASSERT(m_jumpStack.isEmpty());
|
---|
83 | return false;
|
---|
84 | }
|
---|
85 |
|
---|
86 | for (;;) {
|
---|
87 | const BranchCode& code = m_branches[m_index++];
|
---|
88 | switch (code.kind) {
|
---|
89 | case NotEqualToFallThrough:
|
---|
90 | switch (m_type) {
|
---|
91 | case Int32:
|
---|
92 | m_fallThrough.append(jit.branch32(
|
---|
93 | MacroAssembler::NotEqual, m_value,
|
---|
94 | MacroAssembler::Imm32(static_cast<int32_t>(m_cases[code.index].value))));
|
---|
95 | break;
|
---|
96 | case IntPtr:
|
---|
97 | m_fallThrough.append(jit.branchPtr(
|
---|
98 | MacroAssembler::NotEqual, m_value,
|
---|
99 | MacroAssembler::ImmPtr(bitwise_cast<const void*>(static_cast<intptr_t>(m_cases[code.index].value)))));
|
---|
100 | break;
|
---|
101 | }
|
---|
102 | break;
|
---|
103 | case NotEqualToPush:
|
---|
104 | switch (m_type) {
|
---|
105 | case Int32:
|
---|
106 | m_jumpStack.append(jit.branch32(
|
---|
107 | MacroAssembler::NotEqual, m_value,
|
---|
108 | MacroAssembler::Imm32(static_cast<int32_t>(m_cases[code.index].value))));
|
---|
109 | break;
|
---|
110 | case IntPtr:
|
---|
111 | m_jumpStack.append(jit.branchPtr(
|
---|
112 | MacroAssembler::NotEqual, m_value,
|
---|
113 | MacroAssembler::ImmPtr(bitwise_cast<const void*>(static_cast<intptr_t>(m_cases[code.index].value)))));
|
---|
114 | break;
|
---|
115 | }
|
---|
116 | break;
|
---|
117 | case LessThanToPush:
|
---|
118 | switch (m_type) {
|
---|
119 | case Int32:
|
---|
120 | m_jumpStack.append(jit.branch32(
|
---|
121 | MacroAssembler::LessThan, m_value,
|
---|
122 | MacroAssembler::Imm32(static_cast<int32_t>(m_cases[code.index].value))));
|
---|
123 | break;
|
---|
124 | case IntPtr:
|
---|
125 | m_jumpStack.append(jit.branchPtr(
|
---|
126 | MacroAssembler::LessThan, m_value,
|
---|
127 | MacroAssembler::ImmPtr(bitwise_cast<const void*>(static_cast<intptr_t>(m_cases[code.index].value)))));
|
---|
128 | break;
|
---|
129 | }
|
---|
130 | break;
|
---|
131 | case Pop:
|
---|
132 | m_jumpStack.takeLast().link(&jit);
|
---|
133 | break;
|
---|
134 | case ExecuteCase:
|
---|
135 | m_caseIndex = code.index;
|
---|
136 | return true;
|
---|
137 | }
|
---|
138 | }
|
---|
139 | }
|
---|
140 |
|
---|
141 | class RandomNumberGenerator {
|
---|
142 | public:
|
---|
143 | using result_type = uint32_t;
|
---|
144 |
|
---|
145 | RandomNumberGenerator(WeakRandom& weakRandom)
|
---|
146 | : m_weakRandom(weakRandom)
|
---|
147 | {
|
---|
148 | }
|
---|
149 |
|
---|
150 | uint32_t operator()()
|
---|
151 | {
|
---|
152 | return m_weakRandom.getUint32();
|
---|
153 | }
|
---|
154 |
|
---|
155 | static constexpr uint32_t min() { return std::numeric_limits<uint32_t>::min(); }
|
---|
156 | static constexpr uint32_t max() { return std::numeric_limits<uint32_t>::max(); }
|
---|
157 |
|
---|
158 | private:
|
---|
159 | WeakRandom& m_weakRandom;
|
---|
160 | };
|
---|
161 |
|
---|
162 | void BinarySwitch::build(unsigned start, bool hardStart, unsigned end)
|
---|
163 | {
|
---|
164 | if (BinarySwitchInternal::verbose)
|
---|
165 | dataLog("Building with start = ", start, ", hardStart = ", hardStart, ", end = ", end, "\n");
|
---|
166 |
|
---|
167 | auto append = [&] (const BranchCode& code) {
|
---|
168 | if (BinarySwitchInternal::verbose)
|
---|
169 | dataLog("==> ", code, "\n");
|
---|
170 | m_branches.append(code);
|
---|
171 | };
|
---|
172 |
|
---|
173 | unsigned size = end - start;
|
---|
174 |
|
---|
175 | RELEASE_ASSERT(size);
|
---|
176 |
|
---|
177 | // This code uses some random numbers to keep things balanced. It's important to keep in mind
|
---|
178 | // that this does not improve average-case throughput under the assumption that all cases fire
|
---|
179 | // with equal probability. It just ensures that there will not be some switch structure that
|
---|
180 | // when combined with some input will always produce pathologically good or pathologically bad
|
---|
181 | // performance.
|
---|
182 |
|
---|
183 | const unsigned leafThreshold = 3;
|
---|
184 |
|
---|
185 | if (size <= leafThreshold) {
|
---|
186 | if (BinarySwitchInternal::verbose)
|
---|
187 | dataLog("It's a leaf.\n");
|
---|
188 |
|
---|
189 | // It turns out that for exactly three cases or less, it's better to just compare each
|
---|
190 | // case individually. This saves 1/6 of a branch on average, and up to 1/3 of a branch in
|
---|
191 | // extreme cases where the divide-and-conquer bottoms out in a lot of 3-case subswitches.
|
---|
192 | //
|
---|
193 | // This assumes that we care about the cost of hitting some case more than we care about
|
---|
194 | // bottoming out in a default case. I believe that in most places where we use switch
|
---|
195 | // statements, we are more likely to hit one of the cases than we are to fall through to
|
---|
196 | // default. Intuitively, if we wanted to improve the performance of default, we would
|
---|
197 | // reduce the value of leafThreshold to 2 or even to 1. See below for a deeper discussion.
|
---|
198 |
|
---|
199 | bool allConsecutive = false;
|
---|
200 |
|
---|
201 | if ((hardStart || (start && m_cases[start - 1].value == m_cases[start].value - 1))
|
---|
202 | && start + size < m_cases.size()
|
---|
203 | && m_cases[start + size - 1].value == m_cases[start + size].value - 1) {
|
---|
204 | allConsecutive = true;
|
---|
205 | for (unsigned i = 0; i < size - 1; ++i) {
|
---|
206 | if (m_cases[start + i].value + 1 != m_cases[start + i + 1].value) {
|
---|
207 | allConsecutive = false;
|
---|
208 | break;
|
---|
209 | }
|
---|
210 | }
|
---|
211 | }
|
---|
212 |
|
---|
213 | if (BinarySwitchInternal::verbose)
|
---|
214 | dataLog("allConsecutive = ", allConsecutive, "\n");
|
---|
215 |
|
---|
216 | Vector<unsigned, 3> localCaseIndices;
|
---|
217 | for (unsigned i = 0; i < size; ++i)
|
---|
218 | localCaseIndices.append(start + i);
|
---|
219 |
|
---|
220 | std::shuffle(
|
---|
221 | localCaseIndices.begin(), localCaseIndices.end(),
|
---|
222 | RandomNumberGenerator(m_weakRandom));
|
---|
223 |
|
---|
224 | for (unsigned i = 0; i < size - 1; ++i) {
|
---|
225 | append(BranchCode(NotEqualToPush, localCaseIndices[i]));
|
---|
226 | append(BranchCode(ExecuteCase, localCaseIndices[i]));
|
---|
227 | append(BranchCode(Pop));
|
---|
228 | }
|
---|
229 |
|
---|
230 | if (!allConsecutive)
|
---|
231 | append(BranchCode(NotEqualToFallThrough, localCaseIndices.last()));
|
---|
232 |
|
---|
233 | append(BranchCode(ExecuteCase, localCaseIndices.last()));
|
---|
234 | return;
|
---|
235 | }
|
---|
236 |
|
---|
237 | if (BinarySwitchInternal::verbose)
|
---|
238 | dataLog("It's not a leaf.\n");
|
---|
239 |
|
---|
240 | // There are two different strategies we could consider here:
|
---|
241 | //
|
---|
242 | // Isolate median and split: pick a median and check if the comparison value is equal to it;
|
---|
243 | // if so, execute the median case. Otherwise check if the value is less than the median, and
|
---|
244 | // recurse left or right based on this. This has two subvariants: we could either first test
|
---|
245 | // equality for the median and then do the less-than, or we could first do the less-than and
|
---|
246 | // then check equality on the not-less-than path.
|
---|
247 | //
|
---|
248 | // Ignore median and split: do a less-than comparison on a value that splits the cases in two
|
---|
249 | // equal-sized halves. Recurse left or right based on the comparison. Do not test for equality
|
---|
250 | // against the median (or anything else); let the recursion handle those equality comparisons
|
---|
251 | // once we bottom out in a list that case 3 cases or less (see above).
|
---|
252 | //
|
---|
253 | // I'll refer to these strategies as Isolate and Ignore. I initially believed that Isolate
|
---|
254 | // would be faster since it leads to less branching for some lucky cases. It turns out that
|
---|
255 | // Isolate is almost a total fail in the average, assuming all cases are equally likely. How
|
---|
256 | // bad Isolate is depends on whether you believe that doing two consecutive branches based on
|
---|
257 | // the same comparison is cheaper than doing the compare/branches separately. This is
|
---|
258 | // difficult to evaluate. For small immediates that aren't blinded, we just care about
|
---|
259 | // avoiding a second compare instruction. For large immediates or when blinding is in play, we
|
---|
260 | // also care about the instructions used to materialize the immediate a second time. Isolate
|
---|
261 | // can help with both costs since it involves first doing a < compare+branch on some value,
|
---|
262 | // followed by a == compare+branch on the same exact value (or vice-versa). Ignore will do a <
|
---|
263 | // compare+branch on some value, and then the == compare+branch on that same value will happen
|
---|
264 | // much later.
|
---|
265 | //
|
---|
266 | // To evaluate these costs, I wrote the recurrence relation for Isolate and Ignore, assuming
|
---|
267 | // that ComparisonCost is the cost of a compare+branch and ChainedComparisonCost is the cost
|
---|
268 | // of a compare+branch on some value that you've just done another compare+branch for. These
|
---|
269 | // recurrence relations compute the total cost incurred if you executed the switch statement
|
---|
270 | // on each matching value. So the average cost of hitting some case can be computed as
|
---|
271 | // Isolate[n]/n or Ignore[n]/n, respectively for the two relations.
|
---|
272 | //
|
---|
273 | // Isolate[1] = ComparisonCost
|
---|
274 | // Isolate[2] = (2 + 1) * ComparisonCost
|
---|
275 | // Isolate[3] = (3 + 2 + 1) * ComparisonCost
|
---|
276 | // Isolate[n_] := With[
|
---|
277 | // {medianIndex = Floor[n/2] + If[EvenQ[n], RandomInteger[], 1]},
|
---|
278 | // ComparisonCost + ChainedComparisonCost +
|
---|
279 | // (ComparisonCost * (medianIndex - 1) + Isolate[medianIndex - 1]) +
|
---|
280 | // (2 * ComparisonCost * (n - medianIndex) + Isolate[n - medianIndex])]
|
---|
281 | //
|
---|
282 | // Ignore[1] = ComparisonCost
|
---|
283 | // Ignore[2] = (2 + 1) * ComparisonCost
|
---|
284 | // Ignore[3] = (3 + 2 + 1) * ComparisonCost
|
---|
285 | // Ignore[n_] := With[
|
---|
286 | // {medianIndex = If[EvenQ[n], n/2, Floor[n/2] + RandomInteger[]]},
|
---|
287 | // (medianIndex * ComparisonCost + Ignore[medianIndex]) +
|
---|
288 | // ((n - medianIndex) * ComparisonCost + Ignore[n - medianIndex])]
|
---|
289 | //
|
---|
290 | // This does not account for the average cost of hitting the default case. See further below
|
---|
291 | // for a discussion of that.
|
---|
292 | //
|
---|
293 | // It turns out that for ComparisonCost = 1 and ChainedComparisonCost = 1, Ignore is always
|
---|
294 | // better than Isolate. If we assume that ChainedComparisonCost = 0, then Isolate wins for
|
---|
295 | // switch statements that have 20 cases or fewer, though the margin of victory is never large
|
---|
296 | // - it might sometimes save an average of 0.3 ComparisonCost. For larger switch statements,
|
---|
297 | // we see divergence between the two with Ignore winning. This is of course rather
|
---|
298 | // unrealistic since the chained comparison is never free. For ChainedComparisonCost = 0.5, we
|
---|
299 | // see Isolate winning for 10 cases or fewer, by maybe 0.2 ComparisonCost. Again we see
|
---|
300 | // divergence for large switches with Ignore winning, for example if a switch statement has
|
---|
301 | // 100 cases then Ignore saves one branch on average.
|
---|
302 | //
|
---|
303 | // Our current JIT backends don't provide for optimization for chained comparisons, except for
|
---|
304 | // reducing the code for materializing the immediate if the immediates are large or blinding
|
---|
305 | // comes into play. Probably our JIT backends live somewhere north of
|
---|
306 | // ChainedComparisonCost = 0.5.
|
---|
307 | //
|
---|
308 | // This implies that using the Ignore strategy is likely better. If we wanted to incorporate
|
---|
309 | // the Isolate strategy, we'd want to determine the switch size threshold at which the two
|
---|
310 | // cross over and then use Isolate for switches that are smaller than that size.
|
---|
311 | //
|
---|
312 | // The average cost of hitting the default case is similar, but involves a different cost for
|
---|
313 | // the base cases: you have to assume that you will always fail each branch. For the Ignore
|
---|
314 | // strategy we would get this recurrence relation; the same kind of thing happens to the
|
---|
315 | // Isolate strategy:
|
---|
316 | //
|
---|
317 | // Ignore[1] = ComparisonCost
|
---|
318 | // Ignore[2] = (2 + 2) * ComparisonCost
|
---|
319 | // Ignore[3] = (3 + 3 + 3) * ComparisonCost
|
---|
320 | // Ignore[n_] := With[
|
---|
321 | // {medianIndex = If[EvenQ[n], n/2, Floor[n/2] + RandomInteger[]]},
|
---|
322 | // (medianIndex * ComparisonCost + Ignore[medianIndex]) +
|
---|
323 | // ((n - medianIndex) * ComparisonCost + Ignore[n - medianIndex])]
|
---|
324 | //
|
---|
325 | // This means that if we cared about the default case more, we would likely reduce
|
---|
326 | // leafThreshold. Reducing it to 2 would reduce the average cost of the default case by 1/3
|
---|
327 | // in the most extreme cases (num switch cases = 3, 6, 12, 24, ...). But it would also
|
---|
328 | // increase the average cost of taking one of the non-default cases by 1/3. Typically the
|
---|
329 | // difference is 1/6 in either direction. This makes it a very simple trade-off: if we believe
|
---|
330 | // that the default case is more important then we would want leafThreshold to be 2, and the
|
---|
331 | // default case would become 1/6 faster on average. But we believe that most switch statements
|
---|
332 | // are more likely to take one of the cases than the default, so we use leafThreshold = 3
|
---|
333 | // and get a 1/6 speed-up on average for taking an explicit case.
|
---|
334 |
|
---|
335 | unsigned medianIndex = (start + end) / 2;
|
---|
336 |
|
---|
337 | if (BinarySwitchInternal::verbose)
|
---|
338 | dataLog("medianIndex = ", medianIndex, "\n");
|
---|
339 |
|
---|
340 | // We want medianIndex to point to the thing we will do a less-than compare against. We want
|
---|
341 | // this less-than compare to split the current sublist into equal-sized sublists, or
|
---|
342 | // nearly-equal-sized with some randomness if we're in the odd case. With the above
|
---|
343 | // calculation, in the odd case we will have medianIndex pointing at either the element we
|
---|
344 | // want or the element to the left of the one we want. Consider the case of five elements:
|
---|
345 | //
|
---|
346 | // 0 1 2 3 4
|
---|
347 | //
|
---|
348 | // start will be 0, end will be 5. The average is 2.5, which rounds down to 2. If we do
|
---|
349 | // value < 2, then we will split the list into 2 elements on the left and three on the right.
|
---|
350 | // That's pretty good, but in this odd case we'd like to at random choose 3 instead to ensure
|
---|
351 | // that we don't become unbalanced on the right. This does not improve throughput since one
|
---|
352 | // side will always get shafted, and that side might still be odd, in which case it will also
|
---|
353 | // have two sides and one of them will get shafted - and so on. We just want to avoid
|
---|
354 | // deterministic pathologies.
|
---|
355 | //
|
---|
356 | // In the even case, we will always end up pointing at the element we want:
|
---|
357 | //
|
---|
358 | // 0 1 2 3
|
---|
359 | //
|
---|
360 | // start will be 0, end will be 4. So, the average is 2, which is what we'd like.
|
---|
361 | if (size & 1) {
|
---|
362 | RELEASE_ASSERT(medianIndex - start + 1 == end - medianIndex);
|
---|
363 | medianIndex += m_weakRandom.getUint32() & 1;
|
---|
364 | } else
|
---|
365 | RELEASE_ASSERT(medianIndex - start == end - medianIndex);
|
---|
366 |
|
---|
367 | RELEASE_ASSERT(medianIndex > start);
|
---|
368 | RELEASE_ASSERT(medianIndex + 1 < end);
|
---|
369 |
|
---|
370 | if (BinarySwitchInternal::verbose)
|
---|
371 | dataLog("fixed medianIndex = ", medianIndex, "\n");
|
---|
372 |
|
---|
373 | append(BranchCode(LessThanToPush, medianIndex));
|
---|
374 | build(medianIndex, true, end);
|
---|
375 | append(BranchCode(Pop));
|
---|
376 | build(start, hardStart, medianIndex);
|
---|
377 | }
|
---|
378 |
|
---|
379 | void BinarySwitch::Case::dump(PrintStream& out) const
|
---|
380 | {
|
---|
381 | out.print("<value: " , value, ", index: ", index, ">");
|
---|
382 | }
|
---|
383 |
|
---|
384 | void BinarySwitch::BranchCode::dump(PrintStream& out) const
|
---|
385 | {
|
---|
386 | switch (kind) {
|
---|
387 | case NotEqualToFallThrough:
|
---|
388 | out.print("NotEqualToFallThrough");
|
---|
389 | break;
|
---|
390 | case NotEqualToPush:
|
---|
391 | out.print("NotEqualToPush");
|
---|
392 | break;
|
---|
393 | case LessThanToPush:
|
---|
394 | out.print("LessThanToPush");
|
---|
395 | break;
|
---|
396 | case Pop:
|
---|
397 | out.print("Pop");
|
---|
398 | break;
|
---|
399 | case ExecuteCase:
|
---|
400 | out.print("ExecuteCase");
|
---|
401 | break;
|
---|
402 | }
|
---|
403 |
|
---|
404 | if (index != UINT_MAX)
|
---|
405 | out.print("(", index, ")");
|
---|
406 | }
|
---|
407 |
|
---|
408 | } // namespace JSC
|
---|
409 |
|
---|
410 | #endif // ENABLE(JIT)
|
---|
411 |
|
---|