Grok  10.0.3
math-inl.h
Go to the documentation of this file.
1 // Copyright 2020 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // Include guard (still compiled once per target)
17 #if defined(HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_) == \
18  defined(HWY_TARGET_TOGGLE)
19 #ifdef HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
20 #undef HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
21 #else
22 #define HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
23 #endif
24 
25 #include "hwy/highway.h"
26 
28 namespace hwy {
29 namespace HWY_NAMESPACE {
30 
39 template <class D, class V>
40 HWY_INLINE V Acos(const D d, V x);
41 template <class D, class V>
43  return Acos(d, x);
44 }
45 
54 template <class D, class V>
55 HWY_INLINE V Acosh(const D d, V x);
56 template <class D, class V>
58  return Acosh(d, x);
59 }
60 
69 template <class D, class V>
70 HWY_INLINE V Asin(const D d, V x);
71 template <class D, class V>
73  return Asin(d, x);
74 }
75 
84 template <class D, class V>
85 HWY_INLINE V Asinh(const D d, V x);
86 template <class D, class V>
88  return Asinh(d, x);
89 }
90 
99 template <class D, class V>
100 HWY_INLINE V Atan(const D d, V x);
101 template <class D, class V>
103  return Atan(d, x);
104 }
105 
114 template <class D, class V>
115 HWY_INLINE V Atanh(const D d, V x);
116 template <class D, class V>
118  return Atanh(d, x);
119 }
120 
129 template <class D, class V>
130 HWY_INLINE V Cos(const D d, V x);
131 template <class D, class V>
133  return Cos(d, x);
134 }
135 
144 template <class D, class V>
145 HWY_INLINE V Exp(const D d, V x);
146 template <class D, class V>
148  return Exp(d, x);
149 }
150 
159 template <class D, class V>
160 HWY_INLINE V Expm1(const D d, V x);
161 template <class D, class V>
163  return Expm1(d, x);
164 }
165 
174 template <class D, class V>
175 HWY_INLINE V Log(const D d, V x);
176 template <class D, class V>
178  return Log(d, x);
179 }
180 
189 template <class D, class V>
190 HWY_INLINE V Log10(const D d, V x);
191 template <class D, class V>
193  return Log10(d, x);
194 }
195 
204 template <class D, class V>
205 HWY_INLINE V Log1p(const D d, V x);
206 template <class D, class V>
208  return Log1p(d, x);
209 }
210 
219 template <class D, class V>
220 HWY_INLINE V Log2(const D d, V x);
221 template <class D, class V>
223  return Log2(d, x);
224 }
225 
234 template <class D, class V>
235 HWY_INLINE V Sin(const D d, V x);
236 template <class D, class V>
238  return Sin(d, x);
239 }
240 
249 template <class D, class V>
250 HWY_INLINE V Sinh(const D d, V x);
251 template <class D, class V>
253  return Sinh(d, x);
254 }
255 
264 template <class D, class V>
265 HWY_INLINE V Tanh(const D d, V x);
266 template <class D, class V>
268  return Tanh(d, x);
269 }
270 
272 // Implementation
274 namespace impl {
275 
276 // Estrin's Scheme is a faster method for evaluating large polynomials on
277 // super scalar architectures. It works by factoring the Horner's Method
278 // polynomial into power of two sub-trees that can be evaluated in parallel.
279 // Wikipedia Link: https://en.wikipedia.org/wiki/Estrin%27s_scheme
280 template <class T>
281 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1) {
282  return MulAdd(c1, x, c0);
283 }
284 template <class T>
285 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2) {
286  T x2 = Mul(x, x);
287  return MulAdd(x2, c2, MulAdd(c1, x, c0));
288 }
289 template <class T>
290 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3) {
291  T x2 = Mul(x, x);
292  return MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0));
293 }
294 template <class T>
295 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4) {
296  T x2 = Mul(x, x);
297  T x4 = Mul(x2, x2);
298  return MulAdd(x4, c4, MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
299 }
300 template <class T>
301 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5) {
302  T x2 = Mul(x, x);
303  T x4 = Mul(x2, x2);
304  return MulAdd(x4, MulAdd(c5, x, c4),
305  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
306 }
307 template <class T>
308 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
309  T c6) {
310  T x2 = Mul(x, x);
311  T x4 = Mul(x2, x2);
312  return MulAdd(x4, MulAdd(x2, c6, MulAdd(c5, x, c4)),
313  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
314 }
315 template <class T>
316 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
317  T c6, T c7) {
318  T x2 = Mul(x, x);
319  T x4 = Mul(x2, x2);
320  return MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
321  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
322 }
323 template <class T>
324 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
325  T c6, T c7, T c8) {
326  T x2 = Mul(x, x);
327  T x4 = Mul(x2, x2);
328  T x8 = Mul(x4, x4);
329  return MulAdd(x8, c8,
330  MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
331  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
332 }
333 template <class T>
334 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
335  T c6, T c7, T c8, T c9) {
336  T x2 = Mul(x, x);
337  T x4 = Mul(x2, x2);
338  T x8 = Mul(x4, x4);
339  return MulAdd(x8, MulAdd(c9, x, c8),
340  MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
341  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
342 }
343 template <class T>
344 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
345  T c6, T c7, T c8, T c9, T c10) {
346  T x2 = Mul(x, x);
347  T x4 = Mul(x2, x2);
348  T x8 = Mul(x4, x4);
349  return MulAdd(x8, MulAdd(x2, c10, MulAdd(c9, x, c8)),
350  MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
351  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
352 }
353 template <class T>
354 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
355  T c6, T c7, T c8, T c9, T c10, T c11) {
356  T x2 = Mul(x, x);
357  T x4 = Mul(x2, x2);
358  T x8 = Mul(x4, x4);
359  return MulAdd(x8, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8)),
360  MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
361  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
362 }
363 template <class T>
364 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
365  T c6, T c7, T c8, T c9, T c10, T c11,
366  T c12) {
367  T x2 = Mul(x, x);
368  T x4 = Mul(x2, x2);
369  T x8 = Mul(x4, x4);
370  return MulAdd(
371  x8, MulAdd(x4, c12, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
372  MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
373  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
374 }
375 template <class T>
376 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
377  T c6, T c7, T c8, T c9, T c10, T c11,
378  T c12, T c13) {
379  T x2 = Mul(x, x);
380  T x4 = Mul(x2, x2);
381  T x8 = Mul(x4, x4);
382  return MulAdd(x8,
383  MulAdd(x4, MulAdd(c13, x, c12),
384  MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
385  MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
386  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
387 }
388 template <class T>
389 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
390  T c6, T c7, T c8, T c9, T c10, T c11,
391  T c12, T c13, T c14) {
392  T x2 = Mul(x, x);
393  T x4 = Mul(x2, x2);
394  T x8 = Mul(x4, x4);
395  return MulAdd(x8,
396  MulAdd(x4, MulAdd(x2, c14, MulAdd(c13, x, c12)),
397  MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
398  MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
399  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
400 }
401 template <class T>
402 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
403  T c6, T c7, T c8, T c9, T c10, T c11,
404  T c12, T c13, T c14, T c15) {
405  T x2 = Mul(x, x);
406  T x4 = Mul(x2, x2);
407  T x8 = Mul(x4, x4);
408  return MulAdd(x8,
409  MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
410  MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
411  MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
412  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
413 }
414 template <class T>
415 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
416  T c6, T c7, T c8, T c9, T c10, T c11,
417  T c12, T c13, T c14, T c15, T c16) {
418  T x2 = Mul(x, x);
419  T x4 = Mul(x2, x2);
420  T x8 = Mul(x4, x4);
421  T x16 = Mul(x8, x8);
422  return MulAdd(
423  x16, c16,
424  MulAdd(x8,
425  MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
426  MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
427  MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
428  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))));
429 }
430 template <class T>
431 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
432  T c6, T c7, T c8, T c9, T c10, T c11,
433  T c12, T c13, T c14, T c15, T c16, T c17) {
434  T x2 = Mul(x, x);
435  T x4 = Mul(x2, x2);
436  T x8 = Mul(x4, x4);
437  T x16 = Mul(x8, x8);
438  return MulAdd(
439  x16, MulAdd(c17, x, c16),
440  MulAdd(x8,
441  MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
442  MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
443  MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
444  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))));
445 }
446 template <class T>
447 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
448  T c6, T c7, T c8, T c9, T c10, T c11,
449  T c12, T c13, T c14, T c15, T c16, T c17,
450  T c18) {
451  T x2 = Mul(x, x);
452  T x4 = Mul(x2, x2);
453  T x8 = Mul(x4, x4);
454  T x16 = Mul(x8, x8);
455  return MulAdd(
456  x16, MulAdd(x2, c18, MulAdd(c17, x, c16)),
457  MulAdd(x8,
458  MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
459  MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
460  MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
461  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))));
462 }
463 
464 template <class FloatOrDouble>
465 struct AsinImpl {};
466 template <class FloatOrDouble>
467 struct AtanImpl {};
468 template <class FloatOrDouble>
469 struct CosSinImpl {};
470 template <class FloatOrDouble>
471 struct ExpImpl {};
472 template <class FloatOrDouble>
473 struct LogImpl {};
474 
475 template <>
476 struct AsinImpl<float> {
477  // Polynomial approximation for asin(x) over the range [0, 0.5).
478  template <class D, class V>
479  HWY_INLINE V AsinPoly(D d, V x2, V /*x*/) {
480  const auto k0 = Set(d, +0.1666677296f);
481  const auto k1 = Set(d, +0.07495029271f);
482  const auto k2 = Set(d, +0.04547423869f);
483  const auto k3 = Set(d, +0.02424046025f);
484  const auto k4 = Set(d, +0.04197454825f);
485 
486  return Estrin(x2, k0, k1, k2, k3, k4);
487  }
488 };
489 
490 #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
491 
492 template <>
493 struct AsinImpl<double> {
494  // Polynomial approximation for asin(x) over the range [0, 0.5).
495  template <class D, class V>
496  HWY_INLINE V AsinPoly(D d, V x2, V /*x*/) {
497  const auto k0 = Set(d, +0.1666666666666497543);
498  const auto k1 = Set(d, +0.07500000000378581611);
499  const auto k2 = Set(d, +0.04464285681377102438);
500  const auto k3 = Set(d, +0.03038195928038132237);
501  const auto k4 = Set(d, +0.02237176181932048341);
502  const auto k5 = Set(d, +0.01735956991223614604);
503  const auto k6 = Set(d, +0.01388715184501609218);
504  const auto k7 = Set(d, +0.01215360525577377331);
505  const auto k8 = Set(d, +0.006606077476277170610);
506  const auto k9 = Set(d, +0.01929045477267910674);
507  const auto k10 = Set(d, -0.01581918243329996643);
508  const auto k11 = Set(d, +0.03161587650653934628);
509 
510  return Estrin(x2, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11);
511  }
512 };
513 
514 #endif
515 
516 template <>
517 struct AtanImpl<float> {
518  // Polynomial approximation for atan(x) over the range [0, 1.0).
519  template <class D, class V>
520  HWY_INLINE V AtanPoly(D d, V x) {
521  const auto k0 = Set(d, -0.333331018686294555664062f);
522  const auto k1 = Set(d, +0.199926957488059997558594f);
523  const auto k2 = Set(d, -0.142027363181114196777344f);
524  const auto k3 = Set(d, +0.106347933411598205566406f);
525  const auto k4 = Set(d, -0.0748900920152664184570312f);
526  const auto k5 = Set(d, +0.0425049886107444763183594f);
527  const auto k6 = Set(d, -0.0159569028764963150024414f);
528  const auto k7 = Set(d, +0.00282363896258175373077393f);
529 
530  const auto y = Mul(x, x);
531  return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7), Mul(y, x), x);
532  }
533 };
534 
535 #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
536 
537 template <>
538 struct AtanImpl<double> {
539  // Polynomial approximation for atan(x) over the range [0, 1.0).
540  template <class D, class V>
541  HWY_INLINE V AtanPoly(D d, V x) {
542  const auto k0 = Set(d, -0.333333333333311110369124);
543  const auto k1 = Set(d, +0.199999999996591265594148);
544  const auto k2 = Set(d, -0.14285714266771329383765);
545  const auto k3 = Set(d, +0.111111105648261418443745);
546  const auto k4 = Set(d, -0.090908995008245008229153);
547  const auto k5 = Set(d, +0.0769219538311769618355029);
548  const auto k6 = Set(d, -0.0666573579361080525984562);
549  const auto k7 = Set(d, +0.0587666392926673580854313);
550  const auto k8 = Set(d, -0.0523674852303482457616113);
551  const auto k9 = Set(d, +0.0466667150077840625632675);
552  const auto k10 = Set(d, -0.0407629191276836500001934);
553  const auto k11 = Set(d, +0.0337852580001353069993897);
554  const auto k12 = Set(d, -0.0254517624932312641616861);
555  const auto k13 = Set(d, +0.016599329773529201970117);
556  const auto k14 = Set(d, -0.00889896195887655491740809);
557  const auto k15 = Set(d, +0.00370026744188713119232403);
558  const auto k16 = Set(d, -0.00110611831486672482563471);
559  const auto k17 = Set(d, +0.000209850076645816976906797);
560  const auto k18 = Set(d, -1.88796008463073496563746e-5);
561 
562  const auto y = Mul(x, x);
563  return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11,
564  k12, k13, k14, k15, k16, k17, k18),
565  Mul(y, x), x);
566  }
567 };
568 
569 #endif
570 
571 template <>
572 struct CosSinImpl<float> {
573  // Rounds float toward zero and returns as int32_t.
574  template <class D, class V>
576  return ConvertTo(Rebind<int32_t, D>(), x);
577  }
578 
579  template <class D, class V>
580  HWY_INLINE V Poly(D d, V x) {
581  const auto k0 = Set(d, -1.66666597127914428710938e-1f);
582  const auto k1 = Set(d, +8.33307858556509017944336e-3f);
583  const auto k2 = Set(d, -1.981069071916863322258e-4f);
584  const auto k3 = Set(d, +2.6083159809786593541503e-6f);
585 
586  const auto y = Mul(x, x);
587  return MulAdd(Estrin(y, k0, k1, k2, k3), Mul(y, x), x);
588  }
589 
590  template <class D, class V, class VI32>
591  HWY_INLINE V CosReduce(D d, V x, VI32 q) {
592  // kHalfPiPart0f + kHalfPiPart1f + kHalfPiPart2f + kHalfPiPart3f ~= -pi/2
593  const V kHalfPiPart0f = Set(d, -0.5f * 3.140625f);
594  const V kHalfPiPart1f = Set(d, -0.5f * 0.0009670257568359375f);
595  const V kHalfPiPart2f = Set(d, -0.5f * 6.2771141529083251953e-7f);
596  const V kHalfPiPart3f = Set(d, -0.5f * 1.2154201256553420762e-10f);
597 
598  // Extended precision modular arithmetic.
599  const V qf = ConvertTo(d, q);
600  x = MulAdd(qf, kHalfPiPart0f, x);
601  x = MulAdd(qf, kHalfPiPart1f, x);
602  x = MulAdd(qf, kHalfPiPart2f, x);
603  x = MulAdd(qf, kHalfPiPart3f, x);
604  return x;
605  }
606 
607  template <class D, class V, class VI32>
608  HWY_INLINE V SinReduce(D d, V x, VI32 q) {
609  // kPiPart0f + kPiPart1f + kPiPart2f + kPiPart3f ~= -pi
610  const V kPiPart0f = Set(d, -3.140625f);
611  const V kPiPart1f = Set(d, -0.0009670257568359375f);
612  const V kPiPart2f = Set(d, -6.2771141529083251953e-7f);
613  const V kPiPart3f = Set(d, -1.2154201256553420762e-10f);
614 
615  // Extended precision modular arithmetic.
616  const V qf = ConvertTo(d, q);
617  x = MulAdd(qf, kPiPart0f, x);
618  x = MulAdd(qf, kPiPart1f, x);
619  x = MulAdd(qf, kPiPart2f, x);
620  x = MulAdd(qf, kPiPart3f, x);
621  return x;
622  }
623 
624  // (q & 2) == 0 ? -0.0 : +0.0
625  template <class D, class VI32>
627  const VI32 kTwo = Set(Rebind<int32_t, D>(), 2);
628  return BitCast(d, ShiftLeft<30>(AndNot(q, kTwo)));
629  }
630 
631  // ((q & 1) ? -0.0 : +0.0)
632  template <class D, class VI32>
634  const VI32 kOne = Set(Rebind<int32_t, D>(), 1);
635  return BitCast(d, ShiftLeft<31>(And(q, kOne)));
636  }
637 };
638 
639 #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
640 
641 template <>
642 struct CosSinImpl<double> {
643  // Rounds double toward zero and returns as int32_t.
644  template <class D, class V>
645  HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) {
646  return DemoteTo(Rebind<int32_t, D>(), x);
647  }
648 
649  template <class D, class V>
650  HWY_INLINE V Poly(D d, V x) {
651  const auto k0 = Set(d, -0.166666666666666657414808);
652  const auto k1 = Set(d, +0.00833333333333332974823815);
653  const auto k2 = Set(d, -0.000198412698412696162806809);
654  const auto k3 = Set(d, +2.75573192239198747630416e-6);
655  const auto k4 = Set(d, -2.50521083763502045810755e-8);
656  const auto k5 = Set(d, +1.60590430605664501629054e-10);
657  const auto k6 = Set(d, -7.64712219118158833288484e-13);
658  const auto k7 = Set(d, +2.81009972710863200091251e-15);
659  const auto k8 = Set(d, -7.97255955009037868891952e-18);
660 
661  const auto y = Mul(x, x);
662  return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8), Mul(y, x), x);
663  }
664 
665  template <class D, class V, class VI32>
666  HWY_INLINE V CosReduce(D d, V x, VI32 q) {
667  // kHalfPiPart0d + kHalfPiPart1d + kHalfPiPart2d + kHalfPiPart3d ~= -pi/2
668  const V kHalfPiPart0d = Set(d, -0.5 * 3.1415926218032836914);
669  const V kHalfPiPart1d = Set(d, -0.5 * 3.1786509424591713469e-8);
670  const V kHalfPiPart2d = Set(d, -0.5 * 1.2246467864107188502e-16);
671  const V kHalfPiPart3d = Set(d, -0.5 * 1.2736634327021899816e-24);
672 
673  // Extended precision modular arithmetic.
674  const V qf = PromoteTo(d, q);
675  x = MulAdd(qf, kHalfPiPart0d, x);
676  x = MulAdd(qf, kHalfPiPart1d, x);
677  x = MulAdd(qf, kHalfPiPart2d, x);
678  x = MulAdd(qf, kHalfPiPart3d, x);
679  return x;
680  }
681 
682  template <class D, class V, class VI32>
683  HWY_INLINE V SinReduce(D d, V x, VI32 q) {
684  // kPiPart0d + kPiPart1d + kPiPart2d + kPiPart3d ~= -pi
685  const V kPiPart0d = Set(d, -3.1415926218032836914);
686  const V kPiPart1d = Set(d, -3.1786509424591713469e-8);
687  const V kPiPart2d = Set(d, -1.2246467864107188502e-16);
688  const V kPiPart3d = Set(d, -1.2736634327021899816e-24);
689 
690  // Extended precision modular arithmetic.
691  const V qf = PromoteTo(d, q);
692  x = MulAdd(qf, kPiPart0d, x);
693  x = MulAdd(qf, kPiPart1d, x);
694  x = MulAdd(qf, kPiPart2d, x);
695  x = MulAdd(qf, kPiPart3d, x);
696  return x;
697  }
698 
699  // (q & 2) == 0 ? -0.0 : +0.0
700  template <class D, class VI32>
701  HWY_INLINE Vec<Rebind<double, D>> CosSignFromQuadrant(D d, VI32 q) {
702  const VI32 kTwo = Set(Rebind<int32_t, D>(), 2);
703  return BitCast(
704  d, ShiftLeft<62>(PromoteTo(Rebind<int64_t, D>(), AndNot(q, kTwo))));
705  }
706 
707  // ((q & 1) ? -0.0 : +0.0)
708  template <class D, class VI32>
709  HWY_INLINE Vec<Rebind<double, D>> SinSignFromQuadrant(D d, VI32 q) {
710  const VI32 kOne = Set(Rebind<int32_t, D>(), 1);
711  return BitCast(
712  d, ShiftLeft<63>(PromoteTo(Rebind<int64_t, D>(), And(q, kOne))));
713  }
714 };
715 
716 #endif
717 
718 template <>
719 struct ExpImpl<float> {
720  // Rounds float toward zero and returns as int32_t.
721  template <class D, class V>
723  return ConvertTo(Rebind<int32_t, D>(), x);
724  }
725 
726  template <class D, class V>
727  HWY_INLINE V ExpPoly(D d, V x) {
728  const auto k0 = Set(d, +0.5f);
729  const auto k1 = Set(d, +0.166666671633720397949219f);
730  const auto k2 = Set(d, +0.0416664853692054748535156f);
731  const auto k3 = Set(d, +0.00833336077630519866943359f);
732  const auto k4 = Set(d, +0.00139304355252534151077271f);
733  const auto k5 = Set(d, +0.000198527617612853646278381f);
734 
735  return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5), Mul(x, x), x);
736  }
737 
738  // Computes 2^x, where x is an integer.
739  template <class D, class VI32>
740  HWY_INLINE Vec<D> Pow2I(D d, VI32 x) {
741  const Rebind<int32_t, D> di32;
742  const VI32 kOffset = Set(di32, 0x7F);
743  return BitCast(d, ShiftLeft<23>(Add(x, kOffset)));
744  }
745 
746  // Sets the exponent of 'x' to 2^e.
747  template <class D, class V, class VI32>
748  HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) {
749  const VI32 y = ShiftRight<1>(e);
750  return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y)));
751  }
752 
753  template <class D, class V, class VI32>
754  HWY_INLINE V ExpReduce(D d, V x, VI32 q) {
755  // kLn2Part0f + kLn2Part1f ~= -ln(2)
756  const V kLn2Part0f = Set(d, -0.693145751953125f);
757  const V kLn2Part1f = Set(d, -1.428606765330187045e-6f);
758 
759  // Extended precision modular arithmetic.
760  const V qf = ConvertTo(d, q);
761  x = MulAdd(qf, kLn2Part0f, x);
762  x = MulAdd(qf, kLn2Part1f, x);
763  return x;
764  }
765 };
766 
767 template <>
768 struct LogImpl<float> {
769  template <class D, class V>
771  const Rebind<int32_t, D> di32;
772  const Rebind<uint32_t, D> du32;
773  const auto kBias = Set(di32, 0x7F);
774  return Sub(BitCast(di32, ShiftRight<23>(BitCast(du32, x))), kBias);
775  }
776 
777  // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)].
778  template <class D, class V>
779  HWY_INLINE V LogPoly(D d, V x) {
780  const V k0 = Set(d, 0.66666662693f);
781  const V k1 = Set(d, 0.40000972152f);
782  const V k2 = Set(d, 0.28498786688f);
783  const V k3 = Set(d, 0.24279078841f);
784 
785  const V x2 = Mul(x, x);
786  const V x4 = Mul(x2, x2);
787  return MulAdd(MulAdd(k2, x4, k0), x2, Mul(MulAdd(k3, x4, k1), x4));
788  }
789 };
790 
791 #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
792 template <>
793 struct ExpImpl<double> {
794  // Rounds double toward zero and returns as int32_t.
795  template <class D, class V>
796  HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) {
797  return DemoteTo(Rebind<int32_t, D>(), x);
798  }
799 
800  template <class D, class V>
801  HWY_INLINE V ExpPoly(D d, V x) {
802  const auto k0 = Set(d, +0.5);
803  const auto k1 = Set(d, +0.166666666666666851703837);
804  const auto k2 = Set(d, +0.0416666666666665047591422);
805  const auto k3 = Set(d, +0.00833333333331652721664984);
806  const auto k4 = Set(d, +0.00138888888889774492207962);
807  const auto k5 = Set(d, +0.000198412698960509205564975);
808  const auto k6 = Set(d, +2.4801587159235472998791e-5);
809  const auto k7 = Set(d, +2.75572362911928827629423e-6);
810  const auto k8 = Set(d, +2.75573911234900471893338e-7);
811  const auto k9 = Set(d, +2.51112930892876518610661e-8);
812  const auto k10 = Set(d, +2.08860621107283687536341e-9);
813 
814  return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10),
815  Mul(x, x), x);
816  }
817 
818  // Computes 2^x, where x is an integer.
819  template <class D, class VI32>
820  HWY_INLINE Vec<D> Pow2I(D d, VI32 x) {
821  const Rebind<int32_t, D> di32;
822  const Rebind<int64_t, D> di64;
823  const VI32 kOffset = Set(di32, 0x3FF);
824  return BitCast(d, ShiftLeft<52>(PromoteTo(di64, Add(x, kOffset))));
825  }
826 
827  // Sets the exponent of 'x' to 2^e.
828  template <class D, class V, class VI32>
829  HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) {
830  const VI32 y = ShiftRight<1>(e);
831  return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y)));
832  }
833 
834  template <class D, class V, class VI32>
835  HWY_INLINE V ExpReduce(D d, V x, VI32 q) {
836  // kLn2Part0d + kLn2Part1d ~= -ln(2)
837  const V kLn2Part0d = Set(d, -0.6931471805596629565116018);
838  const V kLn2Part1d = Set(d, -0.28235290563031577122588448175e-12);
839 
840  // Extended precision modular arithmetic.
841  const V qf = PromoteTo(d, q);
842  x = MulAdd(qf, kLn2Part0d, x);
843  x = MulAdd(qf, kLn2Part1d, x);
844  return x;
845  }
846 };
847 
848 template <>
849 struct LogImpl<double> {
850  template <class D, class V>
851  HWY_INLINE Vec<Rebind<int64_t, D>> Log2p1NoSubnormal(D /*d*/, V x) {
852  const Rebind<int64_t, D> di64;
853  const Rebind<uint64_t, D> du64;
854  return Sub(BitCast(di64, ShiftRight<52>(BitCast(du64, x))),
855  Set(di64, 0x3FF));
856  }
857 
858  // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)].
859  template <class D, class V>
860  HWY_INLINE V LogPoly(D d, V x) {
861  const V k0 = Set(d, 0.6666666666666735130);
862  const V k1 = Set(d, 0.3999999999940941908);
863  const V k2 = Set(d, 0.2857142874366239149);
864  const V k3 = Set(d, 0.2222219843214978396);
865  const V k4 = Set(d, 0.1818357216161805012);
866  const V k5 = Set(d, 0.1531383769920937332);
867  const V k6 = Set(d, 0.1479819860511658591);
868 
869  const V x2 = Mul(x, x);
870  const V x4 = Mul(x2, x2);
871  return MulAdd(MulAdd(MulAdd(MulAdd(k6, x4, k4), x4, k2), x4, k0), x2,
872  (Mul(MulAdd(MulAdd(k5, x4, k3), x4, k1), x4)));
873  }
874 };
875 
876 #endif
877 
878 template <class D, class V, bool kAllowSubnormals = true>
879 HWY_INLINE V Log(const D d, V x) {
880  // http://git.musl-libc.org/cgit/musl/tree/src/math/log.c for more info.
881  using T = TFromD<D>;
882  impl::LogImpl<T> impl;
883 
884  constexpr bool kIsF32 = (sizeof(T) == 4);
885 
886  // Float Constants
887  const V kLn2Hi = Set(d, kIsF32 ? static_cast<T>(0.69313812256f)
888  : static_cast<T>(0.693147180369123816490));
889  const V kLn2Lo = Set(d, kIsF32 ? static_cast<T>(9.0580006145e-6f)
890  : static_cast<T>(1.90821492927058770002e-10));
891  const V kOne = Set(d, static_cast<T>(+1.0));
892  const V kMinNormal = Set(d, kIsF32 ? static_cast<T>(1.175494351e-38f)
893  : static_cast<T>(2.2250738585072014e-308));
894  const V kScale = Set(d, kIsF32 ? static_cast<T>(3.355443200e+7f)
895  : static_cast<T>(1.8014398509481984e+16));
896 
897  // Integer Constants
898  using TI = MakeSigned<T>;
899  const Rebind<TI, D> di;
900  using VI = decltype(Zero(di));
901  const VI kLowerBits = Set(di, kIsF32 ? static_cast<TI>(0x00000000L)
902  : static_cast<TI>(0xFFFFFFFFLL));
903  const VI kMagic = Set(di, kIsF32 ? static_cast<TI>(0x3F3504F3L)
904  : static_cast<TI>(0x3FE6A09E00000000LL));
905  const VI kExpMask = Set(di, kIsF32 ? static_cast<TI>(0x3F800000L)
906  : static_cast<TI>(0x3FF0000000000000LL));
907  const VI kExpScale =
908  Set(di, kIsF32 ? static_cast<TI>(-25) : static_cast<TI>(-54));
909  const VI kManMask = Set(di, kIsF32 ? static_cast<TI>(0x7FFFFFL)
910  : static_cast<TI>(0xFFFFF00000000LL));
911 
912  // Scale up 'x' so that it is no longer denormalized.
913  VI exp_bits;
914  V exp;
915  if (kAllowSubnormals == true) {
916  const auto is_denormal = Lt(x, kMinNormal);
917  x = IfThenElse(is_denormal, Mul(x, kScale), x);
918 
919  // Compute the new exponent.
920  exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic));
921  const VI exp_scale =
922  BitCast(di, IfThenElseZero(is_denormal, BitCast(d, kExpScale)));
923  exp = ConvertTo(
924  d, Add(exp_scale, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits))));
925  } else {
926  // Compute the new exponent.
927  exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic));
928  exp = ConvertTo(d, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits)));
929  }
930 
931  // Renormalize.
932  const V y = Or(And(x, BitCast(d, kLowerBits)),
933  BitCast(d, Add(And(exp_bits, kManMask), kMagic)));
934 
935  // Approximate and reconstruct.
936  const V ym1 = Sub(y, kOne);
937  const V z = Div(ym1, Add(y, kOne));
938 
939  return MulSub(
940  exp, kLn2Hi,
941  Sub(MulSub(z, Sub(ym1, impl.LogPoly(d, z)), Mul(exp, kLn2Lo)), ym1));
942 }
943 
944 } // namespace impl
945 
946 template <class D, class V>
947 HWY_INLINE V Acos(const D d, V x) {
948  using T = TFromD<D>;
949 
950  const V kZero = Zero(d);
951  const V kHalf = Set(d, static_cast<T>(+0.5));
952  const V kPi = Set(d, static_cast<T>(+3.14159265358979323846264));
953  const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));
954 
955  const V sign_x = And(SignBit(d), x);
956  const V abs_x = Xor(x, sign_x);
957  const auto mask = Lt(abs_x, kHalf);
958  const V yy =
959  IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf));
960  const V y = IfThenElse(mask, abs_x, Sqrt(yy));
961 
962  impl::AsinImpl<T> impl;
963  const V t = Mul(impl.AsinPoly(d, yy, y), Mul(y, yy));
964 
965  const V t_plus_y = Add(t, y);
966  const V z =
967  IfThenElse(mask, Sub(kPiOverTwo, Add(Xor(y, sign_x), Xor(t, sign_x))),
968  Add(t_plus_y, t_plus_y));
969  return IfThenElse(Or(mask, Ge(x, kZero)), z, Sub(kPi, z));
970 }
971 
972 template <class D, class V>
973 HWY_INLINE V Acosh(const D d, V x) {
974  using T = TFromD<D>;
975 
976  const V kLarge = Set(d, static_cast<T>(268435456.0));
977  const V kLog2 = Set(d, static_cast<T>(0.693147180559945286227));
978  const V kOne = Set(d, static_cast<T>(+1.0));
979  const V kTwo = Set(d, static_cast<T>(+2.0));
980 
981  const auto is_x_large = Gt(x, kLarge);
982  const auto is_x_gt_2 = Gt(x, kTwo);
983 
984  const V x_minus_1 = Sub(x, kOne);
985  const V y0 = MulSub(kTwo, x, Div(kOne, Add(Sqrt(MulSub(x, x, kOne)), x)));
986  const V y1 =
987  Add(Sqrt(MulAdd(x_minus_1, kTwo, Mul(x_minus_1, x_minus_1))), x_minus_1);
988  const V y2 =
989  IfThenElse(is_x_gt_2, IfThenElse(is_x_large, x, y0), Add(y1, kOne));
990  const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2);
991 
992  const auto is_pole = Eq(y2, kOne);
993  const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne);
994  return Add(IfThenElse(is_x_gt_2, z,
995  IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor))),
996  IfThenElseZero(is_x_large, kLog2));
997 }
998 
999 template <class D, class V>
1000 HWY_INLINE V Asin(const D d, V x) {
1001  using T = TFromD<D>;
1002 
1003  const V kHalf = Set(d, static_cast<T>(+0.5));
1004  const V kTwo = Set(d, static_cast<T>(+2.0));
1005  const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));
1006 
1007  const V sign_x = And(SignBit(d), x);
1008  const V abs_x = Xor(x, sign_x);
1009  const auto mask = Lt(abs_x, kHalf);
1010  const V yy =
1011  IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf));
1012  const V y = IfThenElse(mask, abs_x, Sqrt(yy));
1013 
1014  impl::AsinImpl<T> impl;
1015  const V z0 = MulAdd(impl.AsinPoly(d, yy, y), Mul(yy, y), y);
1016  const V z1 = NegMulAdd(z0, kTwo, kPiOverTwo);
1017  return Or(IfThenElse(mask, z0, z1), sign_x);
1018 }
1019 
1020 template <class D, class V>
1021 HWY_INLINE V Asinh(const D d, V x) {
1022  using T = TFromD<D>;
1023 
1024  const V kSmall = Set(d, static_cast<T>(1.0 / 268435456.0));
1025  const V kLarge = Set(d, static_cast<T>(268435456.0));
1026  const V kLog2 = Set(d, static_cast<T>(0.693147180559945286227));
1027  const V kOne = Set(d, static_cast<T>(+1.0));
1028  const V kTwo = Set(d, static_cast<T>(+2.0));
1029 
1030  const V sign_x = And(SignBit(d), x); // Extract the sign bit
1031  const V abs_x = Xor(x, sign_x);
1032 
1033  const auto is_x_large = Gt(abs_x, kLarge);
1034  const auto is_x_lt_2 = Lt(abs_x, kTwo);
1035 
1036  const V x2 = Mul(x, x);
1037  const V sqrt_x2_plus_1 = Sqrt(Add(x2, kOne));
1038 
1039  const V y0 = MulAdd(abs_x, kTwo, Div(kOne, Add(sqrt_x2_plus_1, abs_x)));
1040  const V y1 = Add(Div(x2, Add(sqrt_x2_plus_1, kOne)), abs_x);
1041  const V y2 =
1042  IfThenElse(is_x_lt_2, Add(y1, kOne), IfThenElse(is_x_large, abs_x, y0));
1043  const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2);
1044 
1045  const auto is_pole = Eq(y2, kOne);
1046  const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne);
1047  const auto large = IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor));
1048  const V y = IfThenElse(Lt(abs_x, kSmall), x, large);
1049  return Or(Add(IfThenElse(is_x_lt_2, y, z), IfThenElseZero(is_x_large, kLog2)),
1050  sign_x);
1051 }
1052 
1053 template <class D, class V>
1054 HWY_INLINE V Atan(const D d, V x) {
1055  using T = TFromD<D>;
1056 
1057  const V kOne = Set(d, static_cast<T>(+1.0));
1058  const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));
1059 
1060  const V sign = And(SignBit(d), x);
1061  const V abs_x = Xor(x, sign);
1062  const auto mask = Gt(abs_x, kOne);
1063 
1064  impl::AtanImpl<T> impl;
1065  const auto divisor = IfThenElse(mask, abs_x, kOne);
1066  const V y = impl.AtanPoly(d, IfThenElse(mask, Div(kOne, divisor), abs_x));
1067  return Or(IfThenElse(mask, Sub(kPiOverTwo, y), y), sign);
1068 }
1069 
1070 template <class D, class V>
1071 HWY_INLINE V Atanh(const D d, V x) {
1072  using T = TFromD<D>;
1073 
1074  const V kHalf = Set(d, static_cast<T>(+0.5));
1075  const V kOne = Set(d, static_cast<T>(+1.0));
1076 
1077  const V sign = And(SignBit(d), x); // Extract the sign bit
1078  const V abs_x = Xor(x, sign);
1079  return Mul(Log1p(d, Div(Add(abs_x, abs_x), Sub(kOne, abs_x))),
1080  Xor(kHalf, sign));
1081 }
1082 
1083 template <class D, class V>
1084 HWY_INLINE V Cos(const D d, V x) {
1085  using T = TFromD<D>;
1086  impl::CosSinImpl<T> impl;
1087 
1088  // Float Constants
1089  const V kOneOverPi = Set(d, static_cast<T>(0.31830988618379067153));
1090 
1091  // Integer Constants
1092  const Rebind<int32_t, D> di32;
1093  using VI32 = decltype(Zero(di32));
1094  const VI32 kOne = Set(di32, 1);
1095 
1096  const V y = Abs(x); // cos(x) == cos(|x|)
1097 
1098  // Compute the quadrant, q = int(|x| / pi) * 2 + 1
1099  const VI32 q = Add(ShiftLeft<1>(impl.ToInt32(d, Mul(y, kOneOverPi))), kOne);
1100 
1101  // Reduce range, apply sign, and approximate.
1102  return impl.Poly(
1103  d, Xor(impl.CosReduce(d, y, q), impl.CosSignFromQuadrant(d, q)));
1104 }
1105 
1106 template <class D, class V>
1107 HWY_INLINE V Exp(const D d, V x) {
1108  using T = TFromD<D>;
1109 
1110  const V kHalf = Set(d, static_cast<T>(+0.5));
1111  const V kLowerBound =
1112  Set(d, static_cast<T>((sizeof(T) == 4 ? -104.0 : -1000.0)));
1113  const V kNegZero = Set(d, static_cast<T>(-0.0));
1114  const V kOne = Set(d, static_cast<T>(+1.0));
1115  const V kOneOverLog2 = Set(d, static_cast<T>(+1.442695040888963407359924681));
1116 
1117  impl::ExpImpl<T> impl;
1118 
1119  // q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5))
1120  const auto q =
1121  impl.ToInt32(d, MulAdd(x, kOneOverLog2, Or(kHalf, And(x, kNegZero))));
1122 
1123  // Reduce, approximate, and then reconstruct.
1124  const V y = impl.LoadExpShortRange(
1125  d, Add(impl.ExpPoly(d, impl.ExpReduce(d, x, q)), kOne), q);
1126  return IfThenElseZero(Ge(x, kLowerBound), y);
1127 }
1128 
1129 template <class D, class V>
1130 HWY_INLINE V Expm1(const D d, V x) {
1131  using T = TFromD<D>;
1132 
1133  const V kHalf = Set(d, static_cast<T>(+0.5));
1134  const V kLowerBound =
1135  Set(d, static_cast<T>((sizeof(T) == 4 ? -104.0 : -1000.0)));
1136  const V kLn2Over2 = Set(d, static_cast<T>(+0.346573590279972654708616));
1137  const V kNegOne = Set(d, static_cast<T>(-1.0));
1138  const V kNegZero = Set(d, static_cast<T>(-0.0));
1139  const V kOne = Set(d, static_cast<T>(+1.0));
1140  const V kOneOverLog2 = Set(d, static_cast<T>(+1.442695040888963407359924681));
1141 
1142  impl::ExpImpl<T> impl;
1143 
1144  // q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5))
1145  const auto q =
1146  impl.ToInt32(d, MulAdd(x, kOneOverLog2, Or(kHalf, And(x, kNegZero))));
1147 
1148  // Reduce, approximate, and then reconstruct.
1149  const V y = impl.ExpPoly(d, impl.ExpReduce(d, x, q));
1150  const V z = IfThenElse(Lt(Abs(x), kLn2Over2), y,
1151  Sub(impl.LoadExpShortRange(d, Add(y, kOne), q), kOne));
1152  return IfThenElse(Lt(x, kLowerBound), kNegOne, z);
1153 }
1154 
1155 template <class D, class V>
1156 HWY_INLINE V Log(const D d, V x) {
1157  return impl::Log<D, V, /*kAllowSubnormals=*/true>(d, x);
1158 }
1159 
1160 template <class D, class V>
1161 HWY_INLINE V Log10(const D d, V x) {
1162  using T = TFromD<D>;
1163  return Mul(Log(d, x), Set(d, static_cast<T>(0.4342944819032518276511)));
1164 }
1165 
1166 template <class D, class V>
1167 HWY_INLINE V Log1p(const D d, V x) {
1168  using T = TFromD<D>;
1169  const V kOne = Set(d, static_cast<T>(+1.0));
1170 
1171  const V y = Add(x, kOne);
1172  const auto is_pole = Eq(y, kOne);
1173  const auto divisor = Sub(IfThenZeroElse(is_pole, y), kOne);
1174  const auto non_pole =
1175  Mul(impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y), Div(x, divisor));
1176  return IfThenElse(is_pole, x, non_pole);
1177 }
1178 
1179 template <class D, class V>
1180 HWY_INLINE V Log2(const D d, V x) {
1181  using T = TFromD<D>;
1182  return Mul(Log(d, x), Set(d, static_cast<T>(1.44269504088896340735992)));
1183 }
1184 
1185 template <class D, class V>
1186 HWY_INLINE V Sin(const D d, V x) {
1187  using T = TFromD<D>;
1188  impl::CosSinImpl<T> impl;
1189 
1190  // Float Constants
1191  const V kOneOverPi = Set(d, static_cast<T>(0.31830988618379067153));
1192  const V kHalf = Set(d, static_cast<T>(0.5));
1193 
1194  // Integer Constants
1195  const Rebind<int32_t, D> di32;
1196  using VI32 = decltype(Zero(di32));
1197 
1198  const V abs_x = Abs(x);
1199  const V sign_x = Xor(abs_x, x);
1200 
1201  // Compute the quadrant, q = int((|x| / pi) + 0.5)
1202  const VI32 q = impl.ToInt32(d, MulAdd(abs_x, kOneOverPi, kHalf));
1203 
1204  // Reduce range, apply sign, and approximate.
1205  return impl.Poly(d, Xor(impl.SinReduce(d, abs_x, q),
1206  Xor(impl.SinSignFromQuadrant(d, q), sign_x)));
1207 }
1208 
1209 template <class D, class V>
1210 HWY_INLINE V Sinh(const D d, V x) {
1211  using T = TFromD<D>;
1212  const V kHalf = Set(d, static_cast<T>(+0.5));
1213  const V kOne = Set(d, static_cast<T>(+1.0));
1214  const V kTwo = Set(d, static_cast<T>(+2.0));
1215 
1216  const V sign = And(SignBit(d), x); // Extract the sign bit
1217  const V abs_x = Xor(x, sign);
1218  const V y = Expm1(d, abs_x);
1219  const V z = Mul(Div(Add(y, kTwo), Add(y, kOne)), Mul(y, kHalf));
1220  return Xor(z, sign); // Reapply the sign bit
1221 }
1222 
1223 template <class D, class V>
1224 HWY_INLINE V Tanh(const D d, V x) {
1225  using T = TFromD<D>;
1226  const V kLimit = Set(d, static_cast<T>(18.714973875));
1227  const V kOne = Set(d, static_cast<T>(+1.0));
1228  const V kTwo = Set(d, static_cast<T>(+2.0));
1229 
1230  const V sign = And(SignBit(d), x); // Extract the sign bit
1231  const V abs_x = Xor(x, sign);
1232  const V y = Expm1(d, Mul(abs_x, kTwo));
1233  const V z = IfThenElse(Gt(abs_x, kLimit), kOne, Div(y, Add(y, kTwo)));
1234  return Xor(z, sign); // Reapply the sign bit
1235 }
1236 
1237 // NOLINTNEXTLINE(google-readability-namespace-comments)
1238 } // namespace HWY_NAMESPACE
1239 } // namespace hwy
1241 
1242 #endif // HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
#define HWY_NOINLINE
Definition: base.h:63
#define HWY_INLINE
Definition: base.h:62
#define HWY_MAYBE_UNUSED
Definition: base.h:73
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1)
Definition: math-inl.h:281
HWY_INLINE V Log(const D d, V x)
Definition: math-inl.h:879
d
Definition: rvv-inl.h:1742
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_NOINLINE V CallSin(const D d, VecArg< V > x)
Definition: math-inl.h:237
V VecArg
Definition: ops/shared-inl.h:306
HWY_NOINLINE V CallAsin(const D d, VecArg< V > x)
Definition: math-inl.h:72
HWY_INLINE V Atan(const D d, V x)
Highway SIMD version of std::atan(x).
Definition: math-inl.h:1054
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6309
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6301
HWY_INLINE V Cos(const D d, V x)
Highway SIMD version of std::cos(x).
Definition: math-inl.h:1084
HWY_NOINLINE V CallAcos(const D d, VecArg< V > x)
Definition: math-inl.h:42
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6314
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1784
HWY_INLINE V Sin(const D d, V x)
Highway SIMD version of std::sin(x).
Definition: math-inl.h:1186
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:2105
HWY_INLINE V Exp(const D d, V x)
Highway SIMD version of std::exp(x).
Definition: math-inl.h:1107
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:6318
HWY_INLINE V Log10(const D d, V x)
Highway SIMD version of std::log10(x).
Definition: math-inl.h:1161
HWY_INLINE V Log1p(const D d, V x)
Highway SIMD version of std::log1p(x).
Definition: math-inl.h:1167
HWY_NOINLINE V CallExpm1(const D d, VecArg< V > x)
Definition: math-inl.h:162
HWY_NOINLINE V CallLog1p(const D d, VecArg< V > x)
Definition: math-inl.h:207
HWY_INLINE V Atanh(const D d, V x)
Highway SIMD version of std::atanh(x).
Definition: math-inl.h:1071
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3273
HWY_NOINLINE V CallLog10(const D d, VecArg< V > x)
Definition: math-inl.h:192
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:2212
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:6274
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition: emu128-inl.h:325
HWY_NOINLINE V CallLog2(const D d, VecArg< V > x)
Definition: math-inl.h:222
HWY_NOINLINE V CallExp(const D d, VecArg< V > x)
Definition: math-inl.h:147
HWY_NOINLINE V CallAtanh(const D d, VecArg< V > x)
Definition: math-inl.h:117
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1838
HWY_INLINE V Log2(const D d, V x)
Highway SIMD version of std::log2(x).
Definition: math-inl.h:1180
HWY_INLINE V Acos(const D d, V x)
Highway SIMD version of std::acos(x).
Definition: math-inl.h:947
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:312
HWY_NOINLINE V CallAtan(const D d, VecArg< V > x)
Definition: math-inl.h:102
HWY_INLINE V Acosh(const D d, V x)
Highway SIMD version of std::acosh(x).
Definition: math-inl.h:973
HWY_NOINLINE V CallLog(const D d, VecArg< V > x)
Definition: math-inl.h:177
HWY_INLINE V Tanh(const D d, V x)
Highway SIMD version of std::tanh(x).
Definition: math-inl.h:1224
HWY_INLINE V Log(const D d, V x)
Highway SIMD version of std::log(x).
Definition: math-inl.h:1156
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1934
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:988
HWY_INLINE V Asin(const D d, V x)
Highway SIMD version of std::asin(x).
Definition: math-inl.h:1000
HWY_INLINE V Asinh(const D d, V x)
Highway SIMD version of std::asinh(x).
Definition: math-inl.h:1021
HWY_NOINLINE V CallAsinh(const D d, VecArg< V > x)
Definition: math-inl.h:87
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:6278
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:195
HWY_INLINE V Expm1(const D d, V x)
Highway SIMD version of std::expm1(x).
Definition: math-inl.h:1130
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:1011
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:2219
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1983
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1817
HWY_NOINLINE V CallCos(const D d, VecArg< V > x)
Definition: math-inl.h:132
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1898
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:3091
HWY_NOINLINE V CallSinh(const D d, VecArg< V > x)
Definition: math-inl.h:252
HWY_INLINE V Sinh(const D d, V x)
Highway SIMD version of std::sinh(x).
Definition: math-inl.h:1210
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1949
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:6287
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:6283
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2911
HWY_NOINLINE V CallTanh(const D d, VecArg< V > x)
Definition: math-inl.h:267
typename D::T TFromD
Definition: ops/shared-inl.h:191
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1971
decltype(Zero(D())) Vec
Definition: generic_ops-inl.h:32
HWY_NOINLINE V CallAcosh(const D d, VecArg< V > x)
Definition: math-inl.h:57
Definition: aligned_allocator.h:27
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:505
#define HWY_NAMESPACE
Definition: set_macros-inl.h:82
HWY_INLINE V AsinPoly(D d, V x2, V)
Definition: math-inl.h:479
Definition: math-inl.h:465
HWY_INLINE V AtanPoly(D d, V x)
Definition: math-inl.h:520
Definition: math-inl.h:467
HWY_INLINE Vec< Rebind< float, D > > CosSignFromQuadrant(D d, VI32 q)
Definition: math-inl.h:626
HWY_INLINE V SinReduce(D d, V x, VI32 q)
Definition: math-inl.h:608
HWY_INLINE Vec< Rebind< int32_t, D > > ToInt32(D, V x)
Definition: math-inl.h:575
HWY_INLINE V Poly(D d, V x)
Definition: math-inl.h:580
HWY_INLINE Vec< Rebind< float, D > > SinSignFromQuadrant(D d, VI32 q)
Definition: math-inl.h:633
HWY_INLINE V CosReduce(D d, V x, VI32 q)
Definition: math-inl.h:591
Definition: math-inl.h:469
HWY_INLINE V ExpReduce(D d, V x, VI32 q)
Definition: math-inl.h:754
HWY_INLINE Vec< Rebind< int32_t, D > > ToInt32(D, V x)
Definition: math-inl.h:722
HWY_INLINE V ExpPoly(D d, V x)
Definition: math-inl.h:727
HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e)
Definition: math-inl.h:748
HWY_INLINE Vec< D > Pow2I(D d, VI32 x)
Definition: math-inl.h:740
Definition: math-inl.h:471
HWY_INLINE V LogPoly(D d, V x)
Definition: math-inl.h:779
HWY_INLINE Vec< Rebind< int32_t, D > > Log2p1NoSubnormal(D, V x)
Definition: math-inl.h:770
Definition: math-inl.h:473