S

#### Simd

##### Guest

Code:

```
import numba as nb
@nb.njit
def neumaier_sum(arr):
s = arr[0]
c = 0.0
for i in range(1, len(arr)):
t = s + arr[i]
if abs(s) >= abs(arr[i]):
c += (s - t) + arr[i]
else:
c += (arr[i] - t) + s
s = t
return s + c
```

This works well but it is at least four times slower than it would be were you to add fastmath=True. Unfortunately, fastmath allows rebracketing the sum (associativity) which has the effect of ruining its accuracy so we can't do that.

Here is a test to show the results with different ways of summing. First we make an array of length 1001.

Code:

```
import numpy as np
n = 10 ** 3 + 1
a = np.full(n, 0.01, dtype=np.float64)
a[0] = 10**10
a[-1] = -10**10
```

We

`import math`

so we can use `fsum`

to see the correct answer. We also define a version Neumaier using fastmath.
Code:

```
# Bad version using fastmath
@nb.njit(fastmath=True)
def neumaier_sum_fm(arr):
s = arr[0]
c = 0.0
for i in range(1, len(arr)):
t = s + arr[i]
if abs(s) >= abs(arr[i]):
c += (s - t) + arr[i]
else:
c += (arr[i] - t) + s
s = t
return s + c
```

The results are:

Code:

```
math.fsum : 9.99
nb_neumaier_sum : 9.99
nb_neumaier_sum_fm: 9.99001693725586
```

These are the timings:

Code:

```
%timeit nb_neumaier_sum_fm(a)
350 ns ± 0.983 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
%timeit nb_neumaier_sum(a)
1.5 µs ± 18.8 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
```

Is there any way to speed up this code but have it output exactly what it does now?

The assembly is as follows. The comments were added by chatgpt so please let me know if any of it is wrong.

Code:

```
movq 8(%rsp), %rcx # Load the address of the array into rcx
movq 16(%rsp), %rax # Load the length of the array into rax
vmovsd (%rcx), %xmm1 # Load the first element of the array into xmm1 (s = arr[0])
leaq -1(%rax), %rdx # Calculate (length of array - 1) into rdx
testq %rdx, %rdx # Test if rdx is zero
jle .LBB0_1 # If rdx <= 0, jump to .LBB0_1
movabsq $.LCPI0_0, %rsi # Load address of constant value (mask for abs)
movl $1, %edx # Initialize loop counter (i = 1)
vxorpd %xmm0, %xmm0, %xmm0 # Clear xmm0 (c = 0.0)
vmovapd %xmm1, %xmm3 # Copy xmm1 (s) to xmm3
vmovapd (%rsi), %xmm2 # Load abs mask into xmm2
.p2align 4, 0x90
.LBB0_3:
vmovsd (%rcx,%rdx,8), %xmm4 # Load arr[i] into xmm4
vandpd %xmm2, %xmm3, %xmm5 # Compute abs(s) into xmm5
incq %rdx # Increment rdx (i++)
vaddsd %xmm4, %xmm3, %xmm1 # Compute t = s + arr[i]
vandpd %xmm2, %xmm4, %xmm6 # Compute abs(arr[i]) into xmm6
vcmpnlesd %xmm5, %xmm6, %xmm5 # Compare abs(s) and abs(arr[i])
vsubsd %xmm1, %xmm3, %xmm6 # Compute (s - t) into xmm6
vaddsd %xmm6, %xmm4, %xmm6 # Compute (s - t) + arr[i] into xmm6
vsubsd %xmm1, %xmm4, %xmm4 # Compute (arr[i] - t) into xmm4
vaddsd %xmm4, %xmm3, %xmm3 # Compute (arr[i] - t) + s into xmm3
vblendvpd %xmm5, %xmm3, %xmm6, %xmm3 # Conditional blend based on the comparison
vaddsd %xmm3, %xmm0, %xmm0 # Update c
vmovapd %xmm1, %xmm3 # Update s (s = t)
cmpq %rdx, %rax # Compare loop counter with array length
jne .LBB0_3 # If not equal, loop again
vaddsd %xmm1, %xmm0, %xmm0 # Final addition (s + c)
xorl %eax, %eax # Clear eax
vmovsd %xmm0, (%rdi) # Store the result
retq # Return from function
```

<pre><code>import numba as nb

@nb.njit

def neumaier_sum(arr):

s = arr[0]

c = 0.0

for i in range(1, len(arr)):

t = s + arr

if abs(s) >= abs(arr

if abs(s) >= abs(arr

*):*

c += (s - t) + arrc += (s - t) + arr

else:

c += (arrelse:

c += (arr

*- t) + s*

s = t

return s + c

</code></pre>

<p>This works well but it is at least four times slower than it would be were you to add fastmath=True. Unfortunately, fastmath allows rebracketing the sum (associativity) which has the effect of ruining its accuracy so we can't do that.</p>

<p>Here is a test to show the results with different ways of summing. First we make an array of length 1001.</p>

<pre><code>import numpy as np

n = 10 ** 3 + 1

a = np.full(n, 0.01, dtype=np.float64)

a[0] = 10**10

a[-1] = -10**10

</code></pre>

<p>We <code>import math</code> so we can use <code>fsum</code> to see the correct answer. We also define a version Neumaier using fastmath.</p>

<pre><code># Bad version using fastmath

@nb.njit(fastmath=True)

def neumaier_sum_fm(arr):

s = arr[0]

c = 0.0

for i in range(1, len(arr)):

t = s + arrs = t

return s + c

</code></pre>

<p>This works well but it is at least four times slower than it would be were you to add fastmath=True. Unfortunately, fastmath allows rebracketing the sum (associativity) which has the effect of ruining its accuracy so we can't do that.</p>

<p>Here is a test to show the results with different ways of summing. First we make an array of length 1001.</p>

<pre><code>import numpy as np

n = 10 ** 3 + 1

a = np.full(n, 0.01, dtype=np.float64)

a[0] = 10**10

a[-1] = -10**10

</code></pre>

<p>We <code>import math</code> so we can use <code>fsum</code> to see the correct answer. We also define a version Neumaier using fastmath.</p>

<pre><code># Bad version using fastmath

@nb.njit(fastmath=True)

def neumaier_sum_fm(arr):

s = arr[0]

c = 0.0

for i in range(1, len(arr)):

t = s + arr

if abs(s) >= abs(arrif abs(s) >= abs(arr

*):*

c += (s - t) + arrc += (s - t) + arr

else:

c += (arrelse:

c += (arr

*- t) + s*

s = t

return s + c

</code></pre>

<p>The results are:</p>

<pre><code>math.fsum : 9.99

nb_neumaier_sum : 9.99

nb_neumaier_sum_fm: 9.99001693725586

</code></pre>

<p>These are the timings:</p>

<pre><code> %timeit nb_neumaier_sum_fm(a)

350 ns ± 0.983 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)

%timeit nb_neumaier_sum(a)

1.5 µs ± 18.8 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)

</code></pre>

<p>Is there any way to speed up this code but have it output exactly what it does now?</p>

<hr />

<p>The assembly is as follows. The comments were added by chatgpt so please let me know if any of it is wrong.</p>

<pre><code> movq 8(%rsp), %rcx # Load the address of the array into rcx

movq 16(%rsp), %rax # Load the length of the array into rax

vmovsd (%rcx), %xmm1 # Load the first element of the array into xmm1 (s = arr[0])

leaq -1(%rax), %rdx # Calculate (length of array - 1) into rdx

testq %rdx, %rdx # Test if rdx is zero

jle .LBB0_1 # If rdx <= 0, jump to .LBB0_1

movabsq $.LCPI0_0, %rsi # Load address of constant value (mask for abs)

movl $1, %edx # Initialize loop counter (i = 1)

vxorpd %xmm0, %xmm0, %xmm0 # Clear xmm0 (c = 0.0)

vmovapd %xmm1, %xmm3 # Copy xmm1 (s) to xmm3

vmovapd (%rsi), %xmm2 # Load abs mask into xmm2

.p2align 4, 0x90

.LBB0_3:

vmovsd (%rcx,%rdx,8), %xmm4 # Load arrs = t

return s + c

</code></pre>

<p>The results are:</p>

<pre><code>math.fsum : 9.99

nb_neumaier_sum : 9.99

nb_neumaier_sum_fm: 9.99001693725586

</code></pre>

<p>These are the timings:</p>

<pre><code> %timeit nb_neumaier_sum_fm(a)

350 ns ± 0.983 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)

%timeit nb_neumaier_sum(a)

1.5 µs ± 18.8 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)

</code></pre>

<p>Is there any way to speed up this code but have it output exactly what it does now?</p>

<hr />

<p>The assembly is as follows. The comments were added by chatgpt so please let me know if any of it is wrong.</p>

<pre><code> movq 8(%rsp), %rcx # Load the address of the array into rcx

movq 16(%rsp), %rax # Load the length of the array into rax

vmovsd (%rcx), %xmm1 # Load the first element of the array into xmm1 (s = arr[0])

leaq -1(%rax), %rdx # Calculate (length of array - 1) into rdx

testq %rdx, %rdx # Test if rdx is zero

jle .LBB0_1 # If rdx <= 0, jump to .LBB0_1

movabsq $.LCPI0_0, %rsi # Load address of constant value (mask for abs)

movl $1, %edx # Initialize loop counter (i = 1)

vxorpd %xmm0, %xmm0, %xmm0 # Clear xmm0 (c = 0.0)

vmovapd %xmm1, %xmm3 # Copy xmm1 (s) to xmm3

vmovapd (%rsi), %xmm2 # Load abs mask into xmm2

.p2align 4, 0x90

.LBB0_3:

vmovsd (%rcx,%rdx,8), %xmm4 # Load arr

*into xmm4*

vandpd %xmm2, %xmm3, %xmm5 # Compute abs(s) into xmm5

incq %rdx # Increment rdx (i++)

vaddsd %xmm4, %xmm3, %xmm1 # Compute t = s + arrvandpd %xmm2, %xmm3, %xmm5 # Compute abs(s) into xmm5

incq %rdx # Increment rdx (i++)

vaddsd %xmm4, %xmm3, %xmm1 # Compute t = s + arr

vandpd %xmm2, %xmm4, %xmm6 # Compute abs(arrvandpd %xmm2, %xmm4, %xmm6 # Compute abs(arr

*) into xmm6*

vcmpnlesd %xmm5, %xmm6, %xmm5 # Compare abs(s) and abs(arrvcmpnlesd %xmm5, %xmm6, %xmm5 # Compare abs(s) and abs(arr

*)*

vsubsd %xmm1, %xmm3, %xmm6 # Compute (s - t) into xmm6

vaddsd %xmm6, %xmm4, %xmm6 # Compute (s - t) + arrvsubsd %xmm1, %xmm3, %xmm6 # Compute (s - t) into xmm6

vaddsd %xmm6, %xmm4, %xmm6 # Compute (s - t) + arr

*into xmm6*

vsubsd %xmm1, %xmm4, %xmm4 # Compute (arrvsubsd %xmm1, %xmm4, %xmm4 # Compute (arr

*- t) into xmm4*

vaddsd %xmm4, %xmm3, %xmm3 # Compute (arrvaddsd %xmm4, %xmm3, %xmm3 # Compute (arr

*- t) + s into xmm3*

vblendvpd %xmm5, %xmm3, %xmm6, %xmm3 # Conditional blend based on the comparison

vaddsd %xmm3, %xmm0, %xmm0 # Update c

vmovapd %xmm1, %xmm3 # Update s (s = t)

cmpq %rdx, %rax # Compare loop counter with array length

jne .LBB0_3 # If not equal, loop again

vaddsd %xmm1, %xmm0, %xmm0 # Final addition (s + c)

xorl %eax, %eax # Clear eax

vmovsd %xmm0, (%rdi) # Store the result

retq # Return from function

</code></pre>vblendvpd %xmm5, %xmm3, %xmm6, %xmm3 # Conditional blend based on the comparison

vaddsd %xmm3, %xmm0, %xmm0 # Update c

vmovapd %xmm1, %xmm3 # Update s (s = t)

cmpq %rdx, %rax # Compare loop counter with array length

jne .LBB0_3 # If not equal, loop again

vaddsd %xmm1, %xmm0, %xmm0 # Final addition (s + c)

xorl %eax, %eax # Clear eax

vmovsd %xmm0, (%rdi) # Store the result

retq # Return from function

</code></pre>