Performance benchmarks#

This notebook benchmarks mlx-sparse against dense MLX matrix operations across a range of matrix sizes, densities, and right-hand-side widths.

Environment: Apple M5, 10-core GPU, macOS 26.0, MLX 0.31, mlx-sparse 0.0.1b0

The key insight: sparse operations scale with the number of non-zeros nnz, not with n². At low densities the advantage is dramatic.

Cost(SpMV) ≈ O(nnz) = O(density x n²)
Cost(dense) ≈ O(n²)
Speedup ≈ 1 / density (ideal)

import time
import statistics
import mlx.core as mx
import numpy as np
import mlx_sparse as ms

ms.use_cpu()

def bench(fn, warmup=5, iters=50):
    """Median wall time (ms) over `iters` evaluations after `warmup`."""
    for _ in range(warmup):
        mx.eval(fn())
    times = []
    for _ in range(iters):
        t0 = time.perf_counter()
        mx.eval(fn())
        times.append(time.perf_counter() - t0)
    return statistics.median(times) * 1000

def make_csr(n, density, seed=0):
    A = ms.random.rand(
        n, n, density=density, format="csr",
        dtype=mx.float32, rng=seed, index_dtype=mx.int32,
    )
    D = A.todense()
    mx.eval(D)
    return A, D

SpMV: sparse vs dense matrix-vector product#

Increasing matrix size from 2k to 16k at a fixed low density.

print(f"{'shape':<14} {'nnz':<8} {'density':<10} {'sparse_ms':<12} {'dense_ms':<12} {'speedup'}")

for n, density in [(2048, 0.00050), (4096, 0.00025), (8192, 0.0001), (16384, 0.00003)]:
    A, D = make_csr(n, density)
    x = mx.array(np.random.randn(n).astype(np.float32))

    t_sp = bench(lambda: A @ x)
    t_dn = bench(lambda: D @ x)

    shape_str = f"({n},{n})"
    print(f"{shape_str:<14} {A.nnz:<8} {density*100:.3f}%     "
          f"{t_sp:.3f} ms   {t_dn:.3f} ms   {t_dn/t_sp:.1f}x")

shape          nnz      density    sparse_ms    dense_ms     speedup
(2048,2048)    2097     0.050%     0.109 ms   0.262 ms   2.4x
(4096,4096)    4194     0.025%     0.141 ms   0.962 ms   6.8x
(8192,8192)    6711     0.010%     0.134 ms   3.874 ms   29.0x
(16384,16384)  8053     0.003%     0.158 ms   15.600 ms   98.8x

SpMM: sparse vs dense matrix-matrix product#

Two right-hand-side widths (k=16, k=64) at increasing matrix size.

print(f"{'shape':<14} {'k':<5} {'density':<10} {'sparse_ms':<12} {'dense_ms':<12} {'speedup'}")

for n, density, k in [
    (2048, 0.001, 16), (2048, 0.001, 64),
    (4096, 0.00025, 16), (4096, 0.00025, 64),
    (8192, 0.0001, 16), (8192, 0.0001, 64),
]:
    A, D = make_csr(n, density)
    B = mx.array(np.random.randn(n, k).astype(np.float32))

    t_sp = bench(lambda: A @ B)
    t_dn = bench(lambda: D @ B)

    shape_str = f"({n},{n})"
    print(f"{shape_str:<14} {k:<5} {density*100:.3f}%     "
          f"{t_sp:.3f} ms   {t_dn:.3f} ms   {t_dn/t_sp:.1f}x")

shape          k     density    sparse_ms    dense_ms     speedup
(2048,2048)    16    0.100%     0.108 ms   0.454 ms   4.2x
(2048,2048)    64    0.100%     0.096 ms   0.559 ms   5.8x
(4096,4096)    16    0.025%     0.096 ms   2.008 ms   20.9x
(4096,4096)    64    0.025%     0.125 ms   2.387 ms   19.2x
(8192,8192)    16    0.010%     0.102 ms   7.511 ms   73.7x
(8192,8192)    64    0.010%     0.120 ms   9.784 ms   81.2x

Density crossover: when does dense win?#

Below some density threshold, sparse is faster. Above it, the overhead of indirect memory access means dense is better. This varies by matrix size.

We sweep density at n=4096 and find the crossover point.

n = 4096
x = mx.array(np.random.randn(n).astype(np.float32))

print(f"n={n}  SpMV crossover analysis (M5)")
print(f"\n{'density':<12} {'nnz':<10} {'sparse_ms':<12} {'dense_ms':<12} {'ratio (dense/sparse)'}")

for density in [0.00001, 0.00005, 0.00025, 0.001, 0.005, 0.01, 0.025, 0.05]:
    A, D = make_csr(n, density)
    t_sp = bench(lambda: A @ x)
    t_dn = bench(lambda: D @ x)
    ratio = t_dn / t_sp
    winner = "[sparse faster]" if ratio > 1 else "[dense faster] "
    print(f"{density*100:.4f}%     {A.nnz:<10} {t_sp:.3f} ms   {t_dn:.3f} ms   "
          f"{ratio:.1f}x   {winner}")

n=4096  SpMV crossover analysis (M5)

density      nnz        sparse_ms    dense_ms     ratio (dense/sparse)
0010%     168        0.137 ms   1.079 ms   7.8x   [sparse faster]
0050%     839        0.094 ms   0.904 ms   9.6x   [sparse faster]
0250%     4194       0.090 ms   0.883 ms   9.8x   [sparse faster]
1000%     16777      0.090 ms   0.903 ms   10.0x   [sparse faster]
5000%     83886      0.091 ms   0.874 ms   9.6x   [sparse faster]
0000%     167772     0.100 ms   0.882 ms   8.8x   [sparse faster]
5000%     419430     0.127 ms   0.884 ms   7.0x   [sparse faster]
0000%     838861     0.163 ms   0.882 ms   5.4x   [sparse faster]

Batched SpMM: multiple RHS tensors#

Batched SpMM (A @ B_batch where B_batch is rank-3) has no loop overhead, it reshapes internally and runs a single kernel call.

n, density, k = 4096, 0.001, 16
A, _ = make_csr(n, density)
print(f"Batched SpMM vs loop-based reference (n={n}, density={density*100:.1f}%, k={k})")
print(f"\n{'batch':<7} {'batched_ms':<12} {'loop_ms':<10} {'speedup'}")

for batch in [1, 4, 8, 16, 32]:
    B_batch = mx.array(np.random.randn(batch, n, k).astype(np.float32))

    # Batched call (single kernel)
    t_batched = bench(lambda: A @ B_batch)

    # Loop reference
    slices = [B_batch[i] for i in range(batch)]
    def loop_spmm():
        results = [A @ slices[i] for i in range(batch)]
        return mx.stack(results)
    t_loop = bench(loop_spmm)

    print(f"{batch:<7} {t_batched:.3f} ms   {t_loop:.3f} ms  {t_loop/t_batched:.1f}x")

Batched SpMM vs loop-based reference (n=4096, density=0.1%, k=16)

batch   batched_ms   loop_ms    speedup
     0.123 ms   0.118 ms  1.0x
     0.148 ms   0.416 ms  2.8x
     0.178 ms   0.834 ms  4.7x
    0.268 ms   1.472 ms  5.5x
    0.403 ms   3.082 ms  7.6x

Performance by dtype#

float16 and bfloat16 can be faster than float32 due to lower memory bandwidth, or on some hardware, equal due to throughput being compute-bound.

n, density = 8192, 0.0001
A_base = ms.random.rand(
    n, n, density=density, format="csr",
    dtype=mx.float32, rng=0, index_dtype=mx.int32,
)

print(f"SpMV timing by dtype (n={n}, density={density*100:.2f}%)")
print(f"\n{'dtype':<11} {'sparse_ms':<12} {'dense_ms':<11} {'speedup'}")

for mlx_dtype, label in [
    (mx.float32,  "float32 "),
    (mx.float16,  "float16 "),
    (mx.bfloat16, "bfloat16"),
]:
    data = A_base.data.astype(mlx_dtype)
    Ai = ms.csr_array(
        (data, A_base.indices, A_base.indptr),
        shape=A_base.shape, sorted_indices=True, canonical=True,
    )
    Di = Ai.todense().astype(mlx_dtype)
    mx.eval(Di)
    xi = mx.array(np.random.randn(n).astype(np.float32)).astype(mlx_dtype)

    t_sp = bench(lambda: Ai @ xi)
    t_dn = bench(lambda: Di @ xi)

    print(f"{label:<11} {t_sp:.3f} ms   {t_dn:.3f} ms  {t_dn/t_sp:.1f}x")

SpMV timing by dtype (n=8192, density=0.01%)

dtype       sparse_ms    dense_ms    speedup
float32     0.149 ms   3.922 ms  26.3x
float16     0.101 ms   6.763 ms  67.0x
bfloat16    0.102 ms   6.805 ms  66.7x