Dtypes and device execution#

mlx-sparse supports four value dtypes and two index dtypes across both CPU and Metal GPU.

Value dtype	Metal GPU
`float32`	Supported
`float16`	Supported
`bfloat16`	Supported
`complex64`	Supported

Index dtype	Notes
`int32`	Default. Handles matrices up to ~2 billion non-zeros.
`int64`	For very large matrices. Use only when required.

Device selection is global. ms.use_gpu() routes all subsequent operations to the Metal GPU. ms.use_cpu() routes them to the CPU.

import mlx.core as mx
import numpy as np
import mlx_sparse as ms

All value dtypes on GPU#

device = ms.use_gpu()
print(f"Device: {device}")

rng = np.random.default_rng(42)
A_base = ms.random.rand(
    128, 128, density=0.05, format="csr",
    dtype=mx.float32, rng=42, index_dtype=mx.int32,
)
x_np = rng.standard_normal(128).astype(np.float32)

print("\nSpMV results:")
for mlx_dtype, label in [
    (mx.float32,  "float32  "),
    (mx.float16,  "float16  "),
    (mx.bfloat16, "bfloat16 "),
    (mx.complex64, "complex64"),
]:
    if mlx_dtype == mx.complex64:
        data = A_base.data.astype(mx.complex64)
        x = mx.array(x_np.astype(np.complex64))
    else:
        data = A_base.data.astype(mlx_dtype)
        x = mx.array(x_np).astype(mlx_dtype)

    A_typed = ms.csr_array(
        (data, A_base.indices, A_base.indptr),
        shape=A_base.shape, sorted_indices=True, canonical=True,
    )
    y = A_typed @ x
    sample = y.astype(mx.complex64 if mlx_dtype == mx.complex64 else mx.float32)
    mx.eval(y, sample)
    sample_np = np.array(sample)
    sample_idx = int(np.argmax(np.abs(sample_np)))
    sample0 = sample_np[sample_idx]
    if mlx_dtype == mx.complex64:
        value = f"{complex(sample0):.4f}"
    else:
        value = f"{float(sample0):.4f}"
    print(f"  {label}  A.data.dtype={A_typed.data.dtype}  "
          f"y.dtype={y.dtype}  y[{sample_idx}]={value}")

Device: Device(gpu, 0)

SpMV results:
  float32    A.data.dtype=mlx.core.float32  y.dtype=mlx.core.float32  y[43]=6.4706
  float16    A.data.dtype=mlx.core.float16  y.dtype=mlx.core.float16  y[43]=6.4688
  bfloat16   A.data.dtype=mlx.core.bfloat16  y.dtype=mlx.core.bfloat16  y[43]=6.4688
  complex64  A.data.dtype=mlx.core.complex64  y.dtype=mlx.core.complex64  y[43]=6.4706+0.0000j

Index dtypes: int32 vs int64#

A_i32 = A_base

A_i64 = ms.csr_array(
    (A_base.data, A_base.indices.astype(mx.int64), A_base.indptr.astype(mx.int64)),
    shape=A_base.shape, sorted_indices=True, canonical=True,
)

print("int32 indices:", A_i32)
print("int64 indices:", A_i64)

x_f32 = mx.array(x_np)
y32 = A_i32 @ x_f32
y64 = A_i64 @ x_f32
mx.eval(y32, y64)

match = np.allclose(np.array(y32), np.array(y64))
print(f"\nint32 y[0]={float(np.array(y32)[0]):.4f}  "
      f"int64 y[0]={float(np.array(y64)[0]):.4f}  match={match}")

int32 indices: CSRArray(shape=(128, 128), nnz=819, dtype=mlx.core.float32, index_dtype=mlx.core.int32, sorted_indices=True, has_canonical_format=True)
int64 indices: CSRArray(shape=(128, 128), nnz=819, dtype=mlx.core.float32, index_dtype=mlx.core.int64, sorted_indices=True, has_canonical_format=True)

int32 y[0]=0.0000  int64 y[0]=0.0000  match=True

CPU vs GPU: same results, different device#

The CSRArray buffers are MLX arrays and are device-agnostic. Switching device only changes where the kernel runs. No data is copied.

ms.use_cpu()
y_cpu = A_i32 @ x_f32
mx.eval(y_cpu)

ms.use_gpu()
y_gpu = A_i32 @ x_f32
mx.eval(y_gpu)

print("CPU result y[0:4]:", np.array(y_cpu)[:4].round(4))
print("GPU result y[0:4]:", np.array(y_gpu)[:4].round(4))
print(f"max diff CPU vs GPU: {np.max(np.abs(np.array(y_cpu) - np.array(y_gpu))):.2e}")

CPU result y[0:4]: [ 0.      0.2457 -1.2481  0.5915]
GPU result y[0:4]: [ 0.      0.2457 -1.2481  0.5915]
max diff CPU vs GPU: 0.00e+00

Dtype mismatch error#

The matrix values and the dense vector must share the same dtype. A clear error is raised otherwise.

ms.use_gpu()
x_f16 = x_f32.astype(mx.float16)

try:
    _ = A_i32 @ x_f16  # A is float32, x is float16
    mx.eval(_)
except (TypeError, ValueError) as e:
    print(f"Caught expected error: {str(e)[:60]}...")

Caught expected error: csr_matvec requires sparse data and RHS to have the same dty...

Casting between dtypes#

Use A.data.astype(...) to cast the matrix values, then reconstruct.

A_f16 = ms.csr_array(
    (A_i32.data.astype(mx.float16), A_i32.indices, A_i32.indptr),
    shape=A_i32.shape, sorted_indices=True, canonical=True,
)
print(f"float32 -> float16 cast: A_f16.data.dtype = {A_f16.data.dtype}")

x_f16_matched = x_f32.astype(mx.float16)
y_f16 = A_f16 @ x_f16_matched
mx.eval(y_f16)

err = np.max(np.abs(np.array(y_gpu) - np.array(y_f16).astype(np.float32)))
print(f"max error after cast: {err:.2e}")

float32 -> float16 cast: A_f16.data.dtype = mlx.core.float16
max error after cast: 2.22e-03

Complex64: Hermitian transpose#

For complex64 matrices the .H property returns the conjugate transpose. Multiplying Aᴴ @ A gives a Hermitian (self-adjoint) matrix.

n = 8
rng2 = np.random.default_rng(1)
data_c = (rng2.standard_normal(16) + 1j * rng2.standard_normal(16)).astype(np.complex64)
rows_c = rng2.integers(0, n, 16, dtype=np.int32)
cols_c = rng2.integers(0, n, 16, dtype=np.int32)

A_c = ms.coo_array(
    (mx.array(data_c), (mx.array(rows_c), mx.array(cols_c))),
    shape=(n, n)
).tocsr(canonical=True)
print("A_c:", A_c)
print("A_c.H shape:", A_c.H.shape)

# AᴴA should be Hermitian: (AᴴA)ᴴ == AᴴA
AHA = ms.csr_matmat(A_c.H, A_c)
dense_AHA = np.array(AHA.todense())
is_hermitian = np.allclose(dense_AHA, dense_AHA.conj().T, atol=1e-5)
print(f"\nAᴴA is Hermitian: {is_hermitian}")

A_c: CSRArray(shape=(8, 8), nnz=13, dtype=mlx.core.complex64, index_dtype=mlx.core.int32, sorted_indices=True, has_canonical_format=True)
A_c.H shape: (8, 8)

AᴴA is Hermitian: True