Tutorial: Use case with NGSolve#
This notebook shows how to use the GPU Krylov solvers in NGSolve’s ngscuda module:
DevCGSolver— symmetric positive-definite problemsDevTFQMRSolver— non-symmetric problems
GPU required: In Colab, go to Runtime → Change runtime type and select T4 GPU (free) or any GPU partition available for you.
Check if GPU is available#
import subprocess
result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader'],
capture_output=True, text=True)
if result.returncode == 0:
print('GPU found', result.stdout.strip())
else:
print('No GPU found')
import ngsolve
from ngsolve import *
from ngsolve import la
from ngsolve.solvers import TFQMR
from time import time
try:
from ngsolve.ngscuda import *
except:
print("no CUDA library or device available, using replacement types on host")
print("NGSolve version:", ngsolve.__version__)
!nvcc --version
!nvidia-smi
print('On CPU, numthreads =', ngsglobals.numthreads)
Part 1: Symmetric problem — Poisson equation#
The stiffness matrix is symmetric positive definite - example case for Conjugate Gradient.
This extends the setup from 5.5.1 Solving the Poisson Equation on devices with the new CUDA graph execution mode.
mesh = Mesh(unit_square.GenerateMesh(maxh=0.1))
for l in range(5): mesh.Refine()
fes = H1(mesh, order=2, dirichlet='.*')
u, v = fes.TnT()
with TaskManager():
a = BilinearForm(grad(u)*grad(v)*dx + u*v*dx).Assemble()
f = LinearForm(x*v*dx).Assemble()
gfu = GridFunction(fes)
jac = a.mat.CreateSmoother(fes.FreeDofs())
print(f'ndof = {fes.ndof}')
Solve on CPU for reference timing and solution#
with TaskManager():
cpu_solver = CGSolver(a.mat, jac, maxiter=2000)
gfu.vec.data = cpu_solver * f.vec
ts = time()
for _ in range(5): gfu.vec.data = cpu_solver * f.vec
cpu_ms = (time() - ts) / 5 * 1000
ref_vec = gfu.vec.CreateVector()
ref_vec.data = gfu.vec
print(f'CPU CG: {cpu_ms:.0f} ms')
print(f' |sol| = {Norm(gfu.vec):.6e}')
print(f' steps = {cpu_solver.GetSteps()}')
adev = a.mat.CreateDeviceMatrix()
jacdev = jac.CreateDeviceMatrix()
fdev = f.vec.CreateDeviceVector(copy=True)
inv = CGSolver(adev, jacdev, maxiter=2000)
gfu.vec.data = (inv * fdev).Evaluate() # warm-up
ts = time()
for _ in range(5): gfu.vec.data = (inv * fdev).Evaluate()
gpu_std_ms = (time() - ts) / 5 * 1000
print(f'GPU CG (no graph): {gpu_std_ms:.0f} ms')
print(f' diff = {Norm(gfu.vec - ref_vec):.1e}')
print(f' speedup = {cpu_ms/gpu_std_ms:.2f}x')
GPU acceleration#
gpu_solver = DevCGSolver(
mat=adev, pre=jacdev,
adev_raw=adev, cdev_raw=jacdev,
maxsteps=2000, printrates=False
)
gfu.vec.data = (gpu_solver * fdev).Evaluate()
ts = time()
for _ in range(5): gfu.vec.data = (gpu_solver * fdev).Evaluate()
gpu_ms = (time() - ts) / 5 * 1000
print(f'GPU CG (Conditional While Graph): {gpu_ms:.0f} ms')
print(f' diff = {Norm(gfu.vec - ref_vec):.1e}')
print(f' speedup = {cpu_ms/gpu_ms:.2f}x')
Scaling problem size#
print(f"{'ndof':>8} {'CPU (ms)':>10} {'C++ dev (ms)':>13} {'Cuda Conditional While Graph (ms)':>17} {'speedup':>8}")
print('-' * 68)
# for maxh in [0.05, 0.03, 0.01, 0.007, 0.005]:
for maxh in [0.05, 0.03, 0.01]:
m = Mesh(unit_square.GenerateMesh(maxh=maxh))
fs = H1(m, order=2, dirichlet='.*')
uu, vv = fs.TnT()
with TaskManager():
aa = BilinearForm(grad(uu)*grad(vv)*dx + uu*vv*dx).Assemble()
ff = LinearForm(x*vv*dx).Assemble()
jj = aa.mat.CreateSmoother(fs.FreeDofs())
gf = GridFunction(fs)
with TaskManager():
cs = CGSolver(aa.mat, jj, maxsteps=2000, precision=1e-12)
gf.vec.data = cs * ff.vec
ts = time()
for _ in range(3): gf.vec.data = cs * ff.vec
c_ms = (time() - ts) / 3 * 1000
ad = aa.mat.CreateDeviceMatrix()
jd = jj.CreateDeviceMatrix()
fd = ff.vec.CreateDeviceVector(copy=True)
ng = CGSolver(ad, jd, maxsteps=2000, precision=1e-12)
gf.vec.data = (ng * fd).Evaluate()
ts = time()
for _ in range(3): gf.vec.data = (ng * fd).Evaluate()
ng_ms = (time() - ts) / 3 * 1000
gs = DevCGSolver(mat=ad, pre=jd, adev_raw=ad, cdev_raw=jd,
maxsteps=2000, precision=1e-12, printrates=False)
gf.vec.data = (gs * fd).Evaluate()
ts = time()
for _ in range(3): gf.vec.data = (gs * fd).Evaluate()
g_ms = (time() - ts) / 3 * 1000
print(f'{fs.ndof:>8} {c_ms:>10.0f} {ng_ms:>13.0f} {g_ms:>17.0f} {ng_ms/g_ms:>7.2f}x')
H100 reference results — Part 1 (Musica, June 2026)
Single run, ndof = 460,033 (5 mesh refinements, maxh=0.1):
ndof |
CPU CG (ms) |
C++ dev (ms) |
WHILE graph (ms) |
speedup |
|---|---|---|---|---|
1,969 |
176 |
7 |
3 |
2.12× |
5,233 |
311 |
11 |
5 |
2.15× |
46,749 |
1,353 |
36 |
17 |
2.15× |
95,017 |
2,069 |
52 |
26 |
1.97× |
185,857 |
5,086 |
83 |
49 |
1.68× |
Part 2: Non-symmetric problem — 3D convection#
The same DG convection operator used in the NGSolve CUDA benchmark: a transport-dominated 3D problem on an unstructured mesh with upwind flux. The stiffness matrix is non-symmetric — CG does not apply.
TFQMR (Transpose-Free QMR) handles non-symmetric systems without storing a growing Krylov basis — also very relevant solver for Navier–Stokes equations.
mesh2 = Mesh(unit_cube.GenerateMesh(maxh=0.05))
fes2 = L2(mesh2, order=2, dgjumps=True)
u2, v2 = fes2.TnT()
wind = CF((1, 0.2, 0.3))
n = specialcf.normal(mesh2.dim)
dS = dx(element_boundary=True)
with TaskManager():
a2 = BilinearForm(fes2)
a2 += -20*u2*v2*dx
a2 += u2*wind*grad(v2)*dx
a2 += -(wind*n)*IfPos(wind*n, u2, u2.Other(bnd=0))*v2*dS
a2.Assemble()
blocks = fes2.CreateSmoothingBlocks(blocktype='element')
pre2 = a2.mat.CreateBlockSmoother(blocks)
pre2p = Projector(fes2.FreeDofs(), True)
f2 = LinearForm(1*v2*dx).Assemble()
gfu2 = GridFunction(fes2)
print(f'ndof = {fes2.ndof}')
Solve on CPU for reference timing and solution#
with TaskManager():
gfu2.vec.data = TFQMR(mat=pre2@a2.mat, pre=pre2p,
rhs=(pre2*f2.vec).Evaluate(),
maxsteps=400, printrates=False, tol=1e-12)
ts = time()
for _ in range(3):
gfu2.vec.data = TFQMR(mat=pre2@a2.mat, pre=pre2p,
rhs=(pre2*f2.vec).Evaluate(),
maxsteps=400, printrates=False, tol=1e-12)
cpu2_ms = (time() - ts) / 3 * 1000
ref_norm2 = Norm(gfu2.vec)
print(f'CPU TFQMR: {cpu2_ms:.0f} ms |sol| = {ref_norm2:.6e}')
GPU acceleration#
fdev2 = f2.vec.CreateDeviceVector(copy=True)
adev2 = a2.mat.CreateDeviceMatrix()
pdev2 = pre2.CreateDeviceMatrix()
pdev2p = pre2p.CreateDeviceMatrix()
rhs_dev2 = (pdev2 * fdev2).Evaluate()
pa_dev2 = pdev2 @ adev2
gfu2.vec.data = TFQMR(mat=pa_dev2, pre=pdev2p, rhs=rhs_dev2,
maxsteps=400, printrates=False, tol=1e-12)
ts = time()
for _ in range(3):
gfu2.vec.data = TFQMR(mat=pa_dev2, pre=pdev2p, rhs=rhs_dev2,
maxsteps=400, printrates=False, tol=1e-12)
gpu2_std_ms = (time() - ts) / 3 * 1000
print(f'Python TFQMR (device matrices): {gpu2_std_ms:.0f} ms')
print(f' diff = {abs(Norm(gfu2.vec) - ref_norm2):.1e}')
print(f' speedup = {cpu2_ms/gpu2_std_ms:.2f}x')
GPU acceleration Conditional While Graph#
tfqmr_solver = DevTFQMRSolver(
mat=pre2 @ a2.mat, pre=pre2p, # CPU operators
adev_raw=pa_dev2, cdev_raw=pdev2p, # GPU operators
maxsteps=400, printrates=False, precision=1e-12
)
tfqmr_solver.Mult(rhs_dev2, gfu2.vec)
ts = time()
for _ in range(3): tfqmr_solver.Mult(rhs_dev2, gfu2.vec)
gpu2_ms = (time() - ts) / 3 * 1000
err2 = abs(Norm(gfu2.vec) - ref_norm2)
print(f'GPU TFQMR (Conditional While Graph): {gpu2_ms:.0f} ms')
print(f' diff = {err2:.1e}')
print(f' speedup = {cpu2_ms/gpu2_ms:.1f}x')
Scaling problem size#
print(f"{'ndof':>8} {'CPU (ms)':>10} {'no graph (ms)':>14} {'Conditional While Graph (ms)':>25} {'speedup':>8}")
print('-' * 66)
# for maxh in [0.2, 0.1, 0.07, 0.05]:
for maxh in [0.2, 0.1, 0.07]:
m2 = Mesh(unit_cube.GenerateMesh(maxh=maxh))
fs2 = L2(m2, order=2, dgjumps=True)
uu2, vv2 = fs2.TnT()
wind2 = CF((1, 0.2, 0.3))
n2 = specialcf.normal(m2.dim)
dS2 = dx(element_boundary=True)
with TaskManager():
aa2 = BilinearForm(fs2)
aa2 += -20*uu2*vv2*dx
aa2 += uu2*wind2*grad(vv2)*dx
aa2 += -(wind2*n2)*IfPos(wind2*n2, uu2, uu2.Other(bnd=0))*vv2*dS2
aa2.Assemble()
blks2 = fs2.CreateSmoothingBlocks(blocktype='element')
pp2 = aa2.mat.CreateBlockSmoother(blks2)
pp2p = Projector(fs2.FreeDofs(), True)
ff2 = LinearForm(1*vv2*dx).Assemble()
gg2 = GridFunction(fs2)
with TaskManager():
gg2.vec.data = TFQMR(mat=pp2@aa2.mat, pre=pp2p,
rhs=(pp2*ff2.vec).Evaluate(),
maxsteps=400, printrates=False, tol=1e-12)
ts = time()
for _ in range(3):
gg2.vec.data = TFQMR(mat=pp2@aa2.mat, pre=pp2p,
rhs=(pp2*ff2.vec).Evaluate(),
maxsteps=400, printrates=False, tol=1e-12)
c2_ms = (time() - ts) / 3 * 1000
fd2 = ff2.vec.CreateDeviceVector(copy=True)
ad2 = aa2.mat.CreateDeviceMatrix()
pd2 = pp2.CreateDeviceMatrix()
pd2p = pp2p.CreateDeviceMatrix()
rd2 = (pd2 * fd2).Evaluate()
pad2 = pd2 @ ad2
gg2.vec.data = TFQMR(mat=pad2, pre=pd2p, rhs=rd2,
maxsteps=400, printrates=False, tol=1e-12)
ts = time()
for _ in range(3):
gg2.vec.data = TFQMR(mat=pad2, pre=pd2p, rhs=rd2,
maxsteps=400, printrates=False, tol=1e-12)
ng2_ms = (time() - ts) / 3 * 1000
gs2 = DevTFQMRSolver(mat=pp2@aa2.mat, pre=pp2p,
adev_raw=pad2, cdev_raw=pd2p,
maxsteps=400, printrates=False, precision=1e-12)
gs2.Mult(rd2, gg2.vec)
ts = time()
for _ in range(3): gs2.Mult(rd2, gg2.vec)
g2_ms = (time() - ts) / 3 * 1000
print(f'{fs2.ndof:>8} {c2_ms:>10.0f} {ng2_ms:>14.0f} {g2_ms:>17.0f} {ng2_ms/g2_ms:>7.2f}x')
H100 reference results — Part 2 (Musica, June 2026)
Single run, ndof = 340,370:
ndof |
Python TFQMR (ms) |
WHILE graph (ms) |
speedup |
|---|---|---|---|
6,520 |
5 |
2 |
2.78× |
47,810 |
8 |
4 |
2.36× |
149,650 |
15 |
8 |
1.94× |
340,370 |
29 |
19 |
1.54× |