Tutorial: Use case with NGSolve#

This notebook shows how to use the GPU Krylov solvers in NGSolve’s ngscuda module:

  • DevCGSolver — symmetric positive-definite problems

  • DevTFQMRSolver — non-symmetric problems

GPU required: In Colab, go to Runtime → Change runtime type and select T4 GPU (free) or any GPU partition available for you.

Hide code cell content

import sys
if 'google.colab' in sys.modules:
    !pip install --upgrade --pre ngsolve anywidget -q

Open In Colab

Check if GPU is available#

import subprocess
result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader'],
                        capture_output=True, text=True)
if result.returncode == 0:
    print('GPU found', result.stdout.strip())
else:
    print('No GPU found')
import ngsolve
from ngsolve import *
from ngsolve import la
from ngsolve.solvers import TFQMR
from time import time

try:
    from ngsolve.ngscuda import *
except:
    print("no CUDA library or device available, using replacement types on host")

print("NGSolve version:", ngsolve.__version__)
!nvcc --version
!nvidia-smi
print('On CPU, numthreads =', ngsglobals.numthreads)

Part 1: Symmetric problem — Poisson equation#

\[-\Delta u + u = f \quad \text{in } \Omega, \qquad u = 0 \quad \text{on } \partial\Omega\]

The stiffness matrix is symmetric positive definite - example case for Conjugate Gradient.

This extends the setup from 5.5.1 Solving the Poisson Equation on devices with the new CUDA graph execution mode.

mesh = Mesh(unit_square.GenerateMesh(maxh=0.1))
for l in range(5): mesh.Refine()

fes = H1(mesh, order=2, dirichlet='.*')
u, v = fes.TnT()

with TaskManager():
    a = BilinearForm(grad(u)*grad(v)*dx + u*v*dx).Assemble()
    f = LinearForm(x*v*dx).Assemble()

gfu = GridFunction(fes)
jac = a.mat.CreateSmoother(fes.FreeDofs())

print(f'ndof = {fes.ndof}')

Solve on CPU for reference timing and solution#

with TaskManager():
    cpu_solver = CGSolver(a.mat, jac, maxiter=2000)
    gfu.vec.data = cpu_solver * f.vec   
    ts = time()
    for _ in range(5): gfu.vec.data = cpu_solver * f.vec
    cpu_ms = (time() - ts) / 5 * 1000

ref_vec = gfu.vec.CreateVector()
ref_vec.data = gfu.vec

print(f'CPU CG:  {cpu_ms:.0f} ms')
print(f'  |sol| = {Norm(gfu.vec):.6e}')
print(f'  steps = {cpu_solver.GetSteps()}')
adev   = a.mat.CreateDeviceMatrix()
jacdev = jac.CreateDeviceMatrix()
fdev   = f.vec.CreateDeviceVector(copy=True)

inv = CGSolver(adev, jacdev, maxiter=2000)
gfu.vec.data = (inv * fdev).Evaluate()   # warm-up
ts = time()
for _ in range(5): gfu.vec.data = (inv * fdev).Evaluate()
gpu_std_ms = (time() - ts) / 5 * 1000

print(f'GPU CG (no graph):  {gpu_std_ms:.0f} ms')
print(f'  diff    = {Norm(gfu.vec - ref_vec):.1e}')
print(f'  speedup = {cpu_ms/gpu_std_ms:.2f}x')

GPU acceleration#

gpu_solver = DevCGSolver(
    mat=adev, pre=jacdev,
    adev_raw=adev, cdev_raw=jacdev,
    maxsteps=2000, printrates=False
)

gfu.vec.data = (gpu_solver * fdev).Evaluate()  
ts = time()
for _ in range(5): gfu.vec.data = (gpu_solver * fdev).Evaluate()
gpu_ms = (time() - ts) / 5 * 1000

print(f'GPU CG (Conditional While Graph):  {gpu_ms:.0f} ms')
print(f'  diff    = {Norm(gfu.vec - ref_vec):.1e}')
print(f'  speedup = {cpu_ms/gpu_ms:.2f}x')

Scaling problem size#

print(f"{'ndof':>8}  {'CPU (ms)':>10}  {'C++ dev (ms)':>13}  {'Cuda Conditional While Graph (ms)':>17}  {'speedup':>8}")
print('-' * 68)

# for maxh in [0.05, 0.03, 0.01, 0.007, 0.005]:
for maxh in [0.05, 0.03, 0.01]:

    m = Mesh(unit_square.GenerateMesh(maxh=maxh))
    fs = H1(m, order=2, dirichlet='.*')
    uu, vv = fs.TnT()
    with TaskManager():
        aa = BilinearForm(grad(uu)*grad(vv)*dx + uu*vv*dx).Assemble()
        ff = LinearForm(x*vv*dx).Assemble()
    jj = aa.mat.CreateSmoother(fs.FreeDofs())
    gf = GridFunction(fs)

    with TaskManager():
        cs = CGSolver(aa.mat, jj, maxsteps=2000, precision=1e-12)
        gf.vec.data = cs * ff.vec
        ts = time()
        for _ in range(3): gf.vec.data = cs * ff.vec
        c_ms = (time() - ts) / 3 * 1000

    ad = aa.mat.CreateDeviceMatrix()
    jd = jj.CreateDeviceMatrix()
    fd = ff.vec.CreateDeviceVector(copy=True)

    ng = CGSolver(ad, jd, maxsteps=2000, precision=1e-12)
    gf.vec.data = (ng * fd).Evaluate()
    ts = time()
    for _ in range(3): gf.vec.data = (ng * fd).Evaluate()
    ng_ms = (time() - ts) / 3 * 1000

    gs = DevCGSolver(mat=ad, pre=jd, adev_raw=ad, cdev_raw=jd,
                     maxsteps=2000, precision=1e-12, printrates=False)
    gf.vec.data = (gs * fd).Evaluate()
    ts = time()
    for _ in range(3): gf.vec.data = (gs * fd).Evaluate()
    g_ms = (time() - ts) / 3 * 1000

    print(f'{fs.ndof:>8}  {c_ms:>10.0f}  {ng_ms:>13.0f}  {g_ms:>17.0f}  {ng_ms/g_ms:>7.2f}x')
H100 reference results — Part 1 (Musica, June 2026)

Single run, ndof = 460,033 (5 mesh refinements, maxh=0.1):

ndof

CPU CG (ms)

C++ dev (ms)

WHILE graph (ms)

speedup

1,969

176

7

3

2.12×

5,233

311

11

5

2.15×

46,749

1,353

36

17

2.15×

95,017

2,069

52

26

1.97×

185,857

5,086

83

49

1.68×


Part 2: Non-symmetric problem — 3D convection#

The same DG convection operator used in the NGSolve CUDA benchmark: a transport-dominated 3D problem on an unstructured mesh with upwind flux. The stiffness matrix is non-symmetric — CG does not apply.

TFQMR (Transpose-Free QMR) handles non-symmetric systems without storing a growing Krylov basis — also very relevant solver for Navier–Stokes equations.

mesh2 = Mesh(unit_cube.GenerateMesh(maxh=0.05))
fes2  = L2(mesh2, order=2, dgjumps=True)
u2, v2 = fes2.TnT()

wind = CF((1, 0.2, 0.3))
n    = specialcf.normal(mesh2.dim)
dS   = dx(element_boundary=True)

with TaskManager():
    a2 = BilinearForm(fes2)
    a2 += -20*u2*v2*dx
    a2 += u2*wind*grad(v2)*dx
    a2 += -(wind*n)*IfPos(wind*n, u2, u2.Other(bnd=0))*v2*dS
    a2.Assemble()

blocks = fes2.CreateSmoothingBlocks(blocktype='element')
pre2   = a2.mat.CreateBlockSmoother(blocks) 
pre2p  = Projector(fes2.FreeDofs(), True)
f2     = LinearForm(1*v2*dx).Assemble()
gfu2   = GridFunction(fes2)

print(f'ndof = {fes2.ndof}')

Solve on CPU for reference timing and solution#

with TaskManager():
    gfu2.vec.data = TFQMR(mat=pre2@a2.mat, pre=pre2p,
                           rhs=(pre2*f2.vec).Evaluate(),
                           maxsteps=400, printrates=False, tol=1e-12)
    ts = time()
    for _ in range(3):
        gfu2.vec.data = TFQMR(mat=pre2@a2.mat, pre=pre2p,
                               rhs=(pre2*f2.vec).Evaluate(),
                               maxsteps=400, printrates=False, tol=1e-12)
    cpu2_ms = (time() - ts) / 3 * 1000

ref_norm2 = Norm(gfu2.vec)
print(f'CPU TFQMR: {cpu2_ms:.0f} ms   |sol| = {ref_norm2:.6e}')

GPU acceleration#

fdev2    = f2.vec.CreateDeviceVector(copy=True)
adev2    = a2.mat.CreateDeviceMatrix()
pdev2    = pre2.CreateDeviceMatrix()
pdev2p   = pre2p.CreateDeviceMatrix()
rhs_dev2 = (pdev2 * fdev2).Evaluate()
pa_dev2  = pdev2 @ adev2

gfu2.vec.data = TFQMR(mat=pa_dev2, pre=pdev2p, rhs=rhs_dev2,
                       maxsteps=400, printrates=False, tol=1e-12)   
ts = time()
for _ in range(3):
    gfu2.vec.data = TFQMR(mat=pa_dev2, pre=pdev2p, rhs=rhs_dev2,
                           maxsteps=400, printrates=False, tol=1e-12)
gpu2_std_ms = (time() - ts) / 3 * 1000

print(f'Python TFQMR (device matrices):  {gpu2_std_ms:.0f} ms')
print(f'  diff    = {abs(Norm(gfu2.vec) - ref_norm2):.1e}')
print(f'  speedup = {cpu2_ms/gpu2_std_ms:.2f}x')

GPU acceleration Conditional While Graph#

tfqmr_solver = DevTFQMRSolver(
    mat=pre2 @ a2.mat, pre=pre2p,        # CPU operators 
    adev_raw=pa_dev2, cdev_raw=pdev2p,   # GPU operators
    maxsteps=400, printrates=False, precision=1e-12
)

tfqmr_solver.Mult(rhs_dev2, gfu2.vec)  
ts = time()
for _ in range(3): tfqmr_solver.Mult(rhs_dev2, gfu2.vec)
gpu2_ms = (time() - ts) / 3 * 1000

err2 = abs(Norm(gfu2.vec) - ref_norm2)
print(f'GPU TFQMR (Conditional While Graph):  {gpu2_ms:.0f} ms')
print(f'  diff    = {err2:.1e}')
print(f'  speedup = {cpu2_ms/gpu2_ms:.1f}x')

Scaling problem size#

print(f"{'ndof':>8}  {'CPU (ms)':>10}  {'no graph (ms)':>14}  {'Conditional While Graph (ms)':>25}  {'speedup':>8}")
print('-' * 66)

# for maxh in [0.2, 0.1, 0.07, 0.05]:
for maxh in [0.2, 0.1, 0.07]:

    m2 = Mesh(unit_cube.GenerateMesh(maxh=maxh))
    fs2 = L2(m2, order=2, dgjumps=True)
    uu2, vv2 = fs2.TnT()
    wind2 = CF((1, 0.2, 0.3))
    n2    = specialcf.normal(m2.dim)
    dS2   = dx(element_boundary=True)
    with TaskManager():
        aa2 = BilinearForm(fs2)
        aa2 += -20*uu2*vv2*dx
        aa2 += uu2*wind2*grad(vv2)*dx
        aa2 += -(wind2*n2)*IfPos(wind2*n2, uu2, uu2.Other(bnd=0))*vv2*dS2
        aa2.Assemble()
    blks2 = fs2.CreateSmoothingBlocks(blocktype='element')
    pp2   = aa2.mat.CreateBlockSmoother(blks2)
    pp2p  = Projector(fs2.FreeDofs(), True)
    ff2   = LinearForm(1*vv2*dx).Assemble()
    gg2   = GridFunction(fs2)

    with TaskManager():
        gg2.vec.data = TFQMR(mat=pp2@aa2.mat, pre=pp2p,
                              rhs=(pp2*ff2.vec).Evaluate(),
                              maxsteps=400, printrates=False, tol=1e-12)
        ts = time()
        for _ in range(3):
            gg2.vec.data = TFQMR(mat=pp2@aa2.mat, pre=pp2p,
                                  rhs=(pp2*ff2.vec).Evaluate(),
                                  maxsteps=400, printrates=False, tol=1e-12)
        c2_ms = (time() - ts) / 3 * 1000

    fd2  = ff2.vec.CreateDeviceVector(copy=True)
    ad2  = aa2.mat.CreateDeviceMatrix()
    pd2  = pp2.CreateDeviceMatrix()
    pd2p = pp2p.CreateDeviceMatrix()
    rd2  = (pd2 * fd2).Evaluate()
    pad2 = pd2 @ ad2

    gg2.vec.data = TFQMR(mat=pad2, pre=pd2p, rhs=rd2,
                          maxsteps=400, printrates=False, tol=1e-12)
    ts = time()
    for _ in range(3):
        gg2.vec.data = TFQMR(mat=pad2, pre=pd2p, rhs=rd2,
                              maxsteps=400, printrates=False, tol=1e-12)
    ng2_ms = (time() - ts) / 3 * 1000

    gs2 = DevTFQMRSolver(mat=pp2@aa2.mat, pre=pp2p,
                          adev_raw=pad2, cdev_raw=pd2p,
                          maxsteps=400, printrates=False, precision=1e-12)
    gs2.Mult(rd2, gg2.vec)
    ts = time()
    for _ in range(3): gs2.Mult(rd2, gg2.vec)
    g2_ms = (time() - ts) / 3 * 1000

    print(f'{fs2.ndof:>8}  {c2_ms:>10.0f}  {ng2_ms:>14.0f}  {g2_ms:>17.0f}  {ng2_ms/g2_ms:>7.2f}x')
H100 reference results — Part 2 (Musica, June 2026)

Single run, ndof = 340,370:

ndof

Python TFQMR (ms)

WHILE graph (ms)

speedup

6,520

5

2

2.78×

47,810

8

4

2.36×

149,650

15

8

1.94×

340,370

29

19

1.54×