Two types
Independent calculations
Node 1: $a+b$
Node 2: $x/y$
Everything between trivial and not possible
Can parallelize: $a + b + c + d + e$
Can not parallelize: $f(f(f(f(x_0))))$
$a + b + c + d + e + f + g + h = y$
$2 + 3 + 5 + 1 + 7 + 6 + 8 + 4 = y$
$2 + 3 + 5 + 1 + 7 + 6 + 8 + 4 = y$
$2 + 3 + 5 + 1 + 7 + 6 + 8 + 4 = 36$
This is called a Binomial tree
Not quite $n$ speedup, rather
Any associative binary operator $T$ works!
$$ (x T y) T z = x T (y T z) \;\forall\; x,y,z $$We can also sometimes cheat
$y = f(f(f(f(x_0))))$
A model of your program execution time
$$ T = \sum_{i=1}^{N} T_i $$Parallelization speedup $s_i(n)$
$n$ is core/node count
So what is speedup?
This is what is called Amdahl's law: total speedup $S$ is
$$ S(n) = \frac{1}{1 - p + \frac{p}{s(n)}} $$What if problem size scales?
This is what is called Gustafson's law: total speedup $S$ is
$$ S(n) = 1 + (n - 1) p $$So how to use this?
(all scripts for plots and methods in repository)
Practical example
Practical example
Practical example
Data!
Best trade-off: 700 000 particles at 500 cores ~20 hours
Languages
Python
Multi-lingual
HPC frameworks
More stuff on:
trevor-vincent/awesome-high-performance-computing
No highlighting yet so lets go to: chapel-lang.org
Single node - same interpreter (and GIL) - shared memory
from threading import Thread
def worker(index):
print(f"Worker doing {index}")
th = []
for ind in range(6):
t = Thread(target=worker, args=(ind,))
th.append(t)
t.start()
for t in th:
t.join()
Single node - multiple cores - shared memory possible
from multiprocessing import Process
def worker(index):
print(f"Worker doing {index}")
ps = []
for ind in range(6):
p = Process(target=worker, args=(ind,))
ps.append(p)
p.start()
for p in ps:
p.join()
Single node - multiple cores - shared memory possible
from multiprocessing import Pool
def f(x):
return x * x
with Pool(5) as p:
par_computed_map = p.map(f, range(100))
print(par_computed_map)
Single node - multiple cores - shared memory possible
import multiprocessing as mp
import numpy as np
import time
def worker(q, r, pid):
num = 0
while True:
data = q.get()
if data is None:
print("Stopping worker now...")
break
print(f"{pid=} doing task={num}")
for ind in range(1_000):
res = np.mean(data)
r.put(res)
q.task_done()
num += 1
tasks = mp.JoinableQueue()
results = mp.Queue()
# create workers
ps = []
for i in range(6):
p = mp.Process(target=worker, args=(tasks, results, i))
ps.append(p)
p.start()
task_n = 500
t0 = time.time()
# queue tasks
for i in range(task_n):
tasks.put(np.random.randn(10_000) + i**2)
tasks.join() # complete tasks
# get results
means = np.empty(task_n)
for i in range(task_n):
means[i] = results.get()
t1 = time.time() - t0
print(f"EXECUTION: {t1:.4f}")
# stop workers
for i in range(len(ps)):
tasks.put(None)
for p in ps:
p.join()
Launches other programs - single node - multiple cores
import subprocess
addrs = "192.167.0.{:d}"
prog = "ping "
opts = "-c 1 "
processes = []
for ind in range(256):
cmd = " ".join(["ping", f"192.167.0.{ind:d}", "-c", "1"])
print(f"starting '{cmd}'")
p = subprocess.Popen(cmd, cwd="/", shell=True)
processes.append(p)
for p in processes:
p.wait()
Multi-node / local / multi-core - inter-process communication
pacman -S openmpipip install mpi4pympirun -np 6 python code/python-scripts/mpi_hello.py
#!/usr/bin/env python
from mpi4py import MPI
comm = MPI.COMM_WORLD
name = MPI.Get_processor_name()
print(f"hello! name: {name}, my rank is {comm.rank}")
Multi-node / local / multi-core - inter-process communication
#!/usr/bin/env python
import time
from mpi4py import MPI
import numpy as np
comm = MPI.COMM_WORLD
t0_all = time.time()
print(f"rank-{comm.rank}/{comm.size} starting")
if comm.rank == 0:
items = [np.random.random(1000) + x for x in range(10000)]
else:
items = None
items = comm.bcast(items, root=0)
t0 = time.time()
for ind in range(comm.rank, len(items), comm.size):
items[ind] = items[ind] - np.mean(items[ind])
dt = time.time() - t0
if comm.rank == 0:
for rank in range(1, comm.size):
for ind in range(rank, len(items), comm.size):
items[ind] = comm.recv(source=rank, tag=ind)
else:
for ind in range(comm.rank, len(items), comm.size):
comm.send(items[ind], dest=0, tag=ind)
if comm.rank == 0:
su = 0
for dat in items:
su += np.mean(dat)
dt_all = time.time() - t0_all
print(f"rank-{comm.rank}/{comm.size}: work time {dt} s (total {dt_all} s)")
What about this Global Interpreter Lock (GIL)?
import time
from threading import Thread
from multiprocessing import Pool
iter_num = 60000000
def counter(num):
while num != 0:
num -= 1
t1 = Thread(target=counter, args=(iter_num // 2,))
t2 = Thread(target=counter, args=(iter_num // 2,))
start = time.time()
t1.start()
t2.start()
t1.join()
t2.join()
end = time.time()
print("threading exec time -", end - start)
start = time.time()
counter(iter_num)
end = time.time()
print("linear exec time -", end - start)
pool = Pool(2)
start = time.time()
r1 = pool.apply_async(counter, [iter_num // 2])
r2 = pool.apply_async(counter, [iter_num // 2])
pool.close()
pool.join()
end = time.time()
print("multiprocessing exec time -", end - start)
Wikimedia:User:Cburnett CC-BY-SA-3.0
Wikimedia:User:Cburnett CC-BY-SA-3.0
Wikimedia:User:Cburnett CC-BY-SA-3.0
You can also do it in software! For example ZFS
Special file systems for HPC's
What they do:
distribute large amounts of data over
distributed storage and makes sure your code and data is available at the
correct compute node
Special managers for HPC's
What they do:
Makes sure everyone gets their job
run when the cluster is available and with the correct setup
You need to learn your system! How do you submit jobs, how do you make batches, is MPI supported, which catalogues are distributed, etc.
HPC2N (High Performance Computing Center North)
A system by which a programming language can call functions from a library/package/compiled binary from another programming language
For example: pretty much every language has to have a C FFI
(blog post in
handout)
Python developers want to implement the open a file
on disk function... how do they do that?
Lets look at the Linux System Calls Manual
open(2) System Calls Manual open(2)
NAME
open, openat, creat - open and possibly create a file
LIBRARY
Standard C library (libc, -lc)
SYNOPSIS
#include
int open(const char *pathname, int flags, ...
/* mode_t mode */ );
int creat(const char *pathname, mode_t mode);
int openat(int dirfd, const char *pathname, int flags, ...
/* mode_t mode */ );
/* Documented separately, in openat2(2): */
int openat2(int dirfd, const char *pathname,
const struct open_how *how, size_t size);
DESCRIPTION
The open() system call opens the file specified by pathname. If
the specified file does not exist, it may optionally (if O_CREAT is
specified in flags) be created by open().
The return value of open() is a file descriptor, a small, nonnega‐
tive integer that is an index to an entry in the process's table of
open file descriptors. The file descriptor is used in subsequent
system calls (read(2), write(2), lseek(2), fcntl(2), etc.) to re‐
fer to the open file. The file descriptor returned by a successful
call will be the lowest-numbered file descriptor not currently open
for the process.
...
open(2) System Calls Manual open(2)
NAME
open, openat, creat - open and possibly create a file
LIBRARY
Standard C library (libc, -lc)
SYNOPSIS
#include
int open(const char *pathname, int flags, ...
/* mode_t mode */ );
int creat(const char *pathname, mode_t mode);
int openat(int dirfd, const char *pathname, int flags, ...
/* mode_t mode */ );
/* Documented separately, in openat2(2): */
int openat2(int dirfd, const char *pathname,
const struct open_how *how, size_t size);
DESCRIPTION
The open() system call opens the file specified by pathname. If
the specified file does not exist, it may optionally (if O_CREAT is
specified in flags) be created by open().
The return value of open() is a file descriptor, a small, nonnega‐
tive integer that is an index to an entry in the process's table of
open file descriptors. The file descriptor is used in subsequent
system calls (read(2), write(2), lseek(2), fcntl(2), etc.) to re‐
fer to the open file. The file descriptor returned by a successful
call will be the lowest-numbered file descriptor not currently open
for the process.
...
The language is Python not C - so cannot include
Either code and maintain a direct Linux kernel interface
(setting
correct registers ect)
... or
Somehow call the open symbol from libc
Somehow call the open symbol from libc
int func(int a, int b) {
return a + b;
}
int main(void)
{
return func(1, 2);
}
_Z4funcii:
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-4], edi
mov DWORD PTR [rbp-8], esi
mov edx, DWORD PTR [rbp-4]
mov eax, DWORD PTR [rbp-8]
add eax, edx
pop rbp
ret
main:
push rbp
mov rbp, rsp
mov esi, 2
mov edi, 1
call _Z4funcii
nop
pop rbp
ret
int func(int a, int b) {
return a + b;
}
int main(void)
{
return func(1, 2);
}
_Z4funcii:
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-4], edi
mov DWORD PTR [rbp-8], esi
mov edx, DWORD PTR [rbp-4]
mov eax, DWORD PTR [rbp-8]
add eax, edx
pop rbp
ret
main:
push rbp
mov rbp, rsp
mov esi, 2
mov edi, 1
call _Z4funcii
nop
pop rbp
ret
Somehow call the open symbol from libc
int func(int a, int b) {
return a + b;
}
int main(void)
{
return func(1, 2);
}
_Z4funcii:
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-4], edi
mov DWORD PTR [rbp-8], esi
mov edx, DWORD PTR [rbp-4]
mov eax, DWORD PTR [rbp-8]
add eax, edx
pop rbp
ret
main:
push rbp
mov rbp, rsp
mov esi, 2
mov edi, 1
call _Z4funcii
nop
pop rbp
ret
If you want to know more, have a look at the handout
For now: be glad all this is behind the Python implementation and...
For now: be glad all this is behind the Python implementation and...
# The standard Python c-FFI!
import ctypes
Now - if anything can talk C - we can talk to it!
package main
import "core:c"
import "core:fmt"
import "base:runtime"
@export
logistic :: proc "c" (r, x: f64) -> f64 {
return r * x * (1 - x)
}
void twice(double* in_array, double* out_array, int length) {
int i;
for(i = 0; i<length; i++) {
out_array[i] = in_array[i]*2;
}
}
gcc main.c -fPIC -shared -o main.so
void twice(double* in_array, double* out_array, int length) {
int i;
for(i = 0; i<length; i++) {
out_array[i] = in_array[i]*2;
}
}
import ctypes
import numpy as np
import numpy.typing as npt
import numpy.ctypeslib as npct
lib = ctypes.cdll.LoadLibrary("./main.so")
# Define the data types we need
ro_f8_vec = npct.ndpointer(np.float64, ndim=1, flags="aligned, contiguous")
rw_f8_vec = npct.ndpointer(np.float64, ndim=1, flags="aligned, contiguous, writeable")
# Define the C-interface
lib.twice.restype = None
lib.twice.argtypes = [
ro_f8_vec,
rw_f8_vec,
ctypes.c_int,
]
def twice(values: npt.NDArray[np.float64]):
result = np.empty_like(values)
length = ctypes.c_int(len(values))
lib.twice(values, result, length)
return result
if __name__ == "__main__":
x = np.arange(10, dtype=np.float64)
print("running c-twice")
y = twice(x)
print(f"INPUT : {x}")
print(f"OUTPUT: {y}")
Let's say I wanted something like
import bar
print("bar content: ", dir(bar))
print(bar.hello("Daniel"))
But without ctypes while interfacing to a complicated C/C++ library
#define PY_SSIZE_T_CLEAN
#include
#include
static PyObject* say_hello(PyObject *self, PyObject *args)
{
const char *name;
if (!PyArg_ParseTuple(args, "s", &name))
{
return NULL;
}
size_t n = strlen(name);
size_t response_len = 7 + n + 1;
char *response = (char *)malloc(response_len);
if (!response) {
return PyErr_NoMemory();
}
strcpy(response, "Hello, ");
strcat(response, name);
PyObject *ret = Py_BuildValue("s", response);
free(response);
return ret;
}
/* Table of what is in this modeule*/
static PyMethodDef BarMethods[] =
{
{"hello", say_hello, METH_VARARGS,
"Greets you!"},
{NULL, NULL, 0, NULL} /* Sentinel */
};
/* Definition of the module */
static struct PyModuleDef barmodule = {
PyModuleDef_HEAD_INIT,
"bar", /* name of module */
NULL, /* module documentation, may be NULL */
-1, /* size of per-interpreter state of the module,
or -1 if the module keeps state in global variables. */
BarMethods
};
/* This actually creates the module using the module definition */
PyMODINIT_FUNC PyInit_bar(void)
{
return PyModule_Create(&barmodule);
}
This can be directly compiled versus Python
CFLAGS=$(shell python-config --cflags) -fPIC -shared
INCLUDES=$(shell python-config --includes)
build:
gcc $(INCLUDES) $(CFLAGS) bar.c -o bar.so
def calc(vec, mat, repeat):
n = len(vec)
m = len(mat)
ret = [0] * m
tmp = [0] * m
for row in range(m):
ret[row] = vec[row]
for ind in range(repeat):
for row in range(m):
val = 0
for col in range(n):
val += ret[col] * mat[row][col]
tmp[row] = val
for row in range(m):
ret[row] = tmp[row]
return ret
def calc(vec, mat, repeat):
n = len(vec)
m = len(mat)
ret = [0] * m
tmp = [0] * m
for row in range(m):
ret[row] = vec[row]
for ind in range(repeat):
for row in range(m):
val = 0
for col in range(n):
val += ret[col] * mat[row][col]
tmp[row] = val
for row in range(m):
ret[row] = tmp[row]
return ret
def calc(vec, mat, repeat):
cdef int n, m
cdef float val
n = len(vec)
m = len(mat)
ret = [0] * m
tmp = [0] * m
for row in range(m):
ret[row] = vec[row]
for ind in range(repeat):
for row in range(m):
val = 0
for col in range(n):
val += ret[col] * mat[row][col]
tmp[row] = val
for row in range(m):
ret[row] = tmp[row]
return ret
def calc(vec, mat, repeat):
n = len(vec)
m = len(mat)
ret = [0] * m
tmp = [0] * m
for row in range(m):
ret[row] = vec[row]
for ind in range(repeat):
for row in range(m):
val = 0
for col in range(n):
val += ret[col] * mat[row][col]
tmp[row] = val
for row in range(m):
ret[row] = tmp[row]
return ret
py_calc.py, compiled_py_calc.py, cy_calc.pyx
from setuptools import setup
from Cython.Build import cythonize
sources = [
"compiled_py_calc.py",
"cy_calc.pyx",
]
setup(ext_modules=cythonize(sources))
subroutine fib(a,n)
integer, intent(in) :: n
integer, intent(out), dimension(1:n) :: a
do i=1,n
if (i.eq.1) then
a(i) = 0
elseif (i.eq.2) then
a(i) = 1
else
a(i) = a(i-1) + a(i-2)
endif
enddo
end
subroutine fib(a,n)
integer, intent(in) :: n
integer, intent(out), dimension(1:n) :: a
do i=1,n
if (i.eq.1) then
a(i) = 0
elseif (i.eq.2) then
a(i) = 1
else
a(i) = a(i-1) + a(i-2)
endif
enddo
end
sudo pacman -S gcc-fortran meson
python -m numpy.f2py -c fib.f90 -m fiblib
python -m numpy.f2py -c fib.f90 -m fiblib
creates fiblib.cpython-313-x86_64-linux-gnu.so
import fiblib
print(fiblib.fib.__doc__)
n = fiblib.fib(10)
print(f"{n=}")
Let's check out the handout!