# %pip install numpy scipy numba line_profiler
# multiprocessing & progress
import joblib
import tqdm
import numpy as np
import scipy.stats as st
import numba
# plotting
import matplotlib.pyplot as plt
figsize = (5,4)
# profiler
%load_ext line_profilerIncreasing Performance
This chapter covers
- optimization for performance
At the example of the Gillespie simulation algorithm, we will discuss ways to increase performance of an implementation.
We start with the usual imports:
For convenience you can find the previous section’s implementation in src/gillespie.py. Let us import from there and define the example CRN and initial conditions.
from src.gillespie import CRN
species = ["mRNA", "protein"]
r_vec = np.array([
[0, 0], # DNA -> DNA + mRNA
[1, 0], # mRNA -> 0
[1, 0], # mRNA -> mRNA + protein
[0, 1], # protein -> 0
], dtype=int)
state_update_vec: np.ndarray = np.array(
[
[ 1, 0], # DNA -> DNA + mRNA
[-1, 0], # mRNA -> 0
[ 0, 1], # mRNA -> mRNA + protein
[ 0, -1], # protein -> 0
],
dtype=int,
)
rate_constants = np.array([0.5, 0.1, 0.2, 0.05], dtype=float)
example_crn: CRN = {
"species": species,
"r_vec": r_vec,
"state_update_vec": state_update_vec,
"rate_constants": rate_constants,
}Profiling
We use %lprun to profile line by line. We use the -f flag to specify which function to profile linewise.
from src.gillespie import gillespie_ssa
# seed random number generator for reproducibility
np.random.seed(42)
# simulation parameters
time_points = np.linspace(0, 200, 101) # output times
initial_state = np.array([0, 0], dtype=int) # initial counts
%lprun \
-T lp_results.txt \
-f gillespie_ssa \
gillespie_ssa(example_crn, initial_state, time_points)
*** Profile printout saved to text file 'lp_results.txt'.
Timer unit: 1e-09 s
Total time: 0.010123 s
File: /Users/mfuegger/Github/Biodisco/computational_bioengineering_tuwien2026/03-modeling/src/gillespie.py
Function: gillespie_ssa at line 65
Line # Hits Time Per Hit % Time Line Contents
==============================================================
65 def gillespie_ssa(
66 crn: CRN,
67 initial_state: np.ndarray,
68 output_times: np.ndarray,
69 ) -> np.ndarray:
70 1 0.0 0.0 0.0 update = crn["state_update_vec"]
71
72 # allocate arrays
73 1 4000.0 4000.0 0.0 output_states = np.empty((len(output_times), update.shape[1]), dtype=int)
74 1 2000.0 2000.0 0.0 propensities = np.zeros(update.shape[0])
75
76 1 0.0 0.0 0.0 i_time: int = 1
77 1 0.0 0.0 0.0 i: int = 0
78
79 # initial time, state, and output
80 1 1000.0 1000.0 0.0 t = output_times[0]
81 1 3000.0 3000.0 0.0 state = initial_state.copy()
82 1 0.0 0.0 0.0 state_prev = state.copy()
83 1 1000.0 1000.0 0.0 output_states[0, :] = state
84
85 # event loop
86 100 20000.0 200.0 0.2 while i < len(output_times):
87 655 172000.0 262.6 1.7 while t < output_times[i_time]:
88 # 2 random draws
89 556 8532000.0 15345.3 84.3 event, dt = gillespie_draw(state, propensities, crn)
90
91 # update
92 556 208000.0 374.1 2.1 state_prev = state.copy()
93 556 307000.0 552.2 3.0 state += update[event, :]
94 556 132000.0 237.4 1.3 t += dt
95
96 # passed an output time
97 99 633000.0 6393.9 6.3 i = np.sum(output_times <= t)
98 99 93000.0 939.4 0.9 output_states[i_time : min(i, len(output_times))] = state_prev
99 99 15000.0 151.5 0.1 i_time = i
100 1 0.0 0.0 0.0 return output_states
About 80% of the time is spent doing draws. Let us see how we can improve our draw speed by profiling the gillespie_draw function.
from src.gillespie import gillespie_draw
propensities_test = np.zeros(len(example_crn["rate_constants"]))
%lprun \
-T lp_results.txt \
-f gillespie_draw \
[gillespie_draw(initial_state, propensities_test, example_crn) \
for _ in range(10_000)]
*** Profile printout saved to text file 'lp_results.txt'.
Timer unit: 1e-09 s
Total time: 0.12609 s
File: /Users/mfuegger/Github/Biodisco/computational_bioengineering_tuwien2026/03-modeling/src/gillespie.py
Function: gillespie_draw at line 53
Line # Hits Time Per Hit % Time Line Contents
==============================================================
53 def gillespie_draw(
54 state: np.ndarray,
55 propensities: np.ndarray,
56 crn: CRN,
57 ) -> tuple[int, float]:
58 10000 77052000.0 7705.2 61.1 update_propensity(propensities, state, crn["r_vec"], crn["rate_constants"])
59 10000 13667000.0 1366.7 10.8 props_sum = propensities.sum()
60 10000 7317000.0 731.7 5.8 delta = np.random.exponential(1.0 / props_sum)
61 10000 25198000.0 2519.8 20.0 rxn = sample_discrete(propensities / props_sum)
62 10000 2856000.0 285.6 2.3 return rxn, delta
Clearly, update_propensity is taking the most time. We will focus on speeding that up.
Just-in-time compilation
A significant speed boost is achieved by just-in-time compliation using Numba. To utilize this feature, you need to just-in-time compile (JIT) your propensity function. You can insist that everything is compiled (and therefore skip the comparably slow Python interpreter) by using the @numba.njit decorator. In many cases, that is all you have to do.
Speed boost by JIT compilation with Numba
Numba is a package that does LLVM optimized just-in-time compilation of Python code. The speed-ups can be substantial. We will use numba to compile the parts of the code that we can. For many functions, we just need to decorate the function with
@numba.jit()
If possible, we can use the nopython=True keyword argument to get more speed because the compiler can assume that all of the code is JIT-able. Using
@numba.njit
is equivalent to using @numba.jit(nopython=True).
First, recall that we used the following numpy implementation.
def update_propensity(
propensities: np.ndarray,
state: np.ndarray,
r_vec: np.ndarray,
rate_constants: np.ndarray) -> None:
propensities[:] = rate_constants * (
np.where(r_vec >= 1, state, 1) * np.where(r_vec == 2, state - 1, 1)
).prod(axis=1)The implementation is already making good use of vectorization and the natively compiled numpy library, but the number of reactions is probably too small (in our case 4), for vectorization to pay off. We will thus step back to a Python implementation and let the numba compilation do its job. We also include the pure Python function for comparison with both.
# pure Python
def update_propensity_python(
propensities: np.ndarray,
state: np.ndarray,
r_vec: np.ndarray,
rate_constants: np.ndarray) -> None:
for i in range(len(propensities)):
p = rate_constants[i]
for s in range(len(state)):
r = r_vec[i, s]
if r >= 1:
p *= state[s]
if r >= 2:
p *= state[s] - 1
propensities[i] = p
# jit of pure Python
@numba.njit
def update_propensity_numba(
propensities: np.ndarray,
state: np.ndarray,
r_vec: np.ndarray,
rate_constants: np.ndarray) -> None:
for i in range(len(propensities)):
p = rate_constants[i]
for s in range(len(state)):
r = r_vec[i, s]
if r >= 1:
p *= state[s]
if r >= 2:
p *= state[s] - 1
propensities[i] = pfrom src.gillespie import update_propensity
propensities_test = np.zeros(len(example_crn["rate_constants"]))
state_test = np.array([10, 100], dtype=int)
print('numpy:')
%timeit \
update_propensity(propensities_test, state_test, example_crn["r_vec"], example_crn["rate_constants"])
print('Python:')
%timeit \
update_propensity_python(propensities_test, state_test, example_crn["r_vec"], example_crn["rate_constants"])
print('numba:')
%timeit \
update_propensity_numba(propensities_test, state_test, example_crn["r_vec"], example_crn["rate_constants"])numpy:
4.91 μs ± 72.4 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
Python:
1.79 μs ± 31 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
numba:
335 ns ± 3 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
We got a clear speed-up. Now let us also speed up the sum and the discrete sampling.
@numba.njit
def sum_numba(ar):
return ar.sum()
# Make dummy array for testing
ar = np.array([0.3, 0.4, 0.3, 0.2, 0.15])
print('numpy:')
%timeit ar.sum()
print('numba:')
%timeit sum_numba(ar)numpy:
610 ns ± 2.59 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
numba:
119 ns ± 0.638 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
We get another speed boost, though we should note that this speed boost in the sum is due to numba optimizing sums of a certain size.
Finally, we will speed up the sampling of the discrete distribution. We will do this in two ways. First, we notice that the division operation on the propensities took a fair amount of time when we profiled the code. We do not need it; we can instead sample from an unnormalized discrete distribution. Secondly, we can use numba to accelerate the while loop in the sampling.
from src.gillespie import sample_discrete
@numba.njit
def sample_discrete_numba(probs, probs_sum):
q: float = np.random.rand() * probs_sum
i: int = 0
p_sum: float = 0.0
while p_sum < q:
p_sum += probs[i]
i += 1
return i - 1
# Make dummy unnormalized probs
probs = np.array([0.1, 0.3, 0.4, 0.05, 0.15, 0.6])
probs_sum = sum_numba(probs)
print('Python:')
%timeit sample_discrete(probs)
print("numba:")
%timeit sample_discrete_numba(probs, probs_sum)Python:
531 ns ± 1.18 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
numba:
153 ns ± 0.198 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
We get a speed-up of about a factor of three. Let’s now make a new gillespie_draw() function that is faster by using all the previous fast functions as well as adding @numba.njit to the draw itself.
@numba.njit
def gillespie_draw_numba(
state: np.ndarray,
propensities: np.ndarray,
r_vec: np.ndarray,
rate_constants: np.ndarray) -> tuple[int, float]:
# use fastest one:
update_propensity_numba(propensities, state, r_vec, rate_constants)
# use fastest one:
props_sum = sum_numba(propensities)
# reuse props_sum here and in next function call
delta = np.random.exponential(1.0 / props_sum)
# use fastest one:
rxn = sample_discrete_numba(propensities, props_sum)
return rxn, delta
propensities_test = np.zeros(len(example_crn["rate_constants"]))
state_test = np.array([10, 100], dtype=int)
print('gillespie_draw:')
%timeit \
gillespie_draw(state_test, propensities_test, example_crn)
print('gillespie_draw_numba:')
%timeit \
gillespie_draw_numba(state_test, propensities_test, example_crn["r_vec"], example_crn["rate_constants"])gillespie_draw:
7.45 μs ± 18.9 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
gillespie_draw_numba:
409 ns ± 1.44 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Let’s adjust our SSA function to include the numba version and again decorate the whole function.
@numba.njit
def gillespie_ssa_numba(
crn: CRN,
initial_state: np.ndarray,
output_times: np.ndarray,
) -> np.ndarray:
update = crn["state_update_vec"]
# allocate arrays
output_states = np.empty((len(output_times), update.shape[1]), dtype=int)
propensities = np.zeros(update.shape[0])
r_vec = example_crn["r_vec"]
rate_constants = example_crn["rate_constants"]
i_time: int = 1
i: int = 0
# initial time, state, and output
t = output_times[0]
state = initial_state.copy()
state_prev = state.copy()
output_states[0, :] = state
# event loop
while i < len(output_times):
while t < output_times[i_time]:
# 2 random draws
event, dt = gillespie_draw_numba(state, propensities, r_vec, rate_constants)
# update
state_prev = state.copy()
state += update[event, :]
t += dt
# passed an output time
i = np.sum(output_times <= t)
output_states[i_time : min(i, len(output_times))] = state_prev
i_time = i
return output_statesNow we can test the speed of the two SSAs.
print('gillespie_ssa:')
%timeit gillespie_ssa(example_crn, initial_state, time_points)
print('gillespie_ssa_numba:')
%timeit gillespie_ssa_numba(example_crn, initial_state, time_points)gillespie_ssa:
4.87 ms ± 87.5 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
gillespie_ssa_numba:
--------------------------------------------------------------------------- TypingError Traceback (most recent call last) Cell In[12], line 5 1 print('gillespie_ssa:') 2 get_ipython().run_line_magic('timeit', 'gillespie_ssa(example_crn, initial_state, time_points)') 3 4 print('gillespie_ssa_numba:') ----> 5 get_ipython().run_line_magic('timeit', 'gillespie_ssa_numba(example_crn, initial_state, time_points)') File <magic-timeit>:1, in inner(_it, _timer) ----> 1 'Could not get source, probably due dynamically evaluated source code.' File ~/Github/Biodisco/computational_bioengineering_tuwien2026/venv/lib/python3.12/site-packages/numba/core/dispatcher.py:424, in _DispatcherBase._compile_for_args(self, *args, **kws) 420 msg = (f"{str(e).rstrip()} \n\nThis error may have been caused " 421 f"by the following argument(s):\n{args_str}\n") 422 e.patch_message(msg) --> 424 error_rewrite(e, 'typing') 425 except errors.UnsupportedError as e: 426 # Something unsupported is present in the user code, add help info 427 error_rewrite(e, 'unsupported_error') File ~/Github/Biodisco/computational_bioengineering_tuwien2026/venv/lib/python3.12/site-packages/numba/core/dispatcher.py:365, in _DispatcherBase._compile_for_args.<locals>.error_rewrite(e, issue_type) 363 raise e 364 else: --> 365 raise e.with_traceback(None) TypingError: Failed in nopython mode pipeline (step: nopython frontend) Untyped global name 'example_crn': Cannot determine Numba type of <class 'dict'> File "../../../../../../var/folders/hd/g16tvgfd6pddg2qvrkrypykr0000gp/T/ipykernel_63246/603545015.py", line 13: <source missing, REPL/exec in use?> During: Pass nopython_type_inference This error may have been caused by the following argument(s): - argument 0: Cannot determine Numba type of <class 'dict'> This error may have been caused by the following argument(s): - argument 0: Cannot determine Numba type of <class 'dict'>
Oh - this results in an error for our intended new function.
We cannot simply use it on the function gillespie_ssa_numba: Numba cannot type Python dicts, so passing crn: CRN directly fails at compile time (as shown above). The fix is an inner/outer split: a @numba.njit-decorated _gillespie_ssa_numba_inner receives the individual arrays, while the thin outer gillespie_ssa_numba unpacks the CRN dict and delegates.
# here decoration works
@numba.njit
def _gillespie_ssa_numba_inner(
r_vec: np.ndarray,
state_update_vec: np.ndarray,
rate_constants: np.ndarray,
population_0: np.ndarray,
time_points: np.ndarray) -> np.ndarray:
pop_out = np.empty((len(time_points), state_update_vec.shape[1]), dtype=np.int64)
i_time = 1
i = 0
t = time_points[0]
state = population_0.copy()
pop_out[0, :] = state
propensities = np.zeros(r_vec.shape[0])
while i < len(time_points):
while t < time_points[i_time]:
event, dt = gillespie_draw_numba(
state, propensities, r_vec, rate_constants
)
state_prev = state.copy()
state += state_update_vec[event, :]
t += dt
i = np.searchsorted((time_points > t).astype(np.int64), 1)
for j in range(i_time, min(i, len(time_points))):
pop_out[j, :] = state_prev
i_time = i
return pop_out
# function that just unpacks
def gillespie_ssa_numba(
crn: CRN,
population_0: np.ndarray,
time_points: np.ndarray) -> np.ndarray:
return _gillespie_ssa_numba_inner(
crn["r_vec"], crn["state_update_vec"], crn["rate_constants"],
population_0, time_points,
)Let us try again.
print('gillespie_ssa:')
%timeit gillespie_ssa(example_crn, initial_state, time_points)
print('gillespie_ssa_numba:')
%timeit gillespie_ssa_numba(example_crn, initial_state, time_points)gillespie_ssa:
4.78 ms ± 92.6 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
gillespie_ssa_numba:
67.7 μs ± 7.15 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)
The speed up is significant and Numba clearly payed off here.
Parallel Gillespie simulations
Sampling by the Gillespie algorithm is trivially parallelizable.
We first try thejoblib module to parallelize the computation It can be used an an easy drop-in replacement for for loops. We also tell it to use threads instead of processes in order to decrease overhead in argument passing (it would otherwise pickle them).
def gillespie_parallel(
crn: CRN,
population_0: np.ndarray,
time_points: np.ndarray,
n_simulations: int,
n_jobs: int) -> np.ndarray:
populations = joblib.Parallel(n_jobs=n_jobs, prefer="threads")(
joblib.delayed(gillespie_ssa_numba)(crn, population_0, time_points)
for _ in range(n_simulations)
)
return np.array(populations)We are paying some overhead in setting up the parallelization. Let’s time it to see how we do with parallelization.
n_simulations = 1000
time_points = np.linspace(0, 200, 101)
print('sequential:')
%timeit [gillespie_ssa_numba(example_crn, initial_state, time_points) for _ in range(n_simulations)]
cores = joblib.cpu_count()
print(f'parallel ({cores-1} cores):')
%timeit gillespie_parallel(example_crn, initial_state, time_points, n_simulations, cores-1)sequential:
67.8 ms ± 384 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
parallel (7 cores):
90.7 ms ± 1.42 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Unfortunately, the result is disappointing: the overhead is larger than the win.
A fix is to batch all simulations inside a single @numba.njit(parallel=True) function using numba.prange. Each iteration of the parallel loop runs one full simulation. Numba maps these onto OS threads internally, so that there is no Python-level scheduling overhead and the famous Python GIL is irrelevant.
@numba.njit(parallel=True)
def _gillespie_ssa_batch_inner(
r_vec: np.ndarray,
state_update_vec: np.ndarray,
rate_constants: np.ndarray,
population_0: np.ndarray,
time_points: np.ndarray,
n: int) -> np.ndarray:
# allocate memory
out = np.empty((n, len(time_points), state_update_vec.shape[1]), dtype=np.int64)
# run in parallel threads
for i in numba.prange(n):
out[i] = _gillespie_ssa_numba_inner(
r_vec, state_update_vec, rate_constants, population_0, time_points,
)
return out
# just unpacking
def gillespie_ssa_batch_numba(
crn: CRN,
population_0: np.ndarray,
time_points: np.ndarray,
n: int) -> np.ndarray:
return _gillespie_ssa_batch_inner(
crn["r_vec"], crn["state_update_vec"], crn["rate_constants"],
population_0, time_points, n,
)
print(f'numba prange ({cores - 1} threads):')
%timeit gillespie_ssa_batch_numba(example_crn, initial_state, time_points, n_simulations)numba prange (7 threads):
15.3 ms ± 735 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)
The result is a clear improvement.
Approximations: Tau-leaping
We can further speed-up the computational time of the algorithms by approximating the solution.
One such method is called Tau-leaping and the interested reader is referred to take a look at this method.
License & Attribution: This page is from material by Michael Elowitz and Justin Bois (© 2021–2025), licensed under CC BY-NC-SA 4.0, with minor modifications.