Note
Go to the end to download the full example code
Benchmark of histogram routines¶
A direct comparison of the performance of the tttrlib
and the Numpy
histogram routines (Numpy version 1.13.3, Linux) demonstrates that except for
arbitrarily spaced histograms the tttrlib
histogram routines outperform
numpy by at least a factor of 2 (1D log10 Histograms and 2D Histograms) or by a
factor of ~40 (1D linear Histograms)
#.. plot:: ../examples/miscellaneous/histogram_benchmark.py
The current histogram implementation in tttrlib
is not particularly
optimized for speed, e.g., by making use of multiple cores. Nevertheless, in
special cases it outperforms Numpy. This comparison demonstrates that Numpy is
optimized for general use cases.
Note
As already stressed above, the histogram routines are (except for the
rarely used case of arbitrarily spaced histograms primarily optimized for performance (with room for improvements). The routines are internally used.
For other purposes than the applications tested for in tttrlib
other libraries, e.g.
Boost histogram are a better
choice.
Testing linear histograms
-------------------------
time(numpy) = 0.2195449167241653
time(tttrlib) = 0.00727986063187321
tttrlib speedup: 30.16
time(numpy) = 0.21970951867600283
time(tttrlib) = 0.05089626337091128
tttrlib speedup: 4.32
time(numpy) = 0.2040427263515691
time(tttrlib) = 0.32410164798299473
tttrlib speedup: 0.63
Testing 2D Histogram linear spacing
---------------------------------------
time(numpy) (ms) = 0.9090297808870673
time(tttrlib) (ms) = 0.16880432004109025
tttrlib speedup: 5.39
import tttrlib
import numpy as np
import pylab as p
import timeit
print("\n\nTesting linear histograms")
print("-------------------------")
data = np.random.normal(10, 2, int(2e6))
bins = np.linspace(0, 20, 32000, dtype=np.float64)
hist = np.zeros(len(bins), dtype=np.float64)
weights = np.ones_like(data)
tttrlib.histogram1D_double(data, weights, bins, hist, 'lin', True)
# Compare speed to numpy
n_test_runs = 3
time_np_hist_lin = timeit.timeit(
lambda: np.histogram(data, bins=bins),
number=n_test_runs
)
time_tttrlib_hist_lin = timeit.timeit(
lambda: tttrlib.histogram1D_double(data, weights, bins, hist, 'lin', True),
number=n_test_runs
)
print("time(numpy) = %s" % (time_np_hist_lin / n_test_runs))
print("time(tttrlib) = %s" % (time_tttrlib_hist_lin / n_test_runs))
print("tttrlib speedup: %.2f" % (time_np_hist_lin / time_tttrlib_hist_lin))
bins = np.logspace(0, 3.5, 32000, dtype=np.float64)
data = np.random.lognormal(3.0, 1, int(2e6))
hist = np.zeros(len(bins), dtype=np.float64)
weights = np.ones_like(data)
tttrlib.histogram1D_double(data, weights, bins, hist, '', True)
n_test_runs = 3
time_np_hist_log = timeit.timeit(
lambda: np.histogram(data, bins=bins),
number=n_test_runs
)
time_tttrlib_hist_log = timeit.timeit(
lambda: tttrlib.histogram1D_double(data, weights, bins, hist, 'log10', True),
number=n_test_runs
)
print("time(numpy) = %s" % (time_np_hist_log / n_test_runs))
print("time(tttrlib) = %s" % (time_tttrlib_hist_log / n_test_runs))
print("tttrlib speedup: %.2f" % (time_np_hist_log / time_tttrlib_hist_log))
bins1 = np.linspace(1, 600, 16000, dtype=np.float64)
bins2 = np.logspace(np.log10(bins1[-1]+0.1), 3.0, 16000, dtype=np.float64)
bins = np.hstack([bins1, bins2])
hist = np.zeros(len(bins), dtype=np.float64)
weights = np.ones_like(data)
tttrlib.histogram1D_double(data, weights, bins, hist, '', True)
n_test_runs = 3
time_np_hist_arb = timeit.timeit(
lambda : np.histogram(data, bins=bins),
number=n_test_runs
)
time_tttrlib_hist_arb = timeit.timeit(
lambda : tttrlib.histogram1D_double(data, weights, bins, hist, '', True),
number=n_test_runs
)
print("time(numpy) = %s" % (time_np_hist_arb / n_test_runs))
print("time(tttrlib) = %s" % (time_tttrlib_hist_arb / n_test_runs))
print("tttrlib speedup: %.2f" % (time_np_hist_arb / time_tttrlib_hist_arb))
def histogram2d(data, bins):
h = tttrlib.doubleHistogram()
h.set_axis(0, "x", -3, 3, bins, 'lin')
h.set_axis(1, "y", -3, 3, bins, 'lin')
h.update(data.T)
return h.get_histogram().reshape((bins, bins))
print("\n\nTesting 2D Histogram linear spacing")
print("---------------------------------------")
x = np.random.randn(5000)
y = 0.2 * np.random.randn(5000)
data = np.vstack([x, y])
bins = 100
hist2d = histogram2d(data, 100)
n_test_runs = 2000
time_np_2dhist_lin = timeit.timeit(
lambda : np.histogram2d(x, y, bins=bins),
number=n_test_runs
)
time_tttrlib_2dhist_lin = timeit.timeit(
lambda : histogram2d(data, bins),
number=n_test_runs
)
print("time(numpy) (ms) = %s" % (time_np_2dhist_lin / n_test_runs * 1000.0))
print("time(tttrlib) (ms) = %s" % (time_tttrlib_2dhist_lin / n_test_runs * 1000.0))
print("tttrlib speedup: %.2f" % (time_np_2dhist_lin / time_tttrlib_2dhist_lin))
N = 4
width = 0.35
time_tttrlib = (
time_tttrlib_hist_lin,
time_tttrlib_hist_log,
time_tttrlib_hist_arb,
time_tttrlib_2dhist_lin
)
time_numpy = (
time_np_hist_lin,
time_np_hist_log,
time_np_hist_arb,
time_np_2dhist_lin
)
ind = np.arange(N) # the x locations for the groups
labels = ('1D lin', '1D log', '1D arb.', '2D lin hist')
fig, ax = p.subplots()
rects1 = ax.bar(ind, time_numpy, width, color='y')
rects2 = ax.bar(ind + width, time_tttrlib, width, color='r')
# add some text for labels, title and axes ticks
ax.set_ylabel('time / ms')
ax.set_title('Histogram performance')
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(labels)
ax.legend((rects1[0], rects2[1]), ('Numpy', 'tttrlib'))
p.show()
Total running time of the script: (0 minutes 6.264 seconds)