Benchmarks¶
Speed and memory comparisons.
[2]:
import time
import numpy as np
from spector import indices, matrix, vector
def timed(func, *args):
"""Return elapsed time of function call."""
start = time.time()
_ = func(*args) # noqa
return time.time() - start
def dok(size):
return {(i, j): 1.0 for i in range(size) for j in range(size)}
def vecs(size):
arr = np.array(range(size))
return matrix((i, vector(arr)) for i in range(size))
keys = np.array(range(2 ** 18))
values = np.ones(len(keys))
indices
vs. set
¶
[3]:
# from array
timed(indices, keys) / timed(set, keys)
[3]:
0.9866720014226016
[4]:
# to array
timed(np.array, indices(keys)) / timed(np.fromiter, keys, keys.dtype, len(keys))
[4]:
0.11233916473858548
[5]:
# set op
timed(indices(keys).__sub__, indices(keys)) / timed(set(keys).__sub__, set(keys))
[5]:
0.9267794301282779
vector
vs. dict
¶
[6]:
# from arrays
timed(vector, keys, values) / timed(dict, zip(keys, values))
[6]:
0.4839097874550885
[7]:
vec, d = vector(keys, values), dict(zip(keys, values))
# keys
timed(vec.keys) / timed(np.fromiter, d.keys(), keys.dtype, len(d))
[7]:
0.13282985339848957
[8]:
# values
timed(vec.values) / timed(np.fromiter, d.values(), values.dtype, len(d))
[8]:
0.25586455790255813
[9]:
# sum
timed(np.sum, vec) / timed(sum, d.values())
[9]:
0.038633134958725286
[10]:
# matmul
timed(vec.__matmul__, vec) / timed(sum, (d[k] * d[k] for k in d))
[10]:
0.031708814827220454
groupby¶
Matrices rely on an optimized groupby
implementation which is much faster than pandas, especially for integer keys.
[11]:
import collections
import math
import random
import pandas as pd
from spector import groupby
def measure(size, base=10):
buckets = [base ** exp for exp in range(round(math.log(size, base)) + 1)]
data = np.array([random.randint(0, size) for _ in range(size)])
rows = []
values = np.arange(len(data))
for num in buckets:
keys = data % num
df = pd.DataFrame({'keys': keys, 'values': values})
rows.append({
'hashed': timed(collections.deque, groupby(keys, values), 0),
'sorted': timed(collections.deque, groupby(keys.astype('u8'), values), 0),
'pandas': timed(collections.deque, df.groupby('keys', sort=False)['values'], 0),
})
return pd.DataFrame(rows, index=buckets)
df = measure(10 ** 5, 10)[['hashed', 'sorted', 'pandas']]
df.index.name = 'buckets'
df
[11]:
hashed | sorted | pandas | |
---|---|---|---|
buckets | |||
1 | 0.002739 | 0.002964 | 0.003388 |
10 | 0.002769 | 0.007193 | 0.004308 |
100 | 0.002948 | 0.011410 | 0.011873 |
1000 | 0.004762 | 0.017796 | 0.088965 |
10000 | 0.019525 | 0.050821 | 0.770915 |
100000 | 0.104926 | 0.232213 | 4.791982 |
[12]:
for i in df.index:
df.loc[i] = df.loc[i] / df.loc[i].min()
df
[12]:
hashed | sorted | pandas | |
---|---|---|---|
buckets | |||
1 | 1.0 | 1.082166 | 1.236835 |
10 | 1.0 | 2.597589 | 1.555575 |
100 | 1.0 | 3.870673 | 4.027904 |
1000 | 1.0 | 3.736908 | 18.681686 |
10000 | 1.0 | 2.602896 | 39.483851 |
100000 | 1.0 | 2.213122 | 45.670267 |