Benchmarks¶
Speed and memory comparisons.
[2]:
import resource
import time
from concurrent import futures
from functools import partial
import numpy as np
from spector import indices, matrix, vector
def memory(unit=1e6):
"""Return memory usage in megabytes."""
return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / unit
def diff(metric, func, *args):
"""Return metric difference before and after function call."""
start = metric()
_ = func(*args) # noqa
return metric() - start
executor = futures.ProcessPoolExecutor()
def sized(func, *args):
"""Measure memory in a separate process."""
return executor.submit(diff, memory, func, *args).result()
timed = partial(diff, time.time)
def dok(size):
return {(i, j): 1.0 for i in range(size) for j in range(size)}
def vecs(size):
arr = np.array(range(size))
return matrix((i, vector(arr)) for i in range(size))
keys = np.array(range(2 ** 18))
values = np.ones(len(keys))
indices
vs. set
¶
[3]:
# memory
sized(indices, keys) / sized(set, keys)
[3]:
0.6575404973762264
[4]:
# from array
timed(indices, keys) / timed(set, keys)
[4]:
0.9379571787068183
[5]:
# to array
timed(np.array, indices(keys)) / timed(np.fromiter, keys, keys.dtype, len(keys))
[5]:
0.09806606195447544
[6]:
# set op
timed(indices(keys).__sub__, indices(keys)) / timed(set(keys).__sub__, set(keys))
[6]:
0.7381768552021837
vector
vs. dict
¶
[7]:
# memory
sized(vector, keys, values) / sized(dict, zip(keys, values))
[7]:
0.44884702825592737
[8]:
# from arrays
timed(vector, keys, values) / timed(dict, zip(keys, values))
[8]:
0.44178317671327066
[9]:
vec, d = vector(keys, values), dict(zip(keys, values))
# keys
timed(vec.keys) / timed(np.fromiter, d.keys(), keys.dtype, len(d))
[9]:
0.3060828060828061
[10]:
# values
timed(vec.values) / timed(np.fromiter, d.values(), values.dtype, len(d))
[10]:
0.27414184975814443
[11]:
# sum
timed(np.sum, vec) / timed(sum, d.values())
[11]:
0.02977646562631801
[12]:
# dot
timed(vec.dot, vec) / timed(sum, (d[k] * d[k] for k in d))
[12]:
0.03351056327493578
matrix
vs. dict
¶
[13]:
size = int(len(keys) ** 0.5)
# memory
sized(vecs, size) / sized(dok, size)
[13]:
0.0
groupby¶
Matrices rely on an optimized groupby
implementation which is much faster than pandas, especially for integer keys.
[14]:
import collections
import math
import random
import pandas as pd
from spector import groupby
def measure(size, base=10):
buckets = [base ** exp for exp in range(round(math.log(size, base)) + 1)]
data = np.array([random.randint(0, size) for _ in range(size)])
rows = []
values = np.arange(len(data))
for num in buckets:
keys = data % num
df = pd.DataFrame({'keys': keys, 'values': values})
rows.append({
'hashed': timed(collections.deque, groupby(keys, values), 0),
'sorted': timed(collections.deque, groupby(keys.astype('u8'), values), 0),
'pandas': timed(collections.deque, df.groupby('keys', sort=False)['values'], 0),
})
return pd.DataFrame(rows, index=buckets)
df = measure(10 ** 5, 10)[['hashed', 'sorted', 'pandas']]
df.index.name = 'buckets'
df
[14]:
hashed | sorted | pandas | |
---|---|---|---|
buckets | |||
1 | 0.002075 | 0.002632 | 0.003111 |
10 | 0.002581 | 0.005743 | 0.003611 |
100 | 0.002686 | 0.006646 | 0.012685 |
1000 | 0.003882 | 0.010128 | 0.089451 |
10000 | 0.020858 | 0.030449 | 0.936619 |
100000 | 0.112520 | 0.125473 | 5.786675 |
[15]:
for i in df.index:
df.loc[i] = df.loc[i] / df.loc[i].min()
df
[15]:
hashed | sorted | pandas | |
---|---|---|---|
buckets | |||
1 | 1.0 | 1.268298 | 1.499138 |
10 | 1.0 | 2.225219 | 1.399261 |
100 | 1.0 | 2.474567 | 4.722947 |
1000 | 1.0 | 2.609016 | 23.042992 |
10000 | 1.0 | 1.459861 | 44.905456 |
100000 | 1.0 | 1.115111 | 51.427764 |