Benchmarks

Speed and memory comparisons.

[2]:
import time
import numpy as np
from spector import indices, matrix, vector

def timed(func, *args):
    """Return elapsed time of function call."""
    start = time.time()
    _ = func(*args)  # noqa
    return time.time() - start

def dok(size):
    return {(i, j): 1.0 for i in range(size) for j in range(size)}

def vecs(size):
    arr = np.array(range(size))
    return matrix((i, vector(arr)) for i in range(size))

keys = np.array(range(2 ** 18))
values = np.ones(len(keys))

indices vs. set

[3]:
# from array
timed(indices, keys) / timed(set, keys)
[3]:
0.9866720014226016
[4]:
# to array
timed(np.array, indices(keys)) / timed(np.fromiter, keys, keys.dtype, len(keys))
[4]:
0.11233916473858548
[5]:
# set op
timed(indices(keys).__sub__, indices(keys)) / timed(set(keys).__sub__, set(keys))
[5]:
0.9267794301282779

vector vs. dict

[6]:
# from arrays
timed(vector, keys, values) / timed(dict, zip(keys, values))
[6]:
0.4839097874550885
[7]:
vec, d = vector(keys, values), dict(zip(keys, values))
# keys
timed(vec.keys) / timed(np.fromiter, d.keys(), keys.dtype, len(d))
[7]:
0.13282985339848957
[8]:
# values
timed(vec.values) / timed(np.fromiter, d.values(), values.dtype, len(d))
[8]:
0.25586455790255813
[9]:
# sum
timed(np.sum, vec) / timed(sum, d.values())
[9]:
0.038633134958725286
[10]:
# matmul
timed(vec.__matmul__, vec) / timed(sum, (d[k] * d[k] for k in d))
[10]:
0.031708814827220454

groupby

Matrices rely on an optimized groupby implementation which is much faster than pandas, especially for integer keys.

[11]:
import collections
import math
import random
import pandas as pd
from spector import groupby

def measure(size, base=10):
    buckets = [base ** exp for exp in range(round(math.log(size, base)) + 1)]
    data = np.array([random.randint(0, size) for _ in range(size)])
    rows = []
    values = np.arange(len(data))
    for num in buckets:
        keys = data % num
        df = pd.DataFrame({'keys': keys, 'values': values})
        rows.append({
            'hashed': timed(collections.deque, groupby(keys, values), 0),
            'sorted': timed(collections.deque, groupby(keys.astype('u8'), values), 0),
            'pandas': timed(collections.deque, df.groupby('keys', sort=False)['values'], 0),
        })
    return pd.DataFrame(rows, index=buckets)

df = measure(10 ** 5, 10)[['hashed', 'sorted', 'pandas']]
df.index.name = 'buckets'
df
[11]:
hashed sorted pandas
buckets
1 0.002739 0.002964 0.003388
10 0.002769 0.007193 0.004308
100 0.002948 0.011410 0.011873
1000 0.004762 0.017796 0.088965
10000 0.019525 0.050821 0.770915
100000 0.104926 0.232213 4.791982
[12]:
for i in df.index:
    df.loc[i] = df.loc[i] / df.loc[i].min()
df
[12]:
hashed sorted pandas
buckets
1 1.0 1.082166 1.236835
10 1.0 2.597589 1.555575
100 1.0 3.870673 4.027904
1000 1.0 3.736908 18.681686
10000 1.0 2.602896 39.483851
100000 1.0 2.213122 45.670267