Benchmarks

Speed and memory comparisons.

[2]:
import resource
import time
from concurrent import futures
from functools import partial
import numpy as np
from spector import indices, matrix, vector

def memory(unit=1e6):
    """Return memory usage in megabytes."""
    return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / unit

def diff(metric, func, *args):
    """Return metric difference before and after function call."""
    start = metric()
    _ = func(*args)  # noqa
    return metric() - start

executor = futures.ProcessPoolExecutor()
def sized(func, *args):
    """Measure memory in a separate process."""
    return executor.submit(diff, memory, func, *args).result()

timed = partial(diff, time.time)

def dok(size):
    return {(i, j): 1.0 for i in range(size) for j in range(size)}

def vecs(size):
    arr = np.array(range(size))
    return matrix((i, vector(arr)) for i in range(size))

keys = np.array(range(2 ** 18))
values = np.ones(len(keys))

indices vs. set

[3]:
# memory
sized(indices, keys) / sized(set, keys)
[3]:
0.6575404973762264
[4]:
# from array
timed(indices, keys) / timed(set, keys)
[4]:
0.9379571787068183
[5]:
# to array
timed(np.array, indices(keys)) / timed(np.fromiter, keys, keys.dtype, len(keys))
[5]:
0.09806606195447544
[6]:
# set op
timed(indices(keys).__sub__, indices(keys)) / timed(set(keys).__sub__, set(keys))
[6]:
0.7381768552021837

vector vs. dict

[7]:
# memory
sized(vector, keys, values) / sized(dict, zip(keys, values))
[7]:
0.44884702825592737
[8]:
# from arrays
timed(vector, keys, values) / timed(dict, zip(keys, values))
[8]:
0.44178317671327066
[9]:
vec, d = vector(keys, values), dict(zip(keys, values))
# keys
timed(vec.keys) / timed(np.fromiter, d.keys(), keys.dtype, len(d))
[9]:
0.3060828060828061
[10]:
# values
timed(vec.values) / timed(np.fromiter, d.values(), values.dtype, len(d))
[10]:
0.27414184975814443
[11]:
# sum
timed(np.sum, vec) / timed(sum, d.values())
[11]:
0.02977646562631801
[12]:
# dot
timed(vec.dot, vec) / timed(sum, (d[k] * d[k] for k in d))
[12]:
0.03351056327493578

matrix vs. dict

[13]:
size = int(len(keys) ** 0.5)
# memory
sized(vecs, size) / sized(dok, size)
[13]:
0.0

groupby

Matrices rely on an optimized groupby implementation which is much faster than pandas, especially for integer keys.

[14]:
import collections
import math
import random
import pandas as pd
from spector import groupby

def measure(size, base=10):
    buckets = [base ** exp for exp in range(round(math.log(size, base)) + 1)]
    data = np.array([random.randint(0, size) for _ in range(size)])
    rows = []
    values = np.arange(len(data))
    for num in buckets:
        keys = data % num
        df = pd.DataFrame({'keys': keys, 'values': values})
        rows.append({
            'hashed': timed(collections.deque, groupby(keys, values), 0),
            'sorted': timed(collections.deque, groupby(keys.astype('u8'), values), 0),
            'pandas': timed(collections.deque, df.groupby('keys', sort=False)['values'], 0),
        })
    return pd.DataFrame(rows, index=buckets)

df = measure(10 ** 5, 10)[['hashed', 'sorted', 'pandas']]
df.index.name = 'buckets'
df
[14]:
hashed sorted pandas
buckets
1 0.002075 0.002632 0.003111
10 0.002581 0.005743 0.003611
100 0.002686 0.006646 0.012685
1000 0.003882 0.010128 0.089451
10000 0.020858 0.030449 0.936619
100000 0.112520 0.125473 5.786675
[15]:
for i in df.index:
    df.loc[i] = df.loc[i] / df.loc[i].min()
df
[15]:
hashed sorted pandas
buckets
1 1.0 1.268298 1.499138
10 1.0 2.225219 1.399261
100 1.0 2.474567 4.722947
1000 1.0 2.609016 23.042992
10000 1.0 1.459861 44.905456
100000 1.0 1.115111 51.427764