from IPython.parallel import Client, require, interactive
rc = Client()
dv = rc.direct_view()
lv = rc.load_balanced_view()
with dv.sync_imports():
import numpy
mat = numpy.random.random_sample((800, 800))
mat = numpy.asfortranarray(mat)
def simple_inner(i):
column = mat[:, i]
# have to use a list comprehension to prevent closure
return sum([numpy.inner(column, mat[:, j]) for j in xrange(i + 1, mat.shape[1])])
Local, serial performance.
%timeit sum(simple_inner(i) for i in xrange(mat.shape[1] - 1))
dv.push(dict(mat=mat), block=True);
Parallel implementation using a DirectView
.
%timeit sum(dv.map(simple_inner, range(mat.shape[1] - 1), block=False))
Parallel implementation using a LoadBalancedView
with a large chunksize
and unordered results.
%timeit sum(lv.map(simple_inner, range(mat.shape[1] - 1), ordered=False, chunksize=(mat.shape[1] - 1) // len(lv), block=False))
But the transfer forced the array back to C-contiguous, which explains the slowness.
If we re-apply the fortran-contiguous transformation on the engines,
we should geet our performance back.
%px mat = numpy.asfortranarray(mat)
And re-run the timings
%timeit sum(dv.map(simple_inner, range(mat.shape[1] - 1), block=False))
%timeit sum(lv.map(simple_inner, range(mat.shape[1] - 1), ordered=False, chunksize=(mat.shape[1] - 1) // len(lv), block=False))
Using two indices takes even more time due to additional communication.
def inner(i, j):
return numpy.inner(mat[:, i], mat[:, j])
first = [i for i in xrange(mat.shape[1] - 1) for j in xrange(i + 1, mat.shape[1])]
second = [j for i in xrange(mat.shape[1] - 1) for j in xrange(i + 1, mat.shape[1])]
%timeit sum(dv.map(inner, first, second, block=False))
%timeit sum(lv.map(inner, first, second, unordered=True, chunksize=len(first) // len(lv), block=False))
%timeit sum(map(inner, first, second))
So, in every case the double-index case is slower (locality, etc., etc.),
but it’s still faster in parallel than serial.
from IPython.parallel import Client, require, interactive
rc = Client()
dv = rc.direct_view()
lv = rc.load_balanced_view()
with dv.sync_imports():
import numpy
mat = numpy.random.random_sample((800, 800))
mat = numpy.asfortranarray(mat)
def simple_inner(i):
column = mat[:, i]
# have to use a list comprehension to prevent closure
return sum([numpy.inner(column, mat[:, j]) for j in xrange(i + 1, mat.shape[1])])
Local, serial performance.
%timeit sum(simple_inner(i) for i in xrange(mat.shape[1] - 1))
dv.push(dict(mat=mat), block=True);
Parallel implementation using a DirectView
.
%timeit sum(dv.map(simple_inner, range(mat.shape[1] - 1), block=False))
Parallel implementation using a LoadBalancedView
with a large chunksize
and unordered results.
%timeit sum(lv.map(simple_inner, range(mat.shape[1] - 1), ordered=False, chunksize=(mat.shape[1] - 1) // len(lv), block=False))
But the transfer forced the array back to C-contiguous, which explains the slowness.
If we re-apply the fortran-contiguous transformation on the engines,
we should geet our performance back.
%px mat = numpy.asfortranarray(mat)
And re-run the timings
%timeit sum(dv.map(simple_inner, range(mat.shape[1] - 1), block=False))
%timeit sum(lv.map(simple_inner, range(mat.shape[1] - 1), ordered=False, chunksize=(mat.shape[1] - 1) // len(lv), block=False))
Using two indices takes even more time due to additional communication.
def inner(i, j):
return numpy.inner(mat[:, i], mat[:, j])
first = [i for i in xrange(mat.shape[1] - 1) for j in xrange(i + 1, mat.shape[1])]
second = [j for i in xrange(mat.shape[1] - 1) for j in xrange(i + 1, mat.shape[1])]
%timeit sum(dv.map(inner, first, second, block=False))
%timeit sum(lv.map(inner, first, second, unordered=True, chunksize=len(first) // len(lv), block=False))
%timeit sum(map(inner, first, second))
So, in every case the double-index case is slower (locality, etc., etc.),
but it’s still faster in parallel than serial.
Similar Notebooks