from IPython.parallel import Client, require, interactive
rc = Client()
dv = rc.direct_view()
lv = rc.load_balanced_view()
with dv.sync_imports():
import numpy
mat = numpy.random.random_sample((800, 800))
mat = numpy.asfortranarray(mat)
def simple_inner(i):
column = mat[:, i]
# have to use a list comprehension to prevent closure
return sum([numpy.inner(column, mat[:, j]) for j in xrange(i + 1, mat.shape[1])])
Local, serial performance.
%timeit sum(simple_inner(i) for i in xrange(mat.shape[1] - 1))
dv.push(dict(mat=mat), block=True);
Parallel implementation using a DirectView
.
%timeit sum(dv.map(simple_inner, range(mat.shape[1] - 1), block=False))
Parallel implementation using a LoadBalancedView
with a large chunksize
and unordered results.
%timeit sum(lv.map(simple_inner, range(mat.shape[1] - 1), ordered=False, chunksize=(mat.shape[1] - 1) // len(lv), block=False))
But those are super slow! Why?
amr = dv.map(simple_inner, range(mat.shape[1] - 1), block=False)
amr.get()
s = sum(amr)
print "serial time: %.3f" % amr.serial_time
print " wall time: %.3f" % amr.wall_time
But that’s weird, the total computation time was over ten seconds.
That says that maybe the computation itself is slow on the engines for some reason.
Let’s try running the local code exactly on one of the engines.
e0 = rc[0]
e0.block = True
e0.activate('0') # for %px0 magic
e0.push(dict(simple_inner=simple_inner));
# execute the timeit line on engine zero, *exactly* as we typed it above
%px0 %timeit sum(simple_inner(i) for i in xrange(mat.shape[1] - 1))
Now that’s super slow, even though the code is identical to the first run!
IPython.parallel isn’t getting in the way at all, here,
so something must be up.
The only optimization we have made is the asfortranarray
, so let’s check mat.flags
print 'local'
print mat.flags
print 'engine 0:'
%px0 print mat.flags
Aha! mat
on the engines is somehow not FORTRAN-contiguous.
Maybe we will get our performance back if we re-apply the transformation on the engines
after the push.
%px mat = numpy.asfortranarray(mat)
And re-run the timings, to check:
%timeit sum(dv.map(simple_inner, range(mat.shape[1] - 1), block=False))
%timeit sum(lv.map(simple_inner, range(mat.shape[1] - 1), ordered=False, chunksize=(mat.shape[1] - 1) // len(lv), block=False))
Yes, that’s much more sensible than eleven seconds.
from IPython.parallel import Client, require, interactive
rc = Client()
dv = rc.direct_view()
lv = rc.load_balanced_view()
with dv.sync_imports():
import numpy
mat = numpy.random.random_sample((800, 800))
mat = numpy.asfortranarray(mat)
def simple_inner(i):
column = mat[:, i]
# have to use a list comprehension to prevent closure
return sum([numpy.inner(column, mat[:, j]) for j in xrange(i + 1, mat.shape[1])])
Local, serial performance.
%timeit sum(simple_inner(i) for i in xrange(mat.shape[1] - 1))
dv.push(dict(mat=mat), block=True);
Parallel implementation using a DirectView
.
%timeit sum(dv.map(simple_inner, range(mat.shape[1] - 1), block=False))
Parallel implementation using a LoadBalancedView
with a large chunksize
and unordered results.
%timeit sum(lv.map(simple_inner, range(mat.shape[1] - 1), ordered=False, chunksize=(mat.shape[1] - 1) // len(lv), block=False))
But those are super slow! Why?
amr = dv.map(simple_inner, range(mat.shape[1] - 1), block=False)
amr.get()
s = sum(amr)
print "serial time: %.3f" % amr.serial_time
print " wall time: %.3f" % amr.wall_time
But that’s weird, the total computation time was over ten seconds.
That says that maybe the computation itself is slow on the engines for some reason.
Let’s try running the local code exactly on one of the engines.
e0 = rc[0]
e0.block = True
e0.activate('0') # for %px0 magic
e0.push(dict(simple_inner=simple_inner));
# execute the timeit line on engine zero, *exactly* as we typed it above
%px0 %timeit sum(simple_inner(i) for i in xrange(mat.shape[1] - 1))
Now that’s super slow, even though the code is identical to the first run!
IPython.parallel isn’t getting in the way at all, here,
so something must be up.
The only optimization we have made is the asfortranarray
, so let’s check mat.flags
print 'local'
print mat.flags
print 'engine 0:'
%px0 print mat.flags
Aha! mat
on the engines is somehow not FORTRAN-contiguous.
Maybe we will get our performance back if we re-apply the transformation on the engines
after the push.
%px mat = numpy.asfortranarray(mat)
And re-run the timings, to check:
%timeit sum(dv.map(simple_inner, range(mat.shape[1] - 1), block=False))
%timeit sum(lv.map(simple_inner, range(mat.shape[1] - 1), ordered=False, chunksize=(mat.shape[1] - 1) // len(lv), block=False))
Yes, that’s much more sensible than eleven seconds.
Similar Notebooks