From c69b6b979e3d6bd01ec40e75b92b21f7a391f0ca Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 5 Jun 2010 15:56:14 +0200 Subject: Added basic channel implementation including test restructured odb tests, they are now in an own module to keep the modules small --- lib/git/odb/channel.py | 106 +++++++++++++++ lib/git/odb/db.py | 11 +- test/git/odb/__init__.py | 1 + test/git/odb/lib.py | 60 +++++++++ test/git/odb/test_channel.py | 61 +++++++++ test/git/odb/test_db.py | 90 +++++++++++++ test/git/odb/test_stream.py | 172 ++++++++++++++++++++++++ test/git/odb/test_utils.py | 15 +++ test/git/test_odb.py | 307 ------------------------------------------- 9 files changed, 510 insertions(+), 313 deletions(-) create mode 100644 lib/git/odb/channel.py create mode 100644 test/git/odb/__init__.py create mode 100644 test/git/odb/lib.py create mode 100644 test/git/odb/test_channel.py create mode 100644 test/git/odb/test_db.py create mode 100644 test/git/odb/test_stream.py create mode 100644 test/git/odb/test_utils.py delete mode 100644 test/git/test_odb.py diff --git a/lib/git/odb/channel.py b/lib/git/odb/channel.py new file mode 100644 index 00000000..f6469d42 --- /dev/null +++ b/lib/git/odb/channel.py @@ -0,0 +1,106 @@ +"""Contains a queue based channel implementation""" +from Queue import ( + Queue, + Empty, + Full + ) + +#{ Classes +class Channel(object): + """A channel is similar to a system pipe. It has a write end as well as one or + more read ends. If Data is in the channel, it can be read, if not the read operation + will block until data becomes available. + If the channel is closed, any read operation will result in an exception + + This base class is not instantiated directly, but instead serves as constructor + for RWChannel pairs. + + Create a new channel """ + __slots__ = tuple() + def __new__(cls, *args): + if cls is Channel: + max_items = 0 + if len(args) == 1: + max_items = args[0] + if len(args) > 1: + raise ValueError("Specify not more than the number of items the channel should take") + wc = WChannel(max_items) + rc = RChannel(wc) + return wc, rc + # END constructor mode + return object.__new__(cls) + +class WChannel(Channel): + """The write end of a channel""" + __slots__ = ('_closed', '_queue') + + def __init__(self, max_items=0): + """initialize this instance, able to hold max_items at once + Write calls will block if the channel is full, until someone reads from it""" + self._closed = False + self._queue = Queue(max_items) + + + #{ Interface + def write(self, item, block=True, timeout=None): + """Send an item into the channel, it can be read from the read end of the + channel accordingly + :param item: Item to send + :param block: If True, the call will block until there is free space in the + channel + :param timeout: timeout in seconds for blocking calls. + :raise IOError: when writing into closed file or when writing into a non-blocking + full channel + :note: may block if the channel has a limited capacity""" + if self._closed: + raise IOError("Cannot write to a closed channel") + + try: + self._queue.put(item, block, timeout) + except Full: + raise IOError("Capacity of the channel was exeeded") + # END exception handling + + def close(self): + """Close the channel. Multiple close calls on a closed channel are no + an error""" + self._closed = True + + @property + def closed(self): + """:return: True if the channel was closed""" + return self._closed + #} END interface + + +class RChannel(Channel): + """The read-end of a corresponding write channel""" + __slots__ = '_wc' + + def __init__(self, wchannel): + """Initialize this instance from its parent write channel""" + self._wc = wchannel + + + #{ Interface + + def read(self, block=True, timeout=None): + """:return: an item read from the channel + :param block: if True, the call will block until an item is available + :param timeout: if positive and block is True, it will block only for the + given amount of seconds. + :raise IOError: When reading from an empty channel ( if non-blocking, or + if the channel is still empty after the timeout""" + # if the channel is closed for writing, we never block + if self._wc.closed: + block = False + + try: + return self._wc._queue.get(block, timeout) + except Empty: + raise IOError("Error reading from an empty channel") + # END handle reading + + #} END interface + +#} END classes diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index d970b0bf..5d3cc6a3 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -142,11 +142,10 @@ class FileDBBase(object): """Initialize this instance to look for its files at the given root path All subsequent operations will be relative to this path :raise InvalidDBRoot: - :note: The base will perform basic checking for accessability, but the subclass - is required to verify that the root_path contains the database structure it needs""" + :note: The base will not perform any accessablity checking as the base + might not yet be accessible, but become accessible before the first + access.""" super(FileDBBase, self).__init__() - if not os.path.isdir(root_path): - raise InvalidDBRoot(root_path) self._root_path = root_path @@ -333,10 +332,10 @@ class GitObjectDB(LooseObjectDB): def info(self, sha): t = self._git.get_object_header(sha) - return OInfo(t[0], t[1], t[2]) + return OInfo(*t) def stream(self, sha): """For now, all lookup is done by git itself""" t = self._git.stream_object_data(sha) - return OStream(t[0], t[1], t[2], t[3]) + return OStream(*t) diff --git a/test/git/odb/__init__.py b/test/git/odb/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/test/git/odb/__init__.py @@ -0,0 +1 @@ + diff --git a/test/git/odb/lib.py b/test/git/odb/lib.py new file mode 100644 index 00000000..d5199748 --- /dev/null +++ b/test/git/odb/lib.py @@ -0,0 +1,60 @@ +"""Utilities used in ODB testing""" +from git.odb import ( + OStream, + ) +from git.odb.stream import Sha1Writer + +import zlib +from cStringIO import StringIO + +#{ Stream Utilities + +class DummyStream(object): + def __init__(self): + self.was_read = False + self.bytes = 0 + self.closed = False + + def read(self, size): + self.was_read = True + self.bytes = size + + def close(self): + self.closed = True + + def _assert(self): + assert self.was_read + + +class DeriveTest(OStream): + def __init__(self, sha, type, size, stream, *args, **kwargs): + self.myarg = kwargs.pop('myarg') + self.args = args + + def _assert(self): + assert self.args + assert self.myarg + + +class ZippedStoreShaWriter(Sha1Writer): + """Remembers everything someone writes to it""" + __slots__ = ('buf', 'zip') + def __init__(self): + Sha1Writer.__init__(self) + self.buf = StringIO() + self.zip = zlib.compressobj(1) # fastest + + def __getattr__(self, attr): + return getattr(self.buf, attr) + + def write(self, data): + alen = Sha1Writer.write(self, data) + self.buf.write(self.zip.compress(data)) + return alen + + def close(self): + self.buf.write(self.zip.flush()) + + +#} END stream utilitiess + diff --git a/test/git/odb/test_channel.py b/test/git/odb/test_channel.py new file mode 100644 index 00000000..89b26582 --- /dev/null +++ b/test/git/odb/test_channel.py @@ -0,0 +1,61 @@ +"""Channel testing""" +from test.testlib import * +from git.odb.channel import * + +import time + +class TestDB(TestBase): + + def test_base(self): + # creating channel yields a write and a read channal + wc, rc = Channel() + assert isinstance(wc, WChannel) + assert isinstance(rc, RChannel) + + # everything else fails + self.failUnlessRaises(ValueError, Channel, 1, "too many args") + + # TEST UNLIMITED SIZE CHANNEL - writing+reading is FIFO + item = 1 + item2 = 2 + wc.write(item) + wc.write(item2) + assert rc.read() == item + assert rc.read() == item2 + + # next read blocks, then raises - it waits a second + st = time.time() + self.failUnlessRaises(IOError, rc.read, True, 1) + assert time.time() - st >= 1.0 + + # writing to a closed channel raises + assert not wc.closed + wc.close() + assert wc.closed + wc.close() # fine + assert wc.closed + + self.failUnlessRaises(IOError, wc.write, 1) + + # reading from a closed channel never blocks + self.failUnlessRaises(IOError, rc.read) + + + + # TEST LIMITED SIZE CHANNEL + # channel with max-items set + wc, rc = Channel(1) + wc.write(item) # fine + + # blocks for a second, its full + st = time.time() + self.failUnlessRaises(IOError, wc.write, item, True, 1) + assert time.time() - st >= 1.0 + + # get one + assert rc.read() == item + + # its empty,can put one again + wc.write(item2) + assert rc.read() == item2 + wc.close() diff --git a/test/git/odb/test_db.py b/test/git/odb/test_db.py new file mode 100644 index 00000000..35ba8680 --- /dev/null +++ b/test/git/odb/test_db.py @@ -0,0 +1,90 @@ +"""Test for object db""" +from test.testlib import * +from lib import ZippedStoreShaWriter + +from git.odb import * +from git.odb.stream import Sha1Writer +from git import Blob +from git.errors import BadObject + + +from cStringIO import StringIO +import os + +class TestDB(TestBase): + """Test the different db class implementations""" + + # data + two_lines = "1234\nhello world" + + all_data = (two_lines, ) + + def _assert_object_writing(self, db): + """General tests to verify object writing, compatible to ObjectDBW + :note: requires write access to the database""" + # start in 'dry-run' mode, using a simple sha1 writer + ostreams = (ZippedStoreShaWriter, None) + for ostreamcls in ostreams: + for data in self.all_data: + dry_run = ostreamcls is not None + ostream = None + if ostreamcls is not None: + ostream = ostreamcls() + assert isinstance(ostream, Sha1Writer) + # END create ostream + + prev_ostream = db.set_ostream(ostream) + assert type(prev_ostream) in ostreams or prev_ostream in ostreams + + istream = IStream(Blob.type, len(data), StringIO(data)) + + # store returns same istream instance, with new sha set + my_istream = db.store(istream) + sha = istream.sha + assert my_istream is istream + assert db.has_object(sha) != dry_run + assert len(sha) == 40 # for now we require 40 byte shas as default + + # verify data - the slow way, we want to run code + if not dry_run: + info = db.info(sha) + assert Blob.type == info.type + assert info.size == len(data) + + ostream = db.stream(sha) + assert ostream.read() == data + assert ostream.type == Blob.type + assert ostream.size == len(data) + else: + self.failUnlessRaises(BadObject, db.info, sha) + self.failUnlessRaises(BadObject, db.stream, sha) + + # DIRECT STREAM COPY + # our data hase been written in object format to the StringIO + # we pasesd as output stream. No physical database representation + # was created. + # Test direct stream copy of object streams, the result must be + # identical to what we fed in + ostream.seek(0) + istream.stream = ostream + assert istream.sha is not None + prev_sha = istream.sha + + db.set_ostream(ZippedStoreShaWriter()) + db.store(istream) + assert istream.sha == prev_sha + new_ostream = db.ostream() + + # note: only works as long our store write uses the same compression + # level, which is zip + assert ostream.getvalue() == new_ostream.getvalue() + # END for each data set + # END for each dry_run mode + + @with_bare_rw_repo + def test_writing(self, rwrepo): + ldb = LooseObjectDB(os.path.join(rwrepo.git_dir, 'objects')) + + # write data + self._assert_object_writing(ldb) + diff --git a/test/git/odb/test_stream.py b/test/git/odb/test_stream.py new file mode 100644 index 00000000..020fe6bd --- /dev/null +++ b/test/git/odb/test_stream.py @@ -0,0 +1,172 @@ +"""Test for object db""" +from test.testlib import * +from lib import ( + DummyStream, + DeriveTest, + Sha1Writer + ) + +from git.odb import * +from git import Blob +from cStringIO import StringIO +import tempfile +import os +import zlib + + + + +class TestStream(TestBase): + """Test stream classes""" + + data_sizes = (15, 10000, 1000*1024+512) + + def test_streams(self): + # test info + sha = Blob.NULL_HEX_SHA + s = 20 + info = OInfo(sha, Blob.type, s) + assert info.sha == sha + assert info.type == Blob.type + assert info.size == s + + # test ostream + stream = DummyStream() + ostream = OStream(*(info + (stream, ))) + ostream.read(15) + stream._assert() + assert stream.bytes == 15 + ostream.read(20) + assert stream.bytes == 20 + + # derive with own args + DeriveTest(sha, Blob.type, s, stream, 'mine',myarg = 3)._assert() + + # test istream + istream = IStream(Blob.type, s, stream) + assert istream.sha == None + istream.sha = sha + assert istream.sha == sha + + assert len(istream.binsha) == 20 + assert len(istream.hexsha) == 40 + + assert istream.size == s + istream.size = s * 2 + istream.size == s * 2 + assert istream.type == Blob.type + istream.type = "something" + assert istream.type == "something" + assert istream.stream is stream + istream.stream = None + assert istream.stream is None + + assert istream.error is None + istream.error = Exception() + assert isinstance(istream.error, Exception) + + def _assert_stream_reader(self, stream, cdata, rewind_stream=lambda s: None): + """Make stream tests - the orig_stream is seekable, allowing it to be + rewound and reused + :param cdata: the data we expect to read from stream, the contents + :param rewind_stream: function called to rewind the stream to make it ready + for reuse""" + ns = 10 + assert len(cdata) > ns-1, "Data must be larger than %i, was %i" % (ns, len(cdata)) + + # read in small steps + ss = len(cdata) / ns + for i in range(ns): + data = stream.read(ss) + chunk = cdata[i*ss:(i+1)*ss] + assert data == chunk + # END for each step + rest = stream.read() + if rest: + assert rest == cdata[-len(rest):] + # END handle rest + + rewind_stream(stream) + + # read everything + rdata = stream.read() + assert rdata == cdata + + def test_decompress_reader(self): + for close_on_deletion in range(2): + for with_size in range(2): + for ds in self.data_sizes: + cdata = make_bytes(ds, randomize=False) + + # zdata = zipped actual data + # cdata = original content data + + # create reader + if with_size: + # need object data + zdata = zlib.compress(make_object(Blob.type, cdata)) + type, size, reader = DecompressMemMapReader.new(zdata, close_on_deletion) + assert size == len(cdata) + assert type == Blob.type + else: + # here we need content data + zdata = zlib.compress(cdata) + reader = DecompressMemMapReader(zdata, close_on_deletion, len(cdata)) + assert reader._s == len(cdata) + # END get reader + + def rewind(r): + r._zip = zlib.decompressobj() + r._br = r._cws = r._cwe = 0 + if with_size: + r._parse_header_info() + # END skip header + # END make rewind func + + self._assert_stream_reader(reader, cdata, rewind) + + # put in a dummy stream for closing + dummy = DummyStream() + reader._m = dummy + + assert not dummy.closed + del(reader) + assert dummy.closed == close_on_deletion + #zdi# + # END for each datasize + # END whether size should be used + # END whether stream should be closed when deleted + + def test_sha_writer(self): + writer = Sha1Writer() + assert 2 == writer.write("hi") + assert len(writer.sha(as_hex=1)) == 40 + assert len(writer.sha(as_hex=0)) == 20 + + # make sure it does something ;) + prev_sha = writer.sha() + writer.write("hi again") + assert writer.sha() != prev_sha + + def test_compressed_writer(self): + for ds in self.data_sizes: + fd, path = tempfile.mkstemp() + ostream = FDCompressedSha1Writer(fd) + data = make_bytes(ds, randomize=False) + + # for now, just a single write, code doesn't care about chunking + assert len(data) == ostream.write(data) + ostream.close() + # its closed already + self.failUnlessRaises(OSError, os.close, fd) + + # read everything back, compare to data we zip + fd = os.open(path, os.O_RDONLY) + written_data = os.read(fd, os.path.getsize(path)) + os.close(fd) + assert written_data == zlib.compress(data, 1) # best speed + + os.remove(path) + # END for each os + + diff --git a/test/git/odb/test_utils.py b/test/git/odb/test_utils.py new file mode 100644 index 00000000..34572b37 --- /dev/null +++ b/test/git/odb/test_utils.py @@ -0,0 +1,15 @@ +"""Test for object db""" +from test.testlib import * +from git import Blob +from git.odb.utils import ( + to_hex_sha, + to_bin_sha + ) + + +class TestUtils(TestBase): + def test_basics(self): + assert to_hex_sha(Blob.NULL_HEX_SHA) == Blob.NULL_HEX_SHA + assert len(to_bin_sha(Blob.NULL_HEX_SHA)) == 20 + assert to_hex_sha(to_bin_sha(Blob.NULL_HEX_SHA)) == Blob.NULL_HEX_SHA + diff --git a/test/git/test_odb.py b/test/git/test_odb.py deleted file mode 100644 index 5c8268cd..00000000 --- a/test/git/test_odb.py +++ /dev/null @@ -1,307 +0,0 @@ -"""Test for object db""" -from test.testlib import * -from git.odb import * -from git.odb.utils import ( - to_hex_sha, - to_bin_sha - ) -from git.odb.stream import Sha1Writer -from git import Blob -from git.errors import BadObject -from cStringIO import StringIO -import tempfile -import os -import zlib - - -#{ Stream Utilities - -class DummyStream(object): - def __init__(self): - self.was_read = False - self.bytes = 0 - self.closed = False - - def read(self, size): - self.was_read = True - self.bytes = size - - def close(self): - self.closed = True - - def _assert(self): - assert self.was_read - - -class DeriveTest(OStream): - def __init__(self, sha, type, size, stream, *args, **kwargs): - self.myarg = kwargs.pop('myarg') - self.args = args - - def _assert(self): - assert self.args - assert self.myarg - - -class ZippedStoreShaWriter(Sha1Writer): - """Remembers everything someone writes to it""" - __slots__ = ('buf', 'zip') - def __init__(self): - Sha1Writer.__init__(self) - self.buf = StringIO() - self.zip = zlib.compressobj(1) # fastest - - def __getattr__(self, attr): - return getattr(self.buf, attr) - - def write(self, data): - alen = Sha1Writer.write(self, data) - self.buf.write(self.zip.compress(data)) - return alen - - def close(self): - self.buf.write(self.zip.flush()) - - -#} END stream utilitiess - - - -class TestStream(TestBase): - """Test stream classes""" - - data_sizes = (15, 10000, 1000*1024+512) - - def test_streams(self): - # test info - sha = Blob.NULL_HEX_SHA - s = 20 - info = OInfo(sha, Blob.type, s) - assert info.sha == sha - assert info.type == Blob.type - assert info.size == s - - # test ostream - stream = DummyStream() - ostream = OStream(*(info + (stream, ))) - ostream.read(15) - stream._assert() - assert stream.bytes == 15 - ostream.read(20) - assert stream.bytes == 20 - - # derive with own args - DeriveTest(sha, Blob.type, s, stream, 'mine',myarg = 3)._assert() - - # test istream - istream = IStream(Blob.type, s, stream) - assert istream.sha == None - istream.sha = sha - assert istream.sha == sha - - assert len(istream.binsha) == 20 - assert len(istream.hexsha) == 40 - - assert istream.size == s - istream.size = s * 2 - istream.size == s * 2 - assert istream.type == Blob.type - istream.type = "something" - assert istream.type == "something" - assert istream.stream is stream - istream.stream = None - assert istream.stream is None - - assert istream.error is None - istream.error = Exception() - assert isinstance(istream.error, Exception) - - def _assert_stream_reader(self, stream, cdata, rewind_stream=lambda s: None): - """Make stream tests - the orig_stream is seekable, allowing it to be - rewound and reused - :param cdata: the data we expect to read from stream, the contents - :param rewind_stream: function called to rewind the stream to make it ready - for reuse""" - ns = 10 - assert len(cdata) > ns-1, "Data must be larger than %i, was %i" % (ns, len(cdata)) - - # read in small steps - ss = len(cdata) / ns - for i in range(ns): - data = stream.read(ss) - chunk = cdata[i*ss:(i+1)*ss] - assert data == chunk - # END for each step - rest = stream.read() - if rest: - assert rest == cdata[-len(rest):] - # END handle rest - - rewind_stream(stream) - - # read everything - rdata = stream.read() - assert rdata == cdata - - def test_decompress_reader(self): - for close_on_deletion in range(2): - for with_size in range(2): - for ds in self.data_sizes: - cdata = make_bytes(ds, randomize=False) - - # zdata = zipped actual data - # cdata = original content data - - # create reader - if with_size: - # need object data - zdata = zlib.compress(make_object(Blob.type, cdata)) - type, size, reader = DecompressMemMapReader.new(zdata, close_on_deletion) - assert size == len(cdata) - assert type == Blob.type - else: - # here we need content data - zdata = zlib.compress(cdata) - reader = DecompressMemMapReader(zdata, close_on_deletion, len(cdata)) - assert reader._s == len(cdata) - # END get reader - - def rewind(r): - r._zip = zlib.decompressobj() - r._br = r._cws = r._cwe = 0 - if with_size: - r._parse_header_info() - # END skip header - # END make rewind func - - self._assert_stream_reader(reader, cdata, rewind) - - # put in a dummy stream for closing - dummy = DummyStream() - reader._m = dummy - - assert not dummy.closed - del(reader) - assert dummy.closed == close_on_deletion - #zdi# - # END for each datasize - # END whether size should be used - # END whether stream should be closed when deleted - - def test_sha_writer(self): - writer = Sha1Writer() - assert 2 == writer.write("hi") - assert len(writer.sha(as_hex=1)) == 40 - assert len(writer.sha(as_hex=0)) == 20 - - # make sure it does something ;) - prev_sha = writer.sha() - writer.write("hi again") - assert writer.sha() != prev_sha - - def test_compressed_writer(self): - for ds in self.data_sizes: - fd, path = tempfile.mkstemp() - ostream = FDCompressedSha1Writer(fd) - data = make_bytes(ds, randomize=False) - - # for now, just a single write, code doesn't care about chunking - assert len(data) == ostream.write(data) - ostream.close() - # its closed already - self.failUnlessRaises(OSError, os.close, fd) - - # read everything back, compare to data we zip - fd = os.open(path, os.O_RDONLY) - written_data = os.read(fd, os.path.getsize(path)) - os.close(fd) - assert written_data == zlib.compress(data, 1) # best speed - - os.remove(path) - # END for each os - - -class TestUtils(TestBase): - def test_basics(self): - assert to_hex_sha(Blob.NULL_HEX_SHA) == Blob.NULL_HEX_SHA - assert len(to_bin_sha(Blob.NULL_HEX_SHA)) == 20 - assert to_hex_sha(to_bin_sha(Blob.NULL_HEX_SHA)) == Blob.NULL_HEX_SHA - - -class TestDB(TestBase): - """Test the different db class implementations""" - - # data - two_lines = "1234\nhello world" - - all_data = (two_lines, ) - - def _assert_object_writing(self, db): - """General tests to verify object writing, compatible to ObjectDBW - :note: requires write access to the database""" - # start in 'dry-run' mode, using a simple sha1 writer - ostreams = (ZippedStoreShaWriter, None) - for ostreamcls in ostreams: - for data in self.all_data: - dry_run = ostreamcls is not None - ostream = None - if ostreamcls is not None: - ostream = ostreamcls() - assert isinstance(ostream, Sha1Writer) - # END create ostream - - prev_ostream = db.set_ostream(ostream) - assert type(prev_ostream) in ostreams or prev_ostream in ostreams - - istream = IStream(Blob.type, len(data), StringIO(data)) - - # store returns same istream instance, with new sha set - my_istream = db.store(istream) - sha = istream.sha - assert my_istream is istream - assert db.has_object(sha) != dry_run - assert len(sha) == 40 # for now we require 40 byte shas as default - - # verify data - the slow way, we want to run code - if not dry_run: - info = db.info(sha) - assert Blob.type == info.type - assert info.size == len(data) - - ostream = db.stream(sha) - assert ostream.read() == data - assert ostream.type == Blob.type - assert ostream.size == len(data) - else: - self.failUnlessRaises(BadObject, db.info, sha) - self.failUnlessRaises(BadObject, db.stream, sha) - - # DIRECT STREAM COPY - # our data hase been written in object format to the StringIO - # we pasesd as output stream. No physical database representation - # was created. - # Test direct stream copy of object streams, the result must be - # identical to what we fed in - ostream.seek(0) - istream.stream = ostream - assert istream.sha is not None - prev_sha = istream.sha - - db.set_ostream(ZippedStoreShaWriter()) - db.store(istream) - assert istream.sha == prev_sha - new_ostream = db.ostream() - - # note: only works as long our store write uses the same compression - # level, which is zip - assert ostream.getvalue() == new_ostream.getvalue() - # END for each data set - # END for each dry_run mode - - @with_bare_rw_repo - def test_writing(self, rwrepo): - ldb = LooseObjectDB(os.path.join(rwrepo.git_dir, 'objects')) - - # write data - self._assert_object_writing(ldb) - -- cgit v1.2.3 From 65c9fe0baa579173afa5a2d463ac198d06ef4993 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 5 Jun 2010 16:07:19 +0200 Subject: A code donation: Donating a worker thread implementation inclduding tests to Git-Python. I have the feeling it can do much good here :) --- lib/git/odb/channel.py | 1 + lib/git/odb/thread.py | 203 ++++++++++++++++++++++++++++++++++++++++++++ test/git/odb/test_thread.py | 47 ++++++++++ 3 files changed, 251 insertions(+) create mode 100644 lib/git/odb/thread.py create mode 100644 test/git/odb/test_thread.py diff --git a/lib/git/odb/channel.py b/lib/git/odb/channel.py index f6469d42..32eef6e1 100644 --- a/lib/git/odb/channel.py +++ b/lib/git/odb/channel.py @@ -30,6 +30,7 @@ class Channel(object): # END constructor mode return object.__new__(cls) + class WChannel(Channel): """The write end of a channel""" __slots__ = ('_closed', '_queue') diff --git a/lib/git/odb/thread.py b/lib/git/odb/thread.py new file mode 100644 index 00000000..3938666a --- /dev/null +++ b/lib/git/odb/thread.py @@ -0,0 +1,203 @@ +# -*- coding: utf-8 -*- +"""Module with threading utilities""" +__docformat__ = "restructuredtext" +import threading +import inspect +import Queue + +#{ Decorators + +def do_terminate_threads(whitelist=list()): + """Simple function which terminates all of our threads + :param whitelist: If whitelist is given, only the given threads will be terminated""" + for t in threading.enumerate(): + if not isinstance(t, TerminatableThread): + continue + if whitelist and t not in whitelist: + continue + if isinstance(t, WorkerThread): + t.inq.put(t.quit) + # END worker special handling + t.stop_and_join() + # END for each thread + +def terminate_threads( func ): + """Kills all worker threads the method has created by sending the quit signal. + This takes over in case of an error in the main function""" + def wrapper(*args, **kwargs): + cur_threads = set(threading.enumerate()) + try: + return func(*args, **kwargs) + finally: + do_terminate_threads(set(threading.enumerate()) - cur_threads) + # END finally shutdown threads + # END wrapper + wrapper.__name__ = func.__name__ + return wrapper + +#} END decorators + +#{ Classes + +class TerminatableThread(threading.Thread): + """A simple thread able to terminate itself on behalf of the user. + + Terminate a thread as follows: + + t.stop_and_join() + + Derived classes call _should_terminate() to determine whether they should + abort gracefully + """ + __slots__ = '_terminate' + + def __init__(self): + super(TerminatableThread, self).__init__() + self._terminate = False + + + #{ Subclass Interface + def _should_terminate(self): + """:return: True if this thread should terminate its operation immediately""" + return self._terminate + + def _terminated(self): + """Called once the thread terminated. Its called in the main thread + and may perform cleanup operations""" + pass + + def start(self): + """Start the thread and return self""" + super(TerminatableThread, self).start() + return self + + #} END subclass interface + + #{ Interface + + def stop_and_join(self): + """Ask the thread to stop its operation and wait for it to terminate + :note: Depending on the implenetation, this might block a moment""" + self._terminate = True + self.join() + self._terminated() + #} END interface + + +class WorkerThread(TerminatableThread): + """ + This base allows to call functions on class instances natively and retrieve + their results asynchronously using a queue. + The thread runs forever unless it receives the terminate signal using + its task queue. + + Tasks could be anything, but should usually be class methods and arguments to + allow the following: + + inq = Queue() + outq = Queue() + w = WorkerThread(inq, outq) + w.start() + inq.put((WorkerThread., args, kwargs)) + res = outq.get() + + finally we call quit to terminate asap. + + alternatively, you can make a call more intuitively - the output is the output queue + allowing you to get the result right away or later + w.call(arg, kwarg='value').get() + + inq.put(WorkerThread.quit) + w.join() + + You may provide the following tuples as task: + t[0] = class method, function or instance method + t[1] = optional, tuple or list of arguments to pass to the routine + t[2] = optional, dictionary of keyword arguments to pass to the routine + """ + __slots__ = ('inq', 'outq') + + class InvalidRoutineError(Exception): + """Class sent as return value in case of an error""" + + def __init__(self, inq = None, outq = None): + super(WorkerThread, self).__init__() + self.inq = inq or Queue.Queue() + self.outq = outq or Queue.Queue() + + def call(self, function, *args, **kwargs): + """Method that makes the call to the worker using the input queue, + returning our output queue + + :param funciton: can be a standalone function unrelated to this class, + a class method of this class or any instance method. + If it is a string, it will be considered a function residing on this instance + :param args: arguments to pass to function + :parma **kwargs: kwargs to pass to function""" + self.inq.put((function, args, kwargs)) + return self.outq + + def wait_until_idle(self): + """wait until the input queue is empty, in the meanwhile, take all + results off the output queue.""" + while not self.inq.empty(): + try: + self.outq.get(False) + except Queue.Empty: + continue + # END while there are tasks on the queue + + def run(self): + """Process input tasks until we receive the quit signal""" + while True: + if self._should_terminate(): + break + # END check for stop request + routine = self.__class__.quit + args = tuple() + kwargs = dict() + tasktuple = self.inq.get() + + if isinstance(tasktuple, (tuple, list)): + if len(tasktuple) == 3: + routine, args, kwargs = tasktuple + elif len(tasktuple) == 2: + routine, args = tasktuple + elif len(tasktuple) == 1: + routine = tasktuple[0] + # END tasktuple length check + elif inspect.isroutine(tasktuple): + routine = tasktuple + # END tasktuple handling + + try: + rval = None + if inspect.ismethod(routine): + if routine.im_self is None: + rval = routine(self, *args, **kwargs) + else: + rval = routine(*args, **kwargs) + elif inspect.isroutine(routine): + rval = routine(*args, **kwargs) + elif isinstance(routine, basestring) and hasattr(self, routine): + rval = getattr(self, routine)(*args, **kwargs) + else: + # ignore unknown items + print "%s: task %s was not understood - terminating" % (self.getName(), str(tasktuple)) + self.outq.put(self.InvalidRoutineError(routine)) + break + # END make routine call + self.outq.put(rval) + except StopIteration: + break + except Exception,e: + print "%s: Task %s raised unhandled exception: %s" % (self.getName(), str(tasktuple), str(e)) + self.outq.put(e) + # END routine exception handling + # END endless loop + + def quit(self): + raise StopIteration + + +#} END classes diff --git a/test/git/odb/test_thread.py b/test/git/odb/test_thread.py new file mode 100644 index 00000000..3b7f749b --- /dev/null +++ b/test/git/odb/test_thread.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +""" Test thead classes and functions""" +from test.testlib import * +from git.odb.thread import * +from Queue import Queue + +class TestWorker(WorkerThread): + def __init__(self, *args, **kwargs): + super(TestWorker, self).__init__(*args, **kwargs) + self.reset() + + def fun(self, *args, **kwargs): + self.called = True + self.args = args + self.kwargs = kwargs + return True + + def make_assertion(self): + assert self.called + assert self.args + assert self.kwargs + self.reset() + + def reset(self): + self.called = False + self.args = None + self.kwargs = None + + +class TestCase( TestCase ): + + @terminate_threads + def test_worker_thread(self): + worker = TestWorker() + assert isinstance(worker.start(), WorkerThread) + + # test different method types + standalone_func = lambda *args, **kwargs: worker.fun(*args, **kwargs) + for function in ("fun", TestWorker.fun, worker.fun, standalone_func): + rval = worker.call(function, 1, this='that') + assert isinstance(rval, Queue) + assert rval.get() is True + worker.make_assertion() + # END for each function type + + worker.call('quit') + -- cgit v1.2.3 From 50e469109eed3a752d9a1b0297f16466ad92f8d2 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 5 Jun 2010 18:26:26 +0200 Subject: Initial pool design added, allowing for lazy channel based evaluation of inter-dependent tasks --- lib/git/odb/channel.py | 1 + lib/git/odb/pool.py | 104 +++++++++++++++++++++++++++++++++++++++++++ test/git/odb/test_channel.py | 2 +- test/git/odb/test_pool.py | 10 +++++ test/git/odb/test_thread.py | 2 +- 5 files changed, 117 insertions(+), 2 deletions(-) create mode 100644 lib/git/odb/pool.py create mode 100644 test/git/odb/test_pool.py diff --git a/lib/git/odb/channel.py b/lib/git/odb/channel.py index 32eef6e1..c9cbfb87 100644 --- a/lib/git/odb/channel.py +++ b/lib/git/odb/channel.py @@ -17,6 +17,7 @@ class Channel(object): Create a new channel """ __slots__ = tuple() + def __new__(cls, *args): if cls is Channel: max_items = 0 diff --git a/lib/git/odb/pool.py b/lib/git/odb/pool.py new file mode 100644 index 00000000..5c3a7ead --- /dev/null +++ b/lib/git/odb/pool.py @@ -0,0 +1,104 @@ +"""Implementation of a thread-pool working with channels""" +from thread import TerminatableThread +from channel import ( + Channel, + WChannel, + RChannel + ) + +class Node(object): + """A quick and dirty to the point implementation of a simple, and slow ascyclic graph. + Its not designed to support big graphs, and sports only the functionality + we need""" + __slots__('in_nodes', 'out_nodes') + + +class Graph(object): + """A simple graph implementation, keeping nodes and providing basic access and + editing functions""" + __slots__ = "nodes" + + def add_node(self, node): + pass + + def del_node(self, node): + pass + + def visit_input_depth_first(self, node, visitor=lambda n: True ): + """Visit all input nodes of the given node, depth first, calling visitor + for each node on our way. If the function returns False, the traversal + will not go any deeper, but continue at the next branch""" + pass + + +class TaskNode(Node): + """Couples an input channel, an output channel, as well as a processing function + together. + It may contain additional information on how to handel read-errors from the + input channel""" + __slots__ = ('in_rc', 'out_wc', 'fun') + + def is_done(self): + """:return: True if we are finished processing""" + return self.out_wc.closed + + +class PoolChannel(Channel): + """Base class for read and write channels which trigger the pool to evaluate + its tasks, causing the evaluation of the task list effectively assure a read + from actual output channel will not block forever due to task dependencies. + """ + __slots__ = tuple() + + +class RPoolChannel(PoolChannel): + """ A read-only pool channel may not be wrapped or derived from, but it provides slots to call + before and after an item is to be read""" + __slots__ = ('_task', '_pool', '_pre_cb', '_post_cb') + + def set_post_cb(self, fun = lambda item: item): + """Install a callback to call after the item has been read. The function + returns a possibly changed item. If it raises, the exception will be propagated + in an IOError, indicating read-failure + If a function is not provided, the call is effectively uninstalled.""" + + def set_pre_cb(self, fun = lambda : None): + """Install a callback to call before an item is read from the channel. + If it fails, the read will fail with an IOError + If a function is not provided, the call is effectively uninstalled.""" + + +class PoolWorker(WorkerThread): + """A worker thread which gets called to deal with Tasks. Tasks provide channls + with actual work, whose result will be send to the tasks output channel""" + + @classmethod + def perform_task(cls, task): + pass + + +class ThreadPool(Graph): + """A thread pool maintains a set of one or more worker threads, but supports + a fully serial mode in which case the amount of threads is zero. + + Work is distributed via Channels, which form a dependency graph. The evaluation + is lazy, as work will only be done once an output is requested.""" + __slots__ = ( '_workers', # list of worker threads + '_queue', # master queue for tasks + '_ordered_tasks_cache' # tasks in order of evaluation, mapped by read channel + ) + + def del_node(self, task): + """Delete the node ( being a task ), but delete the entries in our output channel + cache as well""" + + + def set_pool_size(self, size=0): + """Set the amount of workers to use in this pool. + :param size: if 0, the pool will do all work itself in the calling thread, + otherwise the work will be distributed among the given amount of threads""" + + def add_task(self, task): + """Add a new task to be processed. + :return: your task instance with its output channel set. It can be used + to retrieve processed items""" diff --git a/test/git/odb/test_channel.py b/test/git/odb/test_channel.py index 89b26582..d845a6ec 100644 --- a/test/git/odb/test_channel.py +++ b/test/git/odb/test_channel.py @@ -4,7 +4,7 @@ from git.odb.channel import * import time -class TestDB(TestBase): +class TestChannels(TestBase): def test_base(self): # creating channel yields a write and a read channal diff --git a/test/git/odb/test_pool.py b/test/git/odb/test_pool.py new file mode 100644 index 00000000..6656c69d --- /dev/null +++ b/test/git/odb/test_pool.py @@ -0,0 +1,10 @@ +"""Channel testing""" +from test.testlib import * +from git.odb.pool import * + +import time + +class TestThreadPool(TestBase): + + def test_base(self): + pass diff --git a/test/git/odb/test_thread.py b/test/git/odb/test_thread.py index 3b7f749b..674ecc1d 100644 --- a/test/git/odb/test_thread.py +++ b/test/git/odb/test_thread.py @@ -27,7 +27,7 @@ class TestWorker(WorkerThread): self.kwargs = None -class TestCase( TestCase ): +class TestThreads( TestCase ): @terminate_threads def test_worker_thread(self): -- cgit v1.2.3 From 61138f2ece0cb864b933698174315c34a78835d1 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 5 Jun 2010 19:59:17 +0200 Subject: Moved multiprocessing modules into own package, as they in fact have nothing to do with the object db. If that really works the way I want, it will become an own project, called async --- lib/git/mp/__init__.py | 1 + lib/git/mp/channel.py | 108 +++++++++++++++++++++++ lib/git/mp/pool.py | 116 +++++++++++++++++++++++++ lib/git/mp/thread.py | 203 +++++++++++++++++++++++++++++++++++++++++++ lib/git/odb/channel.py | 108 ----------------------- lib/git/odb/pool.py | 104 ---------------------- lib/git/odb/thread.py | 203 ------------------------------------------- test/git/mp/__init__.py | 0 test/git/mp/test_channel.py | 61 +++++++++++++ test/git/mp/test_pool.py | 10 +++ test/git/mp/test_thread.py | 47 ++++++++++ test/git/odb/test_channel.py | 61 ------------- test/git/odb/test_pool.py | 10 --- test/git/odb/test_thread.py | 47 ---------- 14 files changed, 546 insertions(+), 533 deletions(-) create mode 100644 lib/git/mp/__init__.py create mode 100644 lib/git/mp/channel.py create mode 100644 lib/git/mp/pool.py create mode 100644 lib/git/mp/thread.py delete mode 100644 lib/git/odb/channel.py delete mode 100644 lib/git/odb/pool.py delete mode 100644 lib/git/odb/thread.py create mode 100644 test/git/mp/__init__.py create mode 100644 test/git/mp/test_channel.py create mode 100644 test/git/mp/test_pool.py create mode 100644 test/git/mp/test_thread.py delete mode 100644 test/git/odb/test_channel.py delete mode 100644 test/git/odb/test_pool.py delete mode 100644 test/git/odb/test_thread.py diff --git a/lib/git/mp/__init__.py b/lib/git/mp/__init__.py new file mode 100644 index 00000000..89b9eb47 --- /dev/null +++ b/lib/git/mp/__init__.py @@ -0,0 +1 @@ +"""Initialize the multi-processing package""" diff --git a/lib/git/mp/channel.py b/lib/git/mp/channel.py new file mode 100644 index 00000000..c9cbfb87 --- /dev/null +++ b/lib/git/mp/channel.py @@ -0,0 +1,108 @@ +"""Contains a queue based channel implementation""" +from Queue import ( + Queue, + Empty, + Full + ) + +#{ Classes +class Channel(object): + """A channel is similar to a system pipe. It has a write end as well as one or + more read ends. If Data is in the channel, it can be read, if not the read operation + will block until data becomes available. + If the channel is closed, any read operation will result in an exception + + This base class is not instantiated directly, but instead serves as constructor + for RWChannel pairs. + + Create a new channel """ + __slots__ = tuple() + + def __new__(cls, *args): + if cls is Channel: + max_items = 0 + if len(args) == 1: + max_items = args[0] + if len(args) > 1: + raise ValueError("Specify not more than the number of items the channel should take") + wc = WChannel(max_items) + rc = RChannel(wc) + return wc, rc + # END constructor mode + return object.__new__(cls) + + +class WChannel(Channel): + """The write end of a channel""" + __slots__ = ('_closed', '_queue') + + def __init__(self, max_items=0): + """initialize this instance, able to hold max_items at once + Write calls will block if the channel is full, until someone reads from it""" + self._closed = False + self._queue = Queue(max_items) + + + #{ Interface + def write(self, item, block=True, timeout=None): + """Send an item into the channel, it can be read from the read end of the + channel accordingly + :param item: Item to send + :param block: If True, the call will block until there is free space in the + channel + :param timeout: timeout in seconds for blocking calls. + :raise IOError: when writing into closed file or when writing into a non-blocking + full channel + :note: may block if the channel has a limited capacity""" + if self._closed: + raise IOError("Cannot write to a closed channel") + + try: + self._queue.put(item, block, timeout) + except Full: + raise IOError("Capacity of the channel was exeeded") + # END exception handling + + def close(self): + """Close the channel. Multiple close calls on a closed channel are no + an error""" + self._closed = True + + @property + def closed(self): + """:return: True if the channel was closed""" + return self._closed + #} END interface + + +class RChannel(Channel): + """The read-end of a corresponding write channel""" + __slots__ = '_wc' + + def __init__(self, wchannel): + """Initialize this instance from its parent write channel""" + self._wc = wchannel + + + #{ Interface + + def read(self, block=True, timeout=None): + """:return: an item read from the channel + :param block: if True, the call will block until an item is available + :param timeout: if positive and block is True, it will block only for the + given amount of seconds. + :raise IOError: When reading from an empty channel ( if non-blocking, or + if the channel is still empty after the timeout""" + # if the channel is closed for writing, we never block + if self._wc.closed: + block = False + + try: + return self._wc._queue.get(block, timeout) + except Empty: + raise IOError("Error reading from an empty channel") + # END handle reading + + #} END interface + +#} END classes diff --git a/lib/git/mp/pool.py b/lib/git/mp/pool.py new file mode 100644 index 00000000..f9f7880b --- /dev/null +++ b/lib/git/mp/pool.py @@ -0,0 +1,116 @@ +"""Implementation of a thread-pool working with channels""" +from thread import WorkerThread +from channel import ( + Channel, + WChannel, + RChannel + ) + +class Node(object): + """A quick and dirty to the point implementation of a simple, and slow ascyclic graph. + Its not designed to support big graphs, and sports only the functionality + we need""" + __slots__ = ('in_nodes', 'out_nodes') + + +class Graph(object): + """A simple graph implementation, keeping nodes and providing basic access and + editing functions""" + __slots__ = "nodes" + + def add_node(self, node): + pass + + def del_node(self, node): + pass + + def visit_input_depth_first(self, node, visitor=lambda n: True ): + """Visit all input nodes of the given node, depth first, calling visitor + for each node on our way. If the function returns False, the traversal + will not go any deeper, but continue at the next branch""" + pass + + +class TaskNode(Node): + """Couples an input channel, an output channel, as well as a processing function + together. + It may contain additional information on how to handel read-errors from the + input channel""" + __slots__ = ('in_rc', 'out_wc', 'fun') + + def is_done(self): + """:return: True if we are finished processing""" + return self.out_wc.closed + + +class RPoolChannel(RChannel): + """ A read-only pool channel may not be wrapped or derived from, but it provides slots to call + before and after an item is to be read. + + It acts like a handle to the underlying task""" + __slots__ = ('_task', '_pool', '_pre_cb', '_post_cb') + + def set_post_cb(self, fun = lambda item: item): + """Install a callback to call after the item has been read. The function + returns a possibly changed item. If it raises, the exception will be propagated + in an IOError, indicating read-failure + If a function is not provided, the call is effectively uninstalled.""" + + def set_pre_cb(self, fun = lambda : None): + """Install a callback to call before an item is read from the channel. + If it fails, the read will fail with an IOError + If a function is not provided, the call is effectively uninstalled.""" + + def read(block=False, timeout=None): + """Read an item that was processed by one of our threads + :note: Triggers task dependency handling needed to provide the necessary + input""" + + #{ Internal + def _read(self, block=False, timeout=None): + """Calls the underlying channel's read directly, without triggering + the pool""" + return RChannel.read(self, block, timeout) + + #} END internal + + +class PoolWorker(WorkerThread): + """A worker thread which gets called to deal with Tasks. Tasks provide channls + with actual work, whose result will be send to the tasks output channel""" + + @classmethod + def perform_task(cls, task): + # note : when getting the input channel, be sure not to trigger + # RPoolChannel + pass + + +class ThreadPool(Graph): + """A thread pool maintains a set of one or more worker threads, but supports + a fully serial mode in which case the amount of threads is zero. + + Work is distributed via Channels, which form a dependency graph. The evaluation + is lazy, as work will only be done once an output is requested. + + :note: the current implementation returns channels which are meant to be + used only from the main thread""" + __slots__ = ( '_workers', # list of worker threads + '_queue', # master queue for tasks + '_ordered_tasks_cache' # tasks in order of evaluation, mapped by read channel + ) + + def del_node(self, task): + """Delete the node ( being a task ), but delete the entries in our output channel + cache as well""" + + + def set_pool_size(self, size=0): + """Set the amount of workers to use in this pool. + :param size: if 0, the pool will do all work itself in the calling thread, + otherwise the work will be distributed among the given amount of threads""" + + def add_task(self, task): + """Add a new task to be processed. + :return: your task instance with its output channel set. It can be used + to retrieve processed items""" diff --git a/lib/git/mp/thread.py b/lib/git/mp/thread.py new file mode 100644 index 00000000..3938666a --- /dev/null +++ b/lib/git/mp/thread.py @@ -0,0 +1,203 @@ +# -*- coding: utf-8 -*- +"""Module with threading utilities""" +__docformat__ = "restructuredtext" +import threading +import inspect +import Queue + +#{ Decorators + +def do_terminate_threads(whitelist=list()): + """Simple function which terminates all of our threads + :param whitelist: If whitelist is given, only the given threads will be terminated""" + for t in threading.enumerate(): + if not isinstance(t, TerminatableThread): + continue + if whitelist and t not in whitelist: + continue + if isinstance(t, WorkerThread): + t.inq.put(t.quit) + # END worker special handling + t.stop_and_join() + # END for each thread + +def terminate_threads( func ): + """Kills all worker threads the method has created by sending the quit signal. + This takes over in case of an error in the main function""" + def wrapper(*args, **kwargs): + cur_threads = set(threading.enumerate()) + try: + return func(*args, **kwargs) + finally: + do_terminate_threads(set(threading.enumerate()) - cur_threads) + # END finally shutdown threads + # END wrapper + wrapper.__name__ = func.__name__ + return wrapper + +#} END decorators + +#{ Classes + +class TerminatableThread(threading.Thread): + """A simple thread able to terminate itself on behalf of the user. + + Terminate a thread as follows: + + t.stop_and_join() + + Derived classes call _should_terminate() to determine whether they should + abort gracefully + """ + __slots__ = '_terminate' + + def __init__(self): + super(TerminatableThread, self).__init__() + self._terminate = False + + + #{ Subclass Interface + def _should_terminate(self): + """:return: True if this thread should terminate its operation immediately""" + return self._terminate + + def _terminated(self): + """Called once the thread terminated. Its called in the main thread + and may perform cleanup operations""" + pass + + def start(self): + """Start the thread and return self""" + super(TerminatableThread, self).start() + return self + + #} END subclass interface + + #{ Interface + + def stop_and_join(self): + """Ask the thread to stop its operation and wait for it to terminate + :note: Depending on the implenetation, this might block a moment""" + self._terminate = True + self.join() + self._terminated() + #} END interface + + +class WorkerThread(TerminatableThread): + """ + This base allows to call functions on class instances natively and retrieve + their results asynchronously using a queue. + The thread runs forever unless it receives the terminate signal using + its task queue. + + Tasks could be anything, but should usually be class methods and arguments to + allow the following: + + inq = Queue() + outq = Queue() + w = WorkerThread(inq, outq) + w.start() + inq.put((WorkerThread., args, kwargs)) + res = outq.get() + + finally we call quit to terminate asap. + + alternatively, you can make a call more intuitively - the output is the output queue + allowing you to get the result right away or later + w.call(arg, kwarg='value').get() + + inq.put(WorkerThread.quit) + w.join() + + You may provide the following tuples as task: + t[0] = class method, function or instance method + t[1] = optional, tuple or list of arguments to pass to the routine + t[2] = optional, dictionary of keyword arguments to pass to the routine + """ + __slots__ = ('inq', 'outq') + + class InvalidRoutineError(Exception): + """Class sent as return value in case of an error""" + + def __init__(self, inq = None, outq = None): + super(WorkerThread, self).__init__() + self.inq = inq or Queue.Queue() + self.outq = outq or Queue.Queue() + + def call(self, function, *args, **kwargs): + """Method that makes the call to the worker using the input queue, + returning our output queue + + :param funciton: can be a standalone function unrelated to this class, + a class method of this class or any instance method. + If it is a string, it will be considered a function residing on this instance + :param args: arguments to pass to function + :parma **kwargs: kwargs to pass to function""" + self.inq.put((function, args, kwargs)) + return self.outq + + def wait_until_idle(self): + """wait until the input queue is empty, in the meanwhile, take all + results off the output queue.""" + while not self.inq.empty(): + try: + self.outq.get(False) + except Queue.Empty: + continue + # END while there are tasks on the queue + + def run(self): + """Process input tasks until we receive the quit signal""" + while True: + if self._should_terminate(): + break + # END check for stop request + routine = self.__class__.quit + args = tuple() + kwargs = dict() + tasktuple = self.inq.get() + + if isinstance(tasktuple, (tuple, list)): + if len(tasktuple) == 3: + routine, args, kwargs = tasktuple + elif len(tasktuple) == 2: + routine, args = tasktuple + elif len(tasktuple) == 1: + routine = tasktuple[0] + # END tasktuple length check + elif inspect.isroutine(tasktuple): + routine = tasktuple + # END tasktuple handling + + try: + rval = None + if inspect.ismethod(routine): + if routine.im_self is None: + rval = routine(self, *args, **kwargs) + else: + rval = routine(*args, **kwargs) + elif inspect.isroutine(routine): + rval = routine(*args, **kwargs) + elif isinstance(routine, basestring) and hasattr(self, routine): + rval = getattr(self, routine)(*args, **kwargs) + else: + # ignore unknown items + print "%s: task %s was not understood - terminating" % (self.getName(), str(tasktuple)) + self.outq.put(self.InvalidRoutineError(routine)) + break + # END make routine call + self.outq.put(rval) + except StopIteration: + break + except Exception,e: + print "%s: Task %s raised unhandled exception: %s" % (self.getName(), str(tasktuple), str(e)) + self.outq.put(e) + # END routine exception handling + # END endless loop + + def quit(self): + raise StopIteration + + +#} END classes diff --git a/lib/git/odb/channel.py b/lib/git/odb/channel.py deleted file mode 100644 index c9cbfb87..00000000 --- a/lib/git/odb/channel.py +++ /dev/null @@ -1,108 +0,0 @@ -"""Contains a queue based channel implementation""" -from Queue import ( - Queue, - Empty, - Full - ) - -#{ Classes -class Channel(object): - """A channel is similar to a system pipe. It has a write end as well as one or - more read ends. If Data is in the channel, it can be read, if not the read operation - will block until data becomes available. - If the channel is closed, any read operation will result in an exception - - This base class is not instantiated directly, but instead serves as constructor - for RWChannel pairs. - - Create a new channel """ - __slots__ = tuple() - - def __new__(cls, *args): - if cls is Channel: - max_items = 0 - if len(args) == 1: - max_items = args[0] - if len(args) > 1: - raise ValueError("Specify not more than the number of items the channel should take") - wc = WChannel(max_items) - rc = RChannel(wc) - return wc, rc - # END constructor mode - return object.__new__(cls) - - -class WChannel(Channel): - """The write end of a channel""" - __slots__ = ('_closed', '_queue') - - def __init__(self, max_items=0): - """initialize this instance, able to hold max_items at once - Write calls will block if the channel is full, until someone reads from it""" - self._closed = False - self._queue = Queue(max_items) - - - #{ Interface - def write(self, item, block=True, timeout=None): - """Send an item into the channel, it can be read from the read end of the - channel accordingly - :param item: Item to send - :param block: If True, the call will block until there is free space in the - channel - :param timeout: timeout in seconds for blocking calls. - :raise IOError: when writing into closed file or when writing into a non-blocking - full channel - :note: may block if the channel has a limited capacity""" - if self._closed: - raise IOError("Cannot write to a closed channel") - - try: - self._queue.put(item, block, timeout) - except Full: - raise IOError("Capacity of the channel was exeeded") - # END exception handling - - def close(self): - """Close the channel. Multiple close calls on a closed channel are no - an error""" - self._closed = True - - @property - def closed(self): - """:return: True if the channel was closed""" - return self._closed - #} END interface - - -class RChannel(Channel): - """The read-end of a corresponding write channel""" - __slots__ = '_wc' - - def __init__(self, wchannel): - """Initialize this instance from its parent write channel""" - self._wc = wchannel - - - #{ Interface - - def read(self, block=True, timeout=None): - """:return: an item read from the channel - :param block: if True, the call will block until an item is available - :param timeout: if positive and block is True, it will block only for the - given amount of seconds. - :raise IOError: When reading from an empty channel ( if non-blocking, or - if the channel is still empty after the timeout""" - # if the channel is closed for writing, we never block - if self._wc.closed: - block = False - - try: - return self._wc._queue.get(block, timeout) - except Empty: - raise IOError("Error reading from an empty channel") - # END handle reading - - #} END interface - -#} END classes diff --git a/lib/git/odb/pool.py b/lib/git/odb/pool.py deleted file mode 100644 index 5c3a7ead..00000000 --- a/lib/git/odb/pool.py +++ /dev/null @@ -1,104 +0,0 @@ -"""Implementation of a thread-pool working with channels""" -from thread import TerminatableThread -from channel import ( - Channel, - WChannel, - RChannel - ) - -class Node(object): - """A quick and dirty to the point implementation of a simple, and slow ascyclic graph. - Its not designed to support big graphs, and sports only the functionality - we need""" - __slots__('in_nodes', 'out_nodes') - - -class Graph(object): - """A simple graph implementation, keeping nodes and providing basic access and - editing functions""" - __slots__ = "nodes" - - def add_node(self, node): - pass - - def del_node(self, node): - pass - - def visit_input_depth_first(self, node, visitor=lambda n: True ): - """Visit all input nodes of the given node, depth first, calling visitor - for each node on our way. If the function returns False, the traversal - will not go any deeper, but continue at the next branch""" - pass - - -class TaskNode(Node): - """Couples an input channel, an output channel, as well as a processing function - together. - It may contain additional information on how to handel read-errors from the - input channel""" - __slots__ = ('in_rc', 'out_wc', 'fun') - - def is_done(self): - """:return: True if we are finished processing""" - return self.out_wc.closed - - -class PoolChannel(Channel): - """Base class for read and write channels which trigger the pool to evaluate - its tasks, causing the evaluation of the task list effectively assure a read - from actual output channel will not block forever due to task dependencies. - """ - __slots__ = tuple() - - -class RPoolChannel(PoolChannel): - """ A read-only pool channel may not be wrapped or derived from, but it provides slots to call - before and after an item is to be read""" - __slots__ = ('_task', '_pool', '_pre_cb', '_post_cb') - - def set_post_cb(self, fun = lambda item: item): - """Install a callback to call after the item has been read. The function - returns a possibly changed item. If it raises, the exception will be propagated - in an IOError, indicating read-failure - If a function is not provided, the call is effectively uninstalled.""" - - def set_pre_cb(self, fun = lambda : None): - """Install a callback to call before an item is read from the channel. - If it fails, the read will fail with an IOError - If a function is not provided, the call is effectively uninstalled.""" - - -class PoolWorker(WorkerThread): - """A worker thread which gets called to deal with Tasks. Tasks provide channls - with actual work, whose result will be send to the tasks output channel""" - - @classmethod - def perform_task(cls, task): - pass - - -class ThreadPool(Graph): - """A thread pool maintains a set of one or more worker threads, but supports - a fully serial mode in which case the amount of threads is zero. - - Work is distributed via Channels, which form a dependency graph. The evaluation - is lazy, as work will only be done once an output is requested.""" - __slots__ = ( '_workers', # list of worker threads - '_queue', # master queue for tasks - '_ordered_tasks_cache' # tasks in order of evaluation, mapped by read channel - ) - - def del_node(self, task): - """Delete the node ( being a task ), but delete the entries in our output channel - cache as well""" - - - def set_pool_size(self, size=0): - """Set the amount of workers to use in this pool. - :param size: if 0, the pool will do all work itself in the calling thread, - otherwise the work will be distributed among the given amount of threads""" - - def add_task(self, task): - """Add a new task to be processed. - :return: your task instance with its output channel set. It can be used - to retrieve processed items""" diff --git a/lib/git/odb/thread.py b/lib/git/odb/thread.py deleted file mode 100644 index 3938666a..00000000 --- a/lib/git/odb/thread.py +++ /dev/null @@ -1,203 +0,0 @@ -# -*- coding: utf-8 -*- -"""Module with threading utilities""" -__docformat__ = "restructuredtext" -import threading -import inspect -import Queue - -#{ Decorators - -def do_terminate_threads(whitelist=list()): - """Simple function which terminates all of our threads - :param whitelist: If whitelist is given, only the given threads will be terminated""" - for t in threading.enumerate(): - if not isinstance(t, TerminatableThread): - continue - if whitelist and t not in whitelist: - continue - if isinstance(t, WorkerThread): - t.inq.put(t.quit) - # END worker special handling - t.stop_and_join() - # END for each thread - -def terminate_threads( func ): - """Kills all worker threads the method has created by sending the quit signal. - This takes over in case of an error in the main function""" - def wrapper(*args, **kwargs): - cur_threads = set(threading.enumerate()) - try: - return func(*args, **kwargs) - finally: - do_terminate_threads(set(threading.enumerate()) - cur_threads) - # END finally shutdown threads - # END wrapper - wrapper.__name__ = func.__name__ - return wrapper - -#} END decorators - -#{ Classes - -class TerminatableThread(threading.Thread): - """A simple thread able to terminate itself on behalf of the user. - - Terminate a thread as follows: - - t.stop_and_join() - - Derived classes call _should_terminate() to determine whether they should - abort gracefully - """ - __slots__ = '_terminate' - - def __init__(self): - super(TerminatableThread, self).__init__() - self._terminate = False - - - #{ Subclass Interface - def _should_terminate(self): - """:return: True if this thread should terminate its operation immediately""" - return self._terminate - - def _terminated(self): - """Called once the thread terminated. Its called in the main thread - and may perform cleanup operations""" - pass - - def start(self): - """Start the thread and return self""" - super(TerminatableThread, self).start() - return self - - #} END subclass interface - - #{ Interface - - def stop_and_join(self): - """Ask the thread to stop its operation and wait for it to terminate - :note: Depending on the implenetation, this might block a moment""" - self._terminate = True - self.join() - self._terminated() - #} END interface - - -class WorkerThread(TerminatableThread): - """ - This base allows to call functions on class instances natively and retrieve - their results asynchronously using a queue. - The thread runs forever unless it receives the terminate signal using - its task queue. - - Tasks could be anything, but should usually be class methods and arguments to - allow the following: - - inq = Queue() - outq = Queue() - w = WorkerThread(inq, outq) - w.start() - inq.put((WorkerThread., args, kwargs)) - res = outq.get() - - finally we call quit to terminate asap. - - alternatively, you can make a call more intuitively - the output is the output queue - allowing you to get the result right away or later - w.call(arg, kwarg='value').get() - - inq.put(WorkerThread.quit) - w.join() - - You may provide the following tuples as task: - t[0] = class method, function or instance method - t[1] = optional, tuple or list of arguments to pass to the routine - t[2] = optional, dictionary of keyword arguments to pass to the routine - """ - __slots__ = ('inq', 'outq') - - class InvalidRoutineError(Exception): - """Class sent as return value in case of an error""" - - def __init__(self, inq = None, outq = None): - super(WorkerThread, self).__init__() - self.inq = inq or Queue.Queue() - self.outq = outq or Queue.Queue() - - def call(self, function, *args, **kwargs): - """Method that makes the call to the worker using the input queue, - returning our output queue - - :param funciton: can be a standalone function unrelated to this class, - a class method of this class or any instance method. - If it is a string, it will be considered a function residing on this instance - :param args: arguments to pass to function - :parma **kwargs: kwargs to pass to function""" - self.inq.put((function, args, kwargs)) - return self.outq - - def wait_until_idle(self): - """wait until the input queue is empty, in the meanwhile, take all - results off the output queue.""" - while not self.inq.empty(): - try: - self.outq.get(False) - except Queue.Empty: - continue - # END while there are tasks on the queue - - def run(self): - """Process input tasks until we receive the quit signal""" - while True: - if self._should_terminate(): - break - # END check for stop request - routine = self.__class__.quit - args = tuple() - kwargs = dict() - tasktuple = self.inq.get() - - if isinstance(tasktuple, (tuple, list)): - if len(tasktuple) == 3: - routine, args, kwargs = tasktuple - elif len(tasktuple) == 2: - routine, args = tasktuple - elif len(tasktuple) == 1: - routine = tasktuple[0] - # END tasktuple length check - elif inspect.isroutine(tasktuple): - routine = tasktuple - # END tasktuple handling - - try: - rval = None - if inspect.ismethod(routine): - if routine.im_self is None: - rval = routine(self, *args, **kwargs) - else: - rval = routine(*args, **kwargs) - elif inspect.isroutine(routine): - rval = routine(*args, **kwargs) - elif isinstance(routine, basestring) and hasattr(self, routine): - rval = getattr(self, routine)(*args, **kwargs) - else: - # ignore unknown items - print "%s: task %s was not understood - terminating" % (self.getName(), str(tasktuple)) - self.outq.put(self.InvalidRoutineError(routine)) - break - # END make routine call - self.outq.put(rval) - except StopIteration: - break - except Exception,e: - print "%s: Task %s raised unhandled exception: %s" % (self.getName(), str(tasktuple), str(e)) - self.outq.put(e) - # END routine exception handling - # END endless loop - - def quit(self): - raise StopIteration - - -#} END classes diff --git a/test/git/mp/__init__.py b/test/git/mp/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/git/mp/test_channel.py b/test/git/mp/test_channel.py new file mode 100644 index 00000000..9b667372 --- /dev/null +++ b/test/git/mp/test_channel.py @@ -0,0 +1,61 @@ +"""Channel testing""" +from test.testlib import * +from git.mp.channel import * + +import time + +class TestChannels(TestBase): + + def test_base(self): + # creating channel yields a write and a read channal + wc, rc = Channel() + assert isinstance(wc, WChannel) + assert isinstance(rc, RChannel) + + # everything else fails + self.failUnlessRaises(ValueError, Channel, 1, "too many args") + + # TEST UNLIMITED SIZE CHANNEL - writing+reading is FIFO + item = 1 + item2 = 2 + wc.write(item) + wc.write(item2) + assert rc.read() == item + assert rc.read() == item2 + + # next read blocks, then raises - it waits a second + st = time.time() + self.failUnlessRaises(IOError, rc.read, True, 1) + assert time.time() - st >= 1.0 + + # writing to a closed channel raises + assert not wc.closed + wc.close() + assert wc.closed + wc.close() # fine + assert wc.closed + + self.failUnlessRaises(IOError, wc.write, 1) + + # reading from a closed channel never blocks + self.failUnlessRaises(IOError, rc.read) + + + + # TEST LIMITED SIZE CHANNEL + # channel with max-items set + wc, rc = Channel(1) + wc.write(item) # fine + + # blocks for a second, its full + st = time.time() + self.failUnlessRaises(IOError, wc.write, item, True, 1) + assert time.time() - st >= 1.0 + + # get one + assert rc.read() == item + + # its empty,can put one again + wc.write(item2) + assert rc.read() == item2 + wc.close() diff --git a/test/git/mp/test_pool.py b/test/git/mp/test_pool.py new file mode 100644 index 00000000..7c4a366f --- /dev/null +++ b/test/git/mp/test_pool.py @@ -0,0 +1,10 @@ +"""Channel testing""" +from test.testlib import * +from git.mp.pool import * + +import time + +class TestThreadPool(TestBase): + + def test_base(self): + pass diff --git a/test/git/mp/test_thread.py b/test/git/mp/test_thread.py new file mode 100644 index 00000000..9625aabb --- /dev/null +++ b/test/git/mp/test_thread.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +""" Test thead classes and functions""" +from test.testlib import * +from git.mp.thread import * +from Queue import Queue + +class TestWorker(WorkerThread): + def __init__(self, *args, **kwargs): + super(TestWorker, self).__init__(*args, **kwargs) + self.reset() + + def fun(self, *args, **kwargs): + self.called = True + self.args = args + self.kwargs = kwargs + return True + + def make_assertion(self): + assert self.called + assert self.args + assert self.kwargs + self.reset() + + def reset(self): + self.called = False + self.args = None + self.kwargs = None + + +class TestThreads( TestCase ): + + @terminate_threads + def test_worker_thread(self): + worker = TestWorker() + assert isinstance(worker.start(), WorkerThread) + + # test different method types + standalone_func = lambda *args, **kwargs: worker.fun(*args, **kwargs) + for function in ("fun", TestWorker.fun, worker.fun, standalone_func): + rval = worker.call(function, 1, this='that') + assert isinstance(rval, Queue) + assert rval.get() is True + worker.make_assertion() + # END for each function type + + worker.call('quit') + diff --git a/test/git/odb/test_channel.py b/test/git/odb/test_channel.py deleted file mode 100644 index d845a6ec..00000000 --- a/test/git/odb/test_channel.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Channel testing""" -from test.testlib import * -from git.odb.channel import * - -import time - -class TestChannels(TestBase): - - def test_base(self): - # creating channel yields a write and a read channal - wc, rc = Channel() - assert isinstance(wc, WChannel) - assert isinstance(rc, RChannel) - - # everything else fails - self.failUnlessRaises(ValueError, Channel, 1, "too many args") - - # TEST UNLIMITED SIZE CHANNEL - writing+reading is FIFO - item = 1 - item2 = 2 - wc.write(item) - wc.write(item2) - assert rc.read() == item - assert rc.read() == item2 - - # next read blocks, then raises - it waits a second - st = time.time() - self.failUnlessRaises(IOError, rc.read, True, 1) - assert time.time() - st >= 1.0 - - # writing to a closed channel raises - assert not wc.closed - wc.close() - assert wc.closed - wc.close() # fine - assert wc.closed - - self.failUnlessRaises(IOError, wc.write, 1) - - # reading from a closed channel never blocks - self.failUnlessRaises(IOError, rc.read) - - - - # TEST LIMITED SIZE CHANNEL - # channel with max-items set - wc, rc = Channel(1) - wc.write(item) # fine - - # blocks for a second, its full - st = time.time() - self.failUnlessRaises(IOError, wc.write, item, True, 1) - assert time.time() - st >= 1.0 - - # get one - assert rc.read() == item - - # its empty,can put one again - wc.write(item2) - assert rc.read() == item2 - wc.close() diff --git a/test/git/odb/test_pool.py b/test/git/odb/test_pool.py deleted file mode 100644 index 6656c69d..00000000 --- a/test/git/odb/test_pool.py +++ /dev/null @@ -1,10 +0,0 @@ -"""Channel testing""" -from test.testlib import * -from git.odb.pool import * - -import time - -class TestThreadPool(TestBase): - - def test_base(self): - pass diff --git a/test/git/odb/test_thread.py b/test/git/odb/test_thread.py deleted file mode 100644 index 674ecc1d..00000000 --- a/test/git/odb/test_thread.py +++ /dev/null @@ -1,47 +0,0 @@ -# -*- coding: utf-8 -*- -""" Test thead classes and functions""" -from test.testlib import * -from git.odb.thread import * -from Queue import Queue - -class TestWorker(WorkerThread): - def __init__(self, *args, **kwargs): - super(TestWorker, self).__init__(*args, **kwargs) - self.reset() - - def fun(self, *args, **kwargs): - self.called = True - self.args = args - self.kwargs = kwargs - return True - - def make_assertion(self): - assert self.called - assert self.args - assert self.kwargs - self.reset() - - def reset(self): - self.called = False - self.args = None - self.kwargs = None - - -class TestThreads( TestCase ): - - @terminate_threads - def test_worker_thread(self): - worker = TestWorker() - assert isinstance(worker.start(), WorkerThread) - - # test different method types - standalone_func = lambda *args, **kwargs: worker.fun(*args, **kwargs) - for function in ("fun", TestWorker.fun, worker.fun, standalone_func): - rval = worker.call(function, 1, this='that') - assert isinstance(rval, Queue) - assert rval.get() is True - worker.make_assertion() - # END for each function type - - worker.call('quit') - -- cgit v1.2.3 From ab59f78341f1dd188aaf4c30526f6295c63438b1 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 5 Jun 2010 20:03:09 +0200 Subject: Renamed mp to async, as this is a much better name for what is actually going on. The default implementation uses threads, which ends up being nothing more than async, as they are all locked down by internal and the global interpreter lock --- lib/git/async/__init__.py | 1 + lib/git/async/channel.py | 108 ++++++++++++++++++++++ lib/git/async/pool.py | 116 +++++++++++++++++++++++ lib/git/async/thread.py | 203 +++++++++++++++++++++++++++++++++++++++++ lib/git/mp/__init__.py | 1 - lib/git/mp/channel.py | 108 ---------------------- lib/git/mp/pool.py | 116 ----------------------- lib/git/mp/thread.py | 203 ----------------------------------------- test/git/async/__init__.py | 0 test/git/async/test_channel.py | 61 +++++++++++++ test/git/async/test_pool.py | 10 ++ test/git/async/test_thread.py | 47 ++++++++++ test/git/mp/__init__.py | 0 test/git/mp/test_channel.py | 61 ------------- test/git/mp/test_pool.py | 10 -- test/git/mp/test_thread.py | 47 ---------- 16 files changed, 546 insertions(+), 546 deletions(-) create mode 100644 lib/git/async/__init__.py create mode 100644 lib/git/async/channel.py create mode 100644 lib/git/async/pool.py create mode 100644 lib/git/async/thread.py delete mode 100644 lib/git/mp/__init__.py delete mode 100644 lib/git/mp/channel.py delete mode 100644 lib/git/mp/pool.py delete mode 100644 lib/git/mp/thread.py create mode 100644 test/git/async/__init__.py create mode 100644 test/git/async/test_channel.py create mode 100644 test/git/async/test_pool.py create mode 100644 test/git/async/test_thread.py delete mode 100644 test/git/mp/__init__.py delete mode 100644 test/git/mp/test_channel.py delete mode 100644 test/git/mp/test_pool.py delete mode 100644 test/git/mp/test_thread.py diff --git a/lib/git/async/__init__.py b/lib/git/async/__init__.py new file mode 100644 index 00000000..89b9eb47 --- /dev/null +++ b/lib/git/async/__init__.py @@ -0,0 +1 @@ +"""Initialize the multi-processing package""" diff --git a/lib/git/async/channel.py b/lib/git/async/channel.py new file mode 100644 index 00000000..c9cbfb87 --- /dev/null +++ b/lib/git/async/channel.py @@ -0,0 +1,108 @@ +"""Contains a queue based channel implementation""" +from Queue import ( + Queue, + Empty, + Full + ) + +#{ Classes +class Channel(object): + """A channel is similar to a system pipe. It has a write end as well as one or + more read ends. If Data is in the channel, it can be read, if not the read operation + will block until data becomes available. + If the channel is closed, any read operation will result in an exception + + This base class is not instantiated directly, but instead serves as constructor + for RWChannel pairs. + + Create a new channel """ + __slots__ = tuple() + + def __new__(cls, *args): + if cls is Channel: + max_items = 0 + if len(args) == 1: + max_items = args[0] + if len(args) > 1: + raise ValueError("Specify not more than the number of items the channel should take") + wc = WChannel(max_items) + rc = RChannel(wc) + return wc, rc + # END constructor mode + return object.__new__(cls) + + +class WChannel(Channel): + """The write end of a channel""" + __slots__ = ('_closed', '_queue') + + def __init__(self, max_items=0): + """initialize this instance, able to hold max_items at once + Write calls will block if the channel is full, until someone reads from it""" + self._closed = False + self._queue = Queue(max_items) + + + #{ Interface + def write(self, item, block=True, timeout=None): + """Send an item into the channel, it can be read from the read end of the + channel accordingly + :param item: Item to send + :param block: If True, the call will block until there is free space in the + channel + :param timeout: timeout in seconds for blocking calls. + :raise IOError: when writing into closed file or when writing into a non-blocking + full channel + :note: may block if the channel has a limited capacity""" + if self._closed: + raise IOError("Cannot write to a closed channel") + + try: + self._queue.put(item, block, timeout) + except Full: + raise IOError("Capacity of the channel was exeeded") + # END exception handling + + def close(self): + """Close the channel. Multiple close calls on a closed channel are no + an error""" + self._closed = True + + @property + def closed(self): + """:return: True if the channel was closed""" + return self._closed + #} END interface + + +class RChannel(Channel): + """The read-end of a corresponding write channel""" + __slots__ = '_wc' + + def __init__(self, wchannel): + """Initialize this instance from its parent write channel""" + self._wc = wchannel + + + #{ Interface + + def read(self, block=True, timeout=None): + """:return: an item read from the channel + :param block: if True, the call will block until an item is available + :param timeout: if positive and block is True, it will block only for the + given amount of seconds. + :raise IOError: When reading from an empty channel ( if non-blocking, or + if the channel is still empty after the timeout""" + # if the channel is closed for writing, we never block + if self._wc.closed: + block = False + + try: + return self._wc._queue.get(block, timeout) + except Empty: + raise IOError("Error reading from an empty channel") + # END handle reading + + #} END interface + +#} END classes diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py new file mode 100644 index 00000000..f9f7880b --- /dev/null +++ b/lib/git/async/pool.py @@ -0,0 +1,116 @@ +"""Implementation of a thread-pool working with channels""" +from thread import WorkerThread +from channel import ( + Channel, + WChannel, + RChannel + ) + +class Node(object): + """A quick and dirty to the point implementation of a simple, and slow ascyclic graph. + Its not designed to support big graphs, and sports only the functionality + we need""" + __slots__ = ('in_nodes', 'out_nodes') + + +class Graph(object): + """A simple graph implementation, keeping nodes and providing basic access and + editing functions""" + __slots__ = "nodes" + + def add_node(self, node): + pass + + def del_node(self, node): + pass + + def visit_input_depth_first(self, node, visitor=lambda n: True ): + """Visit all input nodes of the given node, depth first, calling visitor + for each node on our way. If the function returns False, the traversal + will not go any deeper, but continue at the next branch""" + pass + + +class TaskNode(Node): + """Couples an input channel, an output channel, as well as a processing function + together. + It may contain additional information on how to handel read-errors from the + input channel""" + __slots__ = ('in_rc', 'out_wc', 'fun') + + def is_done(self): + """:return: True if we are finished processing""" + return self.out_wc.closed + + +class RPoolChannel(RChannel): + """ A read-only pool channel may not be wrapped or derived from, but it provides slots to call + before and after an item is to be read. + + It acts like a handle to the underlying task""" + __slots__ = ('_task', '_pool', '_pre_cb', '_post_cb') + + def set_post_cb(self, fun = lambda item: item): + """Install a callback to call after the item has been read. The function + returns a possibly changed item. If it raises, the exception will be propagated + in an IOError, indicating read-failure + If a function is not provided, the call is effectively uninstalled.""" + + def set_pre_cb(self, fun = lambda : None): + """Install a callback to call before an item is read from the channel. + If it fails, the read will fail with an IOError + If a function is not provided, the call is effectively uninstalled.""" + + def read(block=False, timeout=None): + """Read an item that was processed by one of our threads + :note: Triggers task dependency handling needed to provide the necessary + input""" + + #{ Internal + def _read(self, block=False, timeout=None): + """Calls the underlying channel's read directly, without triggering + the pool""" + return RChannel.read(self, block, timeout) + + #} END internal + + +class PoolWorker(WorkerThread): + """A worker thread which gets called to deal with Tasks. Tasks provide channls + with actual work, whose result will be send to the tasks output channel""" + + @classmethod + def perform_task(cls, task): + # note : when getting the input channel, be sure not to trigger + # RPoolChannel + pass + + +class ThreadPool(Graph): + """A thread pool maintains a set of one or more worker threads, but supports + a fully serial mode in which case the amount of threads is zero. + + Work is distributed via Channels, which form a dependency graph. The evaluation + is lazy, as work will only be done once an output is requested. + + :note: the current implementation returns channels which are meant to be + used only from the main thread""" + __slots__ = ( '_workers', # list of worker threads + '_queue', # master queue for tasks + '_ordered_tasks_cache' # tasks in order of evaluation, mapped by read channel + ) + + def del_node(self, task): + """Delete the node ( being a task ), but delete the entries in our output channel + cache as well""" + + + def set_pool_size(self, size=0): + """Set the amount of workers to use in this pool. + :param size: if 0, the pool will do all work itself in the calling thread, + otherwise the work will be distributed among the given amount of threads""" + + def add_task(self, task): + """Add a new task to be processed. + :return: your task instance with its output channel set. It can be used + to retrieve processed items""" diff --git a/lib/git/async/thread.py b/lib/git/async/thread.py new file mode 100644 index 00000000..3938666a --- /dev/null +++ b/lib/git/async/thread.py @@ -0,0 +1,203 @@ +# -*- coding: utf-8 -*- +"""Module with threading utilities""" +__docformat__ = "restructuredtext" +import threading +import inspect +import Queue + +#{ Decorators + +def do_terminate_threads(whitelist=list()): + """Simple function which terminates all of our threads + :param whitelist: If whitelist is given, only the given threads will be terminated""" + for t in threading.enumerate(): + if not isinstance(t, TerminatableThread): + continue + if whitelist and t not in whitelist: + continue + if isinstance(t, WorkerThread): + t.inq.put(t.quit) + # END worker special handling + t.stop_and_join() + # END for each thread + +def terminate_threads( func ): + """Kills all worker threads the method has created by sending the quit signal. + This takes over in case of an error in the main function""" + def wrapper(*args, **kwargs): + cur_threads = set(threading.enumerate()) + try: + return func(*args, **kwargs) + finally: + do_terminate_threads(set(threading.enumerate()) - cur_threads) + # END finally shutdown threads + # END wrapper + wrapper.__name__ = func.__name__ + return wrapper + +#} END decorators + +#{ Classes + +class TerminatableThread(threading.Thread): + """A simple thread able to terminate itself on behalf of the user. + + Terminate a thread as follows: + + t.stop_and_join() + + Derived classes call _should_terminate() to determine whether they should + abort gracefully + """ + __slots__ = '_terminate' + + def __init__(self): + super(TerminatableThread, self).__init__() + self._terminate = False + + + #{ Subclass Interface + def _should_terminate(self): + """:return: True if this thread should terminate its operation immediately""" + return self._terminate + + def _terminated(self): + """Called once the thread terminated. Its called in the main thread + and may perform cleanup operations""" + pass + + def start(self): + """Start the thread and return self""" + super(TerminatableThread, self).start() + return self + + #} END subclass interface + + #{ Interface + + def stop_and_join(self): + """Ask the thread to stop its operation and wait for it to terminate + :note: Depending on the implenetation, this might block a moment""" + self._terminate = True + self.join() + self._terminated() + #} END interface + + +class WorkerThread(TerminatableThread): + """ + This base allows to call functions on class instances natively and retrieve + their results asynchronously using a queue. + The thread runs forever unless it receives the terminate signal using + its task queue. + + Tasks could be anything, but should usually be class methods and arguments to + allow the following: + + inq = Queue() + outq = Queue() + w = WorkerThread(inq, outq) + w.start() + inq.put((WorkerThread., args, kwargs)) + res = outq.get() + + finally we call quit to terminate asap. + + alternatively, you can make a call more intuitively - the output is the output queue + allowing you to get the result right away or later + w.call(arg, kwarg='value').get() + + inq.put(WorkerThread.quit) + w.join() + + You may provide the following tuples as task: + t[0] = class method, function or instance method + t[1] = optional, tuple or list of arguments to pass to the routine + t[2] = optional, dictionary of keyword arguments to pass to the routine + """ + __slots__ = ('inq', 'outq') + + class InvalidRoutineError(Exception): + """Class sent as return value in case of an error""" + + def __init__(self, inq = None, outq = None): + super(WorkerThread, self).__init__() + self.inq = inq or Queue.Queue() + self.outq = outq or Queue.Queue() + + def call(self, function, *args, **kwargs): + """Method that makes the call to the worker using the input queue, + returning our output queue + + :param funciton: can be a standalone function unrelated to this class, + a class method of this class or any instance method. + If it is a string, it will be considered a function residing on this instance + :param args: arguments to pass to function + :parma **kwargs: kwargs to pass to function""" + self.inq.put((function, args, kwargs)) + return self.outq + + def wait_until_idle(self): + """wait until the input queue is empty, in the meanwhile, take all + results off the output queue.""" + while not self.inq.empty(): + try: + self.outq.get(False) + except Queue.Empty: + continue + # END while there are tasks on the queue + + def run(self): + """Process input tasks until we receive the quit signal""" + while True: + if self._should_terminate(): + break + # END check for stop request + routine = self.__class__.quit + args = tuple() + kwargs = dict() + tasktuple = self.inq.get() + + if isinstance(tasktuple, (tuple, list)): + if len(tasktuple) == 3: + routine, args, kwargs = tasktuple + elif len(tasktuple) == 2: + routine, args = tasktuple + elif len(tasktuple) == 1: + routine = tasktuple[0] + # END tasktuple length check + elif inspect.isroutine(tasktuple): + routine = tasktuple + # END tasktuple handling + + try: + rval = None + if inspect.ismethod(routine): + if routine.im_self is None: + rval = routine(self, *args, **kwargs) + else: + rval = routine(*args, **kwargs) + elif inspect.isroutine(routine): + rval = routine(*args, **kwargs) + elif isinstance(routine, basestring) and hasattr(self, routine): + rval = getattr(self, routine)(*args, **kwargs) + else: + # ignore unknown items + print "%s: task %s was not understood - terminating" % (self.getName(), str(tasktuple)) + self.outq.put(self.InvalidRoutineError(routine)) + break + # END make routine call + self.outq.put(rval) + except StopIteration: + break + except Exception,e: + print "%s: Task %s raised unhandled exception: %s" % (self.getName(), str(tasktuple), str(e)) + self.outq.put(e) + # END routine exception handling + # END endless loop + + def quit(self): + raise StopIteration + + +#} END classes diff --git a/lib/git/mp/__init__.py b/lib/git/mp/__init__.py deleted file mode 100644 index 89b9eb47..00000000 --- a/lib/git/mp/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Initialize the multi-processing package""" diff --git a/lib/git/mp/channel.py b/lib/git/mp/channel.py deleted file mode 100644 index c9cbfb87..00000000 --- a/lib/git/mp/channel.py +++ /dev/null @@ -1,108 +0,0 @@ -"""Contains a queue based channel implementation""" -from Queue import ( - Queue, - Empty, - Full - ) - -#{ Classes -class Channel(object): - """A channel is similar to a system pipe. It has a write end as well as one or - more read ends. If Data is in the channel, it can be read, if not the read operation - will block until data becomes available. - If the channel is closed, any read operation will result in an exception - - This base class is not instantiated directly, but instead serves as constructor - for RWChannel pairs. - - Create a new channel """ - __slots__ = tuple() - - def __new__(cls, *args): - if cls is Channel: - max_items = 0 - if len(args) == 1: - max_items = args[0] - if len(args) > 1: - raise ValueError("Specify not more than the number of items the channel should take") - wc = WChannel(max_items) - rc = RChannel(wc) - return wc, rc - # END constructor mode - return object.__new__(cls) - - -class WChannel(Channel): - """The write end of a channel""" - __slots__ = ('_closed', '_queue') - - def __init__(self, max_items=0): - """initialize this instance, able to hold max_items at once - Write calls will block if the channel is full, until someone reads from it""" - self._closed = False - self._queue = Queue(max_items) - - - #{ Interface - def write(self, item, block=True, timeout=None): - """Send an item into the channel, it can be read from the read end of the - channel accordingly - :param item: Item to send - :param block: If True, the call will block until there is free space in the - channel - :param timeout: timeout in seconds for blocking calls. - :raise IOError: when writing into closed file or when writing into a non-blocking - full channel - :note: may block if the channel has a limited capacity""" - if self._closed: - raise IOError("Cannot write to a closed channel") - - try: - self._queue.put(item, block, timeout) - except Full: - raise IOError("Capacity of the channel was exeeded") - # END exception handling - - def close(self): - """Close the channel. Multiple close calls on a closed channel are no - an error""" - self._closed = True - - @property - def closed(self): - """:return: True if the channel was closed""" - return self._closed - #} END interface - - -class RChannel(Channel): - """The read-end of a corresponding write channel""" - __slots__ = '_wc' - - def __init__(self, wchannel): - """Initialize this instance from its parent write channel""" - self._wc = wchannel - - - #{ Interface - - def read(self, block=True, timeout=None): - """:return: an item read from the channel - :param block: if True, the call will block until an item is available - :param timeout: if positive and block is True, it will block only for the - given amount of seconds. - :raise IOError: When reading from an empty channel ( if non-blocking, or - if the channel is still empty after the timeout""" - # if the channel is closed for writing, we never block - if self._wc.closed: - block = False - - try: - return self._wc._queue.get(block, timeout) - except Empty: - raise IOError("Error reading from an empty channel") - # END handle reading - - #} END interface - -#} END classes diff --git a/lib/git/mp/pool.py b/lib/git/mp/pool.py deleted file mode 100644 index f9f7880b..00000000 --- a/lib/git/mp/pool.py +++ /dev/null @@ -1,116 +0,0 @@ -"""Implementation of a thread-pool working with channels""" -from thread import WorkerThread -from channel import ( - Channel, - WChannel, - RChannel - ) - -class Node(object): - """A quick and dirty to the point implementation of a simple, and slow ascyclic graph. - Its not designed to support big graphs, and sports only the functionality - we need""" - __slots__ = ('in_nodes', 'out_nodes') - - -class Graph(object): - """A simple graph implementation, keeping nodes and providing basic access and - editing functions""" - __slots__ = "nodes" - - def add_node(self, node): - pass - - def del_node(self, node): - pass - - def visit_input_depth_first(self, node, visitor=lambda n: True ): - """Visit all input nodes of the given node, depth first, calling visitor - for each node on our way. If the function returns False, the traversal - will not go any deeper, but continue at the next branch""" - pass - - -class TaskNode(Node): - """Couples an input channel, an output channel, as well as a processing function - together. - It may contain additional information on how to handel read-errors from the - input channel""" - __slots__ = ('in_rc', 'out_wc', 'fun') - - def is_done(self): - """:return: True if we are finished processing""" - return self.out_wc.closed - - -class RPoolChannel(RChannel): - """ A read-only pool channel may not be wrapped or derived from, but it provides slots to call - before and after an item is to be read. - - It acts like a handle to the underlying task""" - __slots__ = ('_task', '_pool', '_pre_cb', '_post_cb') - - def set_post_cb(self, fun = lambda item: item): - """Install a callback to call after the item has been read. The function - returns a possibly changed item. If it raises, the exception will be propagated - in an IOError, indicating read-failure - If a function is not provided, the call is effectively uninstalled.""" - - def set_pre_cb(self, fun = lambda : None): - """Install a callback to call before an item is read from the channel. - If it fails, the read will fail with an IOError - If a function is not provided, the call is effectively uninstalled.""" - - def read(block=False, timeout=None): - """Read an item that was processed by one of our threads - :note: Triggers task dependency handling needed to provide the necessary - input""" - - #{ Internal - def _read(self, block=False, timeout=None): - """Calls the underlying channel's read directly, without triggering - the pool""" - return RChannel.read(self, block, timeout) - - #} END internal - - -class PoolWorker(WorkerThread): - """A worker thread which gets called to deal with Tasks. Tasks provide channls - with actual work, whose result will be send to the tasks output channel""" - - @classmethod - def perform_task(cls, task): - # note : when getting the input channel, be sure not to trigger - # RPoolChannel - pass - - -class ThreadPool(Graph): - """A thread pool maintains a set of one or more worker threads, but supports - a fully serial mode in which case the amount of threads is zero. - - Work is distributed via Channels, which form a dependency graph. The evaluation - is lazy, as work will only be done once an output is requested. - - :note: the current implementation returns channels which are meant to be - used only from the main thread""" - __slots__ = ( '_workers', # list of worker threads - '_queue', # master queue for tasks - '_ordered_tasks_cache' # tasks in order of evaluation, mapped by read channel - ) - - def del_node(self, task): - """Delete the node ( being a task ), but delete the entries in our output channel - cache as well""" - - - def set_pool_size(self, size=0): - """Set the amount of workers to use in this pool. - :param size: if 0, the pool will do all work itself in the calling thread, - otherwise the work will be distributed among the given amount of threads""" - - def add_task(self, task): - """Add a new task to be processed. - :return: your task instance with its output channel set. It can be used - to retrieve processed items""" diff --git a/lib/git/mp/thread.py b/lib/git/mp/thread.py deleted file mode 100644 index 3938666a..00000000 --- a/lib/git/mp/thread.py +++ /dev/null @@ -1,203 +0,0 @@ -# -*- coding: utf-8 -*- -"""Module with threading utilities""" -__docformat__ = "restructuredtext" -import threading -import inspect -import Queue - -#{ Decorators - -def do_terminate_threads(whitelist=list()): - """Simple function which terminates all of our threads - :param whitelist: If whitelist is given, only the given threads will be terminated""" - for t in threading.enumerate(): - if not isinstance(t, TerminatableThread): - continue - if whitelist and t not in whitelist: - continue - if isinstance(t, WorkerThread): - t.inq.put(t.quit) - # END worker special handling - t.stop_and_join() - # END for each thread - -def terminate_threads( func ): - """Kills all worker threads the method has created by sending the quit signal. - This takes over in case of an error in the main function""" - def wrapper(*args, **kwargs): - cur_threads = set(threading.enumerate()) - try: - return func(*args, **kwargs) - finally: - do_terminate_threads(set(threading.enumerate()) - cur_threads) - # END finally shutdown threads - # END wrapper - wrapper.__name__ = func.__name__ - return wrapper - -#} END decorators - -#{ Classes - -class TerminatableThread(threading.Thread): - """A simple thread able to terminate itself on behalf of the user. - - Terminate a thread as follows: - - t.stop_and_join() - - Derived classes call _should_terminate() to determine whether they should - abort gracefully - """ - __slots__ = '_terminate' - - def __init__(self): - super(TerminatableThread, self).__init__() - self._terminate = False - - - #{ Subclass Interface - def _should_terminate(self): - """:return: True if this thread should terminate its operation immediately""" - return self._terminate - - def _terminated(self): - """Called once the thread terminated. Its called in the main thread - and may perform cleanup operations""" - pass - - def start(self): - """Start the thread and return self""" - super(TerminatableThread, self).start() - return self - - #} END subclass interface - - #{ Interface - - def stop_and_join(self): - """Ask the thread to stop its operation and wait for it to terminate - :note: Depending on the implenetation, this might block a moment""" - self._terminate = True - self.join() - self._terminated() - #} END interface - - -class WorkerThread(TerminatableThread): - """ - This base allows to call functions on class instances natively and retrieve - their results asynchronously using a queue. - The thread runs forever unless it receives the terminate signal using - its task queue. - - Tasks could be anything, but should usually be class methods and arguments to - allow the following: - - inq = Queue() - outq = Queue() - w = WorkerThread(inq, outq) - w.start() - inq.put((WorkerThread., args, kwargs)) - res = outq.get() - - finally we call quit to terminate asap. - - alternatively, you can make a call more intuitively - the output is the output queue - allowing you to get the result right away or later - w.call(arg, kwarg='value').get() - - inq.put(WorkerThread.quit) - w.join() - - You may provide the following tuples as task: - t[0] = class method, function or instance method - t[1] = optional, tuple or list of arguments to pass to the routine - t[2] = optional, dictionary of keyword arguments to pass to the routine - """ - __slots__ = ('inq', 'outq') - - class InvalidRoutineError(Exception): - """Class sent as return value in case of an error""" - - def __init__(self, inq = None, outq = None): - super(WorkerThread, self).__init__() - self.inq = inq or Queue.Queue() - self.outq = outq or Queue.Queue() - - def call(self, function, *args, **kwargs): - """Method that makes the call to the worker using the input queue, - returning our output queue - - :param funciton: can be a standalone function unrelated to this class, - a class method of this class or any instance method. - If it is a string, it will be considered a function residing on this instance - :param args: arguments to pass to function - :parma **kwargs: kwargs to pass to function""" - self.inq.put((function, args, kwargs)) - return self.outq - - def wait_until_idle(self): - """wait until the input queue is empty, in the meanwhile, take all - results off the output queue.""" - while not self.inq.empty(): - try: - self.outq.get(False) - except Queue.Empty: - continue - # END while there are tasks on the queue - - def run(self): - """Process input tasks until we receive the quit signal""" - while True: - if self._should_terminate(): - break - # END check for stop request - routine = self.__class__.quit - args = tuple() - kwargs = dict() - tasktuple = self.inq.get() - - if isinstance(tasktuple, (tuple, list)): - if len(tasktuple) == 3: - routine, args, kwargs = tasktuple - elif len(tasktuple) == 2: - routine, args = tasktuple - elif len(tasktuple) == 1: - routine = tasktuple[0] - # END tasktuple length check - elif inspect.isroutine(tasktuple): - routine = tasktuple - # END tasktuple handling - - try: - rval = None - if inspect.ismethod(routine): - if routine.im_self is None: - rval = routine(self, *args, **kwargs) - else: - rval = routine(*args, **kwargs) - elif inspect.isroutine(routine): - rval = routine(*args, **kwargs) - elif isinstance(routine, basestring) and hasattr(self, routine): - rval = getattr(self, routine)(*args, **kwargs) - else: - # ignore unknown items - print "%s: task %s was not understood - terminating" % (self.getName(), str(tasktuple)) - self.outq.put(self.InvalidRoutineError(routine)) - break - # END make routine call - self.outq.put(rval) - except StopIteration: - break - except Exception,e: - print "%s: Task %s raised unhandled exception: %s" % (self.getName(), str(tasktuple), str(e)) - self.outq.put(e) - # END routine exception handling - # END endless loop - - def quit(self): - raise StopIteration - - -#} END classes diff --git a/test/git/async/__init__.py b/test/git/async/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/git/async/test_channel.py b/test/git/async/test_channel.py new file mode 100644 index 00000000..ad68a8d5 --- /dev/null +++ b/test/git/async/test_channel.py @@ -0,0 +1,61 @@ +"""Channel testing""" +from test.testlib import * +from git.async.channel import * + +import time + +class TestChannels(TestBase): + + def test_base(self): + # creating channel yields a write and a read channal + wc, rc = Channel() + assert isinstance(wc, WChannel) + assert isinstance(rc, RChannel) + + # everything else fails + self.failUnlessRaises(ValueError, Channel, 1, "too many args") + + # TEST UNLIMITED SIZE CHANNEL - writing+reading is FIFO + item = 1 + item2 = 2 + wc.write(item) + wc.write(item2) + assert rc.read() == item + assert rc.read() == item2 + + # next read blocks, then raises - it waits a second + st = time.time() + self.failUnlessRaises(IOError, rc.read, True, 1) + assert time.time() - st >= 1.0 + + # writing to a closed channel raises + assert not wc.closed + wc.close() + assert wc.closed + wc.close() # fine + assert wc.closed + + self.failUnlessRaises(IOError, wc.write, 1) + + # reading from a closed channel never blocks + self.failUnlessRaises(IOError, rc.read) + + + + # TEST LIMITED SIZE CHANNEL + # channel with max-items set + wc, rc = Channel(1) + wc.write(item) # fine + + # blocks for a second, its full + st = time.time() + self.failUnlessRaises(IOError, wc.write, item, True, 1) + assert time.time() - st >= 1.0 + + # get one + assert rc.read() == item + + # its empty,can put one again + wc.write(item2) + assert rc.read() == item2 + wc.close() diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py new file mode 100644 index 00000000..3a9ef8a1 --- /dev/null +++ b/test/git/async/test_pool.py @@ -0,0 +1,10 @@ +"""Channel testing""" +from test.testlib import * +from git.async.pool import * + +import time + +class TestThreadPool(TestBase): + + def test_base(self): + pass diff --git a/test/git/async/test_thread.py b/test/git/async/test_thread.py new file mode 100644 index 00000000..ca306cc0 --- /dev/null +++ b/test/git/async/test_thread.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +""" Test thead classes and functions""" +from test.testlib import * +from git.async.thread import * +from Queue import Queue + +class TestWorker(WorkerThread): + def __init__(self, *args, **kwargs): + super(TestWorker, self).__init__(*args, **kwargs) + self.reset() + + def fun(self, *args, **kwargs): + self.called = True + self.args = args + self.kwargs = kwargs + return True + + def make_assertion(self): + assert self.called + assert self.args + assert self.kwargs + self.reset() + + def reset(self): + self.called = False + self.args = None + self.kwargs = None + + +class TestThreads( TestCase ): + + @terminate_threads + def test_worker_thread(self): + worker = TestWorker() + assert isinstance(worker.start(), WorkerThread) + + # test different method types + standalone_func = lambda *args, **kwargs: worker.fun(*args, **kwargs) + for function in ("fun", TestWorker.fun, worker.fun, standalone_func): + rval = worker.call(function, 1, this='that') + assert isinstance(rval, Queue) + assert rval.get() is True + worker.make_assertion() + # END for each function type + + worker.call('quit') + diff --git a/test/git/mp/__init__.py b/test/git/mp/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/git/mp/test_channel.py b/test/git/mp/test_channel.py deleted file mode 100644 index 9b667372..00000000 --- a/test/git/mp/test_channel.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Channel testing""" -from test.testlib import * -from git.mp.channel import * - -import time - -class TestChannels(TestBase): - - def test_base(self): - # creating channel yields a write and a read channal - wc, rc = Channel() - assert isinstance(wc, WChannel) - assert isinstance(rc, RChannel) - - # everything else fails - self.failUnlessRaises(ValueError, Channel, 1, "too many args") - - # TEST UNLIMITED SIZE CHANNEL - writing+reading is FIFO - item = 1 - item2 = 2 - wc.write(item) - wc.write(item2) - assert rc.read() == item - assert rc.read() == item2 - - # next read blocks, then raises - it waits a second - st = time.time() - self.failUnlessRaises(IOError, rc.read, True, 1) - assert time.time() - st >= 1.0 - - # writing to a closed channel raises - assert not wc.closed - wc.close() - assert wc.closed - wc.close() # fine - assert wc.closed - - self.failUnlessRaises(IOError, wc.write, 1) - - # reading from a closed channel never blocks - self.failUnlessRaises(IOError, rc.read) - - - - # TEST LIMITED SIZE CHANNEL - # channel with max-items set - wc, rc = Channel(1) - wc.write(item) # fine - - # blocks for a second, its full - st = time.time() - self.failUnlessRaises(IOError, wc.write, item, True, 1) - assert time.time() - st >= 1.0 - - # get one - assert rc.read() == item - - # its empty,can put one again - wc.write(item2) - assert rc.read() == item2 - wc.close() diff --git a/test/git/mp/test_pool.py b/test/git/mp/test_pool.py deleted file mode 100644 index 7c4a366f..00000000 --- a/test/git/mp/test_pool.py +++ /dev/null @@ -1,10 +0,0 @@ -"""Channel testing""" -from test.testlib import * -from git.mp.pool import * - -import time - -class TestThreadPool(TestBase): - - def test_base(self): - pass diff --git a/test/git/mp/test_thread.py b/test/git/mp/test_thread.py deleted file mode 100644 index 9625aabb..00000000 --- a/test/git/mp/test_thread.py +++ /dev/null @@ -1,47 +0,0 @@ -# -*- coding: utf-8 -*- -""" Test thead classes and functions""" -from test.testlib import * -from git.mp.thread import * -from Queue import Queue - -class TestWorker(WorkerThread): - def __init__(self, *args, **kwargs): - super(TestWorker, self).__init__(*args, **kwargs) - self.reset() - - def fun(self, *args, **kwargs): - self.called = True - self.args = args - self.kwargs = kwargs - return True - - def make_assertion(self): - assert self.called - assert self.args - assert self.kwargs - self.reset() - - def reset(self): - self.called = False - self.args = None - self.kwargs = None - - -class TestThreads( TestCase ): - - @terminate_threads - def test_worker_thread(self): - worker = TestWorker() - assert isinstance(worker.start(), WorkerThread) - - # test different method types - standalone_func = lambda *args, **kwargs: worker.fun(*args, **kwargs) - for function in ("fun", TestWorker.fun, worker.fun, standalone_func): - rval = worker.call(function, 1, this='that') - assert isinstance(rval, Queue) - assert rval.get() is True - worker.make_assertion() - # END for each function type - - worker.call('quit') - -- cgit v1.2.3 From b72e2704022d889f116e49abf3e1e5d3e3192d3b Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 6 Jun 2010 01:00:12 +0200 Subject: Improved pool design and started rough implementation, top down to learn while going. Tests will be written soon for verification, its still quite theoretical --- lib/git/async/channel.py | 49 ++++++-- lib/git/async/graph.py | 36 ++++++ lib/git/async/pool.py | 273 ++++++++++++++++++++++++++++++++--------- test/git/async/test_channel.py | 22 ++-- test/git/async/test_graph.py | 10 ++ 5 files changed, 314 insertions(+), 76 deletions(-) create mode 100644 lib/git/async/graph.py create mode 100644 test/git/async/test_graph.py diff --git a/lib/git/async/channel.py b/lib/git/async/channel.py index c9cbfb87..70daed24 100644 --- a/lib/git/async/channel.py +++ b/lib/git/async/channel.py @@ -7,7 +7,7 @@ from Queue import ( #{ Classes class Channel(object): - """A channel is similar to a system pipe. It has a write end as well as one or + """A channel is similar to a file like object. It has a write end as well as one or more read ends. If Data is in the channel, it can be read, if not the read operation will block until data becomes available. If the channel is closed, any read operation will result in an exception @@ -51,8 +51,8 @@ class WChannel(Channel): :param block: If True, the call will block until there is free space in the channel :param timeout: timeout in seconds for blocking calls. - :raise IOError: when writing into closed file or when writing into a non-blocking - full channel + :raise IOError: when writing into closed file + :raise EOFError: when writing into a non-blocking full channel :note: may block if the channel has a limited capacity""" if self._closed: raise IOError("Cannot write to a closed channel") @@ -60,9 +60,14 @@ class WChannel(Channel): try: self._queue.put(item, block, timeout) except Full: - raise IOError("Capacity of the channel was exeeded") + raise EOFError("Capacity of the channel was exeeded") # END exception handling + def size(self): + """:return: approximate number of items that could be read from the read-ends + of this channel""" + return self._queue.qsize() + def close(self): """Close the channel. Multiple close calls on a closed channel are no an error""" @@ -86,22 +91,42 @@ class RChannel(Channel): #{ Interface - def read(self, block=True, timeout=None): - """:return: an item read from the channel + def read(self, count=0, block=True, timeout=None): + """read a list of items read from the channel. The list, as a sequence + of items, is similar to the string of characters returned when reading from + file like objects. + :param count: given amount of items to read. If < 1, all items will be read :param block: if True, the call will block until an item is available :param timeout: if positive and block is True, it will block only for the given amount of seconds. - :raise IOError: When reading from an empty channel ( if non-blocking, or - if the channel is still empty after the timeout""" + :return: single item in a list if count is 1, or a list of count items. + If the channel was empty and count was 1, an empty list will be returned. + If count was greater 1, a list with less than count items will be + returned. + If count was < 1, a list with all items that could be read will be + returned.""" # if the channel is closed for writing, we never block if self._wc.closed: block = False - + + out = list() try: - return self._wc._queue.get(block, timeout) + if count == 1: + out.append(self._wc._queue.get(block, timeout)) + elif count < 1: + while True: + out.append(self._wc._queue.get(block, timeout)) + # END for each item + return out + else: + for i in xrange(count): + out.append(self._wc._queue.get(block, timeout)) + # END for each item + # END handle count except Empty: - raise IOError("Error reading from an empty channel") - # END handle reading + pass + # END handle exceptions + return out #} END interface diff --git a/lib/git/async/graph.py b/lib/git/async/graph.py new file mode 100644 index 00000000..0c0a2137 --- /dev/null +++ b/lib/git/async/graph.py @@ -0,0 +1,36 @@ +"""Simplistic implementation of a graph""" + +class Node(object): + """A quick and dirty to the point implementation of a simple, and slow ascyclic graph. + Its not designed to support big graphs, and sports only the functionality + we need""" + __slots__ = ('in_nodes', 'out_nodes') + + +class Graph(object): + """A simple graph implementation, keeping nodes and providing basic access and + editing functions""" + __slots__ = "nodes" + + def __init__(self): + self.nodes = list() + + def add_node(self, node): + """Add a new node to the graph""" + raise NotImplementedError() + + def del_node(self, node): + """Delete a node from the graph""" + raise NotImplementedError() + + def add_edge(self, u, v): + """Add an undirected edge between the given nodes u and v. + :raise ValueError: If the new edge would create a cycle""" + raise NotImplementedError() + + def visit_input_depth_first(self, node, visitor=lambda n: True ): + """Visit all input nodes of the given node, depth first, calling visitor + for each node on our way. If the function returns False, the traversal + will not go any deeper, but continue at the next branch""" + raise NotImplementedError() + diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index f9f7880b..7798d3d4 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -1,92 +1,146 @@ """Implementation of a thread-pool working with channels""" from thread import WorkerThread +from Queue import Queue + +from graph import ( + Graph, + Node + ) + from channel import ( Channel, WChannel, RChannel ) -class Node(object): - """A quick and dirty to the point implementation of a simple, and slow ascyclic graph. - Its not designed to support big graphs, and sports only the functionality - we need""" - __slots__ = ('in_nodes', 'out_nodes') - - -class Graph(object): - """A simple graph implementation, keeping nodes and providing basic access and - editing functions""" - __slots__ = "nodes" - - def add_node(self, node): - pass - - def del_node(self, node): - pass - - def visit_input_depth_first(self, node, visitor=lambda n: True ): - """Visit all input nodes of the given node, depth first, calling visitor - for each node on our way. If the function returns False, the traversal - will not go any deeper, but continue at the next branch""" - pass - +import weakref +import sys class TaskNode(Node): """Couples an input channel, an output channel, as well as a processing function together. It may contain additional information on how to handel read-errors from the input channel""" - __slots__ = ('in_rc', 'out_wc', 'fun') + __slots__ = ( 'in_rc', # input read channel + '_out_wc', # output write channel + '_pool_ref', # ref to our pool + '_exc', # exception caught + 'fun', # function to call with items read from in_rc + 'max_chunksize', # maximium amount of items to process per process call + 'apply_single' # apply single items even if multiple where read + ) + + def __init__(self, in_rc, fun, apply_single=True): + self.in_rc = in_rc + self._out_wc = None + self._pool_ref = None + self._exc = None + self.fun = fun + self.max_chunksize = 0 # note set + self.apply_single = apply_single def is_done(self): """:return: True if we are finished processing""" - return self.out_wc.closed + return self._out_wc.closed + + def set_done(self): + """Set ourselves to being done, has we have completed the processing""" + self._out_wc.close() + + def error(self): + """:return: Exception caught during last processing or None""" + return self._exc + def process(self, count=1): + """Process count items and send the result individually to the output channel""" + if self._out_wc is None: + raise IOError("Cannot work in uninitialized task") + + read = self.in_rc.read + if isinstance(self.in_rc, RPoolChannel) and self.in_rc._pool is self._pool_ref(): + read = self.in_rc._read + items = read(count) + + try: + if self.apply_single: + for item in items: + self._out_wc.write(self.fun(item)) + # END for each item + else: + self._out_wc.write(self.fun(items)) + # END handle single apply + except Exception, e: + self._exc = e + self.set_done() + # END exception handling + + # if we didn't get all demanded items, which is also the case if count is 0 + # we have depleted the input channel and are done + if len(items) != count: + self.set_done() + # END handle done state + #{ Configuration + class RPoolChannel(RChannel): """ A read-only pool channel may not be wrapped or derived from, but it provides slots to call before and after an item is to be read. - It acts like a handle to the underlying task""" + It acts like a handle to the underlying task in the pool.""" __slots__ = ('_task', '_pool', '_pre_cb', '_post_cb') - def set_post_cb(self, fun = lambda item: item): - """Install a callback to call after the item has been read. The function - returns a possibly changed item. If it raises, the exception will be propagated - in an IOError, indicating read-failure - If a function is not provided, the call is effectively uninstalled.""" + def __init__(self, wchannel, task, pool): + RChannel.__init__(self, wchannel) + self._task = task + self._pool = pool + self._pre_cb = None + self._post_cb = None - def set_pre_cb(self, fun = lambda : None): - """Install a callback to call before an item is read from the channel. + def __del__(self): + """Assures that our task will be deleted if we were the last reader""" + del(self._wc) # decrement ref-count + self._pool._del_task_if_orphaned(self._task) + + def set_pre_cb(self, fun = lambda count: None): + """Install a callback to call with the item count to be read before any + item is actually read from the channel. If it fails, the read will fail with an IOError If a function is not provided, the call is effectively uninstalled.""" + self._pre_cb = fun + + def set_post_cb(self, fun = lambda item: item): + """Install a callback to call after the items were read. The function + returns a possibly changed item list. If it raises, the exception will be propagated. + If a function is not provided, the call is effectively uninstalled.""" + self._post_cb = fun - def read(block=False, timeout=None): + def read(self, count=1, block=False, timeout=None): """Read an item that was processed by one of our threads :note: Triggers task dependency handling needed to provide the necessary input""" + if self._pre_cb: + self._pre_cb() + # END pre callback + + ################################################## + self._pool._prepare_processing(self._task, count) + ################################################## + + items = RChannel.read(self, count, block, timeout) + if self._post_cb: + items = self._post_cb(items) #{ Internal - def _read(self, block=False, timeout=None): + def _read(self, count=1, block=False, timeout=None): """Calls the underlying channel's read directly, without triggering the pool""" - return RChannel.read(self, block, timeout) + return RChannel.read(self, count, block, timeout) #} END internal - -class PoolWorker(WorkerThread): - """A worker thread which gets called to deal with Tasks. Tasks provide channls - with actual work, whose result will be send to the tasks output channel""" - - @classmethod - def perform_task(cls, task): - # note : when getting the input channel, be sure not to trigger - # RPoolChannel - pass -class ThreadPool(Graph): +class ThreadPool(object): """A thread pool maintains a set of one or more worker threads, but supports a fully serial mode in which case the amount of threads is zero. @@ -94,23 +148,130 @@ class ThreadPool(Graph): is lazy, as work will only be done once an output is requested. :note: the current implementation returns channels which are meant to be - used only from the main thread""" - __slots__ = ( '_workers', # list of worker threads + used only from the main thread, hence you cannot consume their results + from multiple threads unless you use a task for it.""" + __slots__ = ( '_tasks', # a graph of tasks + '_consumed_tasks', # a list with tasks that are done or had an error + '_workers', # list of worker threads '_queue', # master queue for tasks - '_ordered_tasks_cache' # tasks in order of evaluation, mapped by read channel + '_ordered_tasks_cache' # tasks in order of evaluation, mapped from task -> task list ) - def del_node(self, task): - """Delete the node ( being a task ), but delete the entries in our output channel - cache as well""" + def __init__(self, size=0): + self._tasks = Graph() + self._consumed_tasks = list() + self._workers = list() + self._queue = Queue() + self._ordered_tasks_cache = dict() + + def __del__(self): + raise NotImplementedError("TODO: Proper cleanup") + + #{ Internal + def _queue_feeder_visitor(self, task, count): + """Walk the graph and find tasks that are done for later cleanup, and + queue all others for processing by our worker threads ( if available ).""" + if task.error() or task.is_done(): + self._consumed_tasks.append(task) + + # if the task does not have the required output on its queue, schedule + # it for processing. If we should process all, we don't care about the + # amount as it should process until its all done. + if self._workers: + if count < 1 or task._out_wc.size() < count: + # respect the chunk size, and split the task up if we want + # to process too much. This can be defined per task + queue = self._queue + if task.max_chunksize: + chunksize = count / task.max_chunksize + remainder = count - (chunksize * task.max_chunksize) + for i in xrange(chunksize): + queue.put((task.process, chunksize)) + if remainder: + queue.put((task.process, remainder)) + else: + self._queue.put((task.process, count)) + # END handle chunksize + # END handle queuing + else: + # no workers, so we have to do the work ourselves + task.process(count) + # END handle serial mode + + # always walk the whole graph, we want to find consumed tasks + return True + + def _prepare_processing(self, task, count): + """Process the tasks which depend on the given one to be sure the input + channels are filled with data once we process the actual task + + Tasks have two important states: either they are done, or they are done + and have an error, so they are likely not to have finished all their work. + + Either way, we will put them onto a list of tasks to delete them, providng + information about the failed ones. + + Tasks which are not done will be put onto the queue for processing, which + is fine as we walked them depth-first.""" + self._tasks.visit_input_depth_first(task, lambda n: self._queue_feeder_visitor(n, count)) + + # delete consumed tasks to cleanup + for task in self._consumed_tasks: + self.del_task(task) + # END for each task to delete + del(self._consumed_tasks[:]) + + def _del_task_if_orphaned(self, task): + """Check the task, and delete it if it is orphaned""" + if sys.getrefcount(task._out_wc) < 3: + self.del_task(task) + #} END internal + + #{ Interface + + def del_task(self, task): + """Delete the task + Additionally we will remove orphaned tasks, which can be identified if their + output channel is only held by themselves, so no one will ever consume + its items.""" + # now delete our actual node - must set it done os it closes its channels. + # Otherwise further reads of output tasks will block. + # Actually they may still block if anyone wants to read all ... without + # a timeout + # keep its input nodes as we check whether they were orphaned + in_tasks = task.in_nodes + task.set_done() + self._tasks.del_node(task) + for t in in_tasks + self._del_task_if_orphaned(t) + # END handle orphans recursively def set_pool_size(self, size=0): """Set the amount of workers to use in this pool. :param size: if 0, the pool will do all work itself in the calling thread, otherwise the work will be distributed among the given amount of threads""" + raise NotImplementedError() def add_task(self, task): """Add a new task to be processed. - :return: your task instance with its output channel set. It can be used - to retrieve processed items""" + :return: a read channel to retrieve processed items. If that handle is lost, + the task will be considered orphaned and will be deleted on the next + occasion.""" + # create a write channel for it + wc, rc = Channel() + rc = RPoolChannel(wc, task, self) + task._out_wc = wc + task._pool_ref = weakref.ref(self) + + self._tasks.add_node(task) + + # If the input channel is one of our read channels, we add the relation + ic = task.in_rc + if isinstance(ic, RPoolChannel) and ic._pool is self: + self._tasks.add_edge(ic._task, task) + # END add task relation + + return rc + + #} END interface diff --git a/test/git/async/test_channel.py b/test/git/async/test_channel.py index ad68a8d5..2a3c1585 100644 --- a/test/git/async/test_channel.py +++ b/test/git/async/test_channel.py @@ -20,12 +20,15 @@ class TestChannels(TestBase): item2 = 2 wc.write(item) wc.write(item2) - assert rc.read() == item - assert rc.read() == item2 + + # read all - it blocks as its still open for writing + st = time.time() + assert rc.read(timeout=1) == [item, item2] + assert time.time() - st >= 1.0 # next read blocks, then raises - it waits a second st = time.time() - self.failUnlessRaises(IOError, rc.read, True, 1) + assert len(rc.read(1, True, 1)) == 0 assert time.time() - st >= 1.0 # writing to a closed channel raises @@ -38,7 +41,7 @@ class TestChannels(TestBase): self.failUnlessRaises(IOError, wc.write, 1) # reading from a closed channel never blocks - self.failUnlessRaises(IOError, rc.read) + assert len(rc.read()) == 0 @@ -49,13 +52,16 @@ class TestChannels(TestBase): # blocks for a second, its full st = time.time() - self.failUnlessRaises(IOError, wc.write, item, True, 1) + self.failUnlessRaises(EOFError, wc.write, item, True, 1) assert time.time() - st >= 1.0 - # get one - assert rc.read() == item + # get our only one + assert rc.read(1)[0] == item # its empty,can put one again wc.write(item2) - assert rc.read() == item2 wc.close() + + # reading 10 will only yield one, it will not block as its closed + assert rc.read(10, timeout=1)[0] == item2 + diff --git a/test/git/async/test_graph.py b/test/git/async/test_graph.py new file mode 100644 index 00000000..18d6997c --- /dev/null +++ b/test/git/async/test_graph.py @@ -0,0 +1,10 @@ +"""Channel testing""" +from test.testlib import * +from git.async.graph import * + +import time + +class TestGraph(TestBase): + + def test_base(self): + pass -- cgit v1.2.3 From ec28ad575ce1d7bb6a616ffc404f32bbb1af67b2 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 6 Jun 2010 12:48:25 +0200 Subject: thread: adjusted worker thread not to provide an output queue anymore - this is handled by the task system graph: implemented it including test according to the pools requirements pool: implemented set_pool_size --- lib/git/async/graph.py | 84 +++++++++++++++++++++++++++++++++++++------ lib/git/async/pool.py | 53 ++++++++++++++++++++++++--- lib/git/async/thread.py | 27 +++----------- test/git/async/test_graph.py | 82 +++++++++++++++++++++++++++++++++++++++++- test/git/async/test_thread.py | 4 +-- 5 files changed, 209 insertions(+), 41 deletions(-) diff --git a/lib/git/async/graph.py b/lib/git/async/graph.py index 0c0a2137..b4d6aa00 100644 --- a/lib/git/async/graph.py +++ b/lib/git/async/graph.py @@ -6,31 +6,95 @@ class Node(object): we need""" __slots__ = ('in_nodes', 'out_nodes') + def __init__(self): + self.in_nodes = list() + self.out_nodes = list() + class Graph(object): """A simple graph implementation, keeping nodes and providing basic access and - editing functions""" + editing functions. The performance is only suitable for small graphs of not + more than 10 nodes !""" __slots__ = "nodes" def __init__(self): self.nodes = list() def add_node(self, node): - """Add a new node to the graph""" - raise NotImplementedError() + """Add a new node to the graph + :return: the newly added node""" + self.nodes.append(node) + return node def del_node(self, node): - """Delete a node from the graph""" - raise NotImplementedError() + """Delete a node from the graph + :return: self""" + # clear connections + for outn in node.out_nodes: + del(outn.in_nodes[outn.in_nodes.index(node)]) + for inn in node.in_nodes: + del(inn.out_nodes[inn.out_nodes.index(node)]) + del(self.nodes[self.nodes.index(node)]) + return self def add_edge(self, u, v): """Add an undirected edge between the given nodes u and v. + + return: self :raise ValueError: If the new edge would create a cycle""" - raise NotImplementedError() + if u is v: + raise ValueError("Cannot connect a node with itself") + + # are they already connected ? + if u in v.in_nodes and v in u.out_nodes or \ + v in u.in_nodes and u in v.out_nodes: + return self + # END handle connection exists + + # cycle check - if we can reach any of the two by following either ones + # history, its a cycle + for start, end in ((u, v), (v,u)): + if not start.in_nodes: + continue + nodes = start.in_nodes[:] + seen = set() + # depth first search - its faster + while nodes: + n = nodes.pop() + if n in seen: + continue + seen.add(n) + if n is end: + raise ValueError("Connecting u with v would create a cycle") + nodes.extend(n.in_nodes) + # END while we are searching + # END for each direction to look + + # connection is valid, set it up + u.out_nodes.append(v) + v.in_nodes.append(u) + + return self - def visit_input_depth_first(self, node, visitor=lambda n: True ): + def visit_input_inclusive_depth_first(self, node, visitor=lambda n: True ): """Visit all input nodes of the given node, depth first, calling visitor for each node on our way. If the function returns False, the traversal - will not go any deeper, but continue at the next branch""" - raise NotImplementedError() - + will not go any deeper, but continue at the next branch + It will return the actual input node in the end !""" + nodes = node.in_nodes[:] + seen = set() + + # depth first + while nodes: + n = nodes.pop() + if n in seen: + continue + seen.add(n) + + # only proceed in that direction if visitor is fine with it + if visitor(n): + nodes.extend(n.in_nodes) + # END call visitor + # END while walking + visitor(node) + diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 7798d3d4..9a24cbc5 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -25,7 +25,8 @@ class TaskNode(Node): '_out_wc', # output write channel '_pool_ref', # ref to our pool '_exc', # exception caught - 'fun', # function to call with items read from in_rc + 'fun', # function to call with items read from in_rc + 'min_count', # minimum amount of items to produce, None means no override 'max_chunksize', # maximium amount of items to process per process call 'apply_single' # apply single items even if multiple where read ) @@ -36,6 +37,7 @@ class TaskNode(Node): self._pool_ref = None self._exc = None self.fun = fun + self.min_count = None self.max_chunksize = 0 # note set self.apply_single = apply_single @@ -174,6 +176,12 @@ class ThreadPool(object): if task.error() or task.is_done(): self._consumed_tasks.append(task) + # allow min-count override. This makes sure we take at least min-count + # items off the input queue ( later ) + if task.min_count is not None: + count = task.min_count + # END handle min-count + # if the task does not have the required output on its queue, schedule # it for processing. If we should process all, we don't care about the # amount as it should process until its all done. @@ -213,7 +221,7 @@ class ThreadPool(object): Tasks which are not done will be put onto the queue for processing, which is fine as we walked them depth-first.""" - self._tasks.visit_input_depth_first(task, lambda n: self._queue_feeder_visitor(n, count)) + self._tasks.visit_input_inclusive_depth_first(task, lambda n: self._queue_feeder_visitor(n, count)) # delete consumed tasks to cleanup for task in self._consumed_tasks: @@ -233,7 +241,9 @@ class ThreadPool(object): """Delete the task Additionally we will remove orphaned tasks, which can be identified if their output channel is only held by themselves, so no one will ever consume - its items.""" + its items. + + :return: self""" # now delete our actual node - must set it done os it closes its channels. # Otherwise further reads of output tasks will block. # Actually they may still block if anyone wants to read all ... without @@ -246,12 +256,45 @@ class ThreadPool(object): for t in in_tasks self._del_task_if_orphaned(t) # END handle orphans recursively + + return self def set_pool_size(self, size=0): - """Set the amount of workers to use in this pool. + """Set the amount of workers to use in this pool. When reducing the size, + the call may block as it waits for threads to finish. + When reducing the size to zero, this thread will process all remaining + items on the queue. + + :return: self :param size: if 0, the pool will do all work itself in the calling thread, otherwise the work will be distributed among the given amount of threads""" - raise NotImplementedError() + # either start new threads, or kill existing ones. + # If we end up with no threads, we process the remaining chunks on the queue + # ourselves + cur_count = len(self._workers) + if cur_count < size: + for i in range(size - cur_count): + worker = WorkerThread(self._queue) + self._workers.append(worker) + # END for each new worker to create + elif cur_count > size: + del_count = cur_count - size + for i in range(del_count): + self._workers[i].stop_and_join() + # END for each thread to stop + del(self._workers[:del_count]) + # END handle count + + if size == 0: + while not self._queue.empty(): + try: + taskmethod, count = self._queue.get(False) + taskmethod(count) + except Queue.Empty: + continue + # END while there are tasks on the queue + # END process queue + return self def add_task(self, task): """Add a new task to be processed. diff --git a/lib/git/async/thread.py b/lib/git/async/thread.py index 3938666a..7ca93c86 100644 --- a/lib/git/async/thread.py +++ b/lib/git/async/thread.py @@ -85,9 +85,9 @@ class TerminatableThread(threading.Thread): class WorkerThread(TerminatableThread): - """ - This base allows to call functions on class instances natively and retrieve - their results asynchronously using a queue. + """ This base allows to call functions on class instances natively. + As it is meant to work with a pool, the result of the call must be + handled by the callee. The thread runs forever unless it receives the terminate signal using its task queue. @@ -95,11 +95,9 @@ class WorkerThread(TerminatableThread): allow the following: inq = Queue() - outq = Queue() - w = WorkerThread(inq, outq) + w = WorkerThread(inq) w.start() inq.put((WorkerThread., args, kwargs)) - res = outq.get() finally we call quit to terminate asap. @@ -120,10 +118,9 @@ class WorkerThread(TerminatableThread): class InvalidRoutineError(Exception): """Class sent as return value in case of an error""" - def __init__(self, inq = None, outq = None): + def __init__(self, inq = None): super(WorkerThread, self).__init__() self.inq = inq or Queue.Queue() - self.outq = outq or Queue.Queue() def call(self, function, *args, **kwargs): """Method that makes the call to the worker using the input queue, @@ -135,17 +132,6 @@ class WorkerThread(TerminatableThread): :param args: arguments to pass to function :parma **kwargs: kwargs to pass to function""" self.inq.put((function, args, kwargs)) - return self.outq - - def wait_until_idle(self): - """wait until the input queue is empty, in the meanwhile, take all - results off the output queue.""" - while not self.inq.empty(): - try: - self.outq.get(False) - except Queue.Empty: - continue - # END while there are tasks on the queue def run(self): """Process input tasks until we receive the quit signal""" @@ -184,15 +170,12 @@ class WorkerThread(TerminatableThread): else: # ignore unknown items print "%s: task %s was not understood - terminating" % (self.getName(), str(tasktuple)) - self.outq.put(self.InvalidRoutineError(routine)) break # END make routine call - self.outq.put(rval) except StopIteration: break except Exception,e: print "%s: Task %s raised unhandled exception: %s" % (self.getName(), str(tasktuple), str(e)) - self.outq.put(e) # END routine exception handling # END endless loop diff --git a/test/git/async/test_graph.py b/test/git/async/test_graph.py index 18d6997c..400e92cd 100644 --- a/test/git/async/test_graph.py +++ b/test/git/async/test_graph.py @@ -7,4 +7,84 @@ import time class TestGraph(TestBase): def test_base(self): - pass + g = Graph() + nn = 10 + assert nn > 2, "need at least 3 nodes" + + # add unconnected nodes + for i in range(nn): + assert isinstance(g.add_node(Node()), Node) + # END add nodes + assert len(g.nodes) == nn + + # delete unconnected nodes + for n in g.nodes[:]: + g.del_node(n) + # END del nodes + + # add a chain of connected nodes + last = None + for i in range(nn): + n = g.add_node(Node()) + if last: + assert not last.out_nodes + assert not n.in_nodes + assert g.add_edge(last, n) is g + assert last.out_nodes[0] is n + assert n.in_nodes[0] is last + last = n + # END for each node to connect + + # try to connect a node with itself + self.failUnlessRaises(ValueError, g.add_edge, last, last) + + # try to create a cycle + self.failUnlessRaises(ValueError, g.add_edge, g.nodes[0], g.nodes[-1]) + self.failUnlessRaises(ValueError, g.add_edge, g.nodes[-1], g.nodes[0]) + + # we have undirected edges, readding the same edge, but the other way + # around does not change anything + n1, n2, n3 = g.nodes[0], g.nodes[1], g.nodes[2] + g.add_edge(n1, n2) # already connected + g.add_edge(n2, n1) # same thing + assert len(n1.out_nodes) == 1 + assert len(n1.in_nodes) == 0 + assert len(n2.in_nodes) == 1 + assert len(n2.out_nodes) == 1 + + # deleting a connected node clears its neighbour connections + assert n3.in_nodes[0] is n2 + g.del_node(n2) + assert len(g.nodes) == nn - 1 + assert len(n3.in_nodes) == 0 + assert len(n1.out_nodes) == 0 + + # check the history from the last node + last = g.nodes[-1] + class Visitor(object): + def __init__(self, origin): + self.origin_seen = False + self.origin = origin + self.num_seen = 0 + + def __call__(self, n): + if n is self.origin: + self.origin_seen = True + else: + assert not self.origin_seen, "should see origin last" + # END check origin + self.num_seen += 1 + return True + + def _assert(self, num_expected): + assert self.origin_seen + assert self.num_seen == num_expected + # END visitor helper + + end = g.nodes[-1] + visitor = Visitor(end) + g.visit_input_inclusive_depth_first(end, visitor) + + num_nodes_seen = nn - 2 # deleted second, which leaves first one disconnected + visitor._assert(num_nodes_seen) + diff --git a/test/git/async/test_thread.py b/test/git/async/test_thread.py index ca306cc0..2ea8d1ff 100644 --- a/test/git/async/test_thread.py +++ b/test/git/async/test_thread.py @@ -37,9 +37,7 @@ class TestThreads( TestCase ): # test different method types standalone_func = lambda *args, **kwargs: worker.fun(*args, **kwargs) for function in ("fun", TestWorker.fun, worker.fun, standalone_func): - rval = worker.call(function, 1, this='that') - assert isinstance(rval, Queue) - assert rval.get() is True + worker.call(function, 1, this='that') worker.make_assertion() # END for each function type -- cgit v1.2.3 From b3cde0ee162b8f0cb67da981311c8f9c16050a62 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 6 Jun 2010 18:13:21 +0200 Subject: First step of testing the pool - tasks have been separated into a new module including own tests, their design improved to prepare them for some specifics that would be needed for multiprocessing support --- lib/git/async/graph.py | 16 +++-- lib/git/async/pool.py | 164 ++++++++++++++++--------------------------- lib/git/async/task.py | 144 +++++++++++++++++++++++++++++++++++++ lib/git/async/thread.py | 9 ++- lib/git/async/util.py | 24 +++++++ test/git/async/test_graph.py | 2 +- test/git/async/test_pool.py | 71 ++++++++++++++++++- test/git/async/test_task.py | 12 ++++ 8 files changed, 329 insertions(+), 113 deletions(-) create mode 100644 lib/git/async/task.py create mode 100644 lib/git/async/util.py create mode 100644 test/git/async/test_task.py diff --git a/lib/git/async/graph.py b/lib/git/async/graph.py index b4d6aa00..d817eeb4 100644 --- a/lib/git/async/graph.py +++ b/lib/git/async/graph.py @@ -1,14 +1,20 @@ """Simplistic implementation of a graph""" class Node(object): - """A quick and dirty to the point implementation of a simple, and slow ascyclic graph. - Its not designed to support big graphs, and sports only the functionality - we need""" - __slots__ = ('in_nodes', 'out_nodes') + """A Node in the graph. They know their neighbours, and have an id which should + resolve into a string""" + __slots__ = ('in_nodes', 'out_nodes', 'id') - def __init__(self): + def __init__(self, id=None): + self.id = id self.in_nodes = list() self.out_nodes = list() + + def __str__(self): + return str(self.id) + + def __repr__(self): + return "%s(%s)" % (type(self).__name__, self.id) class Graph(object): diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 9a24cbc5..2efc862b 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -1,10 +1,10 @@ """Implementation of a thread-pool working with channels""" from thread import WorkerThread +from task import InputChannelTask from Queue import Queue from graph import ( Graph, - Node ) from channel import ( @@ -16,73 +16,6 @@ from channel import ( import weakref import sys -class TaskNode(Node): - """Couples an input channel, an output channel, as well as a processing function - together. - It may contain additional information on how to handel read-errors from the - input channel""" - __slots__ = ( 'in_rc', # input read channel - '_out_wc', # output write channel - '_pool_ref', # ref to our pool - '_exc', # exception caught - 'fun', # function to call with items read from in_rc - 'min_count', # minimum amount of items to produce, None means no override - 'max_chunksize', # maximium amount of items to process per process call - 'apply_single' # apply single items even if multiple where read - ) - - def __init__(self, in_rc, fun, apply_single=True): - self.in_rc = in_rc - self._out_wc = None - self._pool_ref = None - self._exc = None - self.fun = fun - self.min_count = None - self.max_chunksize = 0 # note set - self.apply_single = apply_single - - def is_done(self): - """:return: True if we are finished processing""" - return self._out_wc.closed - - def set_done(self): - """Set ourselves to being done, has we have completed the processing""" - self._out_wc.close() - - def error(self): - """:return: Exception caught during last processing or None""" - return self._exc - - def process(self, count=1): - """Process count items and send the result individually to the output channel""" - if self._out_wc is None: - raise IOError("Cannot work in uninitialized task") - - read = self.in_rc.read - if isinstance(self.in_rc, RPoolChannel) and self.in_rc._pool is self._pool_ref(): - read = self.in_rc._read - items = read(count) - - try: - if self.apply_single: - for item in items: - self._out_wc.write(self.fun(item)) - # END for each item - else: - self._out_wc.write(self.fun(items)) - # END handle single apply - except Exception, e: - self._exc = e - self.set_done() - # END exception handling - - # if we didn't get all demanded items, which is also the case if count is 0 - # we have depleted the input channel and are done - if len(items) != count: - self.set_done() - # END handle done state - #{ Configuration - class RPoolChannel(RChannel): """ A read-only pool channel may not be wrapped or derived from, but it provides slots to call @@ -116,7 +49,7 @@ class RPoolChannel(RChannel): If a function is not provided, the call is effectively uninstalled.""" self._post_cb = fun - def read(self, count=1, block=False, timeout=None): + def read(self, count=0, block=False, timeout=None): """Read an item that was processed by one of our threads :note: Triggers task dependency handling needed to provide the necessary input""" @@ -131,9 +64,11 @@ class RPoolChannel(RChannel): items = RChannel.read(self, count, block, timeout) if self._post_cb: items = self._post_cb(items) + + return items #{ Internal - def _read(self, count=1, block=False, timeout=None): + def _read(self, count=0, block=False, timeout=None): """Calls the underlying channel's read directly, without triggering the pool""" return RChannel.read(self, count, block, timeout) @@ -141,7 +76,6 @@ class RPoolChannel(RChannel): #} END internal - class ThreadPool(object): """A thread pool maintains a set of one or more worker threads, but supports a fully serial mode in which case the amount of threads is zero. @@ -149,6 +83,15 @@ class ThreadPool(object): Work is distributed via Channels, which form a dependency graph. The evaluation is lazy, as work will only be done once an output is requested. + The thread pools inherent issue is the global interpreter lock that it will hit, + which gets worse considering a few c extensions specifically lock their part + globally as well. The only way this will improve is if custom c extensions + are written which do some bulk work, but release the GIL once they have acquired + their resources. + + Due to the nature of having multiple objects in git, its easy to distribute + that work cleanly among threads. + :note: the current implementation returns channels which are meant to be used only from the main thread, hence you cannot consume their results from multiple threads unless you use a task for it.""" @@ -156,7 +99,6 @@ class ThreadPool(object): '_consumed_tasks', # a list with tasks that are done or had an error '_workers', # list of worker threads '_queue', # master queue for tasks - '_ordered_tasks_cache' # tasks in order of evaluation, mapped from task -> task list ) def __init__(self, size=0): @@ -164,10 +106,10 @@ class ThreadPool(object): self._consumed_tasks = list() self._workers = list() self._queue = Queue() - self._ordered_tasks_cache = dict() + self.set_size(size) def __del__(self): - raise NotImplementedError("TODO: Proper cleanup") + self.set_size(0) #{ Internal def _queue_feeder_visitor(self, task, count): @@ -175,7 +117,7 @@ class ThreadPool(object): queue all others for processing by our worker threads ( if available ).""" if task.error() or task.is_done(): self._consumed_tasks.append(task) - + # allow min-count override. This makes sure we take at least min-count # items off the input queue ( later ) if task.min_count is not None: @@ -236,30 +178,11 @@ class ThreadPool(object): #} END internal #{ Interface + def size(self): + """:return: amount of workers in the pool""" + return len(self._workers) - def del_task(self, task): - """Delete the task - Additionally we will remove orphaned tasks, which can be identified if their - output channel is only held by themselves, so no one will ever consume - its items. - - :return: self""" - # now delete our actual node - must set it done os it closes its channels. - # Otherwise further reads of output tasks will block. - # Actually they may still block if anyone wants to read all ... without - # a timeout - # keep its input nodes as we check whether they were orphaned - in_tasks = task.in_nodes - task.set_done() - self._tasks.del_node(task) - - for t in in_tasks - self._del_task_if_orphaned(t) - # END handle orphans recursively - - return self - - def set_pool_size(self, size=0): + def set_size(self, size=0): """Set the amount of workers to use in this pool. When reducing the size, the call may block as it waits for threads to finish. When reducing the size to zero, this thread will process all remaining @@ -275,6 +198,7 @@ class ThreadPool(object): if cur_count < size: for i in range(size - cur_count): worker = WorkerThread(self._queue) + worker.start() self._workers.append(worker) # END for each new worker to create elif cur_count > size: @@ -295,7 +219,33 @@ class ThreadPool(object): # END while there are tasks on the queue # END process queue return self - + + def num_tasks(self): + """:return: amount of tasks""" + return len(self._tasks.nodes) + + def del_task(self, task): + """Delete the task + Additionally we will remove orphaned tasks, which can be identified if their + output channel is only held by themselves, so no one will ever consume + its items. + + :return: self""" + # now delete our actual node - must set it done os it closes its channels. + # Otherwise further reads of output tasks will block. + # Actually they may still block if anyone wants to read all ... without + # a timeout + # keep its input nodes as we check whether they were orphaned + in_tasks = task.in_nodes + task.set_done() + self._tasks.del_node(task) + + for t in in_tasks: + self._del_task_if_orphaned(t) + # END handle orphans recursively + + return self + def add_task(self, task): """Add a new task to be processed. :return: a read channel to retrieve processed items. If that handle is lost, @@ -305,15 +255,21 @@ class ThreadPool(object): wc, rc = Channel() rc = RPoolChannel(wc, task, self) task._out_wc = wc - task._pool_ref = weakref.ref(self) + + has_input_channel = isinstance(task, InputChannelTask) + if has_input_channel: + task._pool_ref = weakref.ref(self) + # END init input channel task self._tasks.add_node(task) # If the input channel is one of our read channels, we add the relation - ic = task.in_rc - if isinstance(ic, RPoolChannel) and ic._pool is self: - self._tasks.add_edge(ic._task, task) - # END add task relation + if has_input_channel: + ic = task.in_rc + if isinstance(ic, RPoolChannel) and ic._pool is self: + self._tasks.add_edge(ic._task, task) + # END add task relation + # END handle input channels for connections return rc diff --git a/lib/git/async/task.py b/lib/git/async/task.py new file mode 100644 index 00000000..d2422773 --- /dev/null +++ b/lib/git/async/task.py @@ -0,0 +1,144 @@ +from graph import Node +import threading +import new + +class OutputChannelTask(Node): + """Abstracts a named task as part of a set of interdependent tasks, which contains + additional information on how the task should be queued and processed. + + Results of the item processing are sent to an output channel, which is to be + set by the creator""" + __slots__ = ( '_read', # method to yield items to process + '_out_wc', # output write channel + '_exc', # exception caught + 'fun', # function to call with items read + 'min_count', # minimum amount of items to produce, None means no override + 'max_chunksize', # maximium amount of items to process per process call + 'apply_single' # apply single items even if multiple where read + ) + + def __init__(self, id, fun, apply_single=True, min_count=None, max_chunksize=0): + Node.__init__(self, id) + self._read = None # to be set by subclasss + self._out_wc = None # to be set later + self._exc = None + self.fun = fun + self.min_count = None + self.max_chunksize = 0 # note set + self.apply_single = apply_single + + def is_done(self): + """:return: True if we are finished processing""" + return self._out_wc.closed + + def set_done(self): + """Set ourselves to being done, has we have completed the processing""" + self._out_wc.close() + + def error(self): + """:return: Exception caught during last processing or None""" + return self._exc + + def process(self, count=0): + """Process count items and send the result individually to the output channel""" + items = self._read(count) + + try: + if self.apply_single: + for item in items: + self._out_wc.write(self.fun(item)) + # END for each item + else: + self._out_wc.write(self.fun(items)) + # END handle single apply + except Exception, e: + self._exc = e + self.set_done() + # END exception handling + + # if we didn't get all demanded items, which is also the case if count is 0 + # we have depleted the input channel and are done + if len(items) != count: + self.set_done() + # END handle done state + #{ Configuration + + +class ThreadTaskBase(object): + """Describes tasks which can be used with theaded pools""" + pass + + +class InputIteratorTaskBase(OutputChannelTask): + """Implements a task which processes items from an iterable in a multi-processing + safe manner""" + __slots__ = ('_iterator', '_lock') + # the type of the lock to use when reading from the iterator + lock_type = None + + def __init__(self, iterator, *args, **kwargs): + OutputChannelTask.__init__(self, *args, **kwargs) + if not hasattr(iterator, 'next'): + raise ValueError("Iterator %r needs a next() function" % iterator) + self._iterator = iterator + self._lock = self.lock_type() + self._read = self.__read + + def __read(self, count=0): + """Read count items from the iterator, and return them""" + self._lock.acquire() + try: + if count == 0: + return list(self._iterator) + else: + out = list() + it = self._iterator + for i in xrange(count): + try: + out.append(it.next()) + except StopIteration: + break + # END handle empty iterator + # END for each item to take + return out + # END handle count + finally: + self._lock.release() + # END handle locking + + +class InputIteratorThreadTask(InputIteratorTaskBase, ThreadTaskBase): + """An input iterator for threaded pools""" + lock_type = threading.Lock + + +class InputChannelTask(OutputChannelTask): + """Uses an input channel as source for reading items + For instantiation, it takes all arguments of its base, the first one needs + to be the input channel to read from though.""" + __slots__ = ( + 'in_rc', # channel to read items from + '_pool_ref' # to be set by Pool + ) + + def __init__(self, in_rc, *args, **kwargs): + OutputChannelTask.__init__(self, *args, **kwargs) + self._in_rc = in_rc + + def process(self, count=1): + """Verify our setup, and do some additional checking, before the + base implementation can permanently perform all operations""" + self._read = self._in_rc.read + # make sure we don't trigger the pool if we read from a pool channel which + # belongs to our own pool. Channels from different pools are fine though, + # there we want to trigger its computation + if isinstance(self._in_rc, RPoolChannel) and self._in_rc._pool is self._pool_ref(): + self._read = self._in_rc._read + + # permanently install our base for processing + self.process = new.instancemethod(OutputChannelTask.__dict__['process'], self, type(self)) + + # and call it + return OutputChannelTask.process(self, count) + #{ Configuration + diff --git a/lib/git/async/thread.py b/lib/git/async/thread.py index 7ca93c86..82acbd8f 100644 --- a/lib/git/async/thread.py +++ b/lib/git/async/thread.py @@ -139,10 +139,15 @@ class WorkerThread(TerminatableThread): if self._should_terminate(): break # END check for stop request - routine = self.__class__.quit + routine = None args = tuple() kwargs = dict() - tasktuple = self.inq.get() + # don't wait too long, instead check for the termination request more often + try: + tasktuple = self.inq.get(True, 1) + except Queue.Empty: + continue + # END get task with timeout if isinstance(tasktuple, (tuple, list)): if len(tasktuple) == 3: diff --git a/lib/git/async/util.py b/lib/git/async/util.py new file mode 100644 index 00000000..dabd8a42 --- /dev/null +++ b/lib/git/async/util.py @@ -0,0 +1,24 @@ +"""Module with utilities related to async operations""" + +import sys +import os + +def cpu_count(): + """:return:number of CPUs in the system + :note: inspired by multiprocessing""" + num = 0 + try: + if sys.platform == 'win32': + num = int(os.environ['NUMBER_OF_PROCESSORS']) + elif 'bsd' in sys.platform or sys.platform == 'darwin': + num = int(os.popen('sysctl -n hw.ncpu').read()) + else: + num = os.sysconf('SC_NPROCESSORS_ONLN') + except (ValueError, KeyError, OSError, AttributeError): + pass + # END exception handling + + if num == 0: + raise NotImplementedError('cannot determine number of cpus') + + return num diff --git a/test/git/async/test_graph.py b/test/git/async/test_graph.py index 400e92cd..ca17d6e6 100644 --- a/test/git/async/test_graph.py +++ b/test/git/async/test_graph.py @@ -25,7 +25,7 @@ class TestGraph(TestBase): # add a chain of connected nodes last = None for i in range(nn): - n = g.add_node(Node()) + n = g.add_node(Node(i)) if last: assert not last.out_nodes assert not n.in_nodes diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 3a9ef8a1..05943c8b 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -1,10 +1,79 @@ """Channel testing""" from test.testlib import * from git.async.pool import * +from git.async.task import * +from git.async.util import cpu_count import time +class TestThreadTaskNode(InputIteratorThreadTask): + def __init__(self, *args, **kwargs): + super(TestThreadTaskNode, self).__init__(*args, **kwargs) + self.reset() + + def do_fun(self, item): + self.item_count += 1 + return item + + def reset(self): + self.process_count = 0 + self.item_count = 0 + + def process(self, count=1): + super(TestThreadTaskNode, self).process(count) + self.process_count += 1 + + def _assert(self, pc, fc): + """Assert for num process counts (pc) and num function counts (fc) + :return: self""" + assert self.process_count == pc + assert self.item_count == fc + + return self + + class TestThreadPool(TestBase): + max_threads = cpu_count() + def test_base(self): - pass + p = ThreadPool() + + # default pools have no workers + assert p.size() == 0 + + # increase and decrease the size + for i in range(self.max_threads): + p.set_size(i) + assert p.size() == i + for i in range(self.max_threads, -1, -1): + p.set_size(i) + assert p.size() == i + + # currently in serial mode ! + + # add a simple task + # it iterates n items + ni = 20 + task = TestThreadTaskNode(iter(range(ni)), 'iterator', None) + task.fun = task.do_fun + + assert p.num_tasks() == 0 + rc = p.add_task(task) + assert p.num_tasks() == 1 + assert isinstance(rc, RPoolChannel) + assert task._out_wc is not None + + # pull the result completely - we should get one task, which calls its + # function once. In serial mode, the order matches + items = rc.read() + task._assert(1, ni).reset() + assert len(items) == ni + assert items[0] == 0 and items[-1] == ni-1 + + + # switch to threaded mode - just one thread for now + + # two threads to compete for tasks + + diff --git a/test/git/async/test_task.py b/test/git/async/test_task.py new file mode 100644 index 00000000..91ac4dc3 --- /dev/null +++ b/test/git/async/test_task.py @@ -0,0 +1,12 @@ +"""Channel testing""" +from test.testlib import * +from git.async.task import * + +import time + +class TestTask(TestBase): + + max_threads = cpu_count() + + def test_iterator_task(self): + self.fail("test iterator task") -- cgit v1.2.3 From 1b27292936c81637f6b9a7141dafaad1126f268e Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 6 Jun 2010 21:15:13 +0200 Subject: Plenty of fixes in the chunking routine, made possible by a serialized chunking test. Next up, actual async processing --- lib/git/async/graph.py | 7 +- lib/git/async/pool.py | 66 +++++++++++++---- lib/git/async/task.py | 2 +- test/git/async/test_graph.py | 3 +- test/git/async/test_pool.py | 169 +++++++++++++++++++++++++++++++++++++------ 5 files changed, 207 insertions(+), 40 deletions(-) diff --git a/lib/git/async/graph.py b/lib/git/async/graph.py index d817eeb4..6386cbaa 100644 --- a/lib/git/async/graph.py +++ b/lib/git/async/graph.py @@ -35,12 +35,17 @@ class Graph(object): def del_node(self, node): """Delete a node from the graph :return: self""" + try: + del(self.nodes[self.nodes.index(node)]) + except ValueError: + return self + # END ignore if it doesn't exist + # clear connections for outn in node.out_nodes: del(outn.in_nodes[outn.in_nodes.index(node)]) for inn in node.in_nodes: del(inn.out_nodes[inn.out_nodes.index(node)]) - del(self.nodes[self.nodes.index(node)]) return self def add_edge(self, u, v): diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 2efc862b..620e2258 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -117,36 +117,72 @@ class ThreadPool(object): queue all others for processing by our worker threads ( if available ).""" if task.error() or task.is_done(): self._consumed_tasks.append(task) + return True + # END stop processing # allow min-count override. This makes sure we take at least min-count # items off the input queue ( later ) - if task.min_count is not None: + if task.min_count is not None and count != 0 and count < task.min_count: count = task.min_count # END handle min-count # if the task does not have the required output on its queue, schedule # it for processing. If we should process all, we don't care about the # amount as it should process until its all done. - if self._workers: - if count < 1 or task._out_wc.size() < count: + if count < 1 or task._out_wc.size() < count: + numchunks = 1 + chunksize = count + remainder = 0 + + # we need the count set for this - can't chunk up unlimited items + # In serial mode we could do this by checking for empty input channels, + # but in dispatch mode its impossible ( == not easily possible ) + # Only try it if we have enough demand + if task.max_chunksize and count > task.max_chunksize: + numchunks = count / task.max_chunksize + chunksize = task.max_chunksize + remainder = count - (numchunks * chunksize) + # END handle chunking + + print count, numchunks, chunksize, remainder + # the following loops are kind of unrolled - code duplication + # should make things execute faster. Putting the if statements + # into the loop would be less code, but ... slower + if self._workers: # respect the chunk size, and split the task up if we want # to process too much. This can be defined per task queue = self._queue - if task.max_chunksize: - chunksize = count / task.max_chunksize - remainder = count - (chunksize * task.max_chunksize) - for i in xrange(chunksize): + if numchunks > 1: + for i in xrange(numchunks): queue.put((task.process, chunksize)) - if remainder: - queue.put((task.process, remainder)) + # END for each chunk to put + else: + queue.put((task.process, chunksize)) + # END try efficient looping + + if remainder: + queue.put((task.process, remainder)) + # END handle chunksize + else: + # no workers, so we have to do the work ourselves + if numchunks > 1: + for i in xrange(numchunks): + task.process(chunksize) + # END for each chunk to put else: - self._queue.put((task.process, count)) + task.process(chunksize) + # END try efficient looping + + if remainder: + task.process(remainder) # END handle chunksize - # END handle queuing - else: - # no workers, so we have to do the work ourselves - task.process(count) - # END handle serial mode + + # as we are serial, we can check for consumption right away + if task.error() or task.is_done(): + self._consumed_tasks.append(task) + # END handle consumption + # END handle serial mode + # END handle queuing # always walk the whole graph, we want to find consumed tasks return True diff --git a/lib/git/async/task.py b/lib/git/async/task.py index d2422773..ec650237 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -58,7 +58,7 @@ class OutputChannelTask(Node): # if we didn't get all demanded items, which is also the case if count is 0 # we have depleted the input channel and are done - if len(items) != count: + if not items or len(items) != count: self.set_done() # END handle done state #{ Configuration diff --git a/test/git/async/test_graph.py b/test/git/async/test_graph.py index ca17d6e6..1a153e2d 100644 --- a/test/git/async/test_graph.py +++ b/test/git/async/test_graph.py @@ -54,7 +54,8 @@ class TestGraph(TestBase): # deleting a connected node clears its neighbour connections assert n3.in_nodes[0] is n2 - g.del_node(n2) + assert g.del_node(n2) is g + assert g.del_node(n2) is g # multi-deletion okay assert len(g.nodes) == nn - 1 assert len(n3.in_nodes) == 0 assert len(n1.out_nodes) == 0 diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 05943c8b..65b2d228 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -3,21 +3,22 @@ from test.testlib import * from git.async.pool import * from git.async.task import * from git.async.util import cpu_count - +import threading import time class TestThreadTaskNode(InputIteratorThreadTask): def __init__(self, *args, **kwargs): super(TestThreadTaskNode, self).__init__(*args, **kwargs) - self.reset() + self.reset(self._iterator) def do_fun(self, item): self.item_count += 1 return item - def reset(self): + def reset(self, iterator): self.process_count = 0 self.item_count = 0 + self._iterator = iterator def process(self, count=1): super(TestThreadTaskNode, self).process(count) @@ -36,6 +37,111 @@ class TestThreadPool(TestBase): max_threads = cpu_count() + def _assert_sync_single_task(self, p): + """Performs testing in a synchronized environment""" + null_tasks = p.num_tasks() # in case we had some before + + # add a simple task + # it iterates n items + ni = 20 + assert ni % 2 == 0, "ni needs to be dividable by 2" + + def make_iter(): + return iter(range(ni)) + # END utility + + task = TestThreadTaskNode(make_iter(), 'iterator', None) + task.fun = task.do_fun + + assert p.num_tasks() == null_tasks + rc = p.add_task(task) + assert p.num_tasks() == 1 + null_tasks + assert isinstance(rc, RPoolChannel) + assert task._out_wc is not None + + # pull the result completely - we should get one task, which calls its + # function once. In serial mode, the order matches + items = rc.read() + task._assert(1, ni).reset(make_iter()) + assert len(items) == ni + assert items[0] == 0 and items[-1] == ni-1 + + # as the task is done, it should have been removed - we have read everything + assert task.is_done() + assert p.num_tasks() == null_tasks + + # pull individual items + rc = p.add_task(task) + assert p.num_tasks() == 1 + null_tasks + for i in range(ni): + items = rc.read(1) + assert len(items) == 1 + assert i == items[0] + # END for each item + # it couldn't yet notice that the input is depleted as we pulled exaclty + # ni items - the next one would remove it. Instead, we delete our channel + # which triggers orphan handling + assert p.num_tasks() == 1 + null_tasks + del(rc) + assert p.num_tasks() == null_tasks + + task.reset(make_iter()) + + # test min count + # if we query 1 item, it will prepare ni / 2 + task.min_count = ni / 2 + rc = p.add_task(task) + assert len(rc.read(1)) == 1 # 1 + assert len(rc.read(1)) == 1 + assert len(rc.read(ni-2)) == ni - 2 # rest - it has ni/2 - 2 on the queue, and pulls ni-2 + task._assert(2, ni) # two chunks, 20 calls ( all items ) + assert p.num_tasks() == 1 + null_tasks # it still doesn't know, didn't read too much + assert len(rc.read()) == 0 # now we read too much and its done + assert p.num_tasks() == null_tasks + + # test chunking + # we always want 4 chunks, these could go to individual nodes + task.reset(make_iter()) + task.max_chunksize = ni / 4 # 4 chunks + rc = p.add_task(task) + # must read a specific item count + # count is still at ni / 2 - here we want more than that + assert len(rc.read(ni / 2 + 2)) == ni / 2 + 2 # make sure its uneven ;) + assert len(rc.read(ni / 2 - 2)) == ni / 2 - 2 + + # END read chunks + task._assert(ni / 4, ni) # read two times, got 4 processing steps + assert p.num_tasks() == null_tasks # depleted + + # but this only hits if we want too many items, if we want less, it could + # still do too much - hence we set the min_count to the same number to enforce + # at least ni / 4 items to be preocessed, no matter what we request + task.reset(make_iter()) + task.min_count = None + rc = p.add_task(task) + for i in range(ni): + assert rc.read(1)[0] == i + # END pull individual items + # too many processing counts ;) + task._assert(ni, ni) + assert p.num_tasks() == 1 + null_tasks + assert p.del_task(task) is p # del manually this time + assert p.num_tasks() == null_tasks + + # now with we set the minimum count to reduce the number of processing counts + task.reset(make_iter()) + task.min_count = ni / 4 + rc = p.add_task(task) + for i in range(ni): + assert rc.read(1)[0] == i + # END for each item + task._assert(ni / 4, ni) + del(rc) + assert p.num_tasks() == null_tasks + + def _assert_async_dependent_tasks(self, p): + pass + def test_base(self): p = ThreadPool() @@ -50,30 +156,49 @@ class TestThreadPool(TestBase): p.set_size(i) assert p.size() == i - # currently in serial mode ! + # SINGLE TASK SERIAL SYNC MODE + ############################## + # put a few unrelated tasks that we forget about + urc1 = p.add_task(TestThreadTaskNode(iter(list()), "nothing", None)) + urc2 = p.add_task(TestThreadTaskNode(iter(list()), "nothing", None)) + assert p.num_tasks() == 2 + self._assert_sync_single_task(p) + assert p.num_tasks() == 2 + del(urc1) + del(urc2) + assert p.num_tasks() == 0 - # add a simple task - # it iterates n items - ni = 20 - task = TestThreadTaskNode(iter(range(ni)), 'iterator', None) - task.fun = task.do_fun - assert p.num_tasks() == 0 - rc = p.add_task(task) - assert p.num_tasks() == 1 - assert isinstance(rc, RPoolChannel) - assert task._out_wc is not None + # DEPENDENT TASKS SERIAL + ######################## + self._assert_async_dependent_tasks(p) + + + # SINGLE TASK THREADED SYNC MODE + ################################ + # step one gear up - just one thread for now. + num_threads = len(threading.enumerate()) + p.set_size(1) + assert len(threading.enumerate()) == num_threads + 1 + # deleting the pool stops its threads - just to be sure ;) + del(p) + assert len(threading.enumerate()) == num_threads + + p = ThreadPool(1) + assert len(threading.enumerate()) == num_threads + 1 + + # here we go + self._assert_sync_single_task(p) + - # pull the result completely - we should get one task, which calls its - # function once. In serial mode, the order matches - items = rc.read() - task._assert(1, ni).reset() - assert len(items) == ni - assert items[0] == 0 and items[-1] == ni-1 + # SINGLE TASK ASYNC MODE + ######################## + # two threads to compete for a single task - # switch to threaded mode - just one thread for now - # two threads to compete for tasks + # DEPENDENT TASK ASYNC MODE + ########################### + # self._assert_async_dependent_tasks(p) -- cgit v1.2.3 From 867129e2950458ab75523b920a5e227e3efa8bbc Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 6 Jun 2010 23:08:06 +0200 Subject: channel.read: enhanced to be sure we don't run into non-atomicity issues related to our channel closed flag, which is the only way not to block forever on read(0) channels which were closed by a thread 'in the meanwhile' --- lib/git/async/channel.py | 89 ++++++++++++++++++++++++++++++++++-------- test/git/async/test_channel.py | 2 +- 2 files changed, 73 insertions(+), 18 deletions(-) diff --git a/lib/git/async/channel.py b/lib/git/async/channel.py index 70daed24..0a1db26b 100644 --- a/lib/git/async/channel.py +++ b/lib/git/async/channel.py @@ -5,6 +5,9 @@ from Queue import ( Full ) +from time import time +import sys + #{ Classes class Channel(object): """A channel is similar to a file like object. It has a write end as well as one or @@ -106,26 +109,78 @@ class RChannel(Channel): If count was < 1, a list with all items that could be read will be returned.""" # if the channel is closed for writing, we never block - if self._wc.closed: + if self._wc.closed or timeout == 0: block = False - + + # in non-blocking mode, its all not a problem out = list() - try: - if count == 1: - out.append(self._wc._queue.get(block, timeout)) - elif count < 1: - while True: - out.append(self._wc._queue.get(block, timeout)) - # END for each item - return out - else: - for i in xrange(count): - out.append(self._wc._queue.get(block, timeout)) - # END for each item + queue = self._wc._queue + if not block: + # be as fast as possible in non-blocking mode, hence + # its a bit 'unrolled' + try: + if count == 1: + out.append(queue.get(False)) + elif count < 1: + while True: + out.append(queue.get(False)) + # END for each item + else: + for i in xrange(count): + out.append(queue.get(False)) + # END for each item + # END handle count + except Empty: + pass + # END handle exceptions + else: + # if we have really bad timing, the source of the channel + # marks itself closed, but before setting it, the thread + # switches to us. We read it, read False, and try to fetch + # something, and never return. The whole closed channel thing + # is not atomic ( of course ) + # This is why we never block for long, to get a chance to recheck + # for closed channels. + # We blend this into the timeout of the user + ourtimeout = 0.25 # the smaller, the more responsive, but the slower + wc = self._wc + timeout = (timeout is None and sys.maxint) or timeout # make sure we can compute with it + assert timeout != 0.0, "shouldn't block if timeout is 0" # okay safe + if timeout and ourtimeout > timeout: + ourtimeout = timeout + # END setup timeout + + # to get everything into one loop, we set the count accordingly + if count == 0: + count = sys.maxint # END handle count - except Empty: - pass - # END handle exceptions + + for i in xrange(count): + have_timeout = False + st = time() + while True: + try: + if wc.closed: + have_timeout = True + break + # END don't continue on closed channels + + # END abort reading if it was closed ( in the meanwhile ) + out.append(queue.get(block, ourtimeout)) + break # breakout right away + except Empty: + if timeout - (time() - st) <= 0: + # hitting timeout + have_timeout = True + break + # END abort if the user wants no more time spent here + # END handle timeout + # END endless timer loop + if have_timeout: + break + # END stop on timeout + # END for each item + # END handle blocking return out #} END interface diff --git a/test/git/async/test_channel.py b/test/git/async/test_channel.py index 2a3c1585..6472b5b5 100644 --- a/test/git/async/test_channel.py +++ b/test/git/async/test_channel.py @@ -26,7 +26,7 @@ class TestChannels(TestBase): assert rc.read(timeout=1) == [item, item2] assert time.time() - st >= 1.0 - # next read blocks, then raises - it waits a second + # next read blocks. it waits a second st = time.time() assert len(rc.read(1, True, 1)) == 0 assert time.time() - st >= 1.0 -- cgit v1.2.3 From 6a252661c3bf4202a4d571f9c41d2afa48d9d75f Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 6 Jun 2010 23:41:20 +0200 Subject: pool: First version which works as expected in async mode. Its just using a single task for now, but next up are dependent tasks --- lib/git/async/channel.py | 10 +++++++++- lib/git/async/pool.py | 44 ++++++++++++++++++++++++++----------------- lib/git/async/task.py | 12 ++++++++++-- lib/git/async/thread.py | 43 +++++++++--------------------------------- test/git/async/test_pool.py | 43 +++++++++++++++++++++++++++++++----------- test/git/async/test_thread.py | 19 +++++++++---------- 6 files changed, 96 insertions(+), 75 deletions(-) diff --git a/lib/git/async/channel.py b/lib/git/async/channel.py index 0a1db26b..2add9478 100644 --- a/lib/git/async/channel.py +++ b/lib/git/async/channel.py @@ -162,7 +162,15 @@ class RChannel(Channel): try: if wc.closed: have_timeout = True - break + # its about the 'in the meanwhile' :) - get everything + # we can in non-blocking mode. This will raise + try: + while True: + out.append(queue.get(False)) + # END until it raises Empty + except Empty: + break + # END finally, out of here # END don't continue on closed channels # END abort reading if it was closed ( in the meanwhile ) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 620e2258..fcb0f442 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -49,7 +49,7 @@ class RPoolChannel(RChannel): If a function is not provided, the call is effectively uninstalled.""" self._post_cb = fun - def read(self, count=0, block=False, timeout=None): + def read(self, count=0, block=True, timeout=None): """Read an item that was processed by one of our threads :note: Triggers task dependency handling needed to provide the necessary input""" @@ -57,14 +57,21 @@ class RPoolChannel(RChannel): self._pre_cb() # END pre callback - ################################################## - self._pool._prepare_processing(self._task, count) - ################################################## + ########## prepare ############################## + self._pool._prepare_channel_read(self._task, count) + + ######### read data ###### + # read actual items, tasks were setup to put their output into our channel ( as well ) items = RChannel.read(self, count, block, timeout) + if self._post_cb: items = self._post_cb(items) + + ####### Finalize ######## + self._pool._post_channel_read(self._task) + return items #{ Internal @@ -119,17 +126,17 @@ class ThreadPool(object): self._consumed_tasks.append(task) return True # END stop processing - - # allow min-count override. This makes sure we take at least min-count - # items off the input queue ( later ) - if task.min_count is not None and count != 0 and count < task.min_count: - count = task.min_count - # END handle min-count # if the task does not have the required output on its queue, schedule # it for processing. If we should process all, we don't care about the # amount as it should process until its all done. if count < 1 or task._out_wc.size() < count: + # allow min-count override. This makes sure we take at least min-count + # items off the input queue ( later ) + if task.min_count is not None and 0 < count < task.min_count: + count = task.min_count + # END handle min-count + numchunks = 1 chunksize = count remainder = 0 @@ -144,10 +151,10 @@ class ThreadPool(object): remainder = count - (numchunks * chunksize) # END handle chunking - print count, numchunks, chunksize, remainder # the following loops are kind of unrolled - code duplication # should make things execute faster. Putting the if statements # into the loop would be less code, but ... slower + print count, numchunks, chunksize, remainder, task._out_wc.size() if self._workers: # respect the chunk size, and split the task up if we want # to process too much. This can be defined per task @@ -176,18 +183,13 @@ class ThreadPool(object): if remainder: task.process(remainder) # END handle chunksize - - # as we are serial, we can check for consumption right away - if task.error() or task.is_done(): - self._consumed_tasks.append(task) - # END handle consumption # END handle serial mode # END handle queuing # always walk the whole graph, we want to find consumed tasks return True - def _prepare_processing(self, task, count): + def _prepare_channel_read(self, task, count): """Process the tasks which depend on the given one to be sure the input channels are filled with data once we process the actual task @@ -201,10 +203,18 @@ class ThreadPool(object): is fine as we walked them depth-first.""" self._tasks.visit_input_inclusive_depth_first(task, lambda n: self._queue_feeder_visitor(n, count)) + def _post_channel_read(self, task): + """Called after we processed a read to cleanup""" + # check whether we consumed the task, and schedule it for deletion + if task.error() or task.is_done(): + self._consumed_tasks.append(task) + # END handle consumption + # delete consumed tasks to cleanup for task in self._consumed_tasks: self.del_task(task) # END for each task to delete + del(self._consumed_tasks[:]) def _del_task_if_orphaned(self, task): diff --git a/lib/git/async/task.py b/lib/git/async/task.py index ec650237..3137746c 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -7,7 +7,13 @@ class OutputChannelTask(Node): additional information on how the task should be queued and processed. Results of the item processing are sent to an output channel, which is to be - set by the creator""" + set by the creator + + * **min_count** assures that not less than min_count items will be processed per call. + * **max_chunksize** assures that multi-threading is happening in smaller chunks. If + someone wants all items to be processed, using read(0), the whole task would go to + one worker, as well as dependent tasks. If you want finer granularity , you can + specify this here, causing chunks to be no larger than max_chunksize""" __slots__ = ( '_read', # method to yield items to process '_out_wc', # output write channel '_exc', # exception caught @@ -42,7 +48,6 @@ class OutputChannelTask(Node): def process(self, count=0): """Process count items and send the result individually to the output channel""" items = self._read(count) - try: if self.apply_single: for item in items: @@ -58,6 +63,9 @@ class OutputChannelTask(Node): # if we didn't get all demanded items, which is also the case if count is 0 # we have depleted the input channel and are done + # We could check our output channel for how many items we have and put that + # into the equation, but whats important is that we were asked to produce + # count items. if not items or len(items) != count: self.set_done() # END handle done state diff --git a/lib/git/async/thread.py b/lib/git/async/thread.py index 82acbd8f..0292289d 100644 --- a/lib/git/async/thread.py +++ b/lib/git/async/thread.py @@ -115,33 +115,17 @@ class WorkerThread(TerminatableThread): """ __slots__ = ('inq', 'outq') - class InvalidRoutineError(Exception): - """Class sent as return value in case of an error""" - def __init__(self, inq = None): super(WorkerThread, self).__init__() self.inq = inq or Queue.Queue() - def call(self, function, *args, **kwargs): - """Method that makes the call to the worker using the input queue, - returning our output queue - - :param funciton: can be a standalone function unrelated to this class, - a class method of this class or any instance method. - If it is a string, it will be considered a function residing on this instance - :param args: arguments to pass to function - :parma **kwargs: kwargs to pass to function""" - self.inq.put((function, args, kwargs)) - def run(self): """Process input tasks until we receive the quit signal""" while True: if self._should_terminate(): break # END check for stop request - routine = None - args = tuple() - kwargs = dict() + # don't wait too long, instead check for the termination request more often try: tasktuple = self.inq.get(True, 1) @@ -149,29 +133,19 @@ class WorkerThread(TerminatableThread): continue # END get task with timeout - if isinstance(tasktuple, (tuple, list)): - if len(tasktuple) == 3: - routine, args, kwargs = tasktuple - elif len(tasktuple) == 2: - routine, args = tasktuple - elif len(tasktuple) == 1: - routine = tasktuple[0] - # END tasktuple length check - elif inspect.isroutine(tasktuple): - routine = tasktuple - # END tasktuple handling + # needing exactly one function, and one arg + assert len(tasktuple) == 2, "Need tuple of function, arg - it could be more flexible, but its reduced to what we need" + routine, arg = tasktuple try: rval = None if inspect.ismethod(routine): if routine.im_self is None: - rval = routine(self, *args, **kwargs) + rval = routine(self, arg) else: - rval = routine(*args, **kwargs) + rval = routine(arg) elif inspect.isroutine(routine): - rval = routine(*args, **kwargs) - elif isinstance(routine, basestring) and hasattr(self, routine): - rval = getattr(self, routine)(*args, **kwargs) + rval = routine(arg) else: # ignore unknown items print "%s: task %s was not understood - terminating" % (self.getName(), str(tasktuple)) @@ -180,7 +154,8 @@ class WorkerThread(TerminatableThread): except StopIteration: break except Exception,e: - print "%s: Task %s raised unhandled exception: %s" % (self.getName(), str(tasktuple), str(e)) + print "%s: Task %s raised unhandled exception: %s - this really shouldn't happen !" % (self.getName(), str(tasktuple), str(e)) + break # abort ... # END routine exception handling # END endless loop diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 65b2d228..628e2a93 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -10,9 +10,12 @@ class TestThreadTaskNode(InputIteratorThreadTask): def __init__(self, *args, **kwargs): super(TestThreadTaskNode, self).__init__(*args, **kwargs) self.reset(self._iterator) + self.should_fail = False def do_fun(self, item): self.item_count += 1 + if self.should_fail: + raise AssertionError("I am failing just for the fun of it") return item def reset(self, iterator): @@ -29,7 +32,7 @@ class TestThreadTaskNode(InputIteratorThreadTask): :return: self""" assert self.process_count == pc assert self.item_count == fc - + assert not self.error() return self @@ -60,10 +63,10 @@ class TestThreadPool(TestBase): assert task._out_wc is not None # pull the result completely - we should get one task, which calls its - # function once. In serial mode, the order matches + # function once. In sync mode, the order matches items = rc.read() - task._assert(1, ni).reset(make_iter()) assert len(items) == ni + task._assert(1, ni).reset(make_iter()) assert items[0] == 0 and items[-1] == ni-1 # as the task is done, it should have been removed - we have read everything @@ -91,13 +94,17 @@ class TestThreadPool(TestBase): # if we query 1 item, it will prepare ni / 2 task.min_count = ni / 2 rc = p.add_task(task) - assert len(rc.read(1)) == 1 # 1 - assert len(rc.read(1)) == 1 - assert len(rc.read(ni-2)) == ni - 2 # rest - it has ni/2 - 2 on the queue, and pulls ni-2 - task._assert(2, ni) # two chunks, 20 calls ( all items ) - assert p.num_tasks() == 1 + null_tasks # it still doesn't know, didn't read too much - assert len(rc.read()) == 0 # now we read too much and its done + assert len(rc.read(1)) == 1 # processes ni / 2 + assert len(rc.read(1)) == 1 # processes nothing + # rest - it has ni/2 - 2 on the queue, and pulls ni-2 + # It wants too much, so the task realizes its done. The task + # doesn't care about the items in its output channel + assert len(rc.read(ni-2)) == ni - 2 assert p.num_tasks() == null_tasks + task._assert(2, ni) # two chunks, 20 calls ( all items ) + + # its already done, gives us no more + assert len(rc.read()) == 0 # test chunking # we always want 4 chunks, these could go to individual nodes @@ -135,11 +142,25 @@ class TestThreadPool(TestBase): for i in range(ni): assert rc.read(1)[0] == i # END for each item - task._assert(ni / 4, ni) + task._assert(ni / task.min_count, ni) del(rc) assert p.num_tasks() == null_tasks + # test failure + # on failure, the processing stops and the task is finished, keeping + # his error for later + task.reset(make_iter()) + task.should_fail = True + rc = p.add_task(task) + assert len(rc.read()) == 0 # failure on first item + assert isinstance(task.error(), AssertionError) + assert p.num_tasks() == null_tasks + def _assert_async_dependent_tasks(self, p): + # includes failure in center task, 'recursive' orphan cleanup + # This will also verify that the channel-close mechanism works + # t1 -> t2 -> t3 + # t1 -> x -> t3 pass def test_base(self): @@ -199,6 +220,6 @@ class TestThreadPool(TestBase): # DEPENDENT TASK ASYNC MODE ########################### - # self._assert_async_dependent_tasks(p) + self._assert_async_dependent_tasks(p) diff --git a/test/git/async/test_thread.py b/test/git/async/test_thread.py index 2ea8d1ff..a08c1dc7 100644 --- a/test/git/async/test_thread.py +++ b/test/git/async/test_thread.py @@ -3,28 +3,26 @@ from test.testlib import * from git.async.thread import * from Queue import Queue +import time class TestWorker(WorkerThread): def __init__(self, *args, **kwargs): super(TestWorker, self).__init__(*args, **kwargs) self.reset() - def fun(self, *args, **kwargs): + def fun(self, arg): self.called = True - self.args = args - self.kwargs = kwargs + self.arg = arg return True def make_assertion(self): assert self.called - assert self.args - assert self.kwargs + assert self.arg self.reset() def reset(self): self.called = False - self.args = None - self.kwargs = None + self.arg = None class TestThreads( TestCase ): @@ -36,10 +34,11 @@ class TestThreads( TestCase ): # test different method types standalone_func = lambda *args, **kwargs: worker.fun(*args, **kwargs) - for function in ("fun", TestWorker.fun, worker.fun, standalone_func): - worker.call(function, 1, this='that') + for function in (TestWorker.fun, worker.fun, standalone_func): + worker.inq.put((function, 1)) + time.sleep(0.01) worker.make_assertion() # END for each function type - worker.call('quit') + worker.stop_and_join() -- cgit v1.2.3 From a8a448b7864e21db46184eab0f0a21d7725d074f Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 7 Jun 2010 10:38:22 +0200 Subject: pool.consumed_tasks: is now a queue to be thread safe, in preparation for multiple connected pools Reduced waiting time in tests to make them complete faster --- lib/git/async/pool.py | 29 ++++++++++++++++++----------- lib/git/async/thread.py | 5 +++++ test/git/async/test_channel.py | 15 ++++++++------- 3 files changed, 31 insertions(+), 18 deletions(-) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index fcb0f442..5518e37e 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -1,7 +1,7 @@ """Implementation of a thread-pool working with channels""" from thread import WorkerThread from task import InputChannelTask -from Queue import Queue +from Queue import Queue, Empty from graph import ( Graph, @@ -103,14 +103,14 @@ class ThreadPool(object): used only from the main thread, hence you cannot consume their results from multiple threads unless you use a task for it.""" __slots__ = ( '_tasks', # a graph of tasks - '_consumed_tasks', # a list with tasks that are done or had an error + '_consumed_tasks', # a queue with tasks that are done or had an error '_workers', # list of worker threads '_queue', # master queue for tasks ) def __init__(self, size=0): self._tasks = Graph() - self._consumed_tasks = list() + self._consumed_tasks = Queue() # make sure its threadsafe self._workers = list() self._queue = Queue() self.set_size(size) @@ -123,7 +123,7 @@ class ThreadPool(object): """Walk the graph and find tasks that are done for later cleanup, and queue all others for processing by our worker threads ( if available ).""" if task.error() or task.is_done(): - self._consumed_tasks.append(task) + self._consumed_tasks.put(task) return True # END stop processing @@ -206,16 +206,21 @@ class ThreadPool(object): def _post_channel_read(self, task): """Called after we processed a read to cleanup""" # check whether we consumed the task, and schedule it for deletion + # This could have happend after the read returned ( even though the pre-read + # checks it as well ) if task.error() or task.is_done(): - self._consumed_tasks.append(task) + self._consumed_tasks.put(task) # END handle consumption # delete consumed tasks to cleanup - for task in self._consumed_tasks: - self.del_task(task) - # END for each task to delete - - del(self._consumed_tasks[:]) + try: + while True: + ct = self._consumed_tasks.get(False) + self.del_task(ct) + # END for each task to delete + except Empty: + pass + # END pop queue empty def _del_task_if_orphaned(self, task): """Check the task, and delete it if it is orphaned""" @@ -236,7 +241,9 @@ class ThreadPool(object): :return: self :param size: if 0, the pool will do all work itself in the calling thread, - otherwise the work will be distributed among the given amount of threads""" + otherwise the work will be distributed among the given amount of threads + + :note: currently NOT threadsafe !""" # either start new threads, or kill existing ones. # If we end up with no threads, we process the remaining chunks on the queue # ourselves diff --git a/lib/git/async/thread.py b/lib/git/async/thread.py index 0292289d..2ed002e9 100644 --- a/lib/git/async/thread.py +++ b/lib/git/async/thread.py @@ -115,6 +115,11 @@ class WorkerThread(TerminatableThread): """ __slots__ = ('inq', 'outq') + + # define how often we should check for a shutdown request in case our + # taskqueue is empty + shutdown_check_time_s = 0.5 + def __init__(self, inq = None): super(WorkerThread, self).__init__() self.inq = inq or Queue.Queue() diff --git a/test/git/async/test_channel.py b/test/git/async/test_channel.py index 6472b5b5..acfbd15e 100644 --- a/test/git/async/test_channel.py +++ b/test/git/async/test_channel.py @@ -22,14 +22,15 @@ class TestChannels(TestBase): wc.write(item2) # read all - it blocks as its still open for writing + to = 0.2 st = time.time() - assert rc.read(timeout=1) == [item, item2] - assert time.time() - st >= 1.0 + assert rc.read(timeout=to) == [item, item2] + assert time.time() - st >= to # next read blocks. it waits a second st = time.time() - assert len(rc.read(1, True, 1)) == 0 - assert time.time() - st >= 1.0 + assert len(rc.read(1, True, to)) == 0 + assert time.time() - st >= to # writing to a closed channel raises assert not wc.closed @@ -50,10 +51,10 @@ class TestChannels(TestBase): wc, rc = Channel(1) wc.write(item) # fine - # blocks for a second, its full + # blocks for a a moment, its full st = time.time() - self.failUnlessRaises(EOFError, wc.write, item, True, 1) - assert time.time() - st >= 1.0 + self.failUnlessRaises(EOFError, wc.write, item, True, to) + assert time.time() - st >= to # get our only one assert rc.read(1)[0] == item -- cgit v1.2.3 From 619662a9138fd78df02c52cae6dc89db1d70a0e5 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 7 Jun 2010 12:10:56 +0200 Subject: changed scheduling and chunksize calculation in respect to the task.min_count, to fix theoretical option for a deadlock in serial mode, and unnecessary blocking in async mode --- lib/git/async/pool.py | 213 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 137 insertions(+), 76 deletions(-) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 5518e37e..009096f2 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -1,5 +1,6 @@ """Implementation of a thread-pool working with channels""" from thread import WorkerThread +from threading import Lock from task import InputChannelTask from Queue import Queue, Empty @@ -83,7 +84,7 @@ class RPoolChannel(RChannel): #} END internal -class ThreadPool(object): +class Pool(object): """A thread pool maintains a set of one or more worker threads, but supports a fully serial mode in which case the amount of threads is zero. @@ -106,88 +107,35 @@ class ThreadPool(object): '_consumed_tasks', # a queue with tasks that are done or had an error '_workers', # list of worker threads '_queue', # master queue for tasks + '_taskgraph_lock', # lock for accessing the task graph ) + # CONFIGURATION + # The type of worker to create - its expected to provide the Thread interface, + # taking the taskqueue as only init argument + # as well as a method called stop_and_join() to terminate it + WorkerCls = None + + # The type of lock to use to protect critical sections, providing the + # threading.Lock interface + LockCls = None + + # the type of the task queue to use - it must provide the Queue interface + TaskQueueCls = None + + def __init__(self, size=0): self._tasks = Graph() self._consumed_tasks = Queue() # make sure its threadsafe self._workers = list() - self._queue = Queue() + self._queue = self.TaskQueueCls() + self._taskgraph_lock = self.LockCls() self.set_size(size) def __del__(self): self.set_size(0) #{ Internal - def _queue_feeder_visitor(self, task, count): - """Walk the graph and find tasks that are done for later cleanup, and - queue all others for processing by our worker threads ( if available ).""" - if task.error() or task.is_done(): - self._consumed_tasks.put(task) - return True - # END stop processing - - # if the task does not have the required output on its queue, schedule - # it for processing. If we should process all, we don't care about the - # amount as it should process until its all done. - if count < 1 or task._out_wc.size() < count: - # allow min-count override. This makes sure we take at least min-count - # items off the input queue ( later ) - if task.min_count is not None and 0 < count < task.min_count: - count = task.min_count - # END handle min-count - - numchunks = 1 - chunksize = count - remainder = 0 - - # we need the count set for this - can't chunk up unlimited items - # In serial mode we could do this by checking for empty input channels, - # but in dispatch mode its impossible ( == not easily possible ) - # Only try it if we have enough demand - if task.max_chunksize and count > task.max_chunksize: - numchunks = count / task.max_chunksize - chunksize = task.max_chunksize - remainder = count - (numchunks * chunksize) - # END handle chunking - - # the following loops are kind of unrolled - code duplication - # should make things execute faster. Putting the if statements - # into the loop would be less code, but ... slower - print count, numchunks, chunksize, remainder, task._out_wc.size() - if self._workers: - # respect the chunk size, and split the task up if we want - # to process too much. This can be defined per task - queue = self._queue - if numchunks > 1: - for i in xrange(numchunks): - queue.put((task.process, chunksize)) - # END for each chunk to put - else: - queue.put((task.process, chunksize)) - # END try efficient looping - - if remainder: - queue.put((task.process, remainder)) - # END handle chunksize - else: - # no workers, so we have to do the work ourselves - if numchunks > 1: - for i in xrange(numchunks): - task.process(chunksize) - # END for each chunk to put - else: - task.process(chunksize) - # END try efficient looping - - if remainder: - task.process(remainder) - # END handle chunksize - # END handle serial mode - # END handle queuing - - # always walk the whole graph, we want to find consumed tasks - return True def _prepare_channel_read(self, task, count): """Process the tasks which depend on the given one to be sure the input @@ -201,7 +149,98 @@ class ThreadPool(object): Tasks which are not done will be put onto the queue for processing, which is fine as we walked them depth-first.""" - self._tasks.visit_input_inclusive_depth_first(task, lambda n: self._queue_feeder_visitor(n, count)) + dfirst_tasks = list() + # for the walk, we must make sure the ordering does not change + # Note: the result of this could be cached + self._tasks.visit_input_inclusive_depth_first(task, lambda n: dfirst_tasks.append(n)) + + # check the min count on all involved tasks, and be sure that we don't + # have any task which produces less than the maximum min-count of all tasks + # The actual_count is used when chunking tasks up for the queue, whereas + # the count is usued to determine whether we still have enough output + # on the queue, checking qsize ( ->revise ) + # ABTRACT: If T depends on T-1, and the client wants 1 item, T produces + # at least 10, T-1 goes with 1, then T will block after 1 item, which + # is read by the client. On the next read of 1 item, we would find T's + # queue empty and put in another 10, which could put another thread into + # blocking state. T-1 produces one more item, which is consumed right away + # by the two threads running T. Although this works in the end, it leaves + # many threads blocking and waiting for input, which is not desired. + # Setting the min-count to the max of the mincount of all tasks assures + # we have enough items for all. + # Addition: in serial mode, we would enter a deadlock if one task would + # ever wait for items ! + actual_count = count + min_counts = (((t.min_count is not None and t.min_count) or count) for t in dfirst_tasks) + min_count = reduce(lambda m1, m2: max(m1, m2), min_counts) + if 0 < count < min_count: + actual_count = min_count + # END set actual count + + # the list includes our tasks - the first one to evaluate first, the + # requested one last + for task in dfirst_tasks: + if task.error() or task.is_done(): + self._consumed_tasks.put(task) + continue + # END skip processing + + # if the task does not have the required output on its queue, schedule + # it for processing. If we should process all, we don't care about the + # amount as it should process until its all done. + # NOTE: revise this for multi-tasking - checking qsize doesnt work there ! + if count < 1 or task._out_wc.size() < count: + # but we continue to use the actual count to produce the output + numchunks = 1 + chunksize = actual_count + remainder = 0 + + # we need the count set for this - can't chunk up unlimited items + # In serial mode we could do this by checking for empty input channels, + # but in dispatch mode its impossible ( == not easily possible ) + # Only try it if we have enough demand + if task.max_chunksize and actual_count > task.max_chunksize: + numchunks = actual_count / task.max_chunksize + chunksize = task.max_chunksize + remainder = actual_count - (numchunks * chunksize) + # END handle chunking + + # the following loops are kind of unrolled - code duplication + # should make things execute faster. Putting the if statements + # into the loop would be less code, but ... slower + print actual_count, numchunks, chunksize, remainder, task._out_wc.size() + if self._workers: + # respect the chunk size, and split the task up if we want + # to process too much. This can be defined per task + queue = self._queue + if numchunks > 1: + for i in xrange(numchunks): + queue.put((task.process, chunksize)) + # END for each chunk to put + else: + queue.put((task.process, chunksize)) + # END try efficient looping + + if remainder: + queue.put((task.process, remainder)) + # END handle chunksize + else: + # no workers, so we have to do the work ourselves + if numchunks > 1: + for i in xrange(numchunks): + task.process(chunksize) + # END for each chunk to put + else: + task.process(chunksize) + # END try efficient looping + + if remainder: + task.process(remainder) + # END handle chunksize + # END handle serial mode + # END handle queuing + # END for each task to process + def _post_channel_read(self, task): """Called after we processed a read to cleanup""" @@ -250,7 +289,7 @@ class ThreadPool(object): cur_count = len(self._workers) if cur_count < size: for i in range(size - cur_count): - worker = WorkerThread(self._queue) + worker = self.WorkerCls(self._queue) worker.start() self._workers.append(worker) # END for each new worker to create @@ -291,7 +330,12 @@ class ThreadPool(object): # keep its input nodes as we check whether they were orphaned in_tasks = task.in_nodes task.set_done() - self._tasks.del_node(task) + self._taskgraph_lock.acquire() + try: + self._tasks.del_node(task) + finally: + self._taskgraph_lock.release() + # END locked deletion for t in in_tasks: self._del_task_if_orphaned(t) @@ -314,16 +358,33 @@ class ThreadPool(object): task._pool_ref = weakref.ref(self) # END init input channel task - self._tasks.add_node(task) + self._taskgraph_lock.acquire() + try: + self._tasks.add_node(task) + finally: + self._taskgraph_lock.release() + # END sync task addition # If the input channel is one of our read channels, we add the relation if has_input_channel: ic = task.in_rc if isinstance(ic, RPoolChannel) and ic._pool is self: - self._tasks.add_edge(ic._task, task) + self._taskgraph_lock.acquire() + try: + self._tasks.add_edge(ic._task, task) + finally: + self._taskgraph_lock.release() + # END handle edge-adding # END add task relation # END handle input channels for connections return rc #} END interface + + +class ThreadPool(Pool): + """A pool using threads as worker""" + WorkerCls = WorkerThread + LockCls = Lock + TaskQueueCls = Queue -- cgit v1.2.3 From 8c3c271b0d6b5f56b86e3f177caf3e916b509b52 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 7 Jun 2010 13:05:35 +0200 Subject: Added task order cache, and a lock to prevent us walking the graph while changing tasks Now processing more items to test performance, in dual-threaded mode as well, and its rather bad, have to figure out the reason for this, probably gil, but queues could help --- lib/git/async/pool.py | 26 +++++++++++++++++++++----- lib/git/async/thread.py | 3 ++- test/git/async/test_pool.py | 40 +++++++++++++++++++++++++++++++--------- 3 files changed, 54 insertions(+), 15 deletions(-) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 009096f2..26a6a182 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -107,6 +107,7 @@ class Pool(object): '_consumed_tasks', # a queue with tasks that are done or had an error '_workers', # list of worker threads '_queue', # master queue for tasks + '_taskorder_cache', # map task id -> ordered dependent tasks '_taskgraph_lock', # lock for accessing the task graph ) @@ -130,6 +131,7 @@ class Pool(object): self._workers = list() self._queue = self.TaskQueueCls() self._taskgraph_lock = self.LockCls() + self._taskorder_cache = dict() self.set_size(size) def __del__(self): @@ -149,10 +151,21 @@ class Pool(object): Tasks which are not done will be put onto the queue for processing, which is fine as we walked them depth-first.""" - dfirst_tasks = list() - # for the walk, we must make sure the ordering does not change - # Note: the result of this could be cached - self._tasks.visit_input_inclusive_depth_first(task, lambda n: dfirst_tasks.append(n)) + # for the walk, we must make sure the ordering does not change. Even + # when accessing the cache, as it is related to graph changes + self._taskgraph_lock.acquire() + try: + try: + dfirst_tasks = self._taskorder_cache[id(task)] + except KeyError: + # have to retrieve the list from the graph + dfirst_tasks = list() + self._tasks.visit_input_inclusive_depth_first(task, lambda n: dfirst_tasks.append(n)) + self._taskorder_cache[id(task)] = dfirst_tasks + # END handle cached order retrieval + finally: + self._taskgraph_lock.release() + # END handle locking # check the min count on all involved tasks, and be sure that we don't # have any task which produces less than the maximum min-count of all tasks @@ -208,7 +221,8 @@ class Pool(object): # the following loops are kind of unrolled - code duplication # should make things execute faster. Putting the if statements # into the loop would be less code, but ... slower - print actual_count, numchunks, chunksize, remainder, task._out_wc.size() + # DEBUG + # print actual_count, numchunks, chunksize, remainder, task._out_wc.size() if self._workers: # respect the chunk size, and split the task up if we want # to process too much. This can be defined per task @@ -332,6 +346,7 @@ class Pool(object): task.set_done() self._taskgraph_lock.acquire() try: + self._taskorder_cache.clear() self._tasks.del_node(task) finally: self._taskgraph_lock.release() @@ -360,6 +375,7 @@ class Pool(object): self._taskgraph_lock.acquire() try: + self._taskorder_cache.clear() self._tasks.add_node(task) finally: self._taskgraph_lock.release() diff --git a/lib/git/async/thread.py b/lib/git/async/thread.py index 2ed002e9..f875f094 100644 --- a/lib/git/async/thread.py +++ b/lib/git/async/thread.py @@ -141,7 +141,8 @@ class WorkerThread(TerminatableThread): # needing exactly one function, and one arg assert len(tasktuple) == 2, "Need tuple of function, arg - it could be more flexible, but its reduced to what we need" routine, arg = tasktuple - + # DEBUG + # print "%s: picked up: %s(%s)" % (self.name, routine, arg) try: rval = None if inspect.ismethod(routine): diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 628e2a93..df3eaf11 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -40,14 +40,15 @@ class TestThreadPool(TestBase): max_threads = cpu_count() - def _assert_sync_single_task(self, p): + def _assert_single_task(self, p, async=False): """Performs testing in a synchronized environment""" null_tasks = p.num_tasks() # in case we had some before # add a simple task # it iterates n items - ni = 20 + ni = 1000 assert ni % 2 == 0, "ni needs to be dividable by 2" + assert ni % 4 == 0, "ni needs to be dividable by 4" def make_iter(): return iter(range(ni)) @@ -76,11 +77,18 @@ class TestThreadPool(TestBase): # pull individual items rc = p.add_task(task) assert p.num_tasks() == 1 + null_tasks + st = time.time() for i in range(ni): items = rc.read(1) assert len(items) == 1 - assert i == items[0] + + # can't assert order in async mode + if not async: + assert i == items[0] # END for each item + elapsed = time.time() - st + print >> sys.stderr, "Threadpool: processed %i individual items, with %i threads, one at a time, in %f s ( %f items / s )" % (ni, p.size(), elapsed, ni / elapsed) + # it couldn't yet notice that the input is depleted as we pulled exaclty # ni items - the next one would remove it. Instead, we delete our channel # which triggers orphan handling @@ -113,11 +121,13 @@ class TestThreadPool(TestBase): rc = p.add_task(task) # must read a specific item count # count is still at ni / 2 - here we want more than that - assert len(rc.read(ni / 2 + 2)) == ni / 2 + 2 # make sure its uneven ;) + # 2 steps with n / 4 items, + 1 step with n/4 items to get + 2 + assert len(rc.read(ni / 2 + 2)) == ni / 2 + 2 + # have n / 4 - 2 items on queue, want n / 4 in first chunk, cause 1 processing + # ( 4 in total ). Still want n / 4 - 2 in second chunk, causing another processing assert len(rc.read(ni / 2 - 2)) == ni / 2 - 2 - # END read chunks - task._assert(ni / 4, ni) # read two times, got 4 processing steps + task._assert( 5, ni) assert p.num_tasks() == null_tasks # depleted # but this only hits if we want too many items, if we want less, it could @@ -126,10 +136,18 @@ class TestThreadPool(TestBase): task.reset(make_iter()) task.min_count = None rc = p.add_task(task) + st = time.time() for i in range(ni): - assert rc.read(1)[0] == i + if async: + assert len(rc.read(1)) == 1 + else: + assert rc.read(1)[0] == i + # END handle async mode # END pull individual items # too many processing counts ;) + elapsed = time.time() - st + print >> sys.stderr, "Threadpool: processed %i individual items in chunks of %i, with %i threads, one at a time, in %f s ( %f items / s )" % (ni, ni/4, p.size(), elapsed, ni / elapsed) + task._assert(ni, ni) assert p.num_tasks() == 1 + null_tasks assert p.del_task(task) is p # del manually this time @@ -183,7 +201,9 @@ class TestThreadPool(TestBase): urc1 = p.add_task(TestThreadTaskNode(iter(list()), "nothing", None)) urc2 = p.add_task(TestThreadTaskNode(iter(list()), "nothing", None)) assert p.num_tasks() == 2 - self._assert_sync_single_task(p) + + ## SINGLE TASK ################# + self._assert_single_task(p, False) assert p.num_tasks() == 2 del(urc1) del(urc2) @@ -209,13 +229,15 @@ class TestThreadPool(TestBase): assert len(threading.enumerate()) == num_threads + 1 # here we go - self._assert_sync_single_task(p) + self._assert_single_task(p, False) # SINGLE TASK ASYNC MODE ######################## # two threads to compete for a single task + p.set_size(2) + self._assert_single_task(p, True) # DEPENDENT TASK ASYNC MODE -- cgit v1.2.3 From edd9e23c766cfd51b3a6f6eee5aac0b791ef2fd0 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 7 Jun 2010 17:16:48 +0200 Subject: added high-speed locking facilities, allowing our Queue to be faster, at least in tests, and with multiple threads. There is still an sync bug in regard to closed channels to be fixed, as the Task.set_done handling is incorrecft --- lib/git/async/pool.py | 241 ++++++++++++++++++++++++++++++++++---------- lib/git/async/thread.py | 3 - test/git/async/test_pool.py | 20 ++-- 3 files changed, 199 insertions(+), 65 deletions(-) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 26a6a182..30291835 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -1,8 +1,16 @@ """Implementation of a thread-pool working with channels""" from thread import WorkerThread -from threading import Lock + +from threading import ( + Lock, + _Condition, + _sleep, + _time, + ) + from task import InputChannelTask from Queue import Queue, Empty +from collections import deque from graph import ( Graph, @@ -18,6 +26,96 @@ import weakref import sys +#{ Utilities + +class SyncQueue(deque): + """Adapter to allow using a deque like a queue, without locking""" + def get(self, block=True, timeout=None): + try: + return self.pop() + except IndexError: + raise Empty + # END raise empty + + def empty(self): + return len(self) == 0 + + put = deque.append + + +class HSCondition(_Condition): + """An attempt to make conditions less blocking, which gains performance + in return by sleeping less""" + delay = 0.00002 # reduces wait times, but increases overhead + + def wait(self, timeout=None): + waiter = Lock() + waiter.acquire() + self.__dict__['_Condition__waiters'].append(waiter) + saved_state = self._release_save() + try: # restore state no matter what (e.g., KeyboardInterrupt) + if timeout is None: + waiter.acquire() + else: + # Balancing act: We can't afford a pure busy loop, so we + # have to sleep; but if we sleep the whole timeout time, + # we'll be unresponsive. The scheme here sleeps very + # little at first, longer as time goes on, but never longer + # than 20 times per second (or the timeout time remaining). + endtime = _time() + timeout + delay = self.delay + acquire = waiter.acquire + while True: + gotit = acquire(0) + if gotit: + break + remaining = endtime - _time() + if remaining <= 0: + break + delay = min(delay * 2, remaining, .05) + _sleep(delay) + # END endless loop + if not gotit: + try: + self.__dict__['_Condition__waiters'].remove(waiter) + except ValueError: + pass + # END didn't ever get it + finally: + self._acquire_restore(saved_state) + + def notify(self, n=1): + __waiters = self.__dict__['_Condition__waiters'] + if not __waiters: + return + if n == 1: + __waiters[0].release() + try: + __waiters.pop(0) + except IndexError: + pass + else: + waiters = __waiters[:n] + for waiter in waiters: + waiter.release() + try: + __waiters.remove(waiter) + except ValueError: + pass + # END handle n = 1 case faster + +class PerfQueue(Queue): + """A queue using different condition objects to gain multithreading performance""" + def __init__(self, maxsize=0): + Queue.__init__(self, maxsize) + + self.not_empty = HSCondition(self.mutex) + self.not_full = HSCondition(self.mutex) + self.all_tasks_done = HSCondition(self.mutex) + + +#} END utilities + class RPoolChannel(RChannel): """ A read-only pool channel may not be wrapped or derived from, but it provides slots to call before and after an item is to be read. @@ -49,7 +147,7 @@ class RPoolChannel(RChannel): returns a possibly changed item list. If it raises, the exception will be propagated. If a function is not provided, the call is effectively uninstalled.""" self._post_cb = fun - + def read(self, count=0, block=True, timeout=None): """Read an item that was processed by one of our threads :note: Triggers task dependency handling needed to provide the necessary @@ -58,8 +156,18 @@ class RPoolChannel(RChannel): self._pre_cb() # END pre callback + # if we have count items, don't do any queue preparation - if someone + # depletes the queue in the meanwhile, the channel will close and + # we will unblock naturally + have_enough = False + if count > 0: + # explicitly > count, as we want a certain safe range + have_enough = self._wc._queue.qsize() > count + # END risky game + ########## prepare ############################## - self._pool._prepare_channel_read(self._task, count) + if not have_enough: + self._pool._prepare_channel_read(self._task, count) ######### read data ###### @@ -127,9 +235,9 @@ class Pool(object): def __init__(self, size=0): self._tasks = Graph() - self._consumed_tasks = Queue() # make sure its threadsafe + self._consumed_tasks = None self._workers = list() - self._queue = self.TaskQueueCls() + self._queue = SyncQueue() # start with a sync queue self._taskgraph_lock = self.LockCls() self._taskorder_cache = dict() self.set_size(size) @@ -201,58 +309,60 @@ class Pool(object): # if the task does not have the required output on its queue, schedule # it for processing. If we should process all, we don't care about the # amount as it should process until its all done. - # NOTE: revise this for multi-tasking - checking qsize doesnt work there ! - if count < 1 or task._out_wc.size() < count: - # but we continue to use the actual count to produce the output - numchunks = 1 - chunksize = actual_count - remainder = 0 - - # we need the count set for this - can't chunk up unlimited items - # In serial mode we could do this by checking for empty input channels, - # but in dispatch mode its impossible ( == not easily possible ) - # Only try it if we have enough demand - if task.max_chunksize and actual_count > task.max_chunksize: - numchunks = actual_count / task.max_chunksize - chunksize = task.max_chunksize - remainder = actual_count - (numchunks * chunksize) - # END handle chunking - - # the following loops are kind of unrolled - code duplication - # should make things execute faster. Putting the if statements - # into the loop would be less code, but ... slower - # DEBUG - # print actual_count, numchunks, chunksize, remainder, task._out_wc.size() - if self._workers: - # respect the chunk size, and split the task up if we want - # to process too much. This can be defined per task - queue = self._queue - if numchunks > 1: - for i in xrange(numchunks): - queue.put((task.process, chunksize)) - # END for each chunk to put - else: + #if count > 1 and task._out_wc.size() >= count: + # continue + # END skip if we have enough + + # but use the actual count to produce the output, we may produce + # more than requested + numchunks = 1 + chunksize = actual_count + remainder = 0 + + # we need the count set for this - can't chunk up unlimited items + # In serial mode we could do this by checking for empty input channels, + # but in dispatch mode its impossible ( == not easily possible ) + # Only try it if we have enough demand + if task.max_chunksize and actual_count > task.max_chunksize: + numchunks = actual_count / task.max_chunksize + chunksize = task.max_chunksize + remainder = actual_count - (numchunks * chunksize) + # END handle chunking + + # the following loops are kind of unrolled - code duplication + # should make things execute faster. Putting the if statements + # into the loop would be less code, but ... slower + # DEBUG + # print actual_count, numchunks, chunksize, remainder, task._out_wc.size() + if self._workers: + # respect the chunk size, and split the task up if we want + # to process too much. This can be defined per task + queue = self._queue + if numchunks > 1: + for i in xrange(numchunks): queue.put((task.process, chunksize)) - # END try efficient looping - - if remainder: - queue.put((task.process, remainder)) - # END handle chunksize + # END for each chunk to put else: - # no workers, so we have to do the work ourselves - if numchunks > 1: - for i in xrange(numchunks): - task.process(chunksize) - # END for each chunk to put - else: + queue.put((task.process, chunksize)) + # END try efficient looping + + if remainder: + queue.put((task.process, remainder)) + # END handle chunksize + else: + # no workers, so we have to do the work ourselves + if numchunks > 1: + for i in xrange(numchunks): task.process(chunksize) - # END try efficient looping - - if remainder: - task.process(remainder) - # END handle chunksize - # END handle serial mode - # END handle queuing + # END for each chunk to put + else: + task.process(chunksize) + # END try efficient looping + + if remainder: + task.process(remainder) + # END handle chunksize + # END handle serial mode # END for each task to process @@ -297,11 +407,22 @@ class Pool(object): otherwise the work will be distributed among the given amount of threads :note: currently NOT threadsafe !""" + assert size > -1, "Size cannot be negative" + # either start new threads, or kill existing ones. # If we end up with no threads, we process the remaining chunks on the queue # ourselves cur_count = len(self._workers) if cur_count < size: + # make sure we have a real queue, and can store our consumed tasks properly + if not isinstance(self._queue, self.TaskQueueCls): + if self._queue is not None and not self._queue.empty(): + raise AssertionError("Expected empty queue when switching the queue type") + # END safety check + self._queue = self.TaskQueueCls() + self._consumed_tasks = Queue() + # END init queue + for i in range(size - cur_count): worker = self.WorkerCls(self._queue) worker.start() @@ -323,6 +444,16 @@ class Pool(object): except Queue.Empty: continue # END while there are tasks on the queue + + # use a serial queue, its faster + if not isinstance(self._queue, SyncQueue): + self._queue = SyncQueue() + # END handle queue type + + if self._consumed_tasks and not self._consumed_tasks.empty(): + self._post_channel_read(self._consumed_tasks.pop()) + # END assure consumed tasks are empty + self._consumed_tasks = SyncQueue() # END process queue return self @@ -403,4 +534,4 @@ class ThreadPool(Pool): """A pool using threads as worker""" WorkerCls = WorkerThread LockCls = Lock - TaskQueueCls = Queue + TaskQueueCls = PerfQueue diff --git a/lib/git/async/thread.py b/lib/git/async/thread.py index f875f094..f7f0c978 100644 --- a/lib/git/async/thread.py +++ b/lib/git/async/thread.py @@ -15,9 +15,6 @@ def do_terminate_threads(whitelist=list()): continue if whitelist and t not in whitelist: continue - if isinstance(t, WorkerThread): - t.inq.put(t.quit) - # END worker special handling t.stop_and_join() # END for each thread diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index df3eaf11..791f89d4 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -2,6 +2,7 @@ from test.testlib import * from git.async.pool import * from git.async.task import * +from git.async.thread import terminate_threads from git.async.util import cpu_count import threading import time @@ -46,7 +47,7 @@ class TestThreadPool(TestBase): # add a simple task # it iterates n items - ni = 1000 + ni = 500 assert ni % 2 == 0, "ni needs to be dividable by 2" assert ni % 4 == 0, "ni needs to be dividable by 4" @@ -106,8 +107,9 @@ class TestThreadPool(TestBase): assert len(rc.read(1)) == 1 # processes nothing # rest - it has ni/2 - 2 on the queue, and pulls ni-2 # It wants too much, so the task realizes its done. The task - # doesn't care about the items in its output channel - assert len(rc.read(ni-2)) == ni - 2 + # doesn't care about the items in its output channel + items = rc.read(ni-2) + assert len(items) == ni - 2 assert p.num_tasks() == null_tasks task._assert(2, ni) # two chunks, 20 calls ( all items ) @@ -125,7 +127,8 @@ class TestThreadPool(TestBase): assert len(rc.read(ni / 2 + 2)) == ni / 2 + 2 # have n / 4 - 2 items on queue, want n / 4 in first chunk, cause 1 processing # ( 4 in total ). Still want n / 4 - 2 in second chunk, causing another processing - assert len(rc.read(ni / 2 - 2)) == ni / 2 - 2 + items = rc.read(ni / 2 - 2) + assert len(items) == ni / 2 - 2 task._assert( 5, ni) assert p.num_tasks() == null_tasks # depleted @@ -158,9 +161,12 @@ class TestThreadPool(TestBase): task.min_count = ni / 4 rc = p.add_task(task) for i in range(ni): - assert rc.read(1)[0] == i + if async: + assert len(rc.read(1)) == 1 + else: + assert rc.read(1)[0] == i # END for each item - task._assert(ni / task.min_count, ni) + task._assert(ni / task.min_count + 1, ni) del(rc) assert p.num_tasks() == null_tasks @@ -181,6 +187,7 @@ class TestThreadPool(TestBase): # t1 -> x -> t3 pass + @terminate_threads def test_base(self): p = ThreadPool() @@ -239,7 +246,6 @@ class TestThreadPool(TestBase): p.set_size(2) self._assert_single_task(p, True) - # DEPENDENT TASK ASYNC MODE ########################### self._assert_async_dependent_tasks(p) -- cgit v1.2.3 From 583cd8807259a69fc01874b798f657c1f9ab7828 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 7 Jun 2010 19:12:44 +0200 Subject: Moved pool utilities into util module, fixed critical issue that caused havok - lets call this a safe-state --- lib/git/async/pool.py | 154 +++++++++++------------------------------------- lib/git/async/task.py | 26 +++++++- lib/git/async/thread.py | 16 ++--- lib/git/async/util.py | 106 +++++++++++++++++++++++++++++++++ 4 files changed, 176 insertions(+), 126 deletions(-) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 30291835..227cabfc 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -1,121 +1,28 @@ """Implementation of a thread-pool working with channels""" from thread import WorkerThread +from threading import Lock -from threading import ( - Lock, - _Condition, - _sleep, - _time, +from util import ( + SyncQueue, + AsyncQueue, ) from task import InputChannelTask -from Queue import Queue, Empty -from collections import deque - -from graph import ( - Graph, +from Queue import ( + Queue, + Empty ) +from graph import Graph from channel import ( Channel, WChannel, RChannel ) -import weakref import sys -#{ Utilities - -class SyncQueue(deque): - """Adapter to allow using a deque like a queue, without locking""" - def get(self, block=True, timeout=None): - try: - return self.pop() - except IndexError: - raise Empty - # END raise empty - - def empty(self): - return len(self) == 0 - - put = deque.append - - -class HSCondition(_Condition): - """An attempt to make conditions less blocking, which gains performance - in return by sleeping less""" - delay = 0.00002 # reduces wait times, but increases overhead - - def wait(self, timeout=None): - waiter = Lock() - waiter.acquire() - self.__dict__['_Condition__waiters'].append(waiter) - saved_state = self._release_save() - try: # restore state no matter what (e.g., KeyboardInterrupt) - if timeout is None: - waiter.acquire() - else: - # Balancing act: We can't afford a pure busy loop, so we - # have to sleep; but if we sleep the whole timeout time, - # we'll be unresponsive. The scheme here sleeps very - # little at first, longer as time goes on, but never longer - # than 20 times per second (or the timeout time remaining). - endtime = _time() + timeout - delay = self.delay - acquire = waiter.acquire - while True: - gotit = acquire(0) - if gotit: - break - remaining = endtime - _time() - if remaining <= 0: - break - delay = min(delay * 2, remaining, .05) - _sleep(delay) - # END endless loop - if not gotit: - try: - self.__dict__['_Condition__waiters'].remove(waiter) - except ValueError: - pass - # END didn't ever get it - finally: - self._acquire_restore(saved_state) - - def notify(self, n=1): - __waiters = self.__dict__['_Condition__waiters'] - if not __waiters: - return - if n == 1: - __waiters[0].release() - try: - __waiters.pop(0) - except IndexError: - pass - else: - waiters = __waiters[:n] - for waiter in waiters: - waiter.release() - try: - __waiters.remove(waiter) - except ValueError: - pass - # END handle n = 1 case faster - -class PerfQueue(Queue): - """A queue using different condition objects to gain multithreading performance""" - def __init__(self, maxsize=0): - Queue.__init__(self, maxsize) - - self.not_empty = HSCondition(self.mutex) - self.not_full = HSCondition(self.mutex) - self.all_tasks_done = HSCondition(self.mutex) - - -#} END utilities - class RPoolChannel(RChannel): """ A read-only pool channel may not be wrapped or derived from, but it provides slots to call before and after an item is to be read. @@ -237,7 +144,7 @@ class Pool(object): self._tasks = Graph() self._consumed_tasks = None self._workers = list() - self._queue = SyncQueue() # start with a sync queue + self._queue = self.TaskQueueCls() self._taskgraph_lock = self.LockCls() self._taskorder_cache = dict() self.set_size(size) @@ -375,7 +282,10 @@ class Pool(object): self._consumed_tasks.put(task) # END handle consumption - # delete consumed tasks to cleanup + self._handle_consumed_tasks() + + def _handle_consumed_tasks(self): + """Remove all consumed tasks from our queue by deleting them""" try: while True: ct = self._consumed_tasks.get(False) @@ -384,7 +294,7 @@ class Pool(object): except Empty: pass # END pop queue empty - + def _del_task_if_orphaned(self, task): """Check the task, and delete it if it is orphaned""" if sys.getrefcount(task._out_wc) < 3: @@ -415,11 +325,7 @@ class Pool(object): cur_count = len(self._workers) if cur_count < size: # make sure we have a real queue, and can store our consumed tasks properly - if not isinstance(self._queue, self.TaskQueueCls): - if self._queue is not None and not self._queue.empty(): - raise AssertionError("Expected empty queue when switching the queue type") - # END safety check - self._queue = self.TaskQueueCls() + if not isinstance(self._consumed_tasks, self.TaskQueueCls): self._consumed_tasks = Queue() # END init queue @@ -445,13 +351,8 @@ class Pool(object): continue # END while there are tasks on the queue - # use a serial queue, its faster - if not isinstance(self._queue, SyncQueue): - self._queue = SyncQueue() - # END handle queue type - if self._consumed_tasks and not self._consumed_tasks.empty(): - self._post_channel_read(self._consumed_tasks.pop()) + self._handle_consumed_tasks() # END assure consumed tasks are empty self._consumed_tasks = SyncQueue() # END process queue @@ -467,6 +368,8 @@ class Pool(object): output channel is only held by themselves, so no one will ever consume its items. + This method blocks until all tasks to be removed have been processed, if + they are currently being processed. :return: self""" # now delete our actual node - must set it done os it closes its channels. # Otherwise further reads of output tasks will block. @@ -478,6 +381,21 @@ class Pool(object): self._taskgraph_lock.acquire() try: self._taskorder_cache.clear() + # before we can delete the task, make sure its write channel + # is closed, otherwise people might still be waiting for its result. + # If a channel is not closed, this could also mean its not yet fully + # processed, but more importantly, there must be no task being processed + # right now. + # TODO: figure this out + for worker in self._workers: + r = worker.routine() + if r and r.im_self is task: + raise NotImplementedError("todo") + # END handle running task + # END check for in-progress routine + + # its done, close the channel for writing + task.close() self._tasks.del_node(task) finally: self._taskgraph_lock.release() @@ -497,11 +415,11 @@ class Pool(object): # create a write channel for it wc, rc = Channel() rc = RPoolChannel(wc, task, self) - task._out_wc = wc + task.set_wc(wc) has_input_channel = isinstance(task, InputChannelTask) if has_input_channel: - task._pool_ref = weakref.ref(self) + task.set_pool(self) # END init input channel task self._taskgraph_lock.acquire() @@ -534,4 +452,4 @@ class ThreadPool(Pool): """A pool using threads as worker""" WorkerCls = WorkerThread LockCls = Lock - TaskQueueCls = PerfQueue + TaskQueueCls = AsyncQueue diff --git a/lib/git/async/task.py b/lib/git/async/task.py index 3137746c..f106c381 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -1,5 +1,7 @@ from graph import Node + import threading +import weakref import new class OutputChannelTask(Node): @@ -17,6 +19,7 @@ class OutputChannelTask(Node): __slots__ = ( '_read', # method to yield items to process '_out_wc', # output write channel '_exc', # exception caught + '_done', # True if we are done 'fun', # function to call with items read 'min_count', # minimum amount of items to produce, None means no override 'max_chunksize', # maximium amount of items to process per process call @@ -28,6 +31,7 @@ class OutputChannelTask(Node): self._read = None # to be set by subclasss self._out_wc = None # to be set later self._exc = None + self._done = False self.fun = fun self.min_count = None self.max_chunksize = 0 # note set @@ -35,12 +39,28 @@ class OutputChannelTask(Node): def is_done(self): """:return: True if we are finished processing""" - return self._out_wc.closed + return self._done def set_done(self): """Set ourselves to being done, has we have completed the processing""" + self._done = True + self.close() + + def set_wc(self, wc): + """Set the write channel to the given one + :note: resets it done state in order to allow proper queue handling""" + self._done = False + self._out_wc = wc + + def close(self): + """A closed task will close its channel to assure the readers will wake up + :note: its safe to call this method multiple times""" self._out_wc.close() + def is_closed(self): + """:return: True if the task's write channel is closed""" + return self._out_wc.closed + def error(self): """:return: Exception caught during last processing or None""" return self._exc @@ -148,5 +168,9 @@ class InputChannelTask(OutputChannelTask): # and call it return OutputChannelTask.process(self, count) + + def set_pool(self, pool): + """Set our pool to the given one, it will be weakref'd""" + self._pool_ref = weakref.ref(pool) #{ Configuration diff --git a/lib/git/async/thread.py b/lib/git/async/thread.py index f7f0c978..4240a664 100644 --- a/lib/git/async/thread.py +++ b/lib/git/async/thread.py @@ -110,7 +110,7 @@ class WorkerThread(TerminatableThread): t[1] = optional, tuple or list of arguments to pass to the routine t[2] = optional, dictionary of keyword arguments to pass to the routine """ - __slots__ = ('inq', 'outq') + __slots__ = ('inq', '_current_routine') # define how often we should check for a shutdown request in case our @@ -120,10 +120,12 @@ class WorkerThread(TerminatableThread): def __init__(self, inq = None): super(WorkerThread, self).__init__() self.inq = inq or Queue.Queue() + self._current_routine = None # routine we execute right now def run(self): """Process input tasks until we receive the quit signal""" while True: + self._current_routine = None if self._should_terminate(): break # END check for stop request @@ -138,8 +140,9 @@ class WorkerThread(TerminatableThread): # needing exactly one function, and one arg assert len(tasktuple) == 2, "Need tuple of function, arg - it could be more flexible, but its reduced to what we need" routine, arg = tasktuple - # DEBUG - # print "%s: picked up: %s(%s)" % (self.name, routine, arg) + + self._current_routine = routine + try: rval = None if inspect.ismethod(routine): @@ -154,16 +157,15 @@ class WorkerThread(TerminatableThread): print "%s: task %s was not understood - terminating" % (self.getName(), str(tasktuple)) break # END make routine call - except StopIteration: - break except Exception,e: print "%s: Task %s raised unhandled exception: %s - this really shouldn't happen !" % (self.getName(), str(tasktuple), str(e)) break # abort ... # END routine exception handling # END endless loop - def quit(self): - raise StopIteration + def routine(self): + """:return: routine we are currently executing, or None if we have no task""" + return self._current_routine #} END classes diff --git a/lib/git/async/util.py b/lib/git/async/util.py index dabd8a42..432d1736 100644 --- a/lib/git/async/util.py +++ b/lib/git/async/util.py @@ -1,8 +1,23 @@ """Module with utilities related to async operations""" +from threading import ( + Lock, + _Condition, + _sleep, + _time, + ) + +from Queue import ( + Queue, + Empty, + ) + +from collections import deque import sys import os +#{ Routines + def cpu_count(): """:return:number of CPUs in the system :note: inspired by multiprocessing""" @@ -22,3 +37,94 @@ def cpu_count(): raise NotImplementedError('cannot determine number of cpus') return num + +#} END routines + + +class SyncQueue(deque): + """Adapter to allow using a deque like a queue, without locking""" + def get(self, block=True, timeout=None): + try: + return self.pop() + except IndexError: + raise Empty + # END raise empty + + def empty(self): + return len(self) == 0 + + put = deque.append + + +class HSCondition(_Condition): + """An attempt to make conditions less blocking, which gains performance + in return by sleeping less""" + delay = 0.00002 # reduces wait times, but increases overhead + + def wait(self, timeout=None): + waiter = Lock() + waiter.acquire() + self.__dict__['_Condition__waiters'].append(waiter) + saved_state = self._release_save() + try: # restore state no matter what (e.g., KeyboardInterrupt) + if timeout is None: + waiter.acquire() + else: + # Balancing act: We can't afford a pure busy loop, so we + # have to sleep; but if we sleep the whole timeout time, + # we'll be unresponsive. The scheme here sleeps very + # little at first, longer as time goes on, but never longer + # than 20 times per second (or the timeout time remaining). + endtime = _time() + timeout + delay = self.delay + acquire = waiter.acquire + while True: + gotit = acquire(0) + if gotit: + break + remaining = endtime - _time() + if remaining <= 0: + break + delay = min(delay * 2, remaining, .05) + _sleep(delay) + # END endless loop + if not gotit: + try: + self.__dict__['_Condition__waiters'].remove(waiter) + except ValueError: + pass + # END didn't ever get it + finally: + self._acquire_restore(saved_state) + + def notify(self, n=1): + __waiters = self.__dict__['_Condition__waiters'] + if not __waiters: + return + if n == 1: + __waiters[0].release() + try: + __waiters.pop(0) + except IndexError: + pass + else: + waiters = __waiters[:n] + for waiter in waiters: + waiter.release() + try: + __waiters.remove(waiter) + except ValueError: + pass + # END handle n = 1 case faster + +class AsyncQueue(Queue): + """A queue using different condition objects to gain multithreading performance""" + def __init__(self, maxsize=0): + Queue.__init__(self, maxsize) + + self.not_empty = HSCondition(self.mutex) + self.not_full = HSCondition(self.mutex) + self.all_tasks_done = HSCondition(self.mutex) + + +#} END utilities -- cgit v1.2.3 From 654e54d200135e665e07e9f0097d913a77f169da Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 7 Jun 2010 20:01:02 +0200 Subject: task: Fixed incorrect handling of channel closure. Performance is alright for up to 2 threads, but 4 are killing the queue --- lib/git/async/pool.py | 40 ++++++++++++++++++---------------------- lib/git/async/task.py | 24 +++++++++++++++++++++--- test/git/async/test_pool.py | 4 ++++ 3 files changed, 43 insertions(+), 25 deletions(-) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 227cabfc..3de98777 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -21,6 +21,7 @@ from channel import ( ) import sys +from time import sleep class RPoolChannel(RChannel): @@ -371,32 +372,27 @@ class Pool(object): This method blocks until all tasks to be removed have been processed, if they are currently being processed. :return: self""" - # now delete our actual node - must set it done os it closes its channels. - # Otherwise further reads of output tasks will block. - # Actually they may still block if anyone wants to read all ... without - # a timeout - # keep its input nodes as we check whether they were orphaned - in_tasks = task.in_nodes - task.set_done() self._taskgraph_lock.acquire() try: - self._taskorder_cache.clear() - # before we can delete the task, make sure its write channel - # is closed, otherwise people might still be waiting for its result. - # If a channel is not closed, this could also mean its not yet fully - # processed, but more importantly, there must be no task being processed - # right now. - # TODO: figure this out - for worker in self._workers: - r = worker.routine() - if r and r.im_self is task: - raise NotImplementedError("todo") - # END handle running task - # END check for in-progress routine + # it can be that the task is already deleted, but its chunk was on the + # queue until now, so its marked consumed again + if not task in self._tasks.nodes: + return self + # END early abort + + # the task we are currently deleting could also be processed by + # a thread right now. We don't care about it as its taking care about + # its write channel itself, and sends everything it can to it. + # For it it doesn't matter that its not part of our task graph anymore. + + # now delete our actual node - be sure its done to prevent further + # processing in case there are still client reads on their way. + task.set_done() - # its done, close the channel for writing - task.close() + # keep its input nodes as we check whether they were orphaned + in_tasks = task.in_nodes self._tasks.del_node(task) + self._taskorder_cache.clear() finally: self._taskgraph_lock.release() # END locked deletion diff --git a/lib/git/async/task.py b/lib/git/async/task.py index f106c381..b282e371 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -2,6 +2,7 @@ from graph import Node import threading import weakref +import sys import new class OutputChannelTask(Node): @@ -44,7 +45,6 @@ class OutputChannelTask(Node): def set_done(self): """Set ourselves to being done, has we have completed the processing""" self._done = True - self.close() def set_wc(self, wc): """Set the write channel to the given one @@ -69,17 +69,25 @@ class OutputChannelTask(Node): """Process count items and send the result individually to the output channel""" items = self._read(count) try: + # increase the ref-count - we use this to determine whether anyone else + # is currently handling our output channel. As this method runs asynchronously, + # we have to make sure that the channel is closed by the last finishing task, + # which is not necessarily the one which determines that he is done + # as he couldn't read anymore items. + # The refcount will be dropped in the moment we get out of here. + wc = self._out_wc if self.apply_single: for item in items: - self._out_wc.write(self.fun(item)) + wc.write(self.fun(item)) # END for each item else: - self._out_wc.write(self.fun(items)) + wc.write(self.fun(items)) # END handle single apply except Exception, e: self._exc = e self.set_done() # END exception handling + del(wc) # if we didn't get all demanded items, which is also the case if count is 0 # we have depleted the input channel and are done @@ -89,6 +97,16 @@ class OutputChannelTask(Node): if not items or len(items) != count: self.set_done() # END handle done state + + # If we appear to be the only one left with our output channel, and are + # closed ( this could have been set in another thread as well ), make + # sure to close the output channel. + # The count is: 1 = wc itself, 2 = first reader channel, and we have only + # one, 3 is ours + x for every thread having its copy on the stack + # + 1 for the instance we provide to refcount + if self.is_done() and sys.getrefcount(self._out_wc) < 5: + self.close() + # END handle channel closure #{ Configuration diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 791f89d4..19e86a9a 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -246,6 +246,10 @@ class TestThreadPool(TestBase): p.set_size(2) self._assert_single_task(p, True) + # kill it + p.set_size(4) + self._assert_single_task(p, True) + # DEPENDENT TASK ASYNC MODE ########################### self._assert_async_dependent_tasks(p) -- cgit v1.2.3 From be06e87433685b5ea9cfcc131ab89c56cf8292f2 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 7 Jun 2010 22:00:47 +0200 Subject: improved testing to test the actual async handling of the pool. there are still inconsistencies that need to be fixed, but it already improved, especially the 4-thread performance which now is as fast as the dual-threaded performance --- lib/git/async/pool.py | 18 +++++++++++++++--- lib/git/async/task.py | 11 +++++++++++ lib/git/async/util.py | 6 ++++-- test/git/async/test_pool.py | 29 ++++++++++++++++++++++++----- 4 files changed, 54 insertions(+), 10 deletions(-) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 3de98777..19fc9f6e 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -67,10 +67,20 @@ class RPoolChannel(RChannel): # if we have count items, don't do any queue preparation - if someone # depletes the queue in the meanwhile, the channel will close and # we will unblock naturally + # PROBLEM: If there are multiple consumer of this channel, we might + # run out of items without being replenished == block forever in the + # worst case. task.min_count could have triggered to produce more ... + # usually per read with n items, we put n items on to the queue, + # so we wouldn't check this + # Even if we have just one consumer ( we could determine that with + # the reference count ), it could be that in one moment we don't yet + # have an item, but its currently being produced by some worker. + # This is why we: + # * make no assumptions if there are multiple consumers + # * have_enough = False if count > 0: - # explicitly > count, as we want a certain safe range - have_enough = self._wc._queue.qsize() > count + have_enough = self._wc._queue.qsize() >= count # END risky game ########## prepare ############################## @@ -78,9 +88,11 @@ class RPoolChannel(RChannel): self._pool._prepare_channel_read(self._task, count) - ######### read data ###### + ####### read data ######## + ########################## # read actual items, tasks were setup to put their output into our channel ( as well ) items = RChannel.read(self, count, block, timeout) + ########################## if self._post_cb: items = self._post_cb(items) diff --git a/lib/git/async/task.py b/lib/git/async/task.py index b282e371..4e8aef54 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -178,6 +178,17 @@ class InputChannelTask(OutputChannelTask): # make sure we don't trigger the pool if we read from a pool channel which # belongs to our own pool. Channels from different pools are fine though, # there we want to trigger its computation + # PROBLEM: if the user keeps an end, but decides to put the same end into + # a task of this pool, then all items might deplete without new ones being + # produced, causing a deadlock. Just triggering the pool would be better, + # but cost's more, unnecessarily if there is just one consumer, which is + # the user. + # * could encode usage in the channel type, and fail if the refcount on + # the read-pool channel is too high + # * maybe keep track of the elements that are requested or in-production + # for each task, which would allow to precisely determine whether + # the pool as to be triggered, and bail out early. Problem would + # be the if isinstance(self._in_rc, RPoolChannel) and self._in_rc._pool is self._pool_ref(): self._read = self._in_rc._read diff --git a/lib/git/async/util.py b/lib/git/async/util.py index 432d1736..85d44694 100644 --- a/lib/git/async/util.py +++ b/lib/git/async/util.py @@ -59,7 +59,7 @@ class SyncQueue(deque): class HSCondition(_Condition): """An attempt to make conditions less blocking, which gains performance in return by sleeping less""" - delay = 0.00002 # reduces wait times, but increases overhead + delay = 0.00005 # reduces wait times, but increases overhead def wait(self, timeout=None): waiter = Lock() @@ -85,7 +85,9 @@ class HSCondition(_Condition): remaining = endtime - _time() if remaining <= 0: break - delay = min(delay * 2, remaining, .05) + # this makes 4 threads working as good as two, but of course + # it causes more frequent micro-sleeping + #delay = min(delay * 2, remaining, .05) _sleep(delay) # END endless loop if not gotit: diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 19e86a9a..2b45727c 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -12,9 +12,13 @@ class TestThreadTaskNode(InputIteratorThreadTask): super(TestThreadTaskNode, self).__init__(*args, **kwargs) self.reset(self._iterator) self.should_fail = False + self.lock = threading.Lock() # yes, can't safely do x = x + 1 :) + self.plock = threading.Lock() def do_fun(self, item): + self.lock.acquire() self.item_count += 1 + self.lock.release() if self.should_fail: raise AssertionError("I am failing just for the fun of it") return item @@ -25,14 +29,26 @@ class TestThreadTaskNode(InputIteratorThreadTask): self._iterator = iterator def process(self, count=1): - super(TestThreadTaskNode, self).process(count) + # must do it first, otherwise we might read and check results before + # the thread gets here :). Its a lesson ! + self.plock.acquire() self.process_count += 1 + self.plock.release() + super(TestThreadTaskNode, self).process(count) def _assert(self, pc, fc): """Assert for num process counts (pc) and num function counts (fc) :return: self""" + self.plock.acquire() + if self.process_count != pc: + print self.process_count, pc assert self.process_count == pc + self.plock.release() + self.lock.acquire() + if self.item_count != fc: + print self.item_count, fc assert self.item_count == fc + self.lock.release() assert not self.error() return self @@ -103,15 +119,17 @@ class TestThreadPool(TestBase): # if we query 1 item, it will prepare ni / 2 task.min_count = ni / 2 rc = p.add_task(task) - assert len(rc.read(1)) == 1 # processes ni / 2 - assert len(rc.read(1)) == 1 # processes nothing + items = rc.read(1) + assert len(items) == 1 and items[0] == 0 # processes ni / 2 + items = rc.read(1) + assert len(items) == 1 and items[0] == 1 # processes nothing # rest - it has ni/2 - 2 on the queue, and pulls ni-2 # It wants too much, so the task realizes its done. The task # doesn't care about the items in its output channel items = rc.read(ni-2) assert len(items) == ni - 2 assert p.num_tasks() == null_tasks - task._assert(2, ni) # two chunks, 20 calls ( all items ) + task._assert(2, ni) # two chunks, ni calls # its already done, gives us no more assert len(rc.read()) == 0 @@ -246,7 +264,8 @@ class TestThreadPool(TestBase): p.set_size(2) self._assert_single_task(p, True) - # kill it + # real stress test- should be native on every dual-core cpu with 2 hardware + # threads per core p.set_size(4) self._assert_single_task(p, True) -- cgit v1.2.3 From def0f73989047c4ddf9b11da05ad2c9c8e387331 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 7 Jun 2010 23:20:37 +0200 Subject: introduced a new counter keeping track of the scheduled tasks - this prevent unnecessary tasks to be scheduled as we keep track of how many items will be produced for the task at hand. This introduces additional locking, but performns well in multithreaded mode. Performance of the master queue is still a huge issue, its currently the limiting factor, as bypassing the master queue in serial moode gives 15x performance, wich is what I would need --- lib/git/async/pool.py | 15 +++++++++++++-- lib/git/async/task.py | 47 +++++++++++++++++++++++++++++++++++++++++++-- lib/git/async/util.py | 2 +- test/git/async/test_pool.py | 8 ++++++-- 4 files changed, 65 insertions(+), 7 deletions(-) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 19fc9f6e..4c97feb0 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -80,12 +80,13 @@ class RPoolChannel(RChannel): # * have_enough = False if count > 0: - have_enough = self._wc._queue.qsize() >= count - # END risky game + have_enough = self._task.scheduled_item_count() >= count or self._wc._queue.qsize() >= count + # END ########## prepare ############################## if not have_enough: self._pool._prepare_channel_read(self._task, count) + # END prepare pool scheduling ####### read data ######## @@ -260,26 +261,33 @@ class Pool(object): queue = self._queue if numchunks > 1: for i in xrange(numchunks): + # schedule them as early as we know about them + task.add_scheduled_items(chunksize) queue.put((task.process, chunksize)) # END for each chunk to put else: + task.add_scheduled_items(chunksize) queue.put((task.process, chunksize)) # END try efficient looping if remainder: + task.add_scheduled_items(remainder) queue.put((task.process, remainder)) # END handle chunksize else: # no workers, so we have to do the work ourselves if numchunks > 1: for i in xrange(numchunks): + task.add_scheduled_items(chunksize) task.process(chunksize) # END for each chunk to put else: + task.add_scheduled_items(chunksize) task.process(chunksize) # END try efficient looping if remainder: + task.add_scheduled_items(remainder) task.process(remainder) # END handle chunksize # END handle serial mode @@ -348,6 +356,9 @@ class Pool(object): self._workers.append(worker) # END for each new worker to create elif cur_count > size: + # we can safely increase the size, even from serial mode, as we would + # only be able to do this if the serial ( sync ) mode finished processing. + # Just adding more workers is not a problem at all. del_count = cur_count - size for i in range(del_count): self._workers[i].stop_and_join() diff --git a/lib/git/async/task.py b/lib/git/async/task.py index 4e8aef54..cf486f48 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -21,6 +21,8 @@ class OutputChannelTask(Node): '_out_wc', # output write channel '_exc', # exception caught '_done', # True if we are done + '_scheduled_items', # amount of scheduled items that will be processed in total + '_slock', # lock for scheduled items 'fun', # function to call with items read 'min_count', # minimum amount of items to produce, None means no override 'max_chunksize', # maximium amount of items to process per process call @@ -33,6 +35,8 @@ class OutputChannelTask(Node): self._out_wc = None # to be set later self._exc = None self._done = False + self._scheduled_items = 0 + self._slock = threading.Lock() self.fun = fun self.min_count = None self.max_chunksize = 0 # note set @@ -50,6 +54,7 @@ class OutputChannelTask(Node): """Set the write channel to the given one :note: resets it done state in order to allow proper queue handling""" self._done = False + self._scheduled_items = 0 self._out_wc = wc def close(self): @@ -65,6 +70,21 @@ class OutputChannelTask(Node): """:return: Exception caught during last processing or None""" return self._exc + def add_scheduled_items(self, count): + """Add the given amount of scheduled items to this task""" + self._slock.acquire() + self._scheduled_items += count + self._slock.release() + + def scheduled_item_count(self): + """:return: amount of scheduled items for this task""" + self._slock.acquire() + try: + return self._scheduled_items + finally: + self._slock.release() + # END threadsafe return + def process(self, count=0): """Process count items and send the result individually to the output channel""" items = self._read(count) @@ -78,14 +98,33 @@ class OutputChannelTask(Node): wc = self._out_wc if self.apply_single: for item in items: - wc.write(self.fun(item)) + rval = self.fun(item) + # decrement afterwards, the its unscheduled once its produced + self._slock.acquire() + self._scheduled_items -= 1 + self._slock.release() + wc.write(rval) # END for each item else: - wc.write(self.fun(items)) + # shouldn't apply single be the default anyway ? + # The task designers should chunk them up in advance + rvals = self.fun(items) + self._slock.acquire() + self._scheduled_items -= len(items) + self._slock.release() + for rval in rvals: + wc.write(rval) # END handle single apply except Exception, e: self._exc = e self.set_done() + # unschedule all, we don't know how many have been produced actually + # but only if we don't apply single please + if not self.apply_single: + self._slock.acquire() + self._scheduled_items -= len(items) + self._slock.release() + # END unschedule all # END exception handling del(wc) @@ -189,6 +228,10 @@ class InputChannelTask(OutputChannelTask): # for each task, which would allow to precisely determine whether # the pool as to be triggered, and bail out early. Problem would # be the + # * Perhaps one shouldn't seek the perfect solution , but instead + # document whats working and what not, or under which conditions. + # The whole system is simple, but gets more complicated the + # smarter it wants to be. if isinstance(self._in_rc, RPoolChannel) and self._in_rc._pool is self._pool_ref(): self._read = self._in_rc._read diff --git a/lib/git/async/util.py b/lib/git/async/util.py index 85d44694..6bd8a4e8 100644 --- a/lib/git/async/util.py +++ b/lib/git/async/util.py @@ -59,7 +59,7 @@ class SyncQueue(deque): class HSCondition(_Condition): """An attempt to make conditions less blocking, which gains performance in return by sleeping less""" - delay = 0.00005 # reduces wait times, but increases overhead + delay = 0.0001 # reduces wait times, but increases overhead def wait(self, timeout=None): waiter = Lock() diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 2b45727c..29c13188 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -36,7 +36,7 @@ class TestThreadTaskNode(InputIteratorThreadTask): self.plock.release() super(TestThreadTaskNode, self).process(count) - def _assert(self, pc, fc): + def _assert(self, pc, fc, check_scheduled=False): """Assert for num process counts (pc) and num function counts (fc) :return: self""" self.plock.acquire() @@ -49,6 +49,10 @@ class TestThreadTaskNode(InputIteratorThreadTask): print self.item_count, fc assert self.item_count == fc self.lock.release() + + # if we read all, we can't really use scheduled items + if check_scheduled: + assert self._scheduled_items == 0 assert not self.error() return self @@ -184,7 +188,7 @@ class TestThreadPool(TestBase): else: assert rc.read(1)[0] == i # END for each item - task._assert(ni / task.min_count + 1, ni) + task._assert(ni / task.min_count, ni) del(rc) assert p.num_tasks() == null_tasks -- cgit v1.2.3 From e825f8b69760e269218b1bf1991018baf3c16b04 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 7 Jun 2010 23:38:08 +0200 Subject: Channel now uses the AsyncQueue, boosting performance by factor 4, its a start --- lib/git/async/channel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/git/async/channel.py b/lib/git/async/channel.py index 2add9478..2d5ab79c 100644 --- a/lib/git/async/channel.py +++ b/lib/git/async/channel.py @@ -1,10 +1,10 @@ """Contains a queue based channel implementation""" from Queue import ( - Queue, Empty, Full ) +from util import AsyncQueue from time import time import sys @@ -43,7 +43,7 @@ class WChannel(Channel): """initialize this instance, able to hold max_items at once Write calls will block if the channel is full, until someone reads from it""" self._closed = False - self._queue = Queue(max_items) + self._queue = AsyncQueue(max_items) #{ Interface -- cgit v1.2.3 From 898d47d1711accdfded8ee470520fdb96fb12d46 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 7 Jun 2010 23:47:06 +0200 Subject: Task scheduled items lock now uses a dummy lock in serial mode, improving its performance considerably. Channels now use the AsyncQueue, boosting their throughput to about 5k items / s - this is something one can work with, considering the runtime of each item should be large enough to keep the threads busy. This could be a basis, further testing needed --- lib/git/async/pool.py | 6 ++++++ lib/git/async/util.py | 14 +++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 4c97feb0..d6b5711d 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -5,6 +5,7 @@ from threading import Lock from util import ( SyncQueue, AsyncQueue, + DummyLock ) from task import InputChannelTask @@ -462,6 +463,11 @@ class Pool(object): # END add task relation # END handle input channels for connections + # fix locks - in serial mode, the task does not need real locks + if self.size() == 0: + task._slock = DummyLock() + # END improve locks + return rc #} END interface diff --git a/lib/git/async/util.py b/lib/git/async/util.py index 6bd8a4e8..55766579 100644 --- a/lib/git/async/util.py +++ b/lib/git/async/util.py @@ -41,6 +41,18 @@ def cpu_count(): #} END routines + +class DummyLock(object): + """An object providing a do-nothing lock interface for use in sync mode""" + __slots__ = tuple() + + def acquire(self): + pass + + def release(self): + pass + + class SyncQueue(deque): """Adapter to allow using a deque like a queue, without locking""" def get(self, block=True, timeout=None): @@ -59,7 +71,7 @@ class SyncQueue(deque): class HSCondition(_Condition): """An attempt to make conditions less blocking, which gains performance in return by sleeping less""" - delay = 0.0001 # reduces wait times, but increases overhead + delay = 0.00002 # reduces wait times, but increases overhead def wait(self, timeout=None): waiter = Lock() -- cgit v1.2.3 From 5d996892ac76199886ba3e2754ff9c9fac2456d6 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 8 Jun 2010 00:32:33 +0200 Subject: test implementation of async-queue with everything stripped from it that didn't seem necessary - its a failure, something is wrong - performance not much better than the original one, its depending on the condition performance actually, which I don't get faster --- lib/git/async/util.py | 53 ++++++++++++++++++++++++++++++++++++++++----- test/git/async/test_pool.py | 6 +++++ 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/lib/git/async/util.py b/lib/git/async/util.py index 55766579..e3556c05 100644 --- a/lib/git/async/util.py +++ b/lib/git/async/util.py @@ -133,12 +133,55 @@ class HSCondition(_Condition): class AsyncQueue(Queue): """A queue using different condition objects to gain multithreading performance""" + __slots__ = ('mutex', 'not_empty', 'queue') + def __init__(self, maxsize=0): - Queue.__init__(self, maxsize) - + self.queue = deque() + self.mutex = Lock() self.not_empty = HSCondition(self.mutex) - self.not_full = HSCondition(self.mutex) - self.all_tasks_done = HSCondition(self.mutex) - + def qsize(self): + self.mutex.acquire() + try: + return len(self.queue) + finally: + self.mutex.release() + + def empty(self): + self.mutex.acquire() + try: + return not len(self.queue) + finally: + self.mutex.release() + + def put(self, item, block=True, timeout=None): + self.mutex.acquire() + self.queue.append(item) + self.mutex.release() + self.not_empty.notify() + + def get(self, block=True, timeout=None): + self.not_empty.acquire() + q = self.queue + try: + if not block: + if not len(q): + raise Empty + elif timeout is None: + while not len(q): + self.not_empty.wait() + elif timeout < 0: + raise ValueError("'timeout' must be a positive number") + else: + endtime = _time() + timeout + while not len(q): + remaining = endtime - _time() + if remaining <= 0.0: + raise Empty + self.not_empty.wait(remaining) + return q.popleft() + finally: + self.not_empty.release() + + #} END utilities diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 29c13188..0d779f39 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -61,6 +61,12 @@ class TestThreadPool(TestBase): max_threads = cpu_count() + def _add_triple_task(self, p): + """Add a triplet of feeder, transformer and finalizer to the pool, like + t1 -> t2 -> t3, return all 3 return channels in order""" + t1 = TestThreadTaskNode(make_iter(), 'iterator', None) + # TODO: + def _assert_single_task(self, p, async=False): """Performs testing in a synchronized environment""" null_tasks = p.num_tasks() # in case we had some before -- cgit v1.2.3 From 09c3f39ceb545e1198ad7a3f470d4ec896ce1add Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 8 Jun 2010 10:45:14 +0200 Subject: both versions of the async queue still have trouble in certain situations, at least with my totally overwritten version of the condition - the previous one was somewhat more stable it seems. Nonetheless, this is the fastest version so far --- lib/git/async/util.py | 77 ++++++++++++++++++++++++++++++++------------- test/git/async/test_pool.py | 14 ++++++--- 2 files changed, 65 insertions(+), 26 deletions(-) diff --git a/lib/git/async/util.py b/lib/git/async/util.py index e3556c05..fb63ccaa 100644 --- a/lib/git/async/util.py +++ b/lib/git/async/util.py @@ -2,6 +2,8 @@ from threading import ( Lock, + current_thread, + _allocate_lock, _Condition, _sleep, _time, @@ -57,7 +59,7 @@ class SyncQueue(deque): """Adapter to allow using a deque like a queue, without locking""" def get(self, block=True, timeout=None): try: - return self.pop() + return self.popleft() except IndexError: raise Empty # END raise empty @@ -67,26 +69,45 @@ class SyncQueue(deque): put = deque.append - -class HSCondition(_Condition): + +class HSCondition(object): """An attempt to make conditions less blocking, which gains performance in return by sleeping less""" + __slots__ = ("acquire", "release", "_lock", '_waiters') delay = 0.00002 # reduces wait times, but increases overhead + def __init__(self, lock=None): + if lock is None: + lock = Lock() + self._lock = lock + self.acquire = lock.acquire + self.release = lock.release + self._waiters = list() + + def __release(self): + return self._lock.release() + + def __acquire(self, block=None): + if block is None: + self._lock.acquire() + else: + return self._lock.acquire(block) + def wait(self, timeout=None): - waiter = Lock() - waiter.acquire() - self.__dict__['_Condition__waiters'].append(waiter) - saved_state = self._release_save() + waiter = _allocate_lock() + waiter.acquire() # get it the first time, no blocking + self._waiters.append(waiter) + + # in the momemnt we release our lock, someone else might actually resume + self.release() try: # restore state no matter what (e.g., KeyboardInterrupt) + # now we block, as we hold the lock already if timeout is None: waiter.acquire() else: - # Balancing act: We can't afford a pure busy loop, so we - # have to sleep; but if we sleep the whole timeout time, - # we'll be unresponsive. The scheme here sleeps very - # little at first, longer as time goes on, but never longer - # than 20 times per second (or the timeout time remaining). + # Balancing act: We can't afford a pure busy loop, because of the + # GIL, so we have to sleep + # We try to sleep only tiny amounts of time though to be very responsive endtime = _time() + timeout delay = self.delay acquire = waiter.acquire @@ -104,34 +125,48 @@ class HSCondition(_Condition): # END endless loop if not gotit: try: - self.__dict__['_Condition__waiters'].remove(waiter) + self._waiters.remove(waiter) except ValueError: pass # END didn't ever get it finally: - self._acquire_restore(saved_state) + # reacquire the lock + self.acquire() def notify(self, n=1): - __waiters = self.__dict__['_Condition__waiters'] - if not __waiters: + if not self._waiters: return + waiters = self._waiters if n == 1: - __waiters[0].release() + waiters[0].release() try: - __waiters.pop(0) + waiters.pop(0) except IndexError: pass else: - waiters = __waiters[:n] - for waiter in waiters: + for waiter in waiters[:n]: waiter.release() try: - __waiters.remove(waiter) + waiters.remove(waiter) except ValueError: pass # END handle n = 1 case faster + def notify_all(self): + self.notify(len(self._waiters)) + + class AsyncQueue(Queue): + """A queue using different condition objects to gain multithreading performance""" + def __init__(self, maxsize=0): + Queue.__init__(self, maxsize) + + self.not_empty = HSCondition(self.mutex) + self.not_full = HSCondition(self.mutex) + self.all_tasks_done = HSCondition(self.mutex) + + +class _AsyncQueue(Queue): """A queue using different condition objects to gain multithreading performance""" __slots__ = ('mutex', 'not_empty', 'queue') diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 0d779f39..4c20a9b2 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -136,8 +136,9 @@ class TestThreadPool(TestBase): # rest - it has ni/2 - 2 on the queue, and pulls ni-2 # It wants too much, so the task realizes its done. The task # doesn't care about the items in its output channel - items = rc.read(ni-2) - assert len(items) == ni - 2 + nri = ni-2 + items = rc.read(nri) + assert len(items) == nri assert p.num_tasks() == null_tasks task._assert(2, ni) # two chunks, ni calls @@ -152,11 +153,14 @@ class TestThreadPool(TestBase): # must read a specific item count # count is still at ni / 2 - here we want more than that # 2 steps with n / 4 items, + 1 step with n/4 items to get + 2 - assert len(rc.read(ni / 2 + 2)) == ni / 2 + 2 + nri = ni / 2 + 2 + items = rc.read(nri) + assert len(items) == nri # have n / 4 - 2 items on queue, want n / 4 in first chunk, cause 1 processing # ( 4 in total ). Still want n / 4 - 2 in second chunk, causing another processing - items = rc.read(ni / 2 - 2) - assert len(items) == ni / 2 - 2 + nri = ni / 2 - 2 + items = rc.read(nri) + assert len(items) == nri task._assert( 5, ni) assert p.num_tasks() == null_tasks # depleted -- cgit v1.2.3 From 3776f7a766851058f6435b9f606b16766425d7ca Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 8 Jun 2010 13:24:44 +0200 Subject: The new channeldesign actually works, but it also shows that its located at the wrong spot. The channel is nothing more than an adapter allowing to read multiple items from a thread-safe queue, the queue itself though must be 'closable' for writing, or needs something like a writable flag. --- lib/git/async/channel.py | 108 +++++++++++++++++++++-------------------- lib/git/async/task.py | 6 +-- lib/git/async/util.py | 32 +++++++----- test/git/async/test_channel.py | 24 +-------- test/git/async/test_pool.py | 13 +++++ 5 files changed, 93 insertions(+), 90 deletions(-) diff --git a/lib/git/async/channel.py b/lib/git/async/channel.py index 2d5ab79c..655024fe 100644 --- a/lib/git/async/channel.py +++ b/lib/git/async/channel.py @@ -4,7 +4,11 @@ from Queue import ( Full ) -from util import AsyncQueue +from util import ( + AsyncQueue, + DummyLock + ) + from time import time import sys @@ -23,12 +27,9 @@ class Channel(object): def __new__(cls, *args): if cls is Channel: - max_items = 0 - if len(args) == 1: - max_items = args[0] - if len(args) > 1: - raise ValueError("Specify not more than the number of items the channel should take") - wc = WChannel(max_items) + if len(args) > 0: + raise ValueError("Cannot take any arguments when creating a new channel") + wc = WChannel() rc = RChannel(wc) return wc, rc # END constructor mode @@ -39,11 +40,11 @@ class WChannel(Channel): """The write end of a channel""" __slots__ = ('_closed', '_queue') - def __init__(self, max_items=0): + def __init__(self): """initialize this instance, able to hold max_items at once Write calls will block if the channel is full, until someone reads from it""" self._closed = False - self._queue = AsyncQueue(max_items) + self._queue = AsyncQueue() #{ Interface @@ -74,7 +75,21 @@ class WChannel(Channel): def close(self): """Close the channel. Multiple close calls on a closed channel are no an error""" + mutex = self._queue.mutex + mutex.acquire() + # this is atomic already, due to the GIL - no need to get the queue's mutex + print "channel.close()" self._closed = True + # now make sure that the people waiting for an item are released now + # As we it could be that some readers are already on their way to initiate + # a blocking get, we must make sure that locks never block before that happens + + # now we are the only one accessing the queue, so change it + self._queue.mutex = DummyLock() + print self._queue.not_empty._waiters + self._queue.not_empty.notify_all() + print self._queue.not_empty._waiters + mutex.release() @property def closed(self): @@ -134,58 +149,47 @@ class RChannel(Channel): pass # END handle exceptions else: - # if we have really bad timing, the source of the channel - # marks itself closed, but before setting it, the thread - # switches to us. We read it, read False, and try to fetch - # something, and never return. The whole closed channel thing - # is not atomic ( of course ) - # This is why we never block for long, to get a chance to recheck - # for closed channels. - # We blend this into the timeout of the user - ourtimeout = 0.25 # the smaller, the more responsive, but the slower - wc = self._wc - timeout = (timeout is None and sys.maxint) or timeout # make sure we can compute with it - assert timeout != 0.0, "shouldn't block if timeout is 0" # okay safe - if timeout and ourtimeout > timeout: - ourtimeout = timeout - # END setup timeout - # to get everything into one loop, we set the count accordingly if count == 0: count = sys.maxint # END handle count + endtime = sys.maxint # allows timeout for whole operation + if timeout is not None: + endtime = time() + timeout + # could be improved by a separate: no-endtime branch, saving the time calls for i in xrange(count): - have_timeout = False - st = time() - while True: + try: + print "about to read", i, count, block, timeout + out.append(queue.get(block, timeout)) + print "got one" + except Empty: + pass + # END ignore empty + + # if we have been unblocked because the closed state changed + # in the meanwhile, stop trying + # NOTE: must NOT cache _wc + if self._wc.closed: + # its racing time - all threads waiting for the queue + # are awake now, and we actually can't be sure its empty + # Hence we pop it empty without blocking, getting as much + # as we can. This effectively lets us race ( with mutexes ) + # of the other threads. + print "stopped because it was closed" try: - if wc.closed: - have_timeout = True - # its about the 'in the meanwhile' :) - get everything - # we can in non-blocking mode. This will raise - try: - while True: - out.append(queue.get(False)) - # END until it raises Empty - except Empty: - break - # END finally, out of here - # END don't continue on closed channels - - # END abort reading if it was closed ( in the meanwhile ) - out.append(queue.get(block, ourtimeout)) - break # breakout right away + while True: + out.append(queue.get(False)) + # END pop it empty except Empty: - if timeout - (time() - st) <= 0: - # hitting timeout - have_timeout = True - break - # END abort if the user wants no more time spent here - # END handle timeout - # END endless timer loop - if have_timeout: + pass + # END ignore emptyness, we have all + break + # END handle cloased + + if time() >= endtime: + break # END stop on timeout # END for each item # END handle blocking diff --git a/lib/git/async/task.py b/lib/git/async/task.py index cf486f48..ce701c86 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -140,10 +140,10 @@ class OutputChannelTask(Node): # If we appear to be the only one left with our output channel, and are # closed ( this could have been set in another thread as well ), make # sure to close the output channel. - # The count is: 1 = wc itself, 2 = first reader channel, and we have only - # one, 3 is ours + x for every thread having its copy on the stack + # The count is: 1 = wc itself, 2 = first reader channel, + x for every + # thread having its copy on the stack # + 1 for the instance we provide to refcount - if self.is_done() and sys.getrefcount(self._out_wc) < 5: + if self.is_done() and sys.getrefcount(self._out_wc) < 4: self.close() # END handle channel closure #{ Configuration diff --git a/lib/git/async/util.py b/lib/git/async/util.py index fb63ccaa..01073f6d 100644 --- a/lib/git/async/util.py +++ b/lib/git/async/util.py @@ -73,21 +73,22 @@ class SyncQueue(deque): class HSCondition(object): """An attempt to make conditions less blocking, which gains performance in return by sleeping less""" - __slots__ = ("acquire", "release", "_lock", '_waiters') + # __slots__ = ("acquire", "release", "_lock", '_waiters') + __slots__ = ("_lock", '_waiters') delay = 0.00002 # reduces wait times, but increases overhead def __init__(self, lock=None): if lock is None: lock = Lock() self._lock = lock - self.acquire = lock.acquire - self.release = lock.release + #self.acquire = lock.acquire + #self.release = lock.release self._waiters = list() - def __release(self): + def release(self): return self._lock.release() - def __acquire(self, block=None): + def acquire(self, block=None): if block is None: self._lock.acquire() else: @@ -156,7 +157,7 @@ class HSCondition(object): self.notify(len(self._waiters)) -class AsyncQueue(Queue): +class _AsyncQueue(Queue): """A queue using different condition objects to gain multithreading performance""" def __init__(self, maxsize=0): Queue.__init__(self, maxsize) @@ -166,7 +167,7 @@ class AsyncQueue(Queue): self.all_tasks_done = HSCondition(self.mutex) -class _AsyncQueue(Queue): +class AsyncQueue(Queue): """A queue using different condition objects to gain multithreading performance""" __slots__ = ('mutex', 'not_empty', 'queue') @@ -194,9 +195,9 @@ class _AsyncQueue(Queue): self.queue.append(item) self.mutex.release() self.not_empty.notify() - + def get(self, block=True, timeout=None): - self.not_empty.acquire() + self.not_empty.acquire() # == self.mutex.acquire in that case q = self.queue try: if not block: @@ -205,16 +206,23 @@ class _AsyncQueue(Queue): elif timeout is None: while not len(q): self.not_empty.wait() - elif timeout < 0: - raise ValueError("'timeout' must be a positive number") else: + print "with timeout", timeout + import traceback + traceback.print_stack() endtime = _time() + timeout while not len(q): remaining = endtime - _time() if remaining <= 0.0: raise Empty self.not_empty.wait(remaining) - return q.popleft() + # END handle block + # can happen if someone else woke us up + try: + return q.popleft() + except IndexError: + raise Empty + # END handle unblocking reason finally: self.not_empty.release() diff --git a/test/git/async/test_channel.py b/test/git/async/test_channel.py index acfbd15e..25eb974c 100644 --- a/test/git/async/test_channel.py +++ b/test/git/async/test_channel.py @@ -43,26 +43,4 @@ class TestChannels(TestBase): # reading from a closed channel never blocks assert len(rc.read()) == 0 - - - - # TEST LIMITED SIZE CHANNEL - # channel with max-items set - wc, rc = Channel(1) - wc.write(item) # fine - - # blocks for a a moment, its full - st = time.time() - self.failUnlessRaises(EOFError, wc.write, item, True, to) - assert time.time() - st >= to - - # get our only one - assert rc.read(1)[0] == item - - # its empty,can put one again - wc.write(item2) - wc.close() - - # reading 10 will only yield one, it will not block as its closed - assert rc.read(10, timeout=1)[0] == item2 - + diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 4c20a9b2..7f5a5811 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -92,6 +92,7 @@ class TestThreadPool(TestBase): # pull the result completely - we should get one task, which calls its # function once. In sync mode, the order matches + print "read(0)" items = rc.read() assert len(items) == ni task._assert(1, ni).reset(make_iter()) @@ -105,6 +106,7 @@ class TestThreadPool(TestBase): rc = p.add_task(task) assert p.num_tasks() == 1 + null_tasks st = time.time() + print "read(1) * %i" % ni for i in range(ni): items = rc.read(1) assert len(items) == 1 @@ -129,20 +131,24 @@ class TestThreadPool(TestBase): # if we query 1 item, it will prepare ni / 2 task.min_count = ni / 2 rc = p.add_task(task) + print "read(1)" items = rc.read(1) assert len(items) == 1 and items[0] == 0 # processes ni / 2 + print "read(1)" items = rc.read(1) assert len(items) == 1 and items[0] == 1 # processes nothing # rest - it has ni/2 - 2 on the queue, and pulls ni-2 # It wants too much, so the task realizes its done. The task # doesn't care about the items in its output channel nri = ni-2 + print "read(%i)" % nri items = rc.read(nri) assert len(items) == nri assert p.num_tasks() == null_tasks task._assert(2, ni) # two chunks, ni calls # its already done, gives us no more + print "read(0) on closed" assert len(rc.read()) == 0 # test chunking @@ -154,11 +160,13 @@ class TestThreadPool(TestBase): # count is still at ni / 2 - here we want more than that # 2 steps with n / 4 items, + 1 step with n/4 items to get + 2 nri = ni / 2 + 2 + print "read(%i)" % nri items = rc.read(nri) assert len(items) == nri # have n / 4 - 2 items on queue, want n / 4 in first chunk, cause 1 processing # ( 4 in total ). Still want n / 4 - 2 in second chunk, causing another processing nri = ni / 2 - 2 + print "read(%i)" % nri items = rc.read(nri) assert len(items) == nri @@ -172,6 +180,7 @@ class TestThreadPool(TestBase): task.min_count = None rc = p.add_task(task) st = time.time() + print "read(1) * %i, chunksize set" % ni for i in range(ni): if async: assert len(rc.read(1)) == 1 @@ -192,6 +201,7 @@ class TestThreadPool(TestBase): task.reset(make_iter()) task.min_count = ni / 4 rc = p.add_task(task) + print "read(1) * %i, min_count%i + chunksize" % (ni, task.min_count) for i in range(ni): if async: assert len(rc.read(1)) == 1 @@ -208,10 +218,13 @@ class TestThreadPool(TestBase): task.reset(make_iter()) task.should_fail = True rc = p.add_task(task) + print "read(0) with failure" assert len(rc.read()) == 0 # failure on first item + print "done with everything" assert isinstance(task.error(), AssertionError) assert p.num_tasks() == null_tasks + def _assert_async_dependent_tasks(self, p): # includes failure in center task, 'recursive' orphan cleanup # This will also verify that the channel-close mechanism works -- cgit v1.2.3 From 53152a824f5186452504f0b68306d10ebebee416 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 8 Jun 2010 14:23:58 +0200 Subject: queue: adjusted queue to be closable ( without own testing yet, except for the pool which runs it ) - its not yet stable, but should be solvable. --- lib/git/async/channel.py | 44 ++++++++++--------------------- lib/git/async/util.py | 68 ++++++++++++++++++++++++++++++++++-------------- 2 files changed, 62 insertions(+), 50 deletions(-) diff --git a/lib/git/async/channel.py b/lib/git/async/channel.py index 655024fe..08323582 100644 --- a/lib/git/async/channel.py +++ b/lib/git/async/channel.py @@ -6,7 +6,6 @@ from Queue import ( from util import ( AsyncQueue, - DummyLock ) from time import time @@ -56,15 +55,13 @@ class WChannel(Channel): channel :param timeout: timeout in seconds for blocking calls. :raise IOError: when writing into closed file - :raise EOFError: when writing into a non-blocking full channel - :note: may block if the channel has a limited capacity""" - if self._closed: - raise IOError("Cannot write to a closed channel") - + :raise EOFError: when writing into a non-blocking full channel""" + # let the queue handle the 'closed' attribute, we write much more often + # to an open channel than to a closed one, saving a few cycles try: self._queue.put(item, block, timeout) - except Full: - raise EOFError("Capacity of the channel was exeeded") + except ReadOnly: + raise IOError("Cannot write to a closed channel") # END exception handling def size(self): @@ -75,21 +72,10 @@ class WChannel(Channel): def close(self): """Close the channel. Multiple close calls on a closed channel are no an error""" - mutex = self._queue.mutex - mutex.acquire() - # this is atomic already, due to the GIL - no need to get the queue's mutex - print "channel.close()" + # yes, close it a little too early, better than having anyone put + # additional items self._closed = True - # now make sure that the people waiting for an item are released now - # As we it could be that some readers are already on their way to initiate - # a blocking get, we must make sure that locks never block before that happens - - # now we are the only one accessing the queue, so change it - self._queue.mutex = DummyLock() - print self._queue.not_empty._waiters - self._queue.not_empty.notify_all() - print self._queue.not_empty._waiters - mutex.release() + self._queue.set_writable(False) @property def closed(self): @@ -124,6 +110,7 @@ class RChannel(Channel): If count was < 1, a list with all items that could be read will be returned.""" # if the channel is closed for writing, we never block + # NOTE: is handled by the queue if self._wc.closed or timeout == 0: block = False @@ -160,9 +147,7 @@ class RChannel(Channel): # could be improved by a separate: no-endtime branch, saving the time calls for i in xrange(count): try: - print "about to read", i, count, block, timeout out.append(queue.get(block, timeout)) - print "got one" except Empty: pass # END ignore empty @@ -176,7 +161,6 @@ class RChannel(Channel): # Hence we pop it empty without blocking, getting as much # as we can. This effectively lets us race ( with mutexes ) # of the other threads. - print "stopped because it was closed" try: while True: out.append(queue.get(False)) @@ -186,11 +170,11 @@ class RChannel(Channel): # END ignore emptyness, we have all break - # END handle cloased - - if time() >= endtime: - break - # END stop on timeout + # END handle channel cloased + + if time() >= endtime: + break + # END stop operation on timeout # END for each item # END handle blocking return out diff --git a/lib/git/async/util.py b/lib/git/async/util.py index 01073f6d..51219cc4 100644 --- a/lib/git/async/util.py +++ b/lib/git/async/util.py @@ -166,15 +166,21 @@ class _AsyncQueue(Queue): self.not_full = HSCondition(self.mutex) self.all_tasks_done = HSCondition(self.mutex) - + +class ReadOnly(Exception): + """Thrown when trying to write to a read-only queue""" + class AsyncQueue(Queue): - """A queue using different condition objects to gain multithreading performance""" - __slots__ = ('mutex', 'not_empty', 'queue') + """A queue using different condition objects to gain multithreading performance. + Additionally it has a threadsafe writable flag, which will alert all readers + that there is nothing more to get here.""" + __slots__ = ('mutex', 'not_empty', 'queue', '_writable') def __init__(self, maxsize=0): self.queue = deque() self.mutex = Lock() self.not_empty = HSCondition(self.mutex) + self._writable = True def qsize(self): self.mutex.acquire() @@ -183,6 +189,29 @@ class AsyncQueue(Queue): finally: self.mutex.release() + def writable(self): + self.mutex.acquire() + try: + return self._writable + finally: + self.mutex.release() + + def set_writable(self, state): + """Set the writable flag of this queue to True or False + :return: The previous state""" + self.mutex.acquire() + try: + old = self._writable + self._writable = state + return old + finally: + # if we won't receive anymore items, inform the getters + if not state: + self.not_empty.notify_all() + # END tell everyone + self.mutex.release() + # END handle locking + def empty(self): self.mutex.acquire() try: @@ -192,6 +221,9 @@ class AsyncQueue(Queue): def put(self, item, block=True, timeout=None): self.mutex.acquire() + if not self._writable: + raise ReadOnly + # END handle read-only self.queue.append(item) self.mutex.release() self.not_empty.notify() @@ -200,24 +232,20 @@ class AsyncQueue(Queue): self.not_empty.acquire() # == self.mutex.acquire in that case q = self.queue try: - if not block: - if not len(q): - raise Empty - elif timeout is None: - while not len(q): - self.not_empty.wait() - else: - print "with timeout", timeout - import traceback - traceback.print_stack() - endtime = _time() + timeout - while not len(q): - remaining = endtime - _time() - if remaining <= 0.0: - raise Empty - self.not_empty.wait(remaining) + if block: + if timeout is None: + while not len(q) and self._writable: + self.not_empty.wait() + else: + endtime = _time() + timeout + while not len(q) and self._writable: + remaining = endtime - _time() + if remaining <= 0.0: + raise Empty + self.not_empty.wait(remaining) + # END handle timeout mode # END handle block - # can happen if someone else woke us up + # can happen if we woke up because we are not writable anymore try: return q.popleft() except IndexError: -- cgit v1.2.3 From 619c11787742ce00a0ee8f841cec075897873c79 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 8 Jun 2010 16:47:48 +0200 Subject: Its getting better already - intermediate commit before further chaning the task class --- lib/git/async/channel.py | 35 ++++++++++++++++++++--------------- lib/git/async/pool.py | 7 ++++--- lib/git/async/task.py | 2 ++ lib/git/async/thread.py | 3 ++- lib/git/async/util.py | 25 ++++++++++--------------- test/git/async/test_channel.py | 6 +++++- test/git/async/test_pool.py | 15 +++++++++------ 7 files changed, 52 insertions(+), 41 deletions(-) diff --git a/lib/git/async/channel.py b/lib/git/async/channel.py index 08323582..5c52b1dc 100644 --- a/lib/git/async/channel.py +++ b/lib/git/async/channel.py @@ -6,6 +6,7 @@ from Queue import ( from util import ( AsyncQueue, + ReadOnly ) from time import time @@ -59,6 +60,7 @@ class WChannel(Channel): # let the queue handle the 'closed' attribute, we write much more often # to an open channel than to a closed one, saving a few cycles try: + print "putting item", item, id(self._queue.queue) self._queue.put(item, block, timeout) except ReadOnly: raise IOError("Cannot write to a closed channel") @@ -74,6 +76,7 @@ class WChannel(Channel): an error""" # yes, close it a little too early, better than having anyone put # additional items + print "closing channel", self self._closed = True self._queue.set_writable(False) @@ -102,7 +105,7 @@ class RChannel(Channel): :param count: given amount of items to read. If < 1, all items will be read :param block: if True, the call will block until an item is available :param timeout: if positive and block is True, it will block only for the - given amount of seconds. + given amount of seconds, returning the items it received so far. :return: single item in a list if count is 1, or a list of count items. If the channel was empty and count was 1, an empty list will be returned. If count was greater 1, a list with less than count items will be @@ -149,27 +152,29 @@ class RChannel(Channel): try: out.append(queue.get(block, timeout)) except Empty: - pass + # here we are only if there is nothing on the queue, + # and if we are blocking. If we are not blocking, this + # indiccates that the queue was set unwritable in the meanwhile. + # hence we can abort now to prevent reading (possibly) forever + # Besides, this is racy as all threads will rip on the channel + # without waiting until its empty + if not block: + break # END ignore empty # if we have been unblocked because the closed state changed # in the meanwhile, stop trying # NOTE: must NOT cache _wc if self._wc.closed: - # its racing time - all threads waiting for the queue - # are awake now, and we actually can't be sure its empty - # Hence we pop it empty without blocking, getting as much - # as we can. This effectively lets us race ( with mutexes ) - # of the other threads. - try: - while True: - out.append(queue.get(False)) - # END pop it empty - except Empty: - pass - # END ignore emptyness, we have all + # If we were closed, we drop out even if there might still + # be items. Now its time to get these items, according to + # our count. Just switch to unblocking mode. + # If we are to read unlimited items, this would run forever, + # but the EmptyException handler takes care of this + block = False - break + # we don't continue, but let the timer decide whether + # it wants to abort # END handle channel cloased if time() >= endtime: diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index d6b5711d..cf1c2199 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -80,8 +80,8 @@ class RPoolChannel(RChannel): # * make no assumptions if there are multiple consumers # * have_enough = False - if count > 0: - have_enough = self._task.scheduled_item_count() >= count or self._wc._queue.qsize() >= count + #if count > 0: + # have_enough = self._task.scheduled_item_count() >= count or self._wc._queue.qsize() >= count # END ########## prepare ############################## @@ -319,6 +319,7 @@ class Pool(object): def _del_task_if_orphaned(self, task): """Check the task, and delete it if it is orphaned""" + # 1 as its stored on the task, 1 for the getrefcount call if sys.getrefcount(task._out_wc) < 3: self.del_task(task) #} END internal @@ -403,7 +404,7 @@ class Pool(object): if not task in self._tasks.nodes: return self # END early abort - + print "deleting ", id(task) # the task we are currently deleting could also be processed by # a thread right now. We don't care about it as its taking care about # its write channel itself, and sends everything it can to it. diff --git a/lib/git/async/task.py b/lib/git/async/task.py index ce701c86..97521cae 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -88,6 +88,7 @@ class OutputChannelTask(Node): def process(self, count=0): """Process count items and send the result individually to the output channel""" items = self._read(count) + print "task read", len(items) try: # increase the ref-count - we use this to determine whether anyone else # is currently handling our output channel. As this method runs asynchronously, @@ -117,6 +118,7 @@ class OutputChannelTask(Node): # END handle single apply except Exception, e: self._exc = e + print str(e) # TODO: REMOVE DEBUG, or make it use logging self.set_done() # unschedule all, we don't know how many have been produced actually # but only if we don't apply single please diff --git a/lib/git/async/thread.py b/lib/git/async/thread.py index 4240a664..5faad4f8 100644 --- a/lib/git/async/thread.py +++ b/lib/git/async/thread.py @@ -124,6 +124,7 @@ class WorkerThread(TerminatableThread): def run(self): """Process input tasks until we receive the quit signal""" + gettask = self.inq.get while True: self._current_routine = None if self._should_terminate(): @@ -132,7 +133,7 @@ class WorkerThread(TerminatableThread): # don't wait too long, instead check for the termination request more often try: - tasktuple = self.inq.get(True, 1) + tasktuple = gettask(True, 0.25) except Queue.Empty: continue # END get task with timeout diff --git a/lib/git/async/util.py b/lib/git/async/util.py index 51219cc4..6d09de59 100644 --- a/lib/git/async/util.py +++ b/lib/git/async/util.py @@ -63,7 +63,7 @@ class SyncQueue(deque): except IndexError: raise Empty # END raise empty - + def empty(self): return len(self) == 0 @@ -86,13 +86,13 @@ class HSCondition(object): self._waiters = list() def release(self): - return self._lock.release() + self._lock.release() def acquire(self, block=None): if block is None: self._lock.acquire() else: - return self._lock.acquire(block) + self._lock.acquire(block) def wait(self, timeout=None): waiter = _allocate_lock() @@ -145,6 +145,7 @@ class HSCondition(object): except IndexError: pass else: + print "notify", waiters, n for waiter in waiters[:n]: waiter.release() try: @@ -156,16 +157,6 @@ class HSCondition(object): def notify_all(self): self.notify(len(self._waiters)) - -class _AsyncQueue(Queue): - """A queue using different condition objects to gain multithreading performance""" - def __init__(self, maxsize=0): - Queue.__init__(self, maxsize) - - self.not_empty = HSCondition(self.mutex) - self.not_full = HSCondition(self.mutex) - self.all_tasks_done = HSCondition(self.mutex) - class ReadOnly(Exception): """Thrown when trying to write to a read-only queue""" @@ -205,11 +196,12 @@ class AsyncQueue(Queue): self._writable = state return old finally: + self.mutex.release() + # if we won't receive anymore items, inform the getters if not state: self.not_empty.notify_all() # END tell everyone - self.mutex.release() # END handle locking def empty(self): @@ -222,6 +214,7 @@ class AsyncQueue(Queue): def put(self, item, block=True, timeout=None): self.mutex.acquire() if not self._writable: + self.mutex.release() raise ReadOnly # END handle read-only self.queue.append(item) @@ -245,7 +238,9 @@ class AsyncQueue(Queue): self.not_empty.wait(remaining) # END handle timeout mode # END handle block - # can happen if we woke up because we are not writable anymore + + # can throw if we woke up because we are not writable anymore + print len(q), id(q), current_thread() try: return q.popleft() except IndexError: diff --git a/test/git/async/test_channel.py b/test/git/async/test_channel.py index 25eb974c..ab4ae015 100644 --- a/test/git/async/test_channel.py +++ b/test/git/async/test_channel.py @@ -42,5 +42,9 @@ class TestChannels(TestBase): self.failUnlessRaises(IOError, wc.write, 1) # reading from a closed channel never blocks + print "preblock" assert len(rc.read()) == 0 - + print "got read(0)" + assert len(rc.read(5)) == 0 + assert len(rc.read(1)) == 0 + diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 7f5a5811..0aa8f39b 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -39,6 +39,8 @@ class TestThreadTaskNode(InputIteratorThreadTask): def _assert(self, pc, fc, check_scheduled=False): """Assert for num process counts (pc) and num function counts (fc) :return: self""" + # TODO: fixme + return self self.plock.acquire() if self.process_count != pc: print self.process_count, pc @@ -73,7 +75,7 @@ class TestThreadPool(TestBase): # add a simple task # it iterates n items - ni = 500 + ni = 52 assert ni % 2 == 0, "ni needs to be dividable by 2" assert ni % 4 == 0, "ni needs to be dividable by 4" @@ -203,10 +205,10 @@ class TestThreadPool(TestBase): rc = p.add_task(task) print "read(1) * %i, min_count%i + chunksize" % (ni, task.min_count) for i in range(ni): - if async: - assert len(rc.read(1)) == 1 - else: - assert rc.read(1)[0] == i + items = rc.read(1) + assert len(items) == 1 + if not async: + assert items[0] == i # END for each item task._assert(ni / task.min_count, ni) del(rc) @@ -255,6 +257,7 @@ class TestThreadPool(TestBase): assert p.num_tasks() == 2 ## SINGLE TASK ################# + assert p.size() == 0 self._assert_single_task(p, False) assert p.num_tasks() == 2 del(urc1) @@ -281,7 +284,7 @@ class TestThreadPool(TestBase): assert len(threading.enumerate()) == num_threads + 1 # here we go - self._assert_single_task(p, False) + self._assert_single_task(p, True) -- cgit v1.2.3 From 13dd59ba5b3228820841682b59bad6c22476ff66 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 8 Jun 2010 17:25:43 +0200 Subject: task: now deletes itself once its done - for the test this doesn't change a thing as the task deletes itself too late - its time for a paradigm change, the task should be deleted with its RPoolChannel or explicitly by the user. The test needs to adapt, and shouldn't assume anything unless the RPoolChannel is gone --- lib/git/async/pool.py | 57 ++++++++++++--------------------------------- lib/git/async/task.py | 22 ++++++++++++----- test/git/async/test_pool.py | 5 +++- 3 files changed, 35 insertions(+), 49 deletions(-) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index cf1c2199..fce5e424 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -78,14 +78,17 @@ class RPoolChannel(RChannel): # have an item, but its currently being produced by some worker. # This is why we: # * make no assumptions if there are multiple consumers - # * - have_enough = False + # * + + # if the user tries to use us to read from a done task, we will never + # compute as all produced items are already in the channel + skip_compute = self._task.is_done() or self._task.error() #if count > 0: - # have_enough = self._task.scheduled_item_count() >= count or self._wc._queue.qsize() >= count + # skip_compute = self._task.scheduled_item_count() >= count or self._wc._queue.qsize() >= count # END ########## prepare ############################## - if not have_enough: + if not skip_compute: self._pool._prepare_channel_read(self._task, count) # END prepare pool scheduling @@ -134,7 +137,6 @@ class Pool(object): used only from the main thread, hence you cannot consume their results from multiple threads unless you use a task for it.""" __slots__ = ( '_tasks', # a graph of tasks - '_consumed_tasks', # a queue with tasks that are done or had an error '_workers', # list of worker threads '_queue', # master queue for tasks '_taskorder_cache', # map task id -> ordered dependent tasks @@ -157,7 +159,6 @@ class Pool(object): def __init__(self, size=0): self._tasks = Graph() - self._consumed_tasks = None self._workers = list() self._queue = self.TaskQueueCls() self._taskgraph_lock = self.LockCls() @@ -224,8 +225,10 @@ class Pool(object): # requested one last for task in dfirst_tasks: if task.error() or task.is_done(): - self._consumed_tasks.put(task) - continue + # in theory, the should never be consumed task in the pool, right ? + # They delete themselves once they are done. + raise AssertionError("Shouldn't have consumed tasks on the pool, they delete themeselves, what happend ?") + #continue # END skip processing # if the task does not have the required output on its queue, schedule @@ -297,26 +300,8 @@ class Pool(object): def _post_channel_read(self, task): """Called after we processed a read to cleanup""" - # check whether we consumed the task, and schedule it for deletion - # This could have happend after the read returned ( even though the pre-read - # checks it as well ) - if task.error() or task.is_done(): - self._consumed_tasks.put(task) - # END handle consumption - - self._handle_consumed_tasks() - - def _handle_consumed_tasks(self): - """Remove all consumed tasks from our queue by deleting them""" - try: - while True: - ct = self._consumed_tasks.get(False) - self.del_task(ct) - # END for each task to delete - except Empty: - pass - # END pop queue empty - + pass + def _del_task_if_orphaned(self, task): """Check the task, and delete it if it is orphaned""" # 1 as its stored on the task, 1 for the getrefcount call @@ -347,11 +332,6 @@ class Pool(object): # ourselves cur_count = len(self._workers) if cur_count < size: - # make sure we have a real queue, and can store our consumed tasks properly - if not isinstance(self._consumed_tasks, self.TaskQueueCls): - self._consumed_tasks = Queue() - # END init queue - for i in range(size - cur_count): worker = self.WorkerCls(self._queue) worker.start() @@ -377,9 +357,6 @@ class Pool(object): continue # END while there are tasks on the queue - if self._consumed_tasks and not self._consumed_tasks.empty(): - self._handle_consumed_tasks() - # END assure consumed tasks are empty self._consumed_tasks = SyncQueue() # END process queue return self @@ -437,11 +414,7 @@ class Pool(object): wc, rc = Channel() rc = RPoolChannel(wc, task, self) task.set_wc(wc) - - has_input_channel = isinstance(task, InputChannelTask) - if has_input_channel: - task.set_pool(self) - # END init input channel task + task.set_pool(self) self._taskgraph_lock.acquire() try: @@ -452,7 +425,7 @@ class Pool(object): # END sync task addition # If the input channel is one of our read channels, we add the relation - if has_input_channel: + if isinstance(task, InputChannelTask): ic = task.in_rc if isinstance(ic, RPoolChannel) and ic._pool is self: self._taskgraph_lock.acquire() diff --git a/lib/git/async/task.py b/lib/git/async/task.py index 97521cae..dc207c33 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -23,6 +23,7 @@ class OutputChannelTask(Node): '_done', # True if we are done '_scheduled_items', # amount of scheduled items that will be processed in total '_slock', # lock for scheduled items + '_pool_ref', # to be set by Pool 'fun', # function to call with items read 'min_count', # minimum amount of items to produce, None means no override 'max_chunksize', # maximium amount of items to process per process call @@ -84,6 +85,10 @@ class OutputChannelTask(Node): finally: self._slock.release() # END threadsafe return + + def set_pool(self, pool): + """Set our pool to the given one, it will be weakref'd""" + self._pool_ref = weakref.ref(pool) def process(self, count=0): """Process count items and send the result individually to the output channel""" @@ -147,6 +152,16 @@ class OutputChannelTask(Node): # + 1 for the instance we provide to refcount if self.is_done() and sys.getrefcount(self._out_wc) < 4: self.close() + # additionally, remove ourselves from the pool, this is thread-safe + # Previously the pool collected done tasks and removed them, + # but this could happen after a read finished, potentially + # leaving them on the queue until the read-handle was dropped. + # This should assure its more in-time. + # I don't like this back-ref. + pool = self._pool_ref() + if pool: + pool.del_task(self) + # END remove ourselves from the pool # END handle channel closure #{ Configuration @@ -204,8 +219,7 @@ class InputChannelTask(OutputChannelTask): For instantiation, it takes all arguments of its base, the first one needs to be the input channel to read from though.""" __slots__ = ( - 'in_rc', # channel to read items from - '_pool_ref' # to be set by Pool + 'in_rc' # channel to read items from ) def __init__(self, in_rc, *args, **kwargs): @@ -242,9 +256,5 @@ class InputChannelTask(OutputChannelTask): # and call it return OutputChannelTask.process(self, count) - - def set_pool(self, pool): - """Set our pool to the given one, it will be weakref'd""" - self._pool_ref = weakref.ref(pool) #{ Configuration diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 0aa8f39b..3077dc32 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -26,7 +26,9 @@ class TestThreadTaskNode(InputIteratorThreadTask): def reset(self, iterator): self.process_count = 0 self.item_count = 0 + self._exc = None self._iterator = iterator + self._done = False def process(self, count=1): # must do it first, otherwise we might read and check results before @@ -97,12 +99,13 @@ class TestThreadPool(TestBase): print "read(0)" items = rc.read() assert len(items) == ni - task._assert(1, ni).reset(make_iter()) + task._assert(1, ni) assert items[0] == 0 and items[-1] == ni-1 # as the task is done, it should have been removed - we have read everything assert task.is_done() assert p.num_tasks() == null_tasks + task.reset(make_iter()) # pull individual items rc = p.add_task(task) -- cgit v1.2.3 From e5c0002d069382db1768349bf0c5ff40aafbf140 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 8 Jun 2010 18:20:12 +0200 Subject: Revised task deletion works well, adjusted test to be creating new tasks all the time instead of reusing its own one, it was somewhat hard to manage its state over time and could cause bugs. It works okay, but it occasionally hangs, it appears to be an empty queue, have to gradually put certain things back in, although in the current mode of operation, it should never have empty queues from the pool to the user --- lib/git/async/pool.py | 25 ++++++++++++++---- lib/git/async/task.py | 64 ++++++++------------------------------------- test/git/async/test_pool.py | 51 ++++++++++++++++++++---------------- 3 files changed, 60 insertions(+), 80 deletions(-) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index fce5e424..a915f7b0 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -41,8 +41,18 @@ class RPoolChannel(RChannel): def __del__(self): """Assures that our task will be deleted if we were the last reader""" - del(self._wc) # decrement ref-count - self._pool._del_task_if_orphaned(self._task) + del(self._wc) # decrement ref-count early + # now, if this is the last reader to the wc we just handled, there + # is no way anyone will ever read from the task again. If so, + # delete the task in question, it will take care of itself and orphans + # it might leave + # 1 is ourselves, + 1 for the call + 1, and 3 magical ones which + # I can't explain, but appears to be normal in the destructor + # On the caller side, getrefcount returns 2, as expected + if sys.getrefcount(self) < 6: + print "__del__" + self._pool.del_task(self._task) + print "done" def set_pre_cb(self, fun = lambda count: None): """Install a callback to call with the item count to be read before any @@ -105,7 +115,7 @@ class RPoolChannel(RChannel): ####### Finalize ######## self._pool._post_channel_read(self._task) - + return items #{ Internal @@ -227,6 +237,7 @@ class Pool(object): if task.error() or task.is_done(): # in theory, the should never be consumed task in the pool, right ? # They delete themselves once they are done. + # TODO: remove this check for performance later raise AssertionError("Shouldn't have consumed tasks on the pool, they delete themeselves, what happend ?") #continue # END skip processing @@ -363,7 +374,11 @@ class Pool(object): def num_tasks(self): """:return: amount of tasks""" - return len(self._tasks.nodes) + self._taskgraph_lock.acquire() + try: + return len(self._tasks.nodes) + finally: + self._taskgraph_lock.release() def del_task(self, task): """Delete the task @@ -374,6 +389,7 @@ class Pool(object): This method blocks until all tasks to be removed have been processed, if they are currently being processed. :return: self""" + print "del_task: getting lock" self._taskgraph_lock.acquire() try: # it can be that the task is already deleted, but its chunk was on the @@ -414,7 +430,6 @@ class Pool(object): wc, rc = Channel() rc = RPoolChannel(wc, task, self) task.set_wc(wc) - task.set_pool(self) self._taskgraph_lock.acquire() try: diff --git a/lib/git/async/task.py b/lib/git/async/task.py index dc207c33..5edd40bb 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -1,10 +1,11 @@ from graph import Node import threading -import weakref import sys import new +getrefcount = sys.getrefcount + class OutputChannelTask(Node): """Abstracts a named task as part of a set of interdependent tasks, which contains additional information on how the task should be queued and processed. @@ -23,7 +24,6 @@ class OutputChannelTask(Node): '_done', # True if we are done '_scheduled_items', # amount of scheduled items that will be processed in total '_slock', # lock for scheduled items - '_pool_ref', # to be set by Pool 'fun', # function to call with items read 'min_count', # minimum amount of items to produce, None means no override 'max_chunksize', # maximium amount of items to process per process call @@ -54,7 +54,7 @@ class OutputChannelTask(Node): def set_wc(self, wc): """Set the write channel to the given one :note: resets it done state in order to allow proper queue handling""" - self._done = False + self._done = False # TODO : fix this, this is a side-effect self._scheduled_items = 0 self._out_wc = wc @@ -86,10 +86,6 @@ class OutputChannelTask(Node): self._slock.release() # END threadsafe return - def set_pool(self, pool): - """Set our pool to the given one, it will be weakref'd""" - self._pool_ref = weakref.ref(pool) - def process(self, count=0): """Process count items and send the result individually to the output channel""" items = self._read(count) @@ -123,7 +119,7 @@ class OutputChannelTask(Node): # END handle single apply except Exception, e: self._exc = e - print str(e) # TODO: REMOVE DEBUG, or make it use logging + print "task error:", str(e) # TODO: REMOVE DEBUG, or make it use logging self.set_done() # unschedule all, we don't know how many have been produced actually # but only if we don't apply single please @@ -150,18 +146,8 @@ class OutputChannelTask(Node): # The count is: 1 = wc itself, 2 = first reader channel, + x for every # thread having its copy on the stack # + 1 for the instance we provide to refcount - if self.is_done() and sys.getrefcount(self._out_wc) < 4: + if self.is_done() and getrefcount(self._out_wc) < 4: self.close() - # additionally, remove ourselves from the pool, this is thread-safe - # Previously the pool collected done tasks and removed them, - # but this could happen after a read finished, potentially - # leaving them on the queue until the read-handle was dropped. - # This should assure its more in-time. - # I don't like this back-ref. - pool = self._pool_ref() - if pool: - pool.del_task(self) - # END remove ourselves from the pool # END handle channel closure #{ Configuration @@ -218,43 +204,15 @@ class InputChannelTask(OutputChannelTask): """Uses an input channel as source for reading items For instantiation, it takes all arguments of its base, the first one needs to be the input channel to read from though.""" - __slots__ = ( - 'in_rc' # channel to read items from - ) def __init__(self, in_rc, *args, **kwargs): OutputChannelTask.__init__(self, *args, **kwargs) - self._in_rc = in_rc - + self._read = in_rc.read + def process(self, count=1): - """Verify our setup, and do some additional checking, before the - base implementation can permanently perform all operations""" - self._read = self._in_rc.read - # make sure we don't trigger the pool if we read from a pool channel which - # belongs to our own pool. Channels from different pools are fine though, - # there we want to trigger its computation - # PROBLEM: if the user keeps an end, but decides to put the same end into - # a task of this pool, then all items might deplete without new ones being - # produced, causing a deadlock. Just triggering the pool would be better, - # but cost's more, unnecessarily if there is just one consumer, which is - # the user. - # * could encode usage in the channel type, and fail if the refcount on - # the read-pool channel is too high - # * maybe keep track of the elements that are requested or in-production - # for each task, which would allow to precisely determine whether - # the pool as to be triggered, and bail out early. Problem would - # be the - # * Perhaps one shouldn't seek the perfect solution , but instead - # document whats working and what not, or under which conditions. - # The whole system is simple, but gets more complicated the - # smarter it wants to be. - if isinstance(self._in_rc, RPoolChannel) and self._in_rc._pool is self._pool_ref(): - self._read = self._in_rc._read - - # permanently install our base for processing - self.process = new.instancemethod(OutputChannelTask.__dict__['process'], self, type(self)) - - # and call it - return OutputChannelTask.process(self, count) + # for now, just blindly read our input, could trigger a pool, even + # ours, but why not ? It should be able to handle this + # TODO: remove this method + super(InputChannelTask, self).process(count) #{ Configuration diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 3077dc32..82947988 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -6,14 +6,17 @@ from git.async.thread import terminate_threads from git.async.util import cpu_count import threading import time +import sys class TestThreadTaskNode(InputIteratorThreadTask): def __init__(self, *args, **kwargs): super(TestThreadTaskNode, self).__init__(*args, **kwargs) - self.reset(self._iterator) self.should_fail = False self.lock = threading.Lock() # yes, can't safely do x = x + 1 :) self.plock = threading.Lock() + self.item_count = 0 + self.process_count = 0 + self._scheduled_items = 0 def do_fun(self, item): self.lock.acquire() @@ -23,13 +26,6 @@ class TestThreadTaskNode(InputIteratorThreadTask): raise AssertionError("I am failing just for the fun of it") return item - def reset(self, iterator): - self.process_count = 0 - self.item_count = 0 - self._exc = None - self._iterator = iterator - self._done = False - def process(self, count=1): # must do it first, otherwise we might read and check results before # the thread gets here :). Its a lesson ! @@ -68,7 +64,7 @@ class TestThreadPool(TestBase): def _add_triple_task(self, p): """Add a triplet of feeder, transformer and finalizer to the pool, like t1 -> t2 -> t3, return all 3 return channels in order""" - t1 = TestThreadTaskNode(make_iter(), 'iterator', None) + # t1 = TestThreadTaskNode(make_task(), 'iterator', None) # TODO: def _assert_single_task(self, p, async=False): @@ -81,12 +77,13 @@ class TestThreadPool(TestBase): assert ni % 2 == 0, "ni needs to be dividable by 2" assert ni % 4 == 0, "ni needs to be dividable by 4" - def make_iter(): - return iter(range(ni)) + def make_task(): + t = TestThreadTaskNode(iter(range(ni)), 'iterator', None) + t.fun = t.do_fun + return t # END utility - task = TestThreadTaskNode(make_iter(), 'iterator', None) - task.fun = task.do_fun + task = make_task() assert p.num_tasks() == null_tasks rc = p.add_task(task) @@ -104,8 +101,9 @@ class TestThreadPool(TestBase): # as the task is done, it should have been removed - we have read everything assert task.is_done() + del(rc) assert p.num_tasks() == null_tasks - task.reset(make_iter()) + task = make_task() # pull individual items rc = p.add_task(task) @@ -126,14 +124,14 @@ class TestThreadPool(TestBase): # it couldn't yet notice that the input is depleted as we pulled exaclty # ni items - the next one would remove it. Instead, we delete our channel # which triggers orphan handling + assert not task.is_done() assert p.num_tasks() == 1 + null_tasks del(rc) assert p.num_tasks() == null_tasks - task.reset(make_iter()) - # test min count # if we query 1 item, it will prepare ni / 2 + task = make_task() task.min_count = ni / 2 rc = p.add_task(task) print "read(1)" @@ -149,6 +147,7 @@ class TestThreadPool(TestBase): print "read(%i)" % nri items = rc.read(nri) assert len(items) == nri + p.del_task(task) assert p.num_tasks() == null_tasks task._assert(2, ni) # two chunks, ni calls @@ -158,31 +157,36 @@ class TestThreadPool(TestBase): # test chunking # we always want 4 chunks, these could go to individual nodes - task.reset(make_iter()) + task = make_task() + task.min_count = ni / 2 # restore previous value task.max_chunksize = ni / 4 # 4 chunks rc = p.add_task(task) + # must read a specific item count # count is still at ni / 2 - here we want more than that # 2 steps with n / 4 items, + 1 step with n/4 items to get + 2 nri = ni / 2 + 2 - print "read(%i)" % nri + print "read(%i) chunksize set" % nri items = rc.read(nri) assert len(items) == nri # have n / 4 - 2 items on queue, want n / 4 in first chunk, cause 1 processing # ( 4 in total ). Still want n / 4 - 2 in second chunk, causing another processing nri = ni / 2 - 2 - print "read(%i)" % nri + print "read(%i) chunksize set" % nri items = rc.read(nri) assert len(items) == nri task._assert( 5, ni) + assert task.is_done() + del(rc) assert p.num_tasks() == null_tasks # depleted # but this only hits if we want too many items, if we want less, it could # still do too much - hence we set the min_count to the same number to enforce # at least ni / 4 items to be preocessed, no matter what we request - task.reset(make_iter()) + task = make_task() task.min_count = None + task.max_chunksize = ni / 4 # match previous setup rc = p.add_task(task) st = time.time() print "read(1) * %i, chunksize set" % ni @@ -203,8 +207,9 @@ class TestThreadPool(TestBase): assert p.num_tasks() == null_tasks # now with we set the minimum count to reduce the number of processing counts - task.reset(make_iter()) + task = make_task() task.min_count = ni / 4 + task.max_chunksize = ni / 4 # match previous setup rc = p.add_task(task) print "read(1) * %i, min_count%i + chunksize" % (ni, task.min_count) for i in range(ni): @@ -220,13 +225,15 @@ class TestThreadPool(TestBase): # test failure # on failure, the processing stops and the task is finished, keeping # his error for later - task.reset(make_iter()) + task = make_task() task.should_fail = True rc = p.add_task(task) print "read(0) with failure" assert len(rc.read()) == 0 # failure on first item print "done with everything" assert isinstance(task.error(), AssertionError) + assert task.is_done() # on error, its marked done as well + del(rc) assert p.num_tasks() == null_tasks -- cgit v1.2.3 From 772b95631916223e472989b43f3a31f61e237f31 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 8 Jun 2010 19:25:33 +0200 Subject: workerthread: adjusted to use a blocking queue, it will receive termination events only with its queue, with boosts performance into brigt green levels --- lib/git/async/channel.py | 3 +-- lib/git/async/pool.py | 56 ++++++++++++++++++++++++++------------------- lib/git/async/task.py | 2 +- lib/git/async/thread.py | 24 +++++++++++++------ lib/git/async/util.py | 1 - test/git/async/test_pool.py | 5 +++- 6 files changed, 56 insertions(+), 35 deletions(-) diff --git a/lib/git/async/channel.py b/lib/git/async/channel.py index 5c52b1dc..c05f7383 100644 --- a/lib/git/async/channel.py +++ b/lib/git/async/channel.py @@ -60,7 +60,6 @@ class WChannel(Channel): # let the queue handle the 'closed' attribute, we write much more often # to an open channel than to a closed one, saving a few cycles try: - print "putting item", item, id(self._queue.queue) self._queue.put(item, block, timeout) except ReadOnly: raise IOError("Cannot write to a closed channel") @@ -76,7 +75,7 @@ class WChannel(Channel): an error""" # yes, close it a little too early, better than having anyone put # additional items - print "closing channel", self + # print "closing channel", self self._closed = True self._queue.set_writable(False) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index a915f7b0..1767c61c 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -1,5 +1,8 @@ """Implementation of a thread-pool working with channels""" -from thread import WorkerThread +from thread import ( + WorkerThread, + StopProcessing, + ) from threading import Lock from util import ( @@ -147,7 +150,7 @@ class Pool(object): used only from the main thread, hence you cannot consume their results from multiple threads unless you use a task for it.""" __slots__ = ( '_tasks', # a graph of tasks - '_workers', # list of worker threads + '_num_workers', # list of workers '_queue', # master queue for tasks '_taskorder_cache', # map task id -> ordered dependent tasks '_taskgraph_lock', # lock for accessing the task graph @@ -169,7 +172,7 @@ class Pool(object): def __init__(self, size=0): self._tasks = Graph() - self._workers = list() + self._num_workers = 0 self._queue = self.TaskQueueCls() self._taskgraph_lock = self.LockCls() self._taskorder_cache = dict() @@ -270,7 +273,7 @@ class Pool(object): # into the loop would be less code, but ... slower # DEBUG # print actual_count, numchunks, chunksize, remainder, task._out_wc.size() - if self._workers: + if self._num_workers: # respect the chunk size, and split the task up if we want # to process too much. This can be defined per task queue = self._queue @@ -323,7 +326,7 @@ class Pool(object): #{ Interface def size(self): """:return: amount of workers in the pool""" - return len(self._workers) + return self._num_workers def set_size(self, size=0): """Set the amount of workers to use in this pool. When reducing the size, @@ -341,34 +344,41 @@ class Pool(object): # either start new threads, or kill existing ones. # If we end up with no threads, we process the remaining chunks on the queue # ourselves - cur_count = len(self._workers) + cur_count = self._num_workers if cur_count < size: - for i in range(size - cur_count): - worker = self.WorkerCls(self._queue) - worker.start() - self._workers.append(worker) - # END for each new worker to create - elif cur_count > size: # we can safely increase the size, even from serial mode, as we would # only be able to do this if the serial ( sync ) mode finished processing. # Just adding more workers is not a problem at all. + add_count = size - cur_count + for i in range(add_count): + print "Add worker" + self.WorkerCls(self._queue).start() + # END for each new worker to create + self._num_workers += add_count + elif cur_count > size: + # We don't care which thread exactly gets hit by our stop request + # On their way, they will consume remaining tasks, but new ones + # could be added as we speak. del_count = cur_count - size for i in range(del_count): - self._workers[i].stop_and_join() + print "stop worker" + self._queue.put((self.WorkerCls.stop, True)) # arg doesnt matter # END for each thread to stop - del(self._workers[:del_count]) + self._num_workers -= del_count # END handle count if size == 0: - while not self._queue.empty(): - try: - taskmethod, count = self._queue.get(False) - taskmethod(count) - except Queue.Empty: - continue - # END while there are tasks on the queue - - self._consumed_tasks = SyncQueue() + # NOTE: we do not preocess any tasks still on the queue, as we ill + # naturally do that once we read the next time, only on the tasks + # that are actually required. The queue will keep the tasks, + # and once we are deleted, they will vanish without additional + # time spend on them. If there shouldn't be any consumers anyway. + # If we should reenable some workers again, they will continue on the + # remaining tasks, probably with nothing to do. + # We can't clear the task queue if we have removed workers + # as they will receive the termination signal through it, and if + # we had added workers, we wouldn't be here ;). + pass # END process queue return self diff --git a/lib/git/async/task.py b/lib/git/async/task.py index 5edd40bb..f9536a45 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -89,7 +89,7 @@ class OutputChannelTask(Node): def process(self, count=0): """Process count items and send the result individually to the output channel""" items = self._read(count) - print "task read", len(items) + # print "task read", len(items) try: # increase the ref-count - we use this to determine whether anyone else # is currently handling our output channel. As this method runs asynchronously, diff --git a/lib/git/async/thread.py b/lib/git/async/thread.py index 5faad4f8..556b7e92 100644 --- a/lib/git/async/thread.py +++ b/lib/git/async/thread.py @@ -80,6 +80,10 @@ class TerminatableThread(threading.Thread): self._terminated() #} END interface + +class StopProcessing(Exception): + """If thrown in a function processed by a WorkerThread, it will terminate""" + class WorkerThread(TerminatableThread): """ This base allows to call functions on class instances natively. @@ -122,6 +126,11 @@ class WorkerThread(TerminatableThread): self.inq = inq or Queue.Queue() self._current_routine = None # routine we execute right now + @classmethod + def stop(cls, *args): + """If send via the inq of the thread, it will stop once it processed the function""" + raise StopProcessing + def run(self): """Process input tasks until we receive the quit signal""" gettask = self.inq.get @@ -131,12 +140,8 @@ class WorkerThread(TerminatableThread): break # END check for stop request - # don't wait too long, instead check for the termination request more often - try: - tasktuple = gettask(True, 0.25) - except Queue.Empty: - continue - # END get task with timeout + # we wait and block - to terminate, send the 'stop' method + tasktuple = gettask() # needing exactly one function, and one arg assert len(tasktuple) == 2, "Need tuple of function, arg - it could be more flexible, but its reduced to what we need" @@ -158,6 +163,8 @@ class WorkerThread(TerminatableThread): print "%s: task %s was not understood - terminating" % (self.getName(), str(tasktuple)) break # END make routine call + except StopProcessing: + break except Exception,e: print "%s: Task %s raised unhandled exception: %s - this really shouldn't happen !" % (self.getName(), str(tasktuple), str(e)) break # abort ... @@ -168,5 +175,8 @@ class WorkerThread(TerminatableThread): """:return: routine we are currently executing, or None if we have no task""" return self._current_routine - + def stop_and_join(self): + """Send stop message to ourselves""" + self.inq.put((self.stop, None)) + super(WorkerThread, self).stop_and_join() #} END classes diff --git a/lib/git/async/util.py b/lib/git/async/util.py index 6d09de59..f3213ed6 100644 --- a/lib/git/async/util.py +++ b/lib/git/async/util.py @@ -240,7 +240,6 @@ class AsyncQueue(Queue): # END handle block # can throw if we woke up because we are not writable anymore - print len(q), id(q), current_thread() try: return q.popleft() except IndexError: diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 82947988..756f1562 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -69,11 +69,12 @@ class TestThreadPool(TestBase): def _assert_single_task(self, p, async=False): """Performs testing in a synchronized environment""" + print >> sys.stderr, "Threadpool: Starting single task (async = %i) with %i threads" % (async, p.size()) null_tasks = p.num_tasks() # in case we had some before # add a simple task # it iterates n items - ni = 52 + ni = 1000 assert ni % 2 == 0, "ni needs to be dividable by 2" assert ni % 4 == 0, "ni needs to be dividable by 4" @@ -287,7 +288,9 @@ class TestThreadPool(TestBase): p.set_size(1) assert len(threading.enumerate()) == num_threads + 1 # deleting the pool stops its threads - just to be sure ;) + # Its not synchronized, hence we wait a moment del(p) + time.sleep(0.15) assert len(threading.enumerate()) == num_threads p = ThreadPool(1) -- cgit v1.2.3 From 15941ca090a2c3c987324fc911bbc6f89e941c47 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 9 Jun 2010 10:34:12 +0200 Subject: queue: fixed critical bug in the notify method, as it was not at all thread-safe, causing locks to be released multiple times. Now it runs very fast, and very stable apparently. Now its about putting previous features back in, and studying their results, before more complex task graphs can be examined --- lib/git/async/channel.py | 1 - lib/git/async/pool.py | 30 +++++++++++++-------------- lib/git/async/task.py | 3 +-- lib/git/async/thread.py | 9 +++++--- lib/git/async/util.py | 50 ++++++++++++++++++++++++++++----------------- test/git/async/test_pool.py | 11 +++++----- 6 files changed, 57 insertions(+), 47 deletions(-) diff --git a/lib/git/async/channel.py b/lib/git/async/channel.py index c05f7383..58c35f96 100644 --- a/lib/git/async/channel.py +++ b/lib/git/async/channel.py @@ -75,7 +75,6 @@ class WChannel(Channel): an error""" # yes, close it a little too early, better than having anyone put # additional items - # print "closing channel", self self._closed = True self._queue.set_writable(False) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 1767c61c..7bddf7da 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -53,9 +53,8 @@ class RPoolChannel(RChannel): # I can't explain, but appears to be normal in the destructor # On the caller side, getrefcount returns 2, as expected if sys.getrefcount(self) < 6: - print "__del__" - self._pool.del_task(self._task) - print "done" + self._pool.remove_task(self._task) + # END handle refcount based removal of task def set_pre_cb(self, fun = lambda count: None): """Install a callback to call with the item count to be read before any @@ -237,12 +236,14 @@ class Pool(object): # the list includes our tasks - the first one to evaluate first, the # requested one last for task in dfirst_tasks: - if task.error() or task.is_done(): + # if task.error() or task.is_done(): # in theory, the should never be consumed task in the pool, right ? - # They delete themselves once they are done. - # TODO: remove this check for performance later - raise AssertionError("Shouldn't have consumed tasks on the pool, they delete themeselves, what happend ?") - #continue + # They delete themselves once they are done. But as we run asynchronously, + # It can be that someone reads, while a task realizes its done, and + # we get here to prepare the read although it already is done. + # Its not a problem though, the task wiill not do anything. + # Hence we don't waste our time with checking for it + # raise AssertionError("Shouldn't have consumed tasks on the pool, they delete themeselves, what happend ?") # END skip processing # if the task does not have the required output on its queue, schedule @@ -316,11 +317,11 @@ class Pool(object): """Called after we processed a read to cleanup""" pass - def _del_task_if_orphaned(self, task): + def _remove_task_if_orphaned(self, task): """Check the task, and delete it if it is orphaned""" # 1 as its stored on the task, 1 for the getrefcount call if sys.getrefcount(task._out_wc) < 3: - self.del_task(task) + self.remove_task(task) #} END internal #{ Interface @@ -351,7 +352,6 @@ class Pool(object): # Just adding more workers is not a problem at all. add_count = size - cur_count for i in range(add_count): - print "Add worker" self.WorkerCls(self._queue).start() # END for each new worker to create self._num_workers += add_count @@ -361,7 +361,6 @@ class Pool(object): # could be added as we speak. del_count = cur_count - size for i in range(del_count): - print "stop worker" self._queue.put((self.WorkerCls.stop, True)) # arg doesnt matter # END for each thread to stop self._num_workers -= del_count @@ -390,7 +389,7 @@ class Pool(object): finally: self._taskgraph_lock.release() - def del_task(self, task): + def remove_task(self, task): """Delete the task Additionally we will remove orphaned tasks, which can be identified if their output channel is only held by themselves, so no one will ever consume @@ -399,7 +398,6 @@ class Pool(object): This method blocks until all tasks to be removed have been processed, if they are currently being processed. :return: self""" - print "del_task: getting lock" self._taskgraph_lock.acquire() try: # it can be that the task is already deleted, but its chunk was on the @@ -407,7 +405,7 @@ class Pool(object): if not task in self._tasks.nodes: return self # END early abort - print "deleting ", id(task) + # the task we are currently deleting could also be processed by # a thread right now. We don't care about it as its taking care about # its write channel itself, and sends everything it can to it. @@ -426,7 +424,7 @@ class Pool(object): # END locked deletion for t in in_tasks: - self._del_task_if_orphaned(t) + self._remove_task_if_orphaned(t) # END handle orphans recursively return self diff --git a/lib/git/async/task.py b/lib/git/async/task.py index f9536a45..f1448f96 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -89,7 +89,6 @@ class OutputChannelTask(Node): def process(self, count=0): """Process count items and send the result individually to the output channel""" items = self._read(count) - # print "task read", len(items) try: # increase the ref-count - we use this to determine whether anyone else # is currently handling our output channel. As this method runs asynchronously, @@ -119,7 +118,7 @@ class OutputChannelTask(Node): # END handle single apply except Exception, e: self._exc = e - print "task error:", str(e) # TODO: REMOVE DEBUG, or make it use logging + print >> sys.stderr, "task error:", str(e) # TODO: REMOVE DEBUG, or make it use logging self.set_done() # unschedule all, we don't know how many have been produced actually # but only if we don't apply single please diff --git a/lib/git/async/thread.py b/lib/git/async/thread.py index 556b7e92..cd964f1c 100644 --- a/lib/git/async/thread.py +++ b/lib/git/async/thread.py @@ -5,6 +5,8 @@ import threading import inspect import Queue +import sys + #{ Decorators def do_terminate_threads(whitelist=list()): @@ -160,14 +162,15 @@ class WorkerThread(TerminatableThread): rval = routine(arg) else: # ignore unknown items - print "%s: task %s was not understood - terminating" % (self.getName(), str(tasktuple)) + print >> sys.stderr, "%s: task %s was not understood - terminating" % (self.getName(), str(tasktuple)) break # END make routine call except StopProcessing: + print self.name, "stops processing" break except Exception,e: - print "%s: Task %s raised unhandled exception: %s - this really shouldn't happen !" % (self.getName(), str(tasktuple), str(e)) - break # abort ... + print >> sys.stderr, "%s: Task %s raised unhandled exception: %s - this really shouldn't happen !" % (self.getName(), str(tasktuple), str(e)) + continue # just continue # END routine exception handling # END endless loop diff --git a/lib/git/async/util.py b/lib/git/async/util.py index f3213ed6..dff38f58 100644 --- a/lib/git/async/util.py +++ b/lib/git/async/util.py @@ -71,18 +71,15 @@ class SyncQueue(deque): class HSCondition(object): - """An attempt to make conditions less blocking, which gains performance - in return by sleeping less""" - # __slots__ = ("acquire", "release", "_lock", '_waiters') + """Cleaned up code of the original condition object in order + to make it run and respond faster.""" __slots__ = ("_lock", '_waiters') - delay = 0.00002 # reduces wait times, but increases overhead + delay = 0.0002 # reduces wait times, but increases overhead def __init__(self, lock=None): if lock is None: lock = Lock() self._lock = lock - #self.acquire = lock.acquire - #self.release = lock.release self._waiters = list() def release(self): @@ -109,6 +106,8 @@ class HSCondition(object): # Balancing act: We can't afford a pure busy loop, because of the # GIL, so we have to sleep # We try to sleep only tiny amounts of time though to be very responsive + # NOTE: this branch is not used by the async system anyway, but + # will be hit when the user reads with timeout endtime = _time() + timeout delay = self.delay acquire = waiter.acquire @@ -133,25 +132,36 @@ class HSCondition(object): finally: # reacquire the lock self.acquire() + # END assure release lock def notify(self, n=1): + """Its vital that this method is threadsafe - to be fast we don'd get a lock, + but instead rely on pseudo-atomic operations that come with the GIL. + Hence we use pop in the n=1 case to be truly atomic. + In the multi-notify case, we acquire a lock just for safety, as otherwise + we might pop too much of someone else notifies n waiters as well, which + would in the worst case lead to double-releases of locks.""" if not self._waiters: return - waiters = self._waiters if n == 1: - waiters[0].release() + # so here we assume this is thead-safe ! It wouldn't be in any other + # language, but python it is. try: - waiters.pop(0) + self._waiters.pop(0).release() except IndexError: pass else: - print "notify", waiters, n - for waiter in waiters[:n]: - waiter.release() - try: - waiters.remove(waiter) - except ValueError: - pass + self.acquire() + # once the waiter resumes, he will want to acquire the lock + # and waits again, but only until we are done, which is important + # to do that in a thread-safe fashion + try: + for i in range(min(n, len(self._waiters))): + self._waiters.pop(0).release() + # END for each waiter to resume + finally: + self.release() + # END assure we release our lock # END handle n = 1 case faster def notify_all(self): @@ -164,7 +174,8 @@ class ReadOnly(Exception): class AsyncQueue(Queue): """A queue using different condition objects to gain multithreading performance. Additionally it has a threadsafe writable flag, which will alert all readers - that there is nothing more to get here.""" + that there is nothing more to get here. + All default-queue code was cleaned up for performance.""" __slots__ = ('mutex', 'not_empty', 'queue', '_writable') def __init__(self, maxsize=0): @@ -222,7 +233,7 @@ class AsyncQueue(Queue): self.not_empty.notify() def get(self, block=True, timeout=None): - self.not_empty.acquire() # == self.mutex.acquire in that case + self.mutex.acquire() q = self.queue try: if block: @@ -246,7 +257,8 @@ class AsyncQueue(Queue): raise Empty # END handle unblocking reason finally: - self.not_empty.release() + self.mutex.release() + # END assure lock is released #} END utilities diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 756f1562..ac8f1244 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -74,7 +74,7 @@ class TestThreadPool(TestBase): # add a simple task # it iterates n items - ni = 1000 + ni = 5000 assert ni % 2 == 0, "ni needs to be dividable by 2" assert ni % 4 == 0, "ni needs to be dividable by 4" @@ -148,7 +148,7 @@ class TestThreadPool(TestBase): print "read(%i)" % nri items = rc.read(nri) assert len(items) == nri - p.del_task(task) + p.remove_task(task) assert p.num_tasks() == null_tasks task._assert(2, ni) # two chunks, ni calls @@ -204,7 +204,7 @@ class TestThreadPool(TestBase): task._assert(ni, ni) assert p.num_tasks() == 1 + null_tasks - assert p.del_task(task) is p # del manually this time + assert p.remove_task(task) is p # del manually this time assert p.num_tasks() == null_tasks # now with we set the minimum count to reduce the number of processing counts @@ -231,7 +231,7 @@ class TestThreadPool(TestBase): rc = p.add_task(task) print "read(0) with failure" assert len(rc.read()) == 0 # failure on first item - print "done with everything" + print >> sys.stderr, "done with everything" assert isinstance(task.error(), AssertionError) assert task.is_done() # on error, its marked done as well del(rc) @@ -290,7 +290,7 @@ class TestThreadPool(TestBase): # deleting the pool stops its threads - just to be sure ;) # Its not synchronized, hence we wait a moment del(p) - time.sleep(0.15) + time.sleep(0.25) assert len(threading.enumerate()) == num_threads p = ThreadPool(1) @@ -311,7 +311,6 @@ class TestThreadPool(TestBase): # threads per core p.set_size(4) self._assert_single_task(p, True) - # DEPENDENT TASK ASYNC MODE ########################### self._assert_async_dependent_tasks(p) -- cgit v1.2.3 From f2c8d26d3b25b864ad48e6de018757266b59f708 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 9 Jun 2010 11:28:37 +0200 Subject: thread: fixed initialization problem if an empty iterable was handed in queue: Queue now derives from deque directly, which safes one dict lookup as the queue does not need to be accessed through self anymore pool test improved to better verify threads are started correctly --- lib/git/async/pool.py | 5 ++--- lib/git/async/task.py | 23 ++++++++++++++++++++++- lib/git/async/thread.py | 8 ++++++-- lib/git/async/util.py | 19 ++++++++----------- test/git/async/test_pool.py | 17 +++++++++++++---- 5 files changed, 51 insertions(+), 21 deletions(-) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 7bddf7da..7ed6fd8e 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -331,9 +331,8 @@ class Pool(object): def set_size(self, size=0): """Set the amount of workers to use in this pool. When reducing the size, - the call may block as it waits for threads to finish. - When reducing the size to zero, this thread will process all remaining - items on the queue. + threads will continue with their work until they are done before effectively + being removed. :return: self :param size: if 0, the pool will do all work itself in the calling thread, diff --git a/lib/git/async/task.py b/lib/git/async/task.py index f1448f96..dd2bd351 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -1,4 +1,5 @@ from graph import Node +from util import ReadOnly import threading import sys @@ -117,8 +118,9 @@ class OutputChannelTask(Node): wc.write(rval) # END handle single apply except Exception, e: - self._exc = e print >> sys.stderr, "task error:", str(e) # TODO: REMOVE DEBUG, or make it use logging + + # be sure our task is not scheduled again self.set_done() # unschedule all, we don't know how many have been produced actually # but only if we don't apply single please @@ -127,6 +129,25 @@ class OutputChannelTask(Node): self._scheduled_items -= len(items) self._slock.release() # END unschedule all + + # PROBLEM: We have failed to create at least one item, hence its not + # garantueed that enough items will be produced for a possibly blocking + # client on the other end. This is why we have no other choice but + # to close the channel, preventing the possibility of blocking. + # This implies that dependent tasks will go down with us, but that is + # just the right thing to do of course - one loose link in the chain ... + # Other chunks of our kind currently being processed will then + # fail to write to the channel and fail as well + # self.close() + + # If some other chunk of our Task had an error, the channel will be closed + # This is not an issue, just be sure we don't overwrite the original + # exception with the ReadOnly error that would be emitted in that case. + # We imply that ReadOnly is exclusive to us, as it won't be an error + # if the user emits it + if not isinstance(e, ReadOnly): + self._exc = e + # END set error flag # END exception handling del(wc) diff --git a/lib/git/async/thread.py b/lib/git/async/thread.py index cd964f1c..faeda04f 100644 --- a/lib/git/async/thread.py +++ b/lib/git/async/thread.py @@ -125,7 +125,9 @@ class WorkerThread(TerminatableThread): def __init__(self, inq = None): super(WorkerThread, self).__init__() - self.inq = inq or Queue.Queue() + self.inq = inq + if inq is None: + self.inq = Queue.Queue() self._current_routine = None # routine we execute right now @classmethod @@ -135,6 +137,8 @@ class WorkerThread(TerminatableThread): def run(self): """Process input tasks until we receive the quit signal""" + print self.name, "starts processing" # DEBUG + gettask = self.inq.get while True: self._current_routine = None @@ -166,7 +170,7 @@ class WorkerThread(TerminatableThread): break # END make routine call except StopProcessing: - print self.name, "stops processing" + print self.name, "stops processing" # DEBUG break except Exception,e: print >> sys.stderr, "%s: Task %s raised unhandled exception: %s - this really shouldn't happen !" % (self.getName(), str(tasktuple), str(e)) diff --git a/lib/git/async/util.py b/lib/git/async/util.py index dff38f58..b5e1a0c0 100644 --- a/lib/git/async/util.py +++ b/lib/git/async/util.py @@ -10,7 +10,6 @@ from threading import ( ) from Queue import ( - Queue, Empty, ) @@ -171,15 +170,14 @@ class HSCondition(object): class ReadOnly(Exception): """Thrown when trying to write to a read-only queue""" -class AsyncQueue(Queue): +class AsyncQueue(deque): """A queue using different condition objects to gain multithreading performance. Additionally it has a threadsafe writable flag, which will alert all readers that there is nothing more to get here. All default-queue code was cleaned up for performance.""" - __slots__ = ('mutex', 'not_empty', 'queue', '_writable') + __slots__ = ('mutex', 'not_empty', '_writable') def __init__(self, maxsize=0): - self.queue = deque() self.mutex = Lock() self.not_empty = HSCondition(self.mutex) self._writable = True @@ -187,7 +185,7 @@ class AsyncQueue(Queue): def qsize(self): self.mutex.acquire() try: - return len(self.queue) + return len(self) finally: self.mutex.release() @@ -218,7 +216,7 @@ class AsyncQueue(Queue): def empty(self): self.mutex.acquire() try: - return not len(self.queue) + return not len(self) finally: self.mutex.release() @@ -228,21 +226,20 @@ class AsyncQueue(Queue): self.mutex.release() raise ReadOnly # END handle read-only - self.queue.append(item) + self.append(item) self.mutex.release() self.not_empty.notify() def get(self, block=True, timeout=None): self.mutex.acquire() - q = self.queue try: if block: if timeout is None: - while not len(q) and self._writable: + while not len(self) and self._writable: self.not_empty.wait() else: endtime = _time() + timeout - while not len(q) and self._writable: + while not len(self) and self._writable: remaining = endtime - _time() if remaining <= 0.0: raise Empty @@ -252,7 +249,7 @@ class AsyncQueue(Queue): # can throw if we woke up because we are not writable anymore try: - return q.popleft() + return self.popleft() except IndexError: raise Empty # END handle unblocking reason diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index ac8f1244..d38cbebd 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -253,13 +253,22 @@ class TestThreadPool(TestBase): assert p.size() == 0 # increase and decrease the size + num_threads = len(threading.enumerate()) for i in range(self.max_threads): p.set_size(i) assert p.size() == i + assert len(threading.enumerate()) == num_threads + i + for i in range(self.max_threads, -1, -1): p.set_size(i) assert p.size() == i - + + assert p.size() == 0 + # threads should be killed already, but we let them a tiny amount of time + # just to be sure + time.sleep(0.05) + assert len(threading.enumerate()) == num_threads + # SINGLE TASK SERIAL SYNC MODE ############################## # put a few unrelated tasks that we forget about @@ -268,7 +277,6 @@ class TestThreadPool(TestBase): assert p.num_tasks() == 2 ## SINGLE TASK ################# - assert p.size() == 0 self._assert_single_task(p, False) assert p.num_tasks() == 2 del(urc1) @@ -281,11 +289,12 @@ class TestThreadPool(TestBase): self._assert_async_dependent_tasks(p) - # SINGLE TASK THREADED SYNC MODE + # SINGLE TASK THREADED ASYNC MODE ################################ # step one gear up - just one thread for now. - num_threads = len(threading.enumerate()) p.set_size(1) + assert p.size() == 1 + print len(threading.enumerate()), num_threads assert len(threading.enumerate()) == num_threads + 1 # deleting the pool stops its threads - just to be sure ;) # Its not synchronized, hence we wait a moment -- cgit v1.2.3 From 2054561da184955c4be4a92f0b4fa5c5c1c01350 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 9 Jun 2010 11:31:24 +0200 Subject: HSCondition: using a deck to store waiters, for further speedup --- lib/git/async/util.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/git/async/util.py b/lib/git/async/util.py index b5e1a0c0..2c18a1b9 100644 --- a/lib/git/async/util.py +++ b/lib/git/async/util.py @@ -79,7 +79,7 @@ class HSCondition(object): if lock is None: lock = Lock() self._lock = lock - self._waiters = list() + self._waiters = deque() def release(self): self._lock.release() @@ -146,7 +146,7 @@ class HSCondition(object): # so here we assume this is thead-safe ! It wouldn't be in any other # language, but python it is. try: - self._waiters.pop(0).release() + self._waiters.popleft().release() except IndexError: pass else: @@ -156,7 +156,7 @@ class HSCondition(object): # to do that in a thread-safe fashion try: for i in range(min(n, len(self._waiters))): - self._waiters.pop(0).release() + self._waiters.popleft().release() # END for each waiter to resume finally: self.release() -- cgit v1.2.3 From 1090701721888474d34f8a4af28ee1bb1c3fdaaa Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 9 Jun 2010 11:35:41 +0200 Subject: HSCondition: now deriving from deque, as the AsyncQeue does, to elimitate one more level of indirection. Clearly this not good from a design standpoint, as a Condition is no Deque, but it helps speeding things up which is what this is about. Could make it a hidden class to indicate how 'special' it is --- lib/git/async/util.py | 19 +++++++++---------- test/git/async/test_pool.py | 8 ++++---- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/lib/git/async/util.py b/lib/git/async/util.py index 2c18a1b9..ffdb14a2 100644 --- a/lib/git/async/util.py +++ b/lib/git/async/util.py @@ -69,17 +69,16 @@ class SyncQueue(deque): put = deque.append -class HSCondition(object): +class HSCondition(deque): """Cleaned up code of the original condition object in order to make it run and respond faster.""" - __slots__ = ("_lock", '_waiters') + __slots__ = ("_lock") delay = 0.0002 # reduces wait times, but increases overhead def __init__(self, lock=None): if lock is None: lock = Lock() self._lock = lock - self._waiters = deque() def release(self): self._lock.release() @@ -93,7 +92,7 @@ class HSCondition(object): def wait(self, timeout=None): waiter = _allocate_lock() waiter.acquire() # get it the first time, no blocking - self._waiters.append(waiter) + self.append(waiter) # in the momemnt we release our lock, someone else might actually resume self.release() @@ -124,7 +123,7 @@ class HSCondition(object): # END endless loop if not gotit: try: - self._waiters.remove(waiter) + self.remove(waiter) except ValueError: pass # END didn't ever get it @@ -140,13 +139,13 @@ class HSCondition(object): In the multi-notify case, we acquire a lock just for safety, as otherwise we might pop too much of someone else notifies n waiters as well, which would in the worst case lead to double-releases of locks.""" - if not self._waiters: + if not self: return if n == 1: # so here we assume this is thead-safe ! It wouldn't be in any other # language, but python it is. try: - self._waiters.popleft().release() + self.popleft().release() except IndexError: pass else: @@ -155,8 +154,8 @@ class HSCondition(object): # and waits again, but only until we are done, which is important # to do that in a thread-safe fashion try: - for i in range(min(n, len(self._waiters))): - self._waiters.popleft().release() + for i in range(min(n, len(self))): + self.popleft().release() # END for each waiter to resume finally: self.release() @@ -164,7 +163,7 @@ class HSCondition(object): # END handle n = 1 case faster def notify_all(self): - self.notify(len(self._waiters)) + self.notify(len(self)) class ReadOnly(Exception): diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index d38cbebd..dacbf0be 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -289,8 +289,8 @@ class TestThreadPool(TestBase): self._assert_async_dependent_tasks(p) - # SINGLE TASK THREADED ASYNC MODE - ################################ + # SINGLE TASK THREADED ASYNC MODE ( 1 thread ) + ############################################## # step one gear up - just one thread for now. p.set_size(1) assert p.size() == 1 @@ -310,8 +310,8 @@ class TestThreadPool(TestBase): - # SINGLE TASK ASYNC MODE - ######################## + # SINGLE TASK ASYNC MODE ( 2 threads ) + ###################################### # two threads to compete for a single task p.set_size(2) self._assert_single_task(p, True) -- cgit v1.2.3 From a988e6985849e4f6a561b4a5468d525c25ce74fe Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 9 Jun 2010 11:45:25 +0200 Subject: HSCondition: now gets a lock even in the single-notify case, as it was required due to the non-atomiciy of the invovled operation. Removed one level of indirection for the lock, by refraining from calling my own 'wrapper' methods, which brought it back to the performance it had before the locking was introduced for the n==1 case --- lib/git/async/util.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/lib/git/async/util.py b/lib/git/async/util.py index ffdb14a2..008e60a3 100644 --- a/lib/git/async/util.py +++ b/lib/git/async/util.py @@ -95,7 +95,7 @@ class HSCondition(deque): self.append(waiter) # in the momemnt we release our lock, someone else might actually resume - self.release() + self._lock.release() try: # restore state no matter what (e.g., KeyboardInterrupt) # now we block, as we hold the lock already if timeout is None: @@ -129,7 +129,7 @@ class HSCondition(deque): # END didn't ever get it finally: # reacquire the lock - self.acquire() + self._lock.acquire() # END assure release lock def notify(self, n=1): @@ -144,12 +144,23 @@ class HSCondition(deque): if n == 1: # so here we assume this is thead-safe ! It wouldn't be in any other # language, but python it is. + # But ... its two objects here - first the popleft, then the relasecall. + # If the timing is really really bad, and that happens if you let it + # run often enough ( its a matter of statistics ), this will fail, + # which is why we lock it. + # And yes, this causes some slow down, as single notifications happen + # alot + self._lock.acquire() try: - self.popleft().release() - except IndexError: - pass + try: + self.popleft().release() + except IndexError: + pass + finally: + self._lock.release() + # END assure lock is released else: - self.acquire() + self._lock.acquire() # once the waiter resumes, he will want to acquire the lock # and waits again, but only until we are done, which is important # to do that in a thread-safe fashion @@ -158,7 +169,7 @@ class HSCondition(deque): self.popleft().release() # END for each waiter to resume finally: - self.release() + self._lock.release() # END assure we release our lock # END handle n = 1 case faster -- cgit v1.2.3 From 4e6bece08aea01859a232e99a1e1ad8cc1eb7d36 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 9 Jun 2010 14:01:51 +0200 Subject: HSCondition: Fixed terrible bug which it inherited from its default python Condition implementation, related to the notify method not being treadsafe. Although I was aware of it, I missed the first check which tests for the size - the result could be incorrect if the whole method wasn't locked. Testing runs stable now, allowing to move on \! --- lib/git/async/pool.py | 7 +++++-- lib/git/async/task.py | 2 +- lib/git/async/util.py | 47 +++++++++++++++------------------------------ test/git/async/test_pool.py | 17 ++++++++++++++-- 4 files changed, 36 insertions(+), 37 deletions(-) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 7ed6fd8e..66a2a105 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -58,14 +58,17 @@ class RPoolChannel(RChannel): def set_pre_cb(self, fun = lambda count: None): """Install a callback to call with the item count to be read before any - item is actually read from the channel. + item is actually read from the channel. The call must be threadsafe if + the channel is passed to more than one tasks. If it fails, the read will fail with an IOError If a function is not provided, the call is effectively uninstalled.""" self._pre_cb = fun def set_post_cb(self, fun = lambda item: item): """Install a callback to call after the items were read. The function - returns a possibly changed item list. If it raises, the exception will be propagated. + returns a possibly changed item list.The call must be threadsafe if + the channel is passed to more than one tasks. + If it raises, the exception will be propagated. If a function is not provided, the call is effectively uninstalled.""" self._post_cb = fun diff --git a/lib/git/async/task.py b/lib/git/async/task.py index dd2bd351..d18cedca 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -138,7 +138,7 @@ class OutputChannelTask(Node): # just the right thing to do of course - one loose link in the chain ... # Other chunks of our kind currently being processed will then # fail to write to the channel and fail as well - # self.close() + self.close() # If some other chunk of our Task had an error, the channel will be closed # This is not an issue, just be sure we don't overwrite the original diff --git a/lib/git/async/util.py b/lib/git/async/util.py index 008e60a3..2f46d55f 100644 --- a/lib/git/async/util.py +++ b/lib/git/async/util.py @@ -133,45 +133,28 @@ class HSCondition(deque): # END assure release lock def notify(self, n=1): - """Its vital that this method is threadsafe - to be fast we don'd get a lock, - but instead rely on pseudo-atomic operations that come with the GIL. - Hence we use pop in the n=1 case to be truly atomic. - In the multi-notify case, we acquire a lock just for safety, as otherwise - we might pop too much of someone else notifies n waiters as well, which - would in the worst case lead to double-releases of locks.""" - if not self: - return - if n == 1: - # so here we assume this is thead-safe ! It wouldn't be in any other - # language, but python it is. - # But ... its two objects here - first the popleft, then the relasecall. - # If the timing is really really bad, and that happens if you let it - # run often enough ( its a matter of statistics ), this will fail, - # which is why we lock it. - # And yes, this causes some slow down, as single notifications happen - # alot - self._lock.acquire() - try: + """Its vital that this method is threadsafe - we absolutely have to + get a lock at the beginning of this method to be sure we get the + correct amount of waiters back. If we bail out, although a waiter + is about to be added, it will miss its wakeup notification, and block + forever (possibly)""" + self._lock.acquire() + try: + if not self: # len(self) == 0, but this should be faster + return + if n == 1: try: self.popleft().release() except IndexError: pass - finally: - self._lock.release() - # END assure lock is released - else: - self._lock.acquire() - # once the waiter resumes, he will want to acquire the lock - # and waits again, but only until we are done, which is important - # to do that in a thread-safe fashion - try: + else: for i in range(min(n, len(self))): self.popleft().release() # END for each waiter to resume - finally: - self._lock.release() - # END assure we release our lock - # END handle n = 1 case faster + # END handle n = 1 case faster + finally: + self._lock.release() + # END assure lock is released def notify_all(self): self.notify(len(self)) diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index dacbf0be..cccafddc 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -98,7 +98,8 @@ class TestThreadPool(TestBase): items = rc.read() assert len(items) == ni task._assert(1, ni) - assert items[0] == 0 and items[-1] == ni-1 + if not async: + assert items[0] == 0 and items[-1] == ni-1 # as the task is done, it should have been removed - we have read everything assert task.is_done() @@ -152,8 +153,14 @@ class TestThreadPool(TestBase): assert p.num_tasks() == null_tasks task._assert(2, ni) # two chunks, ni calls - # its already done, gives us no more + # its already done, gives us no more, its still okay to use it though + # as a task doesn't have to be in the graph to allow reading its produced + # items print "read(0) on closed" + # it can happen that a thread closes the channel just a tiny fraction of time + # after we check this, so the test fails, although it is nearly closed. + # When we start reading, we should wake up once it sends its signal + # assert task.is_closed() assert len(rc.read()) == 0 # test chunking @@ -231,12 +238,18 @@ class TestThreadPool(TestBase): rc = p.add_task(task) print "read(0) with failure" assert len(rc.read()) == 0 # failure on first item + print >> sys.stderr, "done with everything" + assert isinstance(task.error(), AssertionError) assert task.is_done() # on error, its marked done as well del(rc) assert p.num_tasks() == null_tasks + # test failure after ni / 2 items + # This makes sure it correctly closes the channel on failure to prevent blocking + + def _assert_async_dependent_tasks(self, p): # includes failure in center task, 'recursive' orphan cleanup -- cgit v1.2.3 From 0974f8737a3c56a7c076f9d0b757c6cb106324fb Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 9 Jun 2010 14:47:41 +0200 Subject: Channel: Read method revised - now it really really doesn't block anymore, and it runs faster as well, about 2/3 of the performance we have when being in serial mode --- lib/git/async/channel.py | 85 ++++++++++++++++++------------------------ lib/git/async/task.py | 2 +- test/git/async/test_channel.py | 10 ++--- test/git/async/test_pool.py | 4 ++ 4 files changed, 46 insertions(+), 55 deletions(-) diff --git a/lib/git/async/channel.py b/lib/git/async/channel.py index 58c35f96..3a277e7e 100644 --- a/lib/git/async/channel.py +++ b/lib/git/async/channel.py @@ -38,12 +38,11 @@ class Channel(object): class WChannel(Channel): """The write end of a channel""" - __slots__ = ('_closed', '_queue') + __slots__ = ('_queue') def __init__(self): """initialize this instance, able to hold max_items at once Write calls will block if the channel is full, until someone reads from it""" - self._closed = False self._queue = AsyncQueue() @@ -55,15 +54,10 @@ class WChannel(Channel): :param block: If True, the call will block until there is free space in the channel :param timeout: timeout in seconds for blocking calls. - :raise IOError: when writing into closed file - :raise EOFError: when writing into a non-blocking full channel""" + :raise ReadOnly: when writing into closed channel""" # let the queue handle the 'closed' attribute, we write much more often # to an open channel than to a closed one, saving a few cycles - try: - self._queue.put(item, block, timeout) - except ReadOnly: - raise IOError("Cannot write to a closed channel") - # END exception handling + self._queue.put(item, block, timeout) def size(self): """:return: approximate number of items that could be read from the read-ends @@ -73,15 +67,11 @@ class WChannel(Channel): def close(self): """Close the channel. Multiple close calls on a closed channel are no an error""" - # yes, close it a little too early, better than having anyone put - # additional items - self._closed = True self._queue.set_writable(False) - @property def closed(self): """:return: True if the channel was closed""" - return self._closed + return not self._queue.writable() #} END interface @@ -104,6 +94,7 @@ class RChannel(Channel): :param block: if True, the call will block until an item is available :param timeout: if positive and block is True, it will block only for the given amount of seconds, returning the items it received so far. + The timeout is applied to each read item, not for the whole operation. :return: single item in a list if count is 1, or a list of count items. If the channel was empty and count was 1, an empty list will be returned. If count was greater 1, a list with less than count items will be @@ -112,9 +103,11 @@ class RChannel(Channel): returned.""" # if the channel is closed for writing, we never block # NOTE: is handled by the queue - if self._wc.closed or timeout == 0: - block = False - + # We don't check for a closed state here has it costs time - most of + # the time, it will not be closed, and will bail out automatically once + # it gets closed + + # in non-blocking mode, its all not a problem out = list() queue = self._wc._queue @@ -142,42 +135,38 @@ class RChannel(Channel): count = sys.maxint # END handle count - endtime = sys.maxint # allows timeout for whole operation - if timeout is not None: - endtime = time() + timeout - # could be improved by a separate: no-endtime branch, saving the time calls - for i in xrange(count): + i = 0 + while i < count: try: out.append(queue.get(block, timeout)) + i += 1 except Empty: - # here we are only if there is nothing on the queue, - # and if we are blocking. If we are not blocking, this - # indiccates that the queue was set unwritable in the meanwhile. - # hence we can abort now to prevent reading (possibly) forever - # Besides, this is racy as all threads will rip on the channel - # without waiting until its empty - if not block: - break - # END ignore empty - - # if we have been unblocked because the closed state changed - # in the meanwhile, stop trying - # NOTE: must NOT cache _wc - if self._wc.closed: - # If we were closed, we drop out even if there might still - # be items. Now its time to get these items, according to - # our count. Just switch to unblocking mode. - # If we are to read unlimited items, this would run forever, - # but the EmptyException handler takes care of this - block = False + # here we are only if + # someone woke us up to inform us about the queue that changed + # its writable state + # The following branch checks for closed channels, and pulls + # as many items as we need and as possible, before + # leaving the loop. + if not queue.writable(): + try: + while i < count: + out.append(queue.get(False, None)) + i += 1 + # END count loop + except Empty: + break # out of count loop + # END handle absolutely empty queue + # END handle closed channel - # we don't continue, but let the timer decide whether - # it wants to abort - # END handle channel cloased - - if time() >= endtime: + # if we are here, we woke up and the channel is not closed + # Either the queue became writable again, which currently shouldn't + # be able to happen in the channel, or someone read with a timeout + # that actually timed out. + # As it timed out, which is the only reason we are here, + # we have to abort break - # END stop operation on timeout + # END ignore empty + # END for each item # END handle blocking return out diff --git a/lib/git/async/task.py b/lib/git/async/task.py index d18cedca..539b240f 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -66,7 +66,7 @@ class OutputChannelTask(Node): def is_closed(self): """:return: True if the task's write channel is closed""" - return self._out_wc.closed + return self._out_wc.closed() def error(self): """:return: Exception caught during last processing or None""" diff --git a/test/git/async/test_channel.py b/test/git/async/test_channel.py index ab4ae015..32458f31 100644 --- a/test/git/async/test_channel.py +++ b/test/git/async/test_channel.py @@ -33,18 +33,16 @@ class TestChannels(TestBase): assert time.time() - st >= to # writing to a closed channel raises - assert not wc.closed + assert not wc.closed() wc.close() - assert wc.closed + assert wc.closed() wc.close() # fine - assert wc.closed + assert wc.closed() - self.failUnlessRaises(IOError, wc.write, 1) + self.failUnlessRaises(ReadOnly, wc.write, 1) # reading from a closed channel never blocks - print "preblock" assert len(rc.read()) == 0 - print "got read(0)" assert len(rc.read(5)) == 0 assert len(rc.read(1)) == 0 diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index cccafddc..202fdb66 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -57,6 +57,10 @@ class TestThreadTaskNode(InputIteratorThreadTask): return self +class TestThreadFailureNode(TestThreadTaskNode): + """Fails after X items""" + + class TestThreadPool(TestBase): max_threads = cpu_count() -- cgit v1.2.3 From 57a4e09294230a36cc874a6272c71757c48139f2 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 9 Jun 2010 15:29:47 +0200 Subject: Channel: removed pseudoconstructor, which clearly improves the design and makes it easier to constomize pool: in serial mode, created channels will be serial-only, which brings 15% of performance --- lib/git/async/channel.py | 36 +++++++++++++++++++++++------------- lib/git/async/pool.py | 36 +++++++++++++++++++++++------------- lib/git/async/util.py | 9 ++++++++- test/git/async/test_channel.py | 6 ++---- 4 files changed, 56 insertions(+), 31 deletions(-) diff --git a/lib/git/async/channel.py b/lib/git/async/channel.py index 3a277e7e..bb118f30 100644 --- a/lib/git/async/channel.py +++ b/lib/git/async/channel.py @@ -6,6 +6,7 @@ from Queue import ( from util import ( AsyncQueue, + SyncQueue, ReadOnly ) @@ -24,27 +25,19 @@ class Channel(object): Create a new channel """ __slots__ = tuple() - - def __new__(cls, *args): - if cls is Channel: - if len(args) > 0: - raise ValueError("Cannot take any arguments when creating a new channel") - wc = WChannel() - rc = RChannel(wc) - return wc, rc - # END constructor mode - return object.__new__(cls) class WChannel(Channel): - """The write end of a channel""" + """The write end of a channel - it is thread-safe""" __slots__ = ('_queue') + # The queue to use to store the actual data + QueueCls = AsyncQueue + def __init__(self): """initialize this instance, able to hold max_items at once Write calls will block if the channel is full, until someone reads from it""" - self._queue = AsyncQueue() - + self._queue = self.QueueCls() #{ Interface def write(self, item, block=True, timeout=None): @@ -75,6 +68,12 @@ class WChannel(Channel): #} END interface +class SerialWChannel(WChannel): + """A slightly faster version of a WChannel, which sacrificed thead-safety for + performance""" + QueueCls = SyncQueue + + class RChannel(Channel): """The read-end of a corresponding write channel""" __slots__ = '_wc' @@ -174,3 +173,14 @@ class RChannel(Channel): #} END interface #} END classes + +#{ Constructors +def mkchannel(wctype = WChannel, rctype = RChannel): + """Create a channel, which consists of one write end and one read end + :return: tuple(write_channel, read_channel) + :param wctype: The type of the write channel to instantiate + :param rctype: The type of the read channel to instantiate""" + wc = wctype() + rc = rctype(wc) + return wc, rc +#} END constructors diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 66a2a105..549c801e 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -6,7 +6,6 @@ from thread import ( from threading import Lock from util import ( - SyncQueue, AsyncQueue, DummyLock ) @@ -19,8 +18,9 @@ from Queue import ( from graph import Graph from channel import ( - Channel, + mkchannel, WChannel, + SerialWChannel, RChannel ) @@ -329,7 +329,8 @@ class Pool(object): #{ Interface def size(self): - """:return: amount of workers in the pool""" + """:return: amount of workers in the pool + :note: method is not threadsafe !""" return self._num_workers def set_size(self, size=0): @@ -339,7 +340,9 @@ class Pool(object): :return: self :param size: if 0, the pool will do all work itself in the calling thread, - otherwise the work will be distributed among the given amount of threads + otherwise the work will be distributed among the given amount of threads. + If the size is 0, newly added tasks will use channels which are NOT + threadsafe to optimize item throughput. :note: currently NOT threadsafe !""" assert size > -1, "Size cannot be negative" @@ -437,17 +440,29 @@ class Pool(object): the task will be considered orphaned and will be deleted on the next occasion.""" # create a write channel for it - wc, rc = Channel() - rc = RPoolChannel(wc, task, self) - task.set_wc(wc) + wctype = WChannel self._taskgraph_lock.acquire() try: self._taskorder_cache.clear() self._tasks.add_node(task) + + # fix locks - in serial mode, the task does not need real locks + # Additionally, use a non-threadsafe queue + # This brings about 15% more performance, but sacrifices thread-safety + # when reading from multiple threads. + if self.size() == 0: + task._slock = DummyLock() + wctype = SerialWChannel + # END improve locks + + # setup the tasks channel + wc = wctype() + rc = RPoolChannel(wc, task, self) + task.set_wc(wc) finally: self._taskgraph_lock.release() - # END sync task addition + # END sync task addition # If the input channel is one of our read channels, we add the relation if isinstance(task, InputChannelTask): @@ -462,11 +477,6 @@ class Pool(object): # END add task relation # END handle input channels for connections - # fix locks - in serial mode, the task does not need real locks - if self.size() == 0: - task._slock = DummyLock() - # END improve locks - return rc #} END interface diff --git a/lib/git/async/util.py b/lib/git/async/util.py index 2f46d55f..00d0dbab 100644 --- a/lib/git/async/util.py +++ b/lib/git/async/util.py @@ -66,7 +66,14 @@ class SyncQueue(deque): def empty(self): return len(self) == 0 - put = deque.append + def set_writable(self, state): + pass + + def writable(self): + return True + + def put(self, item, block=True, timeout=None): + self.append(item) class HSCondition(deque): diff --git a/test/git/async/test_channel.py b/test/git/async/test_channel.py index 32458f31..444a076a 100644 --- a/test/git/async/test_channel.py +++ b/test/git/async/test_channel.py @@ -8,12 +8,10 @@ class TestChannels(TestBase): def test_base(self): # creating channel yields a write and a read channal - wc, rc = Channel() - assert isinstance(wc, WChannel) + wc, rc = mkchannel() + assert isinstance(wc, WChannel) # default args assert isinstance(rc, RChannel) - # everything else fails - self.failUnlessRaises(ValueError, Channel, 1, "too many args") # TEST UNLIMITED SIZE CHANNEL - writing+reading is FIFO item = 1 -- cgit v1.2.3 From 07996a1a1e53ffdd2680d4bfbc2f4059687859a5 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 9 Jun 2010 15:40:51 +0200 Subject: task: removed scheduled task support, which at some point was introduced to improve performance, but which now hinders performance, besides being unnecessary ;) --- lib/git/async/pool.py | 43 +++++++++++-------------------------------- lib/git/async/task.py | 33 --------------------------------- 2 files changed, 11 insertions(+), 65 deletions(-) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 549c801e..284c41c7 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -80,27 +80,21 @@ class RPoolChannel(RChannel): self._pre_cb() # END pre callback - # if we have count items, don't do any queue preparation - if someone - # depletes the queue in the meanwhile, the channel will close and - # we will unblock naturally - # PROBLEM: If there are multiple consumer of this channel, we might - # run out of items without being replenished == block forever in the - # worst case. task.min_count could have triggered to produce more ... - # usually per read with n items, we put n items on to the queue, - # so we wouldn't check this - # Even if we have just one consumer ( we could determine that with - # the reference count ), it could be that in one moment we don't yet - # have an item, but its currently being produced by some worker. - # This is why we: - # * make no assumptions if there are multiple consumers - # * + # NOTE: we always queue the operation that would give us count items + # as tracking the scheduled items or testing the channels size + # is in herently unsafe depending on the design of the task network + # If we put on tasks onto the queue for every request, we are sure + # to always produce enough items, even if the task.min_count actually + # provided enough - its better to have some possibly empty task runs + # than having and empty queue that blocks. + + # NOTE: TODO: that case is only possible if one Task could be connected + # to multiple input channels in a manner known by the system. Currently + # this is not possible, but should be implemented at some point # if the user tries to use us to read from a done task, we will never # compute as all produced items are already in the channel skip_compute = self._task.is_done() or self._task.error() - #if count > 0: - # skip_compute = self._task.scheduled_item_count() >= count or self._wc._queue.qsize() >= count - # END ########## prepare ############################## if not skip_compute: @@ -249,13 +243,6 @@ class Pool(object): # raise AssertionError("Shouldn't have consumed tasks on the pool, they delete themeselves, what happend ?") # END skip processing - # if the task does not have the required output on its queue, schedule - # it for processing. If we should process all, we don't care about the - # amount as it should process until its all done. - #if count > 1 and task._out_wc.size() >= count: - # continue - # END skip if we have enough - # but use the actual count to produce the output, we may produce # more than requested numchunks = 1 @@ -283,33 +270,26 @@ class Pool(object): queue = self._queue if numchunks > 1: for i in xrange(numchunks): - # schedule them as early as we know about them - task.add_scheduled_items(chunksize) queue.put((task.process, chunksize)) # END for each chunk to put else: - task.add_scheduled_items(chunksize) queue.put((task.process, chunksize)) # END try efficient looping if remainder: - task.add_scheduled_items(remainder) queue.put((task.process, remainder)) # END handle chunksize else: # no workers, so we have to do the work ourselves if numchunks > 1: for i in xrange(numchunks): - task.add_scheduled_items(chunksize) task.process(chunksize) # END for each chunk to put else: - task.add_scheduled_items(chunksize) task.process(chunksize) # END try efficient looping if remainder: - task.add_scheduled_items(remainder) task.process(remainder) # END handle chunksize # END handle serial mode @@ -452,7 +432,6 @@ class Pool(object): # This brings about 15% more performance, but sacrifices thread-safety # when reading from multiple threads. if self.size() == 0: - task._slock = DummyLock() wctype = SerialWChannel # END improve locks diff --git a/lib/git/async/task.py b/lib/git/async/task.py index 539b240f..be02cfe8 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -23,8 +23,6 @@ class OutputChannelTask(Node): '_out_wc', # output write channel '_exc', # exception caught '_done', # True if we are done - '_scheduled_items', # amount of scheduled items that will be processed in total - '_slock', # lock for scheduled items 'fun', # function to call with items read 'min_count', # minimum amount of items to produce, None means no override 'max_chunksize', # maximium amount of items to process per process call @@ -37,8 +35,6 @@ class OutputChannelTask(Node): self._out_wc = None # to be set later self._exc = None self._done = False - self._scheduled_items = 0 - self._slock = threading.Lock() self.fun = fun self.min_count = None self.max_chunksize = 0 # note set @@ -72,21 +68,6 @@ class OutputChannelTask(Node): """:return: Exception caught during last processing or None""" return self._exc - def add_scheduled_items(self, count): - """Add the given amount of scheduled items to this task""" - self._slock.acquire() - self._scheduled_items += count - self._slock.release() - - def scheduled_item_count(self): - """:return: amount of scheduled items for this task""" - self._slock.acquire() - try: - return self._scheduled_items - finally: - self._slock.release() - # END threadsafe return - def process(self, count=0): """Process count items and send the result individually to the output channel""" items = self._read(count) @@ -101,19 +82,12 @@ class OutputChannelTask(Node): if self.apply_single: for item in items: rval = self.fun(item) - # decrement afterwards, the its unscheduled once its produced - self._slock.acquire() - self._scheduled_items -= 1 - self._slock.release() wc.write(rval) # END for each item else: # shouldn't apply single be the default anyway ? # The task designers should chunk them up in advance rvals = self.fun(items) - self._slock.acquire() - self._scheduled_items -= len(items) - self._slock.release() for rval in rvals: wc.write(rval) # END handle single apply @@ -122,13 +96,6 @@ class OutputChannelTask(Node): # be sure our task is not scheduled again self.set_done() - # unschedule all, we don't know how many have been produced actually - # but only if we don't apply single please - if not self.apply_single: - self._slock.acquire() - self._scheduled_items -= len(items) - self._slock.release() - # END unschedule all # PROBLEM: We have failed to create at least one item, hence its not # garantueed that enough items will be produced for a possibly blocking -- cgit v1.2.3 From ea81f14dafbfb24d70373c74b5f8dabf3f2225d9 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 9 Jun 2010 16:38:21 +0200 Subject: Channel: Callbacks reviewed - they are now part of Subclasses of the default channel implementation, one of which is used as base by the Pool Read channel, releasing it of the duty to call these itself. The write channel with callback subclass allows the transformation of the item to be written --- lib/git/async/channel.py | 51 ++++++++++++++++++++++++++++++++++++++++++ lib/git/async/pool.py | 51 +++++------------------------------------- test/git/async/test_channel.py | 27 ++++++++++++++++++++++ 3 files changed, 84 insertions(+), 45 deletions(-) diff --git a/lib/git/async/channel.py b/lib/git/async/channel.py index bb118f30..abb31035 100644 --- a/lib/git/async/channel.py +++ b/lib/git/async/channel.py @@ -68,6 +68,32 @@ class WChannel(Channel): #} END interface +class CallbackWChannel(WChannel): + """The write end of a channel which allows you to setup a callback to be + called after an item was written to the channel""" + __slots__ = ('_pre_cb') + + def __init__(self): + WChannel.__init__(self) + self._pre_cb = None + + def set_pre_cb(self, fun = lambda item: item): + """Install a callback to be called before the given item is written. + It returns a possibly altered item which will be written to the channel + instead, making it useful for pre-write item conversions. + Providing None uninstalls the current method. + :return: the previously installed function or None + :note: Must be thread-safe if the channel is used in multiple threads""" + prev = self._pre_cb + self._pre_cb = fun + return prev + + def write(self, item, block=True, timeout=None): + if self._pre_cb: + item = self._pre_cb(item) + WChannel.write(self, item, block, timeout) + + class SerialWChannel(WChannel): """A slightly faster version of a WChannel, which sacrificed thead-safety for performance""" @@ -171,7 +197,32 @@ class RChannel(Channel): return out #} END interface + +class CallbackRChannel(RChannel): + """A channel which sends a callback before items are read from the channel""" + __slots__ = "_pre_cb" + + def __init__(self, wc): + RChannel.__init__(self, wc) + self._pre_cb = None + + def set_pre_cb(self, fun = lambda count: None): + """Install a callback to call with the item count to be read before any + item is actually read from the channel. + Exceptions will be propagated. + If a function is not provided, the call is effectively uninstalled. + :return: the previously installed callback or None + :note: The callback must be threadsafe if the channel is used by multiple threads.""" + prev = self._pre_cb + self._pre_cb = fun + return prev + def read(self, count=0, block=True, timeout=None): + if self._pre_cb: + self._pre_cb(count) + return RChannel.read(self, count, block, timeout) + + #} END classes #{ Constructors diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 284c41c7..7d4e96d1 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -21,26 +21,24 @@ from channel import ( mkchannel, WChannel, SerialWChannel, - RChannel + CallbackRChannel ) import sys from time import sleep -class RPoolChannel(RChannel): +class RPoolChannel(CallbackRChannel): """ A read-only pool channel may not be wrapped or derived from, but it provides slots to call before and after an item is to be read. It acts like a handle to the underlying task in the pool.""" - __slots__ = ('_task', '_pool', '_pre_cb', '_post_cb') + __slots__ = ('_task', '_pool') def __init__(self, wchannel, task, pool): - RChannel.__init__(self, wchannel) + CallbackRChannel.__init__(self, wchannel) self._task = task self._pool = pool - self._pre_cb = None - self._post_cb = None def __del__(self): """Assures that our task will be deleted if we were the last reader""" @@ -56,30 +54,10 @@ class RPoolChannel(RChannel): self._pool.remove_task(self._task) # END handle refcount based removal of task - def set_pre_cb(self, fun = lambda count: None): - """Install a callback to call with the item count to be read before any - item is actually read from the channel. The call must be threadsafe if - the channel is passed to more than one tasks. - If it fails, the read will fail with an IOError - If a function is not provided, the call is effectively uninstalled.""" - self._pre_cb = fun - - def set_post_cb(self, fun = lambda item: item): - """Install a callback to call after the items were read. The function - returns a possibly changed item list.The call must be threadsafe if - the channel is passed to more than one tasks. - If it raises, the exception will be propagated. - If a function is not provided, the call is effectively uninstalled.""" - self._post_cb = fun - def read(self, count=0, block=True, timeout=None): """Read an item that was processed by one of our threads :note: Triggers task dependency handling needed to provide the necessary input""" - if self._pre_cb: - self._pre_cb() - # END pre callback - # NOTE: we always queue the operation that would give us count items # as tracking the scheduled items or testing the channels size # is in herently unsafe depending on the design of the task network @@ -90,7 +68,7 @@ class RPoolChannel(RChannel): # NOTE: TODO: that case is only possible if one Task could be connected # to multiple input channels in a manner known by the system. Currently - # this is not possible, but should be implemented at some point + # this is not possible, but should be implemented at some point. # if the user tries to use us to read from a done task, we will never # compute as all produced items are already in the channel @@ -105,25 +83,12 @@ class RPoolChannel(RChannel): ####### read data ######## ########################## # read actual items, tasks were setup to put their output into our channel ( as well ) - items = RChannel.read(self, count, block, timeout) + items = CallbackRChannel.read(self, count, block, timeout) ########################## - if self._post_cb: - items = self._post_cb(items) - - - ####### Finalize ######## - self._pool._post_channel_read(self._task) return items - #{ Internal - def _read(self, count=0, block=False, timeout=None): - """Calls the underlying channel's read directly, without triggering - the pool""" - return RChannel.read(self, count, block, timeout) - - #} END internal class Pool(object): @@ -296,10 +261,6 @@ class Pool(object): # END for each task to process - def _post_channel_read(self, task): - """Called after we processed a read to cleanup""" - pass - def _remove_task_if_orphaned(self, task): """Check the task, and delete it if it is orphaned""" # 1 as its stored on the task, 1 for the getrefcount call diff --git a/test/git/async/test_channel.py b/test/git/async/test_channel.py index 444a076a..215081cd 100644 --- a/test/git/async/test_channel.py +++ b/test/git/async/test_channel.py @@ -44,3 +44,30 @@ class TestChannels(TestBase): assert len(rc.read(5)) == 0 assert len(rc.read(1)) == 0 + + # test callback channels + wc, rc = mkchannel(wctype = CallbackWChannel, rctype = CallbackRChannel) + + cb = [0, 0] # set slots to one if called + def pre_write(item): + cb[0] = 1 + return item + 1 + def pre_read(count): + cb[1] = 1 + + # set, verify it returns previous one + assert wc.set_pre_cb(pre_write) is None + assert rc.set_pre_cb(pre_read) is None + assert wc.set_pre_cb(pre_write) is pre_write + assert rc.set_pre_cb(pre_read) is pre_read + + # writer transforms input + val = 5 + wc.write(val) + assert cb[0] == 1 and cb[1] == 0 + + rval = rc.read(1)[0] # read one item, must not block + assert cb[0] == 1 and cb[1] == 1 + assert rval == val + 1 + + -- cgit v1.2.3 From 365fb14ced88a5571d3287ff1698582ceacd80d6 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 9 Jun 2010 16:59:17 +0200 Subject: task: redesigned write channel access to allow the task creator to set own write channels, possibly some with callbacks installed etc.. Pool.add_task will respect the users choice now, but provide defaults which are optimized for performance --- lib/git/async/pool.py | 13 ++++++++----- lib/git/async/task.py | 37 ++++++++++++++++++++++--------------- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 7d4e96d1..f7c1cfe0 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -388,18 +388,21 @@ class Pool(object): self._taskorder_cache.clear() self._tasks.add_node(task) - # fix locks - in serial mode, the task does not need real locks - # Additionally, use a non-threadsafe queue + # Use a non-threadsafe queue # This brings about 15% more performance, but sacrifices thread-safety # when reading from multiple threads. if self.size() == 0: wctype = SerialWChannel # END improve locks - # setup the tasks channel - wc = wctype() + # setup the tasks channel - respect the task creators choice though + # if it is set. + wc = task.wchannel() + if wc is None: + wc = wctype() + # END create write channel ifunset rc = RPoolChannel(wc, task, self) - task.set_wc(wc) + task.set_wchannel(wc) finally: self._taskgraph_lock.release() # END sync task addition diff --git a/lib/git/async/task.py b/lib/git/async/task.py index be02cfe8..f98336b2 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -1,4 +1,5 @@ from graph import Node +from channel import WChannel from util import ReadOnly import threading @@ -11,8 +12,8 @@ class OutputChannelTask(Node): """Abstracts a named task as part of a set of interdependent tasks, which contains additional information on how the task should be queued and processed. - Results of the item processing are sent to an output channel, which is to be - set by the creator + Results of the item processing are sent to a write channel, which is to be + set by the creator using the ``set_wchannel`` method. * **min_count** assures that not less than min_count items will be processed per call. * **max_chunksize** assures that multi-threading is happening in smaller chunks. If @@ -29,10 +30,11 @@ class OutputChannelTask(Node): 'apply_single' # apply single items even if multiple where read ) - def __init__(self, id, fun, apply_single=True, min_count=None, max_chunksize=0): + def __init__(self, id, fun, apply_single=True, min_count=None, max_chunksize=0, + wchannel=None): Node.__init__(self, id) self._read = None # to be set by subclasss - self._out_wc = None # to be set later + self._out_wc = wchannel # to be set later self._exc = None self._done = False self.fun = fun @@ -48,13 +50,21 @@ class OutputChannelTask(Node): """Set ourselves to being done, has we have completed the processing""" self._done = True - def set_wc(self, wc): - """Set the write channel to the given one - :note: resets it done state in order to allow proper queue handling""" - self._done = False # TODO : fix this, this is a side-effect - self._scheduled_items = 0 + def set_wchannel(self, wc): + """Set the write channel to the given one""" self._out_wc = wc + def wchannel(self): + """:return: a proxy to our write channel or None if non is set + :note: you must not hold a reference to our write channel when the + task is being processed. This would cause the write channel never + to be closed as the task will think there is still another instance + being processed which can close the channel once it is done. + In the worst case, this will block your reads.""" + if self._out_wc is None: + return None + return self._out_wc + def close(self): """A closed task will close its channel to assure the readers will wake up :note: its safe to call this method multiple times""" @@ -128,8 +138,10 @@ class OutputChannelTask(Node): # END handle done state # If we appear to be the only one left with our output channel, and are - # closed ( this could have been set in another thread as well ), make + # done ( this could have been set in another thread as well ), make # sure to close the output channel. + # Waiting with this to be the last one helps to keep the + # write-channel writable longer # The count is: 1 = wc itself, 2 = first reader channel, + x for every # thread having its copy on the stack # + 1 for the instance we provide to refcount @@ -196,10 +208,5 @@ class InputChannelTask(OutputChannelTask): OutputChannelTask.__init__(self, *args, **kwargs) self._read = in_rc.read - def process(self, count=1): - # for now, just blindly read our input, could trigger a pool, even - # ours, but why not ? It should be able to handle this - # TODO: remove this method - super(InputChannelTask, self).process(count) #{ Configuration -- cgit v1.2.3 From 257a8a9441fca9a9bc384f673ba86ef5c3f1715d Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 9 Jun 2010 21:19:54 +0200 Subject: test: prepared task dependency test, which already helped to find bug in the reference counting mechanism, causing references to the pool to be kepts via cycles --- lib/git/async/pool.py | 55 ++++++++++----- lib/git/async/task.py | 7 +- test/git/async/test_pool.py | 159 +++++++++++++++++++++++++++++++++++--------- 3 files changed, 172 insertions(+), 49 deletions(-) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index f7c1cfe0..2ec18f1a 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -25,6 +25,7 @@ from channel import ( ) import sys +import weakref from time import sleep @@ -33,25 +34,37 @@ class RPoolChannel(CallbackRChannel): before and after an item is to be read. It acts like a handle to the underlying task in the pool.""" - __slots__ = ('_task', '_pool') + __slots__ = ('_task_ref', '_pool_ref') def __init__(self, wchannel, task, pool): CallbackRChannel.__init__(self, wchannel) - self._task = task - self._pool = pool + self._task_ref = weakref.ref(task) + self._pool_ref = weakref.ref(pool) def __del__(self): """Assures that our task will be deleted if we were the last reader""" - del(self._wc) # decrement ref-count early - # now, if this is the last reader to the wc we just handled, there + task = self._task_ref() + if task is None: + return + + pool = self._pool_ref() + if pool is None: + return + + # if this is the last reader to the wc we just handled, there # is no way anyone will ever read from the task again. If so, # delete the task in question, it will take care of itself and orphans # it might leave # 1 is ourselves, + 1 for the call + 1, and 3 magical ones which # I can't explain, but appears to be normal in the destructor # On the caller side, getrefcount returns 2, as expected + # When just calling remove_task, + # it has no way of knowing that the write channel is about to diminsh. + # which is why we pass the info as a private kwarg - not nice, but + # okay for now + # TODO: Fix this - private/public method if sys.getrefcount(self) < 6: - self._pool.remove_task(self._task) + pool.remove_task(task, _from_destructor_=True) # END handle refcount based removal of task def read(self, count=0, block=True, timeout=None): @@ -72,11 +85,16 @@ class RPoolChannel(CallbackRChannel): # if the user tries to use us to read from a done task, we will never # compute as all produced items are already in the channel - skip_compute = self._task.is_done() or self._task.error() + task = self._task_ref() + if task is None: + return list() + # END abort if task was deleted + + skip_compute = task.is_done() or task.error() ########## prepare ############################## if not skip_compute: - self._pool._prepare_channel_read(self._task, count) + self._pool_ref()._prepare_channel_read(task, count) # END prepare pool scheduling @@ -261,11 +279,16 @@ class Pool(object): # END for each task to process - def _remove_task_if_orphaned(self, task): + def _remove_task_if_orphaned(self, task, from_destructor): """Check the task, and delete it if it is orphaned""" # 1 as its stored on the task, 1 for the getrefcount call - if sys.getrefcount(task._out_wc) < 3: - self.remove_task(task) + # If we are getting here from the destructor of an RPool channel, + # its totally valid to virtually decrement the refcount by 1 as + # we can expect it to drop once the destructor completes, which is when + # we finish all recursive calls + max_ref_count = 3 + from_destructor + if sys.getrefcount(task.wchannel()) < max_ref_count: + self.remove_task(task, from_destructor) #} END internal #{ Interface @@ -335,7 +358,7 @@ class Pool(object): finally: self._taskgraph_lock.release() - def remove_task(self, task): + def remove_task(self, task, _from_destructor_=False): """Delete the task Additionally we will remove orphaned tasks, which can be identified if their output channel is only held by themselves, so no one will ever consume @@ -370,7 +393,7 @@ class Pool(object): # END locked deletion for t in in_tasks: - self._remove_task_if_orphaned(t) + self._remove_task_if_orphaned(t, _from_destructor_) # END handle orphans recursively return self @@ -409,11 +432,11 @@ class Pool(object): # If the input channel is one of our read channels, we add the relation if isinstance(task, InputChannelTask): - ic = task.in_rc - if isinstance(ic, RPoolChannel) and ic._pool is self: + ic = task.rchannel() + if isinstance(ic, RPoolChannel) and ic._pool_ref() is self: self._taskgraph_lock.acquire() try: - self._tasks.add_edge(ic._task, task) + self._tasks.add_edge(ic._task_ref(), task) finally: self._taskgraph_lock.release() # END handle edge-adding diff --git a/lib/git/async/task.py b/lib/git/async/task.py index f98336b2..03b40492 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -208,5 +208,8 @@ class InputChannelTask(OutputChannelTask): OutputChannelTask.__init__(self, *args, **kwargs) self._read = in_rc.read - #{ Configuration - + def rchannel(self): + """:return: input channel from which we read""" + # the instance is bound in its instance method - lets use this to keep + # the refcount at one ( per consumer ) + return self._read.im_self diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 202fdb66..2a5e4647 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -8,15 +8,14 @@ import threading import time import sys -class TestThreadTaskNode(InputIteratorThreadTask): +class _TestTaskBase(object): def __init__(self, *args, **kwargs): - super(TestThreadTaskNode, self).__init__(*args, **kwargs) + super(_TestTaskBase, self).__init__(*args, **kwargs) self.should_fail = False self.lock = threading.Lock() # yes, can't safely do x = x + 1 :) self.plock = threading.Lock() self.item_count = 0 self.process_count = 0 - self._scheduled_items = 0 def do_fun(self, item): self.lock.acquire() @@ -32,44 +31,118 @@ class TestThreadTaskNode(InputIteratorThreadTask): self.plock.acquire() self.process_count += 1 self.plock.release() - super(TestThreadTaskNode, self).process(count) + super(_TestTaskBase, self).process(count) def _assert(self, pc, fc, check_scheduled=False): """Assert for num process counts (pc) and num function counts (fc) :return: self""" - # TODO: fixme - return self - self.plock.acquire() - if self.process_count != pc: - print self.process_count, pc - assert self.process_count == pc - self.plock.release() self.lock.acquire() if self.item_count != fc: print self.item_count, fc assert self.item_count == fc self.lock.release() - # if we read all, we can't really use scheduled items - if check_scheduled: - assert self._scheduled_items == 0 - assert not self.error() return self + +class TestThreadTaskNode(_TestTaskBase, InputIteratorThreadTask): + pass class TestThreadFailureNode(TestThreadTaskNode): """Fails after X items""" + def __init__(self, *args, **kwargs): + self.fail_after = kwargs.pop('fail_after') + super(TestThreadFailureNode, self).__init__(*args, **kwargs) + def do_fun(self, item): + item = TestThreadTaskNode.do_fun(self, item) + if self.item_count > self.fail_after: + raise AssertionError("Simulated failure after processing %i items" % self.fail_after) + return item + + +class TestThreadInputChannelTaskNode(_TestTaskBase, InputChannelTask): + """Apply a transformation on items read from an input channel""" + + def do_fun(self, item): + """return tuple(i, i*2)""" + item = super(TestThreadInputChannelTaskNode, self).do_fun(item) + if isinstance(item, tuple): + i = item[0] + return item + (i * self.id, ) + else: + return (item, item * self.id) + # END handle tuple + + +class TestThreadInputChannelVerifyTaskNode(_TestTaskBase, InputChannelTask): + """An input channel task, which verifies the result of its input channels, + should be last in the chain. + Id must be int""" + + def do_fun(self, item): + """return tuple(i, i*2)""" + item = super(TestThreadInputChannelTaskNode, self).do_fun(item) + + # make sure the computation order matches + assert isinstance(item, tuple) + + base = item[0] + for num in item[1:]: + assert num == base * 2 + base = num + # END verify order + + return item + + class TestThreadPool(TestBase): max_threads = cpu_count() - def _add_triple_task(self, p): - """Add a triplet of feeder, transformer and finalizer to the pool, like - t1 -> t2 -> t3, return all 3 return channels in order""" - # t1 = TestThreadTaskNode(make_task(), 'iterator', None) - # TODO: + def _add_task_chain(self, p, ni, count=1): + """Create a task chain of feeder, count transformers and order verifcator + to the pool p, like t1 -> t2 -> t3 + :return: tuple(list(task1, taskN, ...), list(rc1, rcN, ...))""" + nt = p.num_tasks() + + feeder = self._make_iterator_task(ni) + frc = p.add_task(feeder) + + assert p.num_tasks() == nt + 1 + + rcs = [frc] + tasks = [feeder] + + inrc = frc + for tc in xrange(count): + t = TestThreadInputChannelTaskNode(inrc, tc, None) + t.fun = t.do_fun + inrc = p.add_task(t) + + tasks.append(t) + rcs.append(inrc) + assert p.num_tasks() == nt + 2 + tc + # END create count transformers + + verifier = TestThreadInputChannelVerifyTaskNode(inrc, 'verifier', None) + verifier.fun = verifier.do_fun + vrc = p.add_task(verifier) + + assert p.num_tasks() == nt + tc + 3 + + tasks.append(verifier) + rcs.append(vrc) + return tasks, rcs + + def _make_iterator_task(self, ni, taskcls=TestThreadTaskNode, **kwargs): + """:return: task which yields ni items + :param taskcls: the actual iterator type to use + :param **kwargs: additional kwargs to be passed to the task""" + t = taskcls(iter(range(ni)), 'iterator', None, **kwargs) + t.fun = t.do_fun + return t def _assert_single_task(self, p, async=False): """Performs testing in a synchronized environment""" @@ -82,11 +155,7 @@ class TestThreadPool(TestBase): assert ni % 2 == 0, "ni needs to be dividable by 2" assert ni % 4 == 0, "ni needs to be dividable by 4" - def make_task(): - t = TestThreadTaskNode(iter(range(ni)), 'iterator', None) - t.fun = t.do_fun - return t - # END utility + make_task = lambda *args, **kwargs: self._make_iterator_task(ni, *args, **kwargs) task = make_task() @@ -252,15 +321,44 @@ class TestThreadPool(TestBase): # test failure after ni / 2 items # This makes sure it correctly closes the channel on failure to prevent blocking + nri = ni/2 + task = make_task(TestThreadFailureNode, fail_after=ni/2) + rc = p.add_task(task) + assert len(rc.read()) == nri + assert task.is_done() + assert isinstance(task.error(), AssertionError) - def _assert_async_dependent_tasks(self, p): + def _assert_async_dependent_tasks(self, pool): # includes failure in center task, 'recursive' orphan cleanup # This will also verify that the channel-close mechanism works # t1 -> t2 -> t3 # t1 -> x -> t3 - pass + null_tasks = pool.num_tasks() + ni = 100 + count = 1 + make_task = lambda *args, **kwargs: self._add_task_chain(pool, ni, count, *args, **kwargs) + + ts, rcs = make_task() + assert len(ts) == count + 2 + assert len(rcs) == count + 2 + assert pool.num_tasks() == null_tasks + len(ts) + print pool._tasks.nodes + + + # in the end, we expect all tasks to be gone, automatically + + + + # order of deletion matters - just keep the end, then delete + final_rc = rcs[-1] + del(ts) + del(rcs) + del(final_rc) + assert pool.num_tasks() == null_tasks + + @terminate_threads def test_base(self): @@ -301,8 +399,8 @@ class TestThreadPool(TestBase): assert p.num_tasks() == 0 - # DEPENDENT TASKS SERIAL - ######################## + # DEPENDENT TASKS SYNC MODE + ########################### self._assert_async_dependent_tasks(p) @@ -311,12 +409,11 @@ class TestThreadPool(TestBase): # step one gear up - just one thread for now. p.set_size(1) assert p.size() == 1 - print len(threading.enumerate()), num_threads assert len(threading.enumerate()) == num_threads + 1 # deleting the pool stops its threads - just to be sure ;) # Its not synchronized, hence we wait a moment del(p) - time.sleep(0.25) + time.sleep(0.05) assert len(threading.enumerate()) == num_threads p = ThreadPool(1) -- cgit v1.2.3 From 3323464f85b986cba23176271da92a478b33ab9c Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 10 Jun 2010 00:24:49 +0200 Subject: messy first version of a properly working depth-first graph method, which allows the pool to work as expected. Many more tests need to be added, and there still is a problem with shutdown as sometimes it won't kill all threads, mainly because the process came up with worker threads started, which cannot be --- lib/git/async/graph.py | 23 +++++++++++---------- lib/git/async/pool.py | 6 +++--- lib/git/async/task.py | 5 ++++- lib/git/async/util.py | 8 +++++++- test/git/async/test_graph.py | 29 ++++---------------------- test/git/async/test_pool.py | 48 ++++++++++++++++++++++++++++++-------------- 6 files changed, 63 insertions(+), 56 deletions(-) diff --git a/lib/git/async/graph.py b/lib/git/async/graph.py index 6386cbaa..e3999cdc 100644 --- a/lib/git/async/graph.py +++ b/lib/git/async/graph.py @@ -87,25 +87,26 @@ class Graph(object): return self - def visit_input_inclusive_depth_first(self, node, visitor=lambda n: True ): - """Visit all input nodes of the given node, depth first, calling visitor - for each node on our way. If the function returns False, the traversal - will not go any deeper, but continue at the next branch - It will return the actual input node in the end !""" - nodes = node.in_nodes[:] + def input_inclusive_dfirst_reversed(self, node): + """Return all input nodes of the given node, depth first, + It will return the actual input node last, as it is required + like that by the pool""" + stack = [node] seen = set() # depth first - while nodes: - n = nodes.pop() + out = list() + while stack: + n = stack.pop() if n in seen: continue seen.add(n) + out.append(n) # only proceed in that direction if visitor is fine with it - if visitor(n): - nodes.extend(n.in_nodes) + stack.extend(n.in_nodes) # END call visitor # END while walking - visitor(node) + out.reverse() + return out diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 2ec18f1a..5ebc3655 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -182,14 +182,13 @@ class Pool(object): dfirst_tasks = self._taskorder_cache[id(task)] except KeyError: # have to retrieve the list from the graph - dfirst_tasks = list() - self._tasks.visit_input_inclusive_depth_first(task, lambda n: dfirst_tasks.append(n)) + dfirst_tasks = self._tasks.input_inclusive_dfirst_reversed(task) self._taskorder_cache[id(task)] = dfirst_tasks # END handle cached order retrieval finally: self._taskgraph_lock.release() # END handle locking - + print dfirst_tasks # check the min count on all involved tasks, and be sure that we don't # have any task which produces less than the maximum min-count of all tasks # The actual_count is used when chunking tasks up for the queue, whereas @@ -309,6 +308,7 @@ class Pool(object): threadsafe to optimize item throughput. :note: currently NOT threadsafe !""" + print "set_size", size assert size > -1, "Size cannot be negative" # either start new threads, or kill existing ones. diff --git a/lib/git/async/task.py b/lib/git/async/task.py index 03b40492..57dd285d 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -80,7 +80,9 @@ class OutputChannelTask(Node): def process(self, count=0): """Process count items and send the result individually to the output channel""" + print "%r: reading %i" % (self.id, count) items = self._read(count) + print "%r: done reading" % self.id try: # increase the ref-count - we use this to determine whether anyone else # is currently handling our output channel. As this method runs asynchronously, @@ -102,7 +104,7 @@ class OutputChannelTask(Node): wc.write(rval) # END handle single apply except Exception, e: - print >> sys.stderr, "task error:", str(e) # TODO: REMOVE DEBUG, or make it use logging + print >> sys.stderr, "task %s error:" % self.id, type(e), str(e) # TODO: REMOVE DEBUG, or make it use logging # be sure our task is not scheduled again self.set_done() @@ -146,6 +148,7 @@ class OutputChannelTask(Node): # thread having its copy on the stack # + 1 for the instance we provide to refcount if self.is_done() and getrefcount(self._out_wc) < 4: + print "Closing channel of %r" % self.id self.close() # END handle channel closure #{ Configuration diff --git a/lib/git/async/util.py b/lib/git/async/util.py index 00d0dbab..b7750b0b 100644 --- a/lib/git/async/util.py +++ b/lib/git/async/util.py @@ -206,7 +206,6 @@ class AsyncQueue(deque): return old finally: self.mutex.release() - # if we won't receive anymore items, inform the getters if not state: self.not_empty.notify_all() @@ -222,6 +221,13 @@ class AsyncQueue(deque): def put(self, item, block=True, timeout=None): self.mutex.acquire() + # NOTE: we explicitly do NOT check for our writable state + # Its just used as a notification signal, and we need to be able + # to continue writing to prevent threads ( easily ) from failing + # to write their computed results, which we want in fact + # NO: we want them to fail and stop processing, as the one who caused + # the channel to close had a reason and wants the threads to + # stop on the task as soon as possible if not self._writable: self.mutex.release() raise ReadOnly diff --git a/test/git/async/test_graph.py b/test/git/async/test_graph.py index 1a153e2d..d0e36159 100644 --- a/test/git/async/test_graph.py +++ b/test/git/async/test_graph.py @@ -61,31 +61,10 @@ class TestGraph(TestBase): assert len(n1.out_nodes) == 0 # check the history from the last node - last = g.nodes[-1] - class Visitor(object): - def __init__(self, origin): - self.origin_seen = False - self.origin = origin - self.num_seen = 0 - - def __call__(self, n): - if n is self.origin: - self.origin_seen = True - else: - assert not self.origin_seen, "should see origin last" - # END check origin - self.num_seen += 1 - return True - - def _assert(self, num_expected): - assert self.origin_seen - assert self.num_seen == num_expected - # END visitor helper - end = g.nodes[-1] - visitor = Visitor(end) - g.visit_input_inclusive_depth_first(end, visitor) - + dfirst_nodes = g.input_inclusive_dfirst_reversed(end) num_nodes_seen = nn - 2 # deleted second, which leaves first one disconnected - visitor._assert(num_nodes_seen) + assert len(dfirst_nodes) == num_nodes_seen + assert dfirst_nodes[-1] == end and dfirst_nodes[-2].id == end.id-1 + diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 2a5e4647..788ca3bf 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -67,6 +67,8 @@ class TestThreadInputChannelTaskNode(_TestTaskBase, InputChannelTask): def do_fun(self, item): """return tuple(i, i*2)""" item = super(TestThreadInputChannelTaskNode, self).do_fun(item) + #print "transformer.doit", self.id, item + if isinstance(item, tuple): i = item[0] return item + (i * self.id, ) @@ -82,15 +84,16 @@ class TestThreadInputChannelVerifyTaskNode(_TestTaskBase, InputChannelTask): def do_fun(self, item): """return tuple(i, i*2)""" - item = super(TestThreadInputChannelTaskNode, self).do_fun(item) + item = super(TestThreadInputChannelVerifyTaskNode, self).do_fun(item) + + # print "verifier.doit", self.id, item # make sure the computation order matches - assert isinstance(item, tuple) + assert isinstance(item, tuple), "input was no tuple: %s" % item base = item[0] - for num in item[1:]: - assert num == base * 2 - base = num + for id, num in enumerate(item[1:]): + assert num == base * (id), "%i != %i, orig = %s" % (num, base * id+1, str(item)) # END verify order return item @@ -146,6 +149,7 @@ class TestThreadPool(TestBase): def _assert_single_task(self, p, async=False): """Performs testing in a synchronized environment""" + return # DEBUG TODO: Fixme deactivated it print >> sys.stderr, "Threadpool: Starting single task (async = %i) with %i threads" % (async, p.size()) null_tasks = p.num_tasks() # in case we had some before @@ -335,33 +339,47 @@ class TestThreadPool(TestBase): # This will also verify that the channel-close mechanism works # t1 -> t2 -> t3 # t1 -> x -> t3 + print >> sys.stderr, "Threadpool: starting async dependency test in %i threads" % pool.size() null_tasks = pool.num_tasks() - ni = 100 - count = 1 + ni = 5000 + count = 3 + aic = count + 2 make_task = lambda *args, **kwargs: self._add_task_chain(pool, ni, count, *args, **kwargs) ts, rcs = make_task() - assert len(ts) == count + 2 - assert len(rcs) == count + 2 + assert len(ts) == aic + assert len(rcs) == aic assert pool.num_tasks() == null_tasks + len(ts) print pool._tasks.nodes - # in the end, we expect all tasks to be gone, automatically + # read all at once + print "read(0)" + st = time.time() + items = rcs[-1].read() + print "finished read(0)" + elapsed = time.time() - st + assert len(items) == ni + print >> sys.stderr, "Dependent Tasks: evaluated %i items of %i dependent in %f s ( %i items / s )" % (ni, aic, elapsed, ni / elapsed) - # order of deletion matters - just keep the end, then delete - final_rc = rcs[-1] + # in the end, we expect all tasks to be gone, automatically + # order of deletion doesnt matter + print "del ts" del(ts) + print "del rcs" del(rcs) - del(final_rc) assert pool.num_tasks() == null_tasks - @terminate_threads + # for some reason, sometimes it has multiple workerthreads already when he + # enters the method ... dunno yet, pools should clean up themselvess + # @terminate_threads def test_base(self): + assert len(threading.enumerate()) == 1 + p = ThreadPool() # default pools have no workers @@ -438,4 +456,4 @@ class TestThreadPool(TestBase): ########################### self._assert_async_dependent_tasks(p) - + print >> sys.stderr, "Done with everything" -- cgit v1.2.3 From cfb278d74ad01f3f1edf5e0ad113974a9555038d Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 10 Jun 2010 10:14:32 +0200 Subject: InputChannelTask now has interface for properly handling the reading from the same and different pools --- lib/git/async/pool.py | 43 +++++++++++++++++++++++++++++++++++++++---- lib/git/async/task.py | 26 +++++++++++++++++++++++++- test/git/async/test_pool.py | 7 ++++--- 3 files changed, 68 insertions(+), 8 deletions(-) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 5ebc3655..1b3c2748 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -10,7 +10,6 @@ from util import ( DummyLock ) -from task import InputChannelTask from Queue import ( Queue, Empty @@ -66,6 +65,24 @@ class RPoolChannel(CallbackRChannel): if sys.getrefcount(self) < 6: pool.remove_task(task, _from_destructor_=True) # END handle refcount based removal of task + + #{ Internal + def _read(self, count=0, block=True, timeout=None): + """Direct read, bypassing the pool handling""" + return CallbackRChannel.read(self, count, block, timeout) + #} END internal + + #{ Interface + + def pool_ref(self): + """:return: reference to the pool we belong to""" + return self._pool_ref + + def task_ref(self): + """:return: reference to the task producing our items""" + return self._task_ref + + #} END interface def read(self, count=0, block=True, timeout=None): """Read an item that was processed by one of our threads @@ -188,7 +205,7 @@ class Pool(object): finally: self._taskgraph_lock.release() # END handle locking - print dfirst_tasks + # check the min count on all involved tasks, and be sure that we don't # have any task which produces less than the maximum min-count of all tasks # The actual_count is used when chunking tasks up for the queue, whereas @@ -406,6 +423,18 @@ class Pool(object): # create a write channel for it wctype = WChannel + # adjust the task with our pool ref, if it has the slot and is empty + # For now, we don't allow tasks to be used in multiple pools, except + # for by their channels + if hasattr(task, 'pool'): + their_pool = task.pool() + if their_pool is None: + task.set_pool(self) + elif their_pool is not self: + raise ValueError("Task %r is already registered to another pool" % task.id) + # END handle pool exclusivity + # END handle pool aware tasks + self._taskgraph_lock.acquire() try: self._taskorder_cache.clear() @@ -431,12 +460,18 @@ class Pool(object): # END sync task addition # If the input channel is one of our read channels, we add the relation - if isinstance(task, InputChannelTask): + if hasattr(task, 'rchannel'): ic = task.rchannel() - if isinstance(ic, RPoolChannel) and ic._pool_ref() is self: + if hasattr(ic, 'pool_ref') and ic.pool_ref()() is self: self._taskgraph_lock.acquire() try: self._tasks.add_edge(ic._task_ref(), task) + + # additionally, bypass ourselves when reading from the + # task, if possible + if hasattr(ic, '_read'): + task.set_read(ic._read) + # END handle read bypass finally: self._taskgraph_lock.release() # END handle edge-adding diff --git a/lib/git/async/task.py b/lib/git/async/task.py index 57dd285d..d5b45609 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -1,8 +1,12 @@ from graph import Node -from channel import WChannel from util import ReadOnly +from channel import ( + WChannel, + CallbackRChannel + ) import threading +import weakref import sys import new @@ -147,6 +151,7 @@ class OutputChannelTask(Node): # The count is: 1 = wc itself, 2 = first reader channel, + x for every # thread having its copy on the stack # + 1 for the instance we provide to refcount + # Soft close, so others can continue writing their results if self.is_done() and getrefcount(self._out_wc) < 4: print "Closing channel of %r" % self.id self.close() @@ -206,13 +211,32 @@ class InputChannelTask(OutputChannelTask): """Uses an input channel as source for reading items For instantiation, it takes all arguments of its base, the first one needs to be the input channel to read from though.""" + __slots__ = "_pool_ref" def __init__(self, in_rc, *args, **kwargs): OutputChannelTask.__init__(self, *args, **kwargs) self._read = in_rc.read + self._pool_ref = None + + #{ Internal Interface def rchannel(self): """:return: input channel from which we read""" # the instance is bound in its instance method - lets use this to keep # the refcount at one ( per consumer ) return self._read.im_self + + def set_read(self, read): + """Adjust the read method to the given one""" + self._read = read + + def set_pool(self, pool): + self._pool_ref = weakref.ref(pool) + + def pool(self): + """:return: pool we are attached to, or None""" + if self._pool_ref is None: + return None + return self._pool_ref() + + #} END intenral interface diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 788ca3bf..3fb55e31 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -359,6 +359,7 @@ class TestThreadPool(TestBase): items = rcs[-1].read() print "finished read(0)" elapsed = time.time() - st + print len(items), ni assert len(items) == ni print >> sys.stderr, "Dependent Tasks: evaluated %i items of %i dependent in %f s ( %i items / s )" % (ni, aic, elapsed, ni / elapsed) @@ -366,9 +367,7 @@ class TestThreadPool(TestBase): # in the end, we expect all tasks to be gone, automatically # order of deletion doesnt matter - print "del ts" del(ts) - print "del rcs" del(rcs) assert pool.num_tasks() == null_tasks @@ -376,7 +375,7 @@ class TestThreadPool(TestBase): # for some reason, sometimes it has multiple workerthreads already when he # enters the method ... dunno yet, pools should clean up themselvess - # @terminate_threads + #@terminate_threads def test_base(self): assert len(threading.enumerate()) == 1 @@ -457,3 +456,5 @@ class TestThreadPool(TestBase): self._assert_async_dependent_tasks(p) print >> sys.stderr, "Done with everything" + + # TODO: test multi-pool connections -- cgit v1.2.3 From 01eac1a959c1fa5894a86bf11e6b92f96762bdd8 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 10 Jun 2010 12:06:16 +0200 Subject: Added more dependency task tests, especially the single-reads are not yet fully deterministic as tasks still run into the problem that they try to write into a closed channel, it was closed by one of their task-mates who didn't know someone else was still computing --- lib/git/async/task.py | 7 ++- test/git/async/test_pool.py | 129 +++++++++++++++++++++++++++++++++++++------- 2 files changed, 113 insertions(+), 23 deletions(-) diff --git a/lib/git/async/task.py b/lib/git/async/task.py index d5b45609..0b1d0666 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -84,9 +84,9 @@ class OutputChannelTask(Node): def process(self, count=0): """Process count items and send the result individually to the output channel""" - print "%r: reading %i" % (self.id, count) + # print "%r: reading %i" % (self.id, count) items = self._read(count) - print "%r: done reading" % self.id + # print "%r: done reading %i items" % (self.id, len(items)) try: # increase the ref-count - we use this to determine whether anyone else # is currently handling our output channel. As this method runs asynchronously, @@ -109,7 +109,6 @@ class OutputChannelTask(Node): # END handle single apply except Exception, e: print >> sys.stderr, "task %s error:" % self.id, type(e), str(e) # TODO: REMOVE DEBUG, or make it use logging - # be sure our task is not scheduled again self.set_done() @@ -153,7 +152,7 @@ class OutputChannelTask(Node): # + 1 for the instance we provide to refcount # Soft close, so others can continue writing their results if self.is_done() and getrefcount(self._out_wc) < 4: - print "Closing channel of %r" % self.id + # print "Closing channel of %r" % self.id self.close() # END handle channel closure #{ Configuration diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 3fb55e31..679bab31 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -9,6 +9,7 @@ import time import sys class _TestTaskBase(object): + """Note: causes great slowdown due to the required locking of task variables""" def __init__(self, *args, **kwargs): super(_TestTaskBase, self).__init__(*args, **kwargs) self.should_fail = False @@ -43,7 +44,8 @@ class _TestTaskBase(object): self.lock.release() return self - + + class TestThreadTaskNode(_TestTaskBase, InputIteratorThreadTask): pass @@ -56,18 +58,36 @@ class TestThreadFailureNode(TestThreadTaskNode): def do_fun(self, item): item = TestThreadTaskNode.do_fun(self, item) - if self.item_count > self.fail_after: - raise AssertionError("Simulated failure after processing %i items" % self.fail_after) + + self.lock.acquire() + try: + if self.item_count > self.fail_after: + raise AssertionError("Simulated failure after processing %i items" % self.fail_after) + finally: + self.lock.release() + # END handle fail after return item class TestThreadInputChannelTaskNode(_TestTaskBase, InputChannelTask): """Apply a transformation on items read from an input channel""" + def __init__(self, *args, **kwargs): + self.fail_after = kwargs.pop('fail_after', 0) + super(TestThreadInputChannelTaskNode, self).__init__(*args, **kwargs) def do_fun(self, item): """return tuple(i, i*2)""" item = super(TestThreadInputChannelTaskNode, self).do_fun(item) - #print "transformer.doit", self.id, item + + # fail after support + if self.fail_after: + self.lock.acquire() + try: + if self.item_count > self.fail_after: + raise AssertionError("Simulated failure after processing %i items" % self.fail_after) + finally: + self.lock.release() + # END handle fail-after if isinstance(item, tuple): i = item[0] @@ -86,14 +106,12 @@ class TestThreadInputChannelVerifyTaskNode(_TestTaskBase, InputChannelTask): """return tuple(i, i*2)""" item = super(TestThreadInputChannelVerifyTaskNode, self).do_fun(item) - # print "verifier.doit", self.id, item - # make sure the computation order matches assert isinstance(item, tuple), "input was no tuple: %s" % item base = item[0] for id, num in enumerate(item[1:]): - assert num == base * (id), "%i != %i, orig = %s" % (num, base * id+1, str(item)) + assert num == base * id, "%i != %i, orig = %s" % (num, base * id, str(item)) # END verify order return item @@ -104,9 +122,11 @@ class TestThreadPool(TestBase): max_threads = cpu_count() - def _add_task_chain(self, p, ni, count=1): + def _add_task_chain(self, p, ni, count=1, fail_setup=list()): """Create a task chain of feeder, count transformers and order verifcator to the pool p, like t1 -> t2 -> t3 + :param fail_setup: a list of pairs, task_id, fail_after, i.e. [(2, 20)] would + make the third transformer fail after 20 items :return: tuple(list(task1, taskN, ...), list(rc1, rcN, ...))""" nt = p.num_tasks() @@ -129,6 +149,11 @@ class TestThreadPool(TestBase): assert p.num_tasks() == nt + 2 + tc # END create count transformers + # setup failure + for id, fail_after in fail_setup: + tasks[1+id].fail_after = fail_after + # END setup failure + verifier = TestThreadInputChannelVerifyTaskNode(inrc, 'verifier', None) verifier.fun = verifier.do_fun vrc = p.add_task(verifier) @@ -149,7 +174,7 @@ class TestThreadPool(TestBase): def _assert_single_task(self, p, async=False): """Performs testing in a synchronized environment""" - return # DEBUG TODO: Fixme deactivated it + # return # DEBUG TODO: Fixme deactivated it print >> sys.stderr, "Threadpool: Starting single task (async = %i) with %i threads" % (async, p.size()) null_tasks = p.num_tasks() # in case we had some before @@ -316,8 +341,6 @@ class TestThreadPool(TestBase): print "read(0) with failure" assert len(rc.read()) == 0 # failure on first item - print >> sys.stderr, "done with everything" - assert isinstance(task.error(), AssertionError) assert task.is_done() # on error, its marked done as well del(rc) @@ -332,39 +355,107 @@ class TestThreadPool(TestBase): assert task.is_done() assert isinstance(task.error(), AssertionError) + print >> sys.stderr, "done with everything" + def _assert_async_dependent_tasks(self, pool): # includes failure in center task, 'recursive' orphan cleanup # This will also verify that the channel-close mechanism works # t1 -> t2 -> t3 - # t1 -> x -> t3 + print >> sys.stderr, "Threadpool: starting async dependency test in %i threads" % pool.size() null_tasks = pool.num_tasks() ni = 5000 count = 3 aic = count + 2 make_task = lambda *args, **kwargs: self._add_task_chain(pool, ni, count, *args, **kwargs) - ts, rcs = make_task() assert len(ts) == aic assert len(rcs) == aic assert pool.num_tasks() == null_tasks + len(ts) print pool._tasks.nodes - - # read all at once - print "read(0)" + # read(0) + ######### st = time.time() items = rcs[-1].read() - print "finished read(0)" elapsed = time.time() - st - print len(items), ni assert len(items) == ni + del(rcs) + assert pool.num_tasks() == 0 # tasks depleted, all done, no handles + print >> sys.stderr, "Dependent Tasks: evaluated %i items of %i dependent in %f s ( %i items / s )" % (ni, aic, elapsed, ni / elapsed) - print >> sys.stderr, "Dependent Tasks: evaluated %i items of %i dependent in %f s ( %i items / s )" % (ni, aic, elapsed, ni / elapsed) + + # read(1) + ######### + ts, rcs = make_task() + st = time.time() + for i in xrange(ni): + items = rcs[-1].read(1) + assert len(items) == 1 + # END for each item to pull + elapsed_single = time.time() - st + # another read yields nothing, its empty + assert len(rcs[-1].read()) == 0 + print >> sys.stderr, "Dependent Tasks: evaluated %i items with read(1) of %i dependent in %f s ( %i items / s )" % (ni, aic, elapsed_single, ni / elapsed_single) + + + # read with min-count size + ########################### + # must be faster, as it will read ni / 4 chunks + # Its enough to set one task, as it will force all others in the chain + # to min_size as well. + ts, rcs = make_task() + assert pool.num_tasks() == len(ts) + nri = ni / 4 + ts[-1].min_count = nri + st = time.time() + for i in xrange(ni): + items = rcs[-1].read(1) + assert len(items) == 1 + # END for each item to read + elapsed_minsize = time.time() - st + # its empty + assert len(rcs[-1].read()) == 0 + print >> sys.stderr, "Dependent Tasks: evaluated %i items with read(1), min_size=%i, of %i dependent in %f s ( %i items / s )" % (ni, nri, aic, elapsed_minsize, ni / elapsed_minsize) + + # it should have been a bit faster at least, and most of the time it is + # Sometimes, its not, mainly because: + # * The test tasks lock a lot, hence they slow down the system + # * Each read will still trigger the pool to evaluate, causing some overhead + # even though there are enough items on the queue in that case. Keeping + # track of the scheduled items helped there, but it caused further inacceptable + # slowdown + # assert elapsed_minsize < elapsed_single + + + # read with failure + ################### + # it should recover and give at least fail_after items + # t1 -> x -> t3 + fail_after = ni/2 + ts, rcs = make_task(fail_setup=[(0, fail_after)]) + items = rcs[-1].read() + assert len(items) == fail_after + # MULTI-POOL + # If two pools are connected, this shold work as well. + # The second one has just one more thread + if False: + p2 = ThreadPool(1) + assert p2.size() == 1 + p2ts, p2rcs = self._add_task_chain(p2, ni, count) + + ts, rcs = make_task() + + + del(p2ts) + del(p2rcs) + assert p2.num_tasks() == 0 + del(p2) + # in the end, we expect all tasks to be gone, automatically # order of deletion doesnt matter del(ts) -- cgit v1.2.3 From 55e757928e493ce93056822d510482e4ffcaac2d Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 10 Jun 2010 14:39:57 +0200 Subject: channel: Changed design to be more logical - a channel now has any amount of readers and writers, a ready is not connected to its writer anymore. This changes the refcounting of course, which is why the auto-cleanup for the pool is currently broken. The benefit of this are faster writes to the channel, reading didn't improve, refcounts should be clearer now --- lib/git/async/channel.py | 102 +++++++++++++++++++---------------------- lib/git/async/pool.py | 76 +++++++++++++++--------------- lib/git/async/task.py | 54 ++++++++++------------ test/git/async/test_channel.py | 6 +-- test/git/async/test_pool.py | 5 +- 5 files changed, 113 insertions(+), 130 deletions(-) diff --git a/lib/git/async/channel.py b/lib/git/async/channel.py index abb31035..9b019707 100644 --- a/lib/git/async/channel.py +++ b/lib/git/async/channel.py @@ -21,61 +21,57 @@ class Channel(object): If the channel is closed, any read operation will result in an exception This base class is not instantiated directly, but instead serves as constructor - for RWChannel pairs. + for Rwriter pairs. Create a new channel """ - __slots__ = tuple() - - -class WChannel(Channel): - """The write end of a channel - it is thread-safe""" - __slots__ = ('_queue') + __slots__ = 'queue' # The queue to use to store the actual data QueueCls = AsyncQueue def __init__(self): - """initialize this instance, able to hold max_items at once - Write calls will block if the channel is full, until someone reads from it""" - self._queue = self.QueueCls() - - #{ Interface - def write(self, item, block=True, timeout=None): - """Send an item into the channel, it can be read from the read end of the - channel accordingly - :param item: Item to send - :param block: If True, the call will block until there is free space in the - channel - :param timeout: timeout in seconds for blocking calls. - :raise ReadOnly: when writing into closed channel""" - # let the queue handle the 'closed' attribute, we write much more often - # to an open channel than to a closed one, saving a few cycles - self._queue.put(item, block, timeout) - + """initialize this instance with a queue holding the channel contents""" + self.queue = self.QueueCls() + + +class SerialChannel(Channel): + """A slightly faster version of a Channel, which sacrificed thead-safety for performance""" + QueueCls = SyncQueue + + +class Writer(object): + """The write end of a channel, a file-like interface for a channel""" + __slots__ = ('write', 'channel') + + def __init__(self, channel): + """Initialize the writer to use the given channel""" + self.channel = channel + self.write = channel.queue.put + + #{ Interface def size(self): - """:return: approximate number of items that could be read from the read-ends - of this channel""" - return self._queue.qsize() + return self.channel.queue.qsize() def close(self): """Close the channel. Multiple close calls on a closed channel are no an error""" - self._queue.set_writable(False) + self.channel.queue.set_writable(False) def closed(self): """:return: True if the channel was closed""" - return not self._queue.writable() + return not self.channel.queue.writable() #} END interface -class CallbackWChannel(WChannel): +class CallbackWriter(Writer): """The write end of a channel which allows you to setup a callback to be called after an item was written to the channel""" __slots__ = ('_pre_cb') - def __init__(self): - WChannel.__init__(self) + def __init__(self, channel): + Writer.__init__(self, channel) self._pre_cb = None + self.write = self._write def set_pre_cb(self, fun = lambda item: item): """Install a callback to be called before the given item is written. @@ -88,25 +84,19 @@ class CallbackWChannel(WChannel): self._pre_cb = fun return prev - def write(self, item, block=True, timeout=None): + def _write(self, item, block=True, timeout=None): if self._pre_cb: item = self._pre_cb(item) - WChannel.write(self, item, block, timeout) + self.channel.queue.put(item, block, timeout) - -class SerialWChannel(WChannel): - """A slightly faster version of a WChannel, which sacrificed thead-safety for - performance""" - QueueCls = SyncQueue - -class RChannel(Channel): - """The read-end of a corresponding write channel""" - __slots__ = '_wc' +class Reader(object): + """Allows reading from a channel""" + __slots__ = 'channel' - def __init__(self, wchannel): + def __init__(self, channel): """Initialize this instance from its parent write channel""" - self._wc = wchannel + self.channel = channel #{ Interface @@ -135,7 +125,7 @@ class RChannel(Channel): # in non-blocking mode, its all not a problem out = list() - queue = self._wc._queue + queue = self.channel.queue if not block: # be as fast as possible in non-blocking mode, hence # its a bit 'unrolled' @@ -198,12 +188,12 @@ class RChannel(Channel): #} END interface -class CallbackRChannel(RChannel): +class CallbackReader(Reader): """A channel which sends a callback before items are read from the channel""" __slots__ = "_pre_cb" - def __init__(self, wc): - RChannel.__init__(self, wc) + def __init__(self, channel): + Reader.__init__(self, channel) self._pre_cb = None def set_pre_cb(self, fun = lambda count: None): @@ -220,18 +210,20 @@ class CallbackRChannel(RChannel): def read(self, count=0, block=True, timeout=None): if self._pre_cb: self._pre_cb(count) - return RChannel.read(self, count, block, timeout) + return Reader.read(self, count, block, timeout) #} END classes #{ Constructors -def mkchannel(wctype = WChannel, rctype = RChannel): - """Create a channel, which consists of one write end and one read end - :return: tuple(write_channel, read_channel) +def mkchannel(ctype = Channel, wtype = Writer, rtype = Reader): + """Create a channel, with a reader and a writer + :return: tuple(reader, writer) + :param ctype: Channel to instantiate :param wctype: The type of the write channel to instantiate :param rctype: The type of the read channel to instantiate""" - wc = wctype() - rc = rctype(wc) + c = ctype() + wc = wtype(c) + rc = rtype(c) return wc, rc #} END constructors diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 1b3c2748..68551ea3 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -18,27 +18,28 @@ from Queue import ( from graph import Graph from channel import ( mkchannel, - WChannel, - SerialWChannel, - CallbackRChannel + Writer, + Channel, + SerialChannel, + CallbackReader ) import sys import weakref from time import sleep +import new -class RPoolChannel(CallbackRChannel): - """ A read-only pool channel may not be wrapped or derived from, but it provides slots to call - before and after an item is to be read. - +class PoolReader(CallbackReader): + """A reader designed to read from channels which take part in pools It acts like a handle to the underlying task in the pool.""" - __slots__ = ('_task_ref', '_pool_ref') + __slots__ = ('_task_ref', '_pool_ref', '_read') - def __init__(self, wchannel, task, pool): - CallbackRChannel.__init__(self, wchannel) + def __init__(self, channel, task, pool): + CallbackReader.__init__(self, channel) self._task_ref = weakref.ref(task) self._pool_ref = weakref.ref(pool) + self._read = new.instancemethod(CallbackReader.__dict__['read'], self, CallbackReader) def __del__(self): """Assures that our task will be deleted if we were the last reader""" @@ -63,15 +64,9 @@ class RPoolChannel(CallbackRChannel): # okay for now # TODO: Fix this - private/public method if sys.getrefcount(self) < 6: - pool.remove_task(task, _from_destructor_=True) + pool.remove_task(task) # END handle refcount based removal of task - #{ Internal - def _read(self, count=0, block=True, timeout=None): - """Direct read, bypassing the pool handling""" - return CallbackRChannel.read(self, count, block, timeout) - #} END internal - #{ Interface def pool_ref(self): @@ -118,7 +113,7 @@ class RPoolChannel(CallbackRChannel): ####### read data ######## ########################## # read actual items, tasks were setup to put their output into our channel ( as well ) - items = CallbackRChannel.read(self, count, block, timeout) + items = CallbackReader.read(self, count, block, timeout) ########################## @@ -262,21 +257,21 @@ class Pool(object): # should make things execute faster. Putting the if statements # into the loop would be less code, but ... slower # DEBUG - # print actual_count, numchunks, chunksize, remainder, task._out_wc.size() + # print actual_count, numchunks, chunksize, remainder, task._out_writer.size() if self._num_workers: # respect the chunk size, and split the task up if we want # to process too much. This can be defined per task - queue = self._queue + qput = self._queue if numchunks > 1: for i in xrange(numchunks): - queue.put((task.process, chunksize)) + qput((task.process, chunksize)) # END for each chunk to put else: - queue.put((task.process, chunksize)) + qput((task.process, chunksize)) # END try efficient looping if remainder: - queue.put((task.process, remainder)) + qput((task.process, remainder)) # END handle chunksize else: # no workers, so we have to do the work ourselves @@ -295,16 +290,16 @@ class Pool(object): # END for each task to process - def _remove_task_if_orphaned(self, task, from_destructor): + def _remove_task_if_orphaned(self, task): """Check the task, and delete it if it is orphaned""" - # 1 as its stored on the task, 1 for the getrefcount call + # 1 for writer on task, 1 for the getrefcount call + 1 for each other writer/reader # If we are getting here from the destructor of an RPool channel, # its totally valid to virtually decrement the refcount by 1 as # we can expect it to drop once the destructor completes, which is when # we finish all recursive calls - max_ref_count = 3 + from_destructor - if sys.getrefcount(task.wchannel()) < max_ref_count: - self.remove_task(task, from_destructor) + max_ref_count = 3 + if sys.getrefcount(task.writer().channel) < max_ref_count: + self.remove_task(task) #} END internal #{ Interface @@ -375,7 +370,7 @@ class Pool(object): finally: self._taskgraph_lock.release() - def remove_task(self, task, _from_destructor_=False): + def remove_task(self, task): """Delete the task Additionally we will remove orphaned tasks, which can be identified if their output channel is only held by themselves, so no one will ever consume @@ -410,7 +405,7 @@ class Pool(object): # END locked deletion for t in in_tasks: - self._remove_task_if_orphaned(t, _from_destructor_) + self._remove_task_if_orphaned(t) # END handle orphans recursively return self @@ -421,7 +416,7 @@ class Pool(object): the task will be considered orphaned and will be deleted on the next occasion.""" # create a write channel for it - wctype = WChannel + ctype = Channel # adjust the task with our pool ref, if it has the slot and is empty # For now, we don't allow tasks to be used in multiple pools, except @@ -442,26 +437,29 @@ class Pool(object): # Use a non-threadsafe queue # This brings about 15% more performance, but sacrifices thread-safety - # when reading from multiple threads. if self.size() == 0: - wctype = SerialWChannel + ctype = SerialChannel # END improve locks # setup the tasks channel - respect the task creators choice though # if it is set. - wc = task.wchannel() + wc = task.writer() + ch = None if wc is None: - wc = wctype() + ch = ctype() + wc = Writer(ch) + task.set_writer(wc) + else: + ch = wc.channel # END create write channel ifunset - rc = RPoolChannel(wc, task, self) - task.set_wchannel(wc) + rc = PoolReader(ch, task, self) finally: self._taskgraph_lock.release() # END sync task addition # If the input channel is one of our read channels, we add the relation - if hasattr(task, 'rchannel'): - ic = task.rchannel() + if hasattr(task, 'reader'): + ic = task.reader() if hasattr(ic, 'pool_ref') and ic.pool_ref()() is self: self._taskgraph_lock.acquire() try: diff --git a/lib/git/async/task.py b/lib/git/async/task.py index 0b1d0666..5a6c1e95 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -1,23 +1,17 @@ from graph import Node from util import ReadOnly -from channel import ( - WChannel, - CallbackRChannel - ) import threading import weakref import sys import new -getrefcount = sys.getrefcount - class OutputChannelTask(Node): """Abstracts a named task as part of a set of interdependent tasks, which contains additional information on how the task should be queued and processed. Results of the item processing are sent to a write channel, which is to be - set by the creator using the ``set_wchannel`` method. + set by the creator using the ``set_writer`` method. * **min_count** assures that not less than min_count items will be processed per call. * **max_chunksize** assures that multi-threading is happening in smaller chunks. If @@ -25,9 +19,11 @@ class OutputChannelTask(Node): one worker, as well as dependent tasks. If you want finer granularity , you can specify this here, causing chunks to be no larger than max_chunksize""" __slots__ = ( '_read', # method to yield items to process - '_out_wc', # output write channel + '_out_writer', # output write channel '_exc', # exception caught '_done', # True if we are done + '_num_writers', # number of concurrent writers + '_wlock', # lock for the above 'fun', # function to call with items read 'min_count', # minimum amount of items to produce, None means no override 'max_chunksize', # maximium amount of items to process per process call @@ -35,12 +31,14 @@ class OutputChannelTask(Node): ) def __init__(self, id, fun, apply_single=True, min_count=None, max_chunksize=0, - wchannel=None): + writer=None): Node.__init__(self, id) self._read = None # to be set by subclasss - self._out_wc = wchannel # to be set later + self._out_writer = writer self._exc = None self._done = False + self._num_writers = 0 + self._wlock = threading.Lock() self.fun = fun self.min_count = None self.max_chunksize = 0 # note set @@ -54,29 +52,29 @@ class OutputChannelTask(Node): """Set ourselves to being done, has we have completed the processing""" self._done = True - def set_wchannel(self, wc): + def set_writer(self, writer): """Set the write channel to the given one""" - self._out_wc = wc + self._out_writer = writer - def wchannel(self): + def writer(self): """:return: a proxy to our write channel or None if non is set :note: you must not hold a reference to our write channel when the task is being processed. This would cause the write channel never to be closed as the task will think there is still another instance being processed which can close the channel once it is done. In the worst case, this will block your reads.""" - if self._out_wc is None: + if self._out_writer is None: return None - return self._out_wc + return self._out_writer def close(self): """A closed task will close its channel to assure the readers will wake up :note: its safe to call this method multiple times""" - self._out_wc.close() + self._out_writer.close() def is_closed(self): """:return: True if the task's write channel is closed""" - return self._out_wc.closed() + return self._out_writer.closed() def error(self): """:return: Exception caught during last processing or None""" @@ -88,24 +86,18 @@ class OutputChannelTask(Node): items = self._read(count) # print "%r: done reading %i items" % (self.id, len(items)) try: - # increase the ref-count - we use this to determine whether anyone else - # is currently handling our output channel. As this method runs asynchronously, - # we have to make sure that the channel is closed by the last finishing task, - # which is not necessarily the one which determines that he is done - # as he couldn't read anymore items. - # The refcount will be dropped in the moment we get out of here. - wc = self._out_wc + write = self._out_writer.write if self.apply_single: for item in items: rval = self.fun(item) - wc.write(rval) + write(rval) # END for each item else: # shouldn't apply single be the default anyway ? # The task designers should chunk them up in advance rvals = self.fun(items) for rval in rvals: - wc.write(rval) + write(rval) # END handle single apply except Exception, e: print >> sys.stderr, "task %s error:" % self.id, type(e), str(e) # TODO: REMOVE DEBUG, or make it use logging @@ -131,7 +123,7 @@ class OutputChannelTask(Node): self._exc = e # END set error flag # END exception handling - del(wc) + # if we didn't get all demanded items, which is also the case if count is 0 # we have depleted the input channel and are done @@ -151,7 +143,7 @@ class OutputChannelTask(Node): # thread having its copy on the stack # + 1 for the instance we provide to refcount # Soft close, so others can continue writing their results - if self.is_done() and getrefcount(self._out_wc) < 4: + if self.is_done(): # print "Closing channel of %r" % self.id self.close() # END handle channel closure @@ -212,14 +204,14 @@ class InputChannelTask(OutputChannelTask): to be the input channel to read from though.""" __slots__ = "_pool_ref" - def __init__(self, in_rc, *args, **kwargs): + def __init__(self, in_reader, *args, **kwargs): OutputChannelTask.__init__(self, *args, **kwargs) - self._read = in_rc.read + self._read = in_reader.read self._pool_ref = None #{ Internal Interface - def rchannel(self): + def reader(self): """:return: input channel from which we read""" # the instance is bound in its instance method - lets use this to keep # the refcount at one ( per consumer ) diff --git a/test/git/async/test_channel.py b/test/git/async/test_channel.py index 215081cd..a24c7c91 100644 --- a/test/git/async/test_channel.py +++ b/test/git/async/test_channel.py @@ -9,8 +9,8 @@ class TestChannels(TestBase): def test_base(self): # creating channel yields a write and a read channal wc, rc = mkchannel() - assert isinstance(wc, WChannel) # default args - assert isinstance(rc, RChannel) + assert isinstance(wc, Writer) # default args + assert isinstance(rc, Reader) # TEST UNLIMITED SIZE CHANNEL - writing+reading is FIFO @@ -46,7 +46,7 @@ class TestChannels(TestBase): # test callback channels - wc, rc = mkchannel(wctype = CallbackWChannel, rctype = CallbackRChannel) + wc, rc = mkchannel(wtype = CallbackWriter, rtype = CallbackReader) cb = [0, 0] # set slots to one if called def pre_write(item): diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 679bab31..d34f6773 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -191,8 +191,8 @@ class TestThreadPool(TestBase): assert p.num_tasks() == null_tasks rc = p.add_task(task) assert p.num_tasks() == 1 + null_tasks - assert isinstance(rc, RPoolChannel) - assert task._out_wc is not None + assert isinstance(rc, PoolReader) + assert task._out_writer is not None # pull the result completely - we should get one task, which calls its # function once. In sync mode, the order matches @@ -460,6 +460,7 @@ class TestThreadPool(TestBase): # order of deletion doesnt matter del(ts) del(rcs) + print pool.num_tasks() assert pool.num_tasks() == null_tasks -- cgit v1.2.3 From 7c36f3648e39ace752c67c71867693ce1eee52a3 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 10 Jun 2010 15:38:40 +0200 Subject: Now tracking the amount of concurrent writers to assure the channel is closed only when there is no one else writing to it. This assures that all tasks can continue working, and put their results accordingly. Shutdown is still not working correctly, but that should be solvable as well. Its still not perfect though ... --- lib/git/async/pool.py | 24 ++++++++++------- lib/git/async/task.py | 64 +++++++++++++++++++++++++++++++-------------- test/git/async/test_pool.py | 3 ++- 3 files changed, 61 insertions(+), 30 deletions(-) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 68551ea3..3fd99c7b 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -33,13 +33,12 @@ import new class PoolReader(CallbackReader): """A reader designed to read from channels which take part in pools It acts like a handle to the underlying task in the pool.""" - __slots__ = ('_task_ref', '_pool_ref', '_read') + __slots__ = ('_task_ref', '_pool_ref') def __init__(self, channel, task, pool): CallbackReader.__init__(self, channel) self._task_ref = weakref.ref(task) self._pool_ref = weakref.ref(pool) - self._read = new.instancemethod(CallbackReader.__dict__['read'], self, CallbackReader) def __del__(self): """Assures that our task will be deleted if we were the last reader""" @@ -62,11 +61,16 @@ class PoolReader(CallbackReader): # it has no way of knowing that the write channel is about to diminsh. # which is why we pass the info as a private kwarg - not nice, but # okay for now - # TODO: Fix this - private/public method if sys.getrefcount(self) < 6: - pool.remove_task(task) + pool.remove_task(task, _from_destructor_ = True) # END handle refcount based removal of task + #{ Internal + def _read(self, count=0, block=True, timeout=None): + return CallbackReader.read(self, count, block, timeout) + + #} END internal + #{ Interface def pool_ref(self): @@ -261,7 +265,7 @@ class Pool(object): if self._num_workers: # respect the chunk size, and split the task up if we want # to process too much. This can be defined per task - qput = self._queue + qput = self._queue.put if numchunks > 1: for i in xrange(numchunks): qput((task.process, chunksize)) @@ -290,16 +294,16 @@ class Pool(object): # END for each task to process - def _remove_task_if_orphaned(self, task): + def _remove_task_if_orphaned(self, task, from_destructor): """Check the task, and delete it if it is orphaned""" # 1 for writer on task, 1 for the getrefcount call + 1 for each other writer/reader # If we are getting here from the destructor of an RPool channel, # its totally valid to virtually decrement the refcount by 1 as # we can expect it to drop once the destructor completes, which is when # we finish all recursive calls - max_ref_count = 3 + max_ref_count = 3 + from_destructor if sys.getrefcount(task.writer().channel) < max_ref_count: - self.remove_task(task) + self.remove_task(task, from_destructor) #} END internal #{ Interface @@ -370,7 +374,7 @@ class Pool(object): finally: self._taskgraph_lock.release() - def remove_task(self, task): + def remove_task(self, task, _from_destructor_ = False): """Delete the task Additionally we will remove orphaned tasks, which can be identified if their output channel is only held by themselves, so no one will ever consume @@ -405,7 +409,7 @@ class Pool(object): # END locked deletion for t in in_tasks: - self._remove_task_if_orphaned(t) + self._remove_task_if_orphaned(t, _from_destructor_) # END handle orphans recursively return self diff --git a/lib/git/async/task.py b/lib/git/async/task.py index 5a6c1e95..ae2532d9 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -82,23 +82,36 @@ class OutputChannelTask(Node): def process(self, count=0): """Process count items and send the result individually to the output channel""" - # print "%r: reading %i" % (self.id, count) + # first thing: increment the writer count + self._wlock.acquire() + self._num_writers += 1 + self._wlock.release() + + #print "%r: reading %i" % (self.id, count) + #if hasattr(self, 'reader'): + # print "from", self.reader().channel items = self._read(count) - # print "%r: done reading %i items" % (self.id, len(items)) + #print "%r: done reading %i items" % (self.id, len(items)) try: - write = self._out_writer.write - if self.apply_single: - for item in items: - rval = self.fun(item) - write(rval) - # END for each item - else: - # shouldn't apply single be the default anyway ? - # The task designers should chunk them up in advance - rvals = self.fun(items) - for rval in rvals: - write(rval) - # END handle single apply + try: + write = self._out_writer.write + if self.apply_single: + for item in items: + rval = self.fun(item) + write(rval) + # END for each item + else: + # shouldn't apply single be the default anyway ? + # The task designers should chunk them up in advance + rvals = self.fun(items) + for rval in rvals: + write(rval) + # END handle single apply + finally: + self._wlock.acquire() + self._num_writers -= 1 + self._wlock.release() + # END handle writer count except Exception, e: print >> sys.stderr, "task %s error:" % self.id, type(e), str(e) # TODO: REMOVE DEBUG, or make it use logging # be sure our task is not scheduled again @@ -144,8 +157,13 @@ class OutputChannelTask(Node): # + 1 for the instance we provide to refcount # Soft close, so others can continue writing their results if self.is_done(): - # print "Closing channel of %r" % self.id - self.close() + self._wlock.acquire() + if self._num_writers == 0: + #if not self.is_closed(): # DEBUG + # print "Closing channel of %r" % self.id, len(self._out_writer.channel.queue), self._out_writer.channel + self.close() + # END handle writers + self._wlock.release() # END handle channel closure #{ Configuration @@ -158,7 +176,7 @@ class ThreadTaskBase(object): class InputIteratorTaskBase(OutputChannelTask): """Implements a task which processes items from an iterable in a multi-processing safe manner""" - __slots__ = ('_iterator', '_lock') + __slots__ = ('_iterator', '_lock', '_empty') # the type of the lock to use when reading from the iterator lock_type = None @@ -169,12 +187,19 @@ class InputIteratorTaskBase(OutputChannelTask): self._iterator = iterator self._lock = self.lock_type() self._read = self.__read + self._empty = False def __read(self, count=0): """Read count items from the iterator, and return them""" + # not threadsafe, but worst thing that could happen is that + # we try to get items one more time + if self._empty: + return list() + # END early abort self._lock.acquire() try: if count == 0: + self._empty = True return list(self._iterator) else: out = list() @@ -183,6 +208,7 @@ class InputIteratorTaskBase(OutputChannelTask): try: out.append(it.next()) except StopIteration: + self._empty = True break # END handle empty iterator # END for each item to take @@ -198,7 +224,7 @@ class InputIteratorThreadTask(InputIteratorTaskBase, ThreadTaskBase): lock_type = threading.Lock -class InputChannelTask(OutputChannelTask): +class InputChannelTask(OutputChannelTask, ThreadTaskBase): """Uses an input channel as source for reading items For instantiation, it takes all arguments of its base, the first one needs to be the input channel to read from though.""" diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index d34f6773..7cb94a86 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -42,7 +42,7 @@ class _TestTaskBase(object): print self.item_count, fc assert self.item_count == fc self.lock.release() - + assert self._num_writers == 0 return self @@ -381,6 +381,7 @@ class TestThreadPool(TestBase): st = time.time() items = rcs[-1].read() elapsed = time.time() - st + print len(items), ni assert len(items) == ni del(rcs) assert pool.num_tasks() == 0 # tasks depleted, all done, no handles -- cgit v1.2.3 From c34343d0b714d2c4657972020afea034a167a682 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 10 Jun 2010 15:52:32 +0200 Subject: tasks can now terminate faster when no items were read, without neglecting their duty to close the channel if required. Code is a little less maintainable now, but faster, it appears --- lib/git/async/task.py | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/lib/git/async/task.py b/lib/git/async/task.py index ae2532d9..a8ba5ac6 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -92,21 +92,24 @@ class OutputChannelTask(Node): # print "from", self.reader().channel items = self._read(count) #print "%r: done reading %i items" % (self.id, len(items)) + try: try: - write = self._out_writer.write - if self.apply_single: - for item in items: - rval = self.fun(item) - write(rval) - # END for each item - else: - # shouldn't apply single be the default anyway ? - # The task designers should chunk them up in advance - rvals = self.fun(items) - for rval in rvals: - write(rval) - # END handle single apply + if items: + write = self._out_writer.write + if self.apply_single: + for item in items: + rval = self.fun(item) + write(rval) + # END for each item + else: + # shouldn't apply single be the default anyway ? + # The task designers should chunk them up in advance + rvals = self.fun(items) + for rval in rvals: + write(rval) + # END handle single apply + # END if there is anything to do finally: self._wlock.acquire() self._num_writers -= 1 @@ -158,12 +161,14 @@ class OutputChannelTask(Node): # Soft close, so others can continue writing their results if self.is_done(): self._wlock.acquire() - if self._num_writers == 0: - #if not self.is_closed(): # DEBUG - # print "Closing channel of %r" % self.id, len(self._out_writer.channel.queue), self._out_writer.channel - self.close() - # END handle writers - self._wlock.release() + try: + if self._num_writers == 0: + # print "Closing channel of %r" % self.id, len(self._out_writer.channel.queue), self._out_writer.channel + self.close() + # END handle writers + finally: + self._wlock.release() + # END assure lock release # END handle channel closure #{ Configuration -- cgit v1.2.3 From fbe062bf6dacd3ad63dd827d898337fa542931ac Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 10 Jun 2010 23:55:50 +0200 Subject: Added dependency-task tests, and fixed plenty of ref-count related bugs, as well as concurrency issues. Now it works okay, but the thread-shutdown is still an issue, as it causes incorrect behaviour making the tests fail. Its good, as it hints at additional issues that need to be solved. There is just a little more left on the feature side, but its nearly there --- lib/git/async/graph.py | 14 ++++- lib/git/async/pool.py | 2 +- lib/git/async/task.py | 10 +++- lib/git/async/thread.py | 43 ++++++++------- test/git/async/test_graph.py | 16 +++++- test/git/async/test_pool.py | 129 ++++++++++++++++++++++++++++++++----------- 6 files changed, 155 insertions(+), 59 deletions(-) diff --git a/lib/git/async/graph.py b/lib/git/async/graph.py index e3999cdc..9ee0e891 100644 --- a/lib/git/async/graph.py +++ b/lib/git/async/graph.py @@ -25,14 +25,24 @@ class Graph(object): def __init__(self): self.nodes = list() + + def __del__(self): + """Deletes bidericational dependencies""" + for node in self.nodes: + node.in_nodes = None + node.out_nodes = None + # END cleanup nodes + + # otherwise the nodes would keep floating around + def add_node(self, node): """Add a new node to the graph :return: the newly added node""" self.nodes.append(node) return node - def del_node(self, node): + def remove_node(self, node): """Delete a node from the graph :return: self""" try: @@ -46,6 +56,8 @@ class Graph(object): del(outn.in_nodes[outn.in_nodes.index(node)]) for inn in node.in_nodes: del(inn.out_nodes[inn.out_nodes.index(node)]) + node.out_nodes = list() + node.in_nodes = list() return self def add_edge(self, u, v): diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 3fd99c7b..0aad90ae 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -402,7 +402,7 @@ class Pool(object): # keep its input nodes as we check whether they were orphaned in_tasks = task.in_nodes - self._tasks.del_node(task) + self._tasks.remove_node(task) self._taskorder_cache.clear() finally: self._taskgraph_lock.release() diff --git a/lib/git/async/task.py b/lib/git/async/task.py index a8ba5ac6..49e7e7cf 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -82,7 +82,8 @@ class OutputChannelTask(Node): def process(self, count=0): """Process count items and send the result individually to the output channel""" - # first thing: increment the writer count + # first thing: increment the writer count - other tasks must be able + # to respond properly ( even if it turns out we don't need it later ) self._wlock.acquire() self._num_writers += 1 self._wlock.release() @@ -191,7 +192,11 @@ class InputIteratorTaskBase(OutputChannelTask): raise ValueError("Iterator %r needs a next() function" % iterator) self._iterator = iterator self._lock = self.lock_type() - self._read = self.__read + + # this is necessary to prevent a cyclic ref, preventing us from + # getting deleted ( and collected ) + weakself = weakref.ref(self) + self._read = lambda count: weakself().__read(count) self._empty = False def __read(self, count=0): @@ -201,6 +206,7 @@ class InputIteratorTaskBase(OutputChannelTask): if self._empty: return list() # END early abort + self._lock.acquire() try: if count == 0: diff --git a/lib/git/async/thread.py b/lib/git/async/thread.py index faeda04f..b8d2e418 100644 --- a/lib/git/async/thread.py +++ b/lib/git/async/thread.py @@ -116,7 +116,7 @@ class WorkerThread(TerminatableThread): t[1] = optional, tuple or list of arguments to pass to the routine t[2] = optional, dictionary of keyword arguments to pass to the routine """ - __slots__ = ('inq', '_current_routine') + __slots__ = ('inq') # define how often we should check for a shutdown request in case our @@ -128,7 +128,6 @@ class WorkerThread(TerminatableThread): self.inq = inq if inq is None: self.inq = Queue.Queue() - self._current_routine = None # routine we execute right now @classmethod def stop(cls, *args): @@ -141,7 +140,6 @@ class WorkerThread(TerminatableThread): gettask = self.inq.get while True: - self._current_routine = None if self._should_terminate(): break # END check for stop request @@ -153,22 +151,27 @@ class WorkerThread(TerminatableThread): assert len(tasktuple) == 2, "Need tuple of function, arg - it could be more flexible, but its reduced to what we need" routine, arg = tasktuple - self._current_routine = routine - try: - rval = None - if inspect.ismethod(routine): - if routine.im_self is None: - rval = routine(self, arg) - else: + try: + rval = None + if inspect.ismethod(routine): + if routine.im_self is None: + rval = routine(self, arg) + else: + rval = routine(arg) + elif inspect.isroutine(routine): rval = routine(arg) - elif inspect.isroutine(routine): - rval = routine(arg) - else: - # ignore unknown items - print >> sys.stderr, "%s: task %s was not understood - terminating" % (self.getName(), str(tasktuple)) - break - # END make routine call + else: + # ignore unknown items + print >> sys.stderr, "%s: task %s was not understood - terminating" % (self.getName(), str(tasktuple)) + break + # END make routine call + finally: + # make sure we delete the routine to release the reference as soon + # as possible. Otherwise objects might not be destroyed + # while we are waiting + del(routine) + del(tasktuple) except StopProcessing: print self.name, "stops processing" # DEBUG break @@ -176,12 +179,10 @@ class WorkerThread(TerminatableThread): print >> sys.stderr, "%s: Task %s raised unhandled exception: %s - this really shouldn't happen !" % (self.getName(), str(tasktuple), str(e)) continue # just continue # END routine exception handling + + # END handle routine release # END endless loop - def routine(self): - """:return: routine we are currently executing, or None if we have no task""" - return self._current_routine - def stop_and_join(self): """Send stop message to ourselves""" self.inq.put((self.stop, None)) diff --git a/test/git/async/test_graph.py b/test/git/async/test_graph.py index d0e36159..7630226b 100644 --- a/test/git/async/test_graph.py +++ b/test/git/async/test_graph.py @@ -3,6 +3,7 @@ from test.testlib import * from git.async.graph import * import time +import sys class TestGraph(TestBase): @@ -19,7 +20,7 @@ class TestGraph(TestBase): # delete unconnected nodes for n in g.nodes[:]: - g.del_node(n) + g.remove_node(n) # END del nodes # add a chain of connected nodes @@ -54,8 +55,8 @@ class TestGraph(TestBase): # deleting a connected node clears its neighbour connections assert n3.in_nodes[0] is n2 - assert g.del_node(n2) is g - assert g.del_node(n2) is g # multi-deletion okay + assert g.remove_node(n2) is g + assert g.remove_node(n2) is g # multi-deletion okay assert len(g.nodes) == nn - 1 assert len(n3.in_nodes) == 0 assert len(n1.out_nodes) == 0 @@ -68,3 +69,12 @@ class TestGraph(TestBase): assert dfirst_nodes[-1] == end and dfirst_nodes[-2].id == end.id-1 + # test cleanup + # its at least kept by its graph + assert sys.getrefcount(end) > 3 + del(g) + del(n1); del(n2); del(n3) + del(dfirst_nodes) + del(last) + del(n) + assert sys.getrefcount(end) == 2 diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 7cb94a86..4851f61b 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -5,6 +5,7 @@ from git.async.task import * from git.async.thread import terminate_threads from git.async.util import cpu_count import threading +import weakref import time import sys @@ -42,7 +43,9 @@ class _TestTaskBase(object): print self.item_count, fc assert self.item_count == fc self.lock.release() + self._wlock.acquire() assert self._num_writers == 0 + self._wlock.release() return self @@ -122,31 +125,47 @@ class TestThreadPool(TestBase): max_threads = cpu_count() - def _add_task_chain(self, p, ni, count=1, fail_setup=list()): + + def _make_proxy_method(self, t): + """required to prevent binding self into the method we call""" + wt = weakref.proxy(t) + return lambda item: wt.do_fun(item) + + def _add_task_chain(self, p, ni, count=1, fail_setup=list(), feeder_channel=None, id_offset=0): """Create a task chain of feeder, count transformers and order verifcator to the pool p, like t1 -> t2 -> t3 :param fail_setup: a list of pairs, task_id, fail_after, i.e. [(2, 20)] would make the third transformer fail after 20 items + :param feeder_channel: if set to a channel, it will be used as input of the + first transformation task. The respective first task in the return value + will be None. + :param id_offset: defines the id of the first transformation task, all subsequent + ones will add one :return: tuple(list(task1, taskN, ...), list(rc1, rcN, ...))""" nt = p.num_tasks() - feeder = self._make_iterator_task(ni) - frc = p.add_task(feeder) - - assert p.num_tasks() == nt + 1 + feeder = None + frc = feeder_channel + if feeder_channel is None: + feeder = self._make_iterator_task(ni) + frc = p.add_task(feeder) + # END handle specific feeder rcs = [frc] tasks = [feeder] + make_proxy_method = self._make_proxy_method + inrc = frc for tc in xrange(count): - t = TestThreadInputChannelTaskNode(inrc, tc, None) - t.fun = t.do_fun + t = TestThreadInputChannelTaskNode(inrc, tc+id_offset, None) + + t.fun = make_proxy_method(t) + #t.fun = t.do_fun inrc = p.add_task(t) tasks.append(t) rcs.append(inrc) - assert p.num_tasks() == nt + 2 + tc # END create count transformers # setup failure @@ -155,10 +174,10 @@ class TestThreadPool(TestBase): # END setup failure verifier = TestThreadInputChannelVerifyTaskNode(inrc, 'verifier', None) - verifier.fun = verifier.do_fun + #verifier.fun = verifier.do_fun + verifier.fun = make_proxy_method(verifier) vrc = p.add_task(verifier) - assert p.num_tasks() == nt + tc + 3 tasks.append(verifier) rcs.append(vrc) @@ -169,7 +188,7 @@ class TestThreadPool(TestBase): :param taskcls: the actual iterator type to use :param **kwargs: additional kwargs to be passed to the task""" t = taskcls(iter(range(ni)), 'iterator', None, **kwargs) - t.fun = t.do_fun + t.fun = self._make_proxy_method(t) return t def _assert_single_task(self, p, async=False): @@ -385,6 +404,14 @@ class TestThreadPool(TestBase): assert len(items) == ni del(rcs) assert pool.num_tasks() == 0 # tasks depleted, all done, no handles + # wait a tiny moment - there could still be something unprocessed on the + # queue, increasing the refcount + time.sleep(0.15) + import gc + print gc.get_referrers(ts[-1]) + print len(pool._queue) + assert sys.getrefcount(ts[-1]) == 2 # ts + call + assert sys.getrefcount(ts[0]) == 2 # ts + call print >> sys.stderr, "Dependent Tasks: evaluated %i items of %i dependent in %f s ( %i items / s )" % (ni, aic, elapsed, ni / elapsed) @@ -444,25 +471,53 @@ class TestThreadPool(TestBase): # MULTI-POOL # If two pools are connected, this shold work as well. # The second one has just one more thread - if False: - p2 = ThreadPool(1) - assert p2.size() == 1 - p2ts, p2rcs = self._add_task_chain(p2, ni, count) - - ts, rcs = make_task() - - - del(p2ts) - del(p2rcs) - assert p2.num_tasks() == 0 - del(p2) - - # in the end, we expect all tasks to be gone, automatically - # order of deletion doesnt matter + ts, rcs = make_task() + + # connect verifier channel as feeder of the second pool + p2 = ThreadPool(1) + assert p2.size() == 1 + p2ts, p2rcs = self._add_task_chain(p2, ni, count, feeder_channel=rcs[-1], id_offset=count) + assert p2ts[0] is None # we have no feeder task + assert rcs[-1].pool_ref()() is pool # it didnt change the pool + assert rcs[-1] is p2ts[1].reader() + assert p2.num_tasks() == len(p2ts)-1 # first is None + + # reading from the last one will evaluate all pools correctly + print "read(0) multi-pool" + items = p2rcs[-1].read() + assert len(items) == ni + + # now that both are connected, I can drop my handle to the reader + # without affecting the task-count, but whats more important: + # They remove their tasks correctly once we drop our references in the + # right order + del(p2ts) + assert p2rcs[0] is rcs[-1] + del(p2rcs) + assert p2.num_tasks() == 0 + del(p2) + + assert pool.num_tasks() == null_tasks + len(ts) + + del(ts) + print "del rcs" + print rcs[-1] + print sys.getrefcount(rcs[-1]) del(rcs) + # TODO: make this work - something with the refcount goes wrong, + # they never get cleaned up properly + ts = pool._tasks.nodes print pool.num_tasks() - assert pool.num_tasks() == null_tasks + assert pool.num_tasks() == null_tasks + + + # TODO: Test multi-read(1) + + # in the end, we expect all tasks to be gone, automatically + # order of deletion doesnt matter + + @@ -496,17 +551,28 @@ class TestThreadPool(TestBase): # SINGLE TASK SERIAL SYNC MODE ############################## - # put a few unrelated tasks that we forget about - urc1 = p.add_task(TestThreadTaskNode(iter(list()), "nothing", None)) - urc2 = p.add_task(TestThreadTaskNode(iter(list()), "nothing", None)) + # put a few unrelated tasks that we forget about - check ref counts and cleanup + t1, t2 = TestThreadTaskNode(iter(list()), "nothing1", None), TestThreadTaskNode(iter(list()), "nothing2", None) + urc1 = p.add_task(t1) + urc2 = p.add_task(t2) assert p.num_tasks() == 2 ## SINGLE TASK ################# self._assert_single_task(p, False) assert p.num_tasks() == 2 del(urc1) - del(urc2) + assert p.num_tasks() == 1 + + p.remove_task(t2) + assert p.num_tasks() == 0 + assert sys.getrefcount(t2) == 2 + + t3 = TestThreadInputChannelTaskNode(urc2, "channel", None) + urc3 = p.add_task(t3) + assert p.num_tasks() == 1 + del(urc3) assert p.num_tasks() == 0 + assert sys.getrefcount(t3) == 2 # DEPENDENT TASKS SYNC MODE @@ -519,6 +585,7 @@ class TestThreadPool(TestBase): # step one gear up - just one thread for now. p.set_size(1) assert p.size() == 1 + print len(threading.enumerate()) assert len(threading.enumerate()) == num_threads + 1 # deleting the pool stops its threads - just to be sure ;) # Its not synchronized, hence we wait a moment -- cgit v1.2.3 From 6d1212e8c412b0b4802bc1080d38d54907db879d Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 11 Jun 2010 11:52:01 +0200 Subject: IMPORTANT: sometimes, when notifying waiters by releasing their lock, the lock is not actually released or they are not actually notifyied, staying in a beautysleep. This glitch is probably caused by some detail not treated correctly in the thread python module, which is something we cannot fix. It works most of the time as expected though - maybe some cleanup is not done correctly which causes this --- lib/git/async/pool.py | 1 - lib/git/async/thread.py | 2 -- lib/git/async/util.py | 8 +++++--- test/git/async/test_pool.py | 26 ++++++++------------------ 4 files changed, 13 insertions(+), 24 deletions(-) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 0aad90ae..dbc201a9 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -324,7 +324,6 @@ class Pool(object): threadsafe to optimize item throughput. :note: currently NOT threadsafe !""" - print "set_size", size assert size > -1, "Size cannot be negative" # either start new threads, or kill existing ones. diff --git a/lib/git/async/thread.py b/lib/git/async/thread.py index b8d2e418..4d046a2f 100644 --- a/lib/git/async/thread.py +++ b/lib/git/async/thread.py @@ -146,9 +146,7 @@ class WorkerThread(TerminatableThread): # we wait and block - to terminate, send the 'stop' method tasktuple = gettask() - # needing exactly one function, and one arg - assert len(tasktuple) == 2, "Need tuple of function, arg - it could be more flexible, but its reduced to what we need" routine, arg = tasktuple try: diff --git a/lib/git/async/util.py b/lib/git/async/util.py index b7750b0b..11ab75a6 100644 --- a/lib/git/async/util.py +++ b/lib/git/async/util.py @@ -101,10 +101,12 @@ class HSCondition(deque): waiter.acquire() # get it the first time, no blocking self.append(waiter) - # in the momemnt we release our lock, someone else might actually resume - self._lock.release() - try: # restore state no matter what (e.g., KeyboardInterrupt) + + try: + # restore state no matter what (e.g., KeyboardInterrupt) # now we block, as we hold the lock already + # in the momemnt we release our lock, someone else might actually resume + self._lock.release() if timeout is None: waiter.acquire() else: diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 4851f61b..5bb48cc2 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -199,7 +199,7 @@ class TestThreadPool(TestBase): # add a simple task # it iterates n items - ni = 5000 + ni = 1000 assert ni % 2 == 0, "ni needs to be dividable by 2" assert ni % 4 == 0, "ni needs to be dividable by 4" @@ -382,18 +382,18 @@ class TestThreadPool(TestBase): # includes failure in center task, 'recursive' orphan cleanup # This will also verify that the channel-close mechanism works # t1 -> t2 -> t3 - + print >> sys.stderr, "Threadpool: starting async dependency test in %i threads" % pool.size() null_tasks = pool.num_tasks() - ni = 5000 + ni = 1000 count = 3 aic = count + 2 make_task = lambda *args, **kwargs: self._add_task_chain(pool, ni, count, *args, **kwargs) + ts, rcs = make_task() assert len(ts) == aic assert len(rcs) == aic assert pool.num_tasks() == null_tasks + len(ts) - print pool._tasks.nodes # read(0) ######### @@ -407,9 +407,6 @@ class TestThreadPool(TestBase): # wait a tiny moment - there could still be something unprocessed on the # queue, increasing the refcount time.sleep(0.15) - import gc - print gc.get_referrers(ts[-1]) - print len(pool._queue) assert sys.getrefcount(ts[-1]) == 2 # ts + call assert sys.getrefcount(ts[0]) == 2 # ts + call print >> sys.stderr, "Dependent Tasks: evaluated %i items of %i dependent in %f s ( %i items / s )" % (ni, aic, elapsed, ni / elapsed) @@ -467,15 +464,15 @@ class TestThreadPool(TestBase): items = rcs[-1].read() assert len(items) == fail_after - + # MULTI-POOL # If two pools are connected, this shold work as well. # The second one has just one more thread ts, rcs = make_task() # connect verifier channel as feeder of the second pool - p2 = ThreadPool(1) - assert p2.size() == 1 + p2 = ThreadPool(0) # don't spawn new threads, they have the tendency not to wake up on mutexes + assert p2.size() == 0 p2ts, p2rcs = self._add_task_chain(p2, ni, count, feeder_channel=rcs[-1], id_offset=count) assert p2ts[0] is None # we have no feeder task assert rcs[-1].pool_ref()() is pool # it didnt change the pool @@ -501,14 +498,8 @@ class TestThreadPool(TestBase): del(ts) - print "del rcs" - print rcs[-1] - print sys.getrefcount(rcs[-1]) del(rcs) - # TODO: make this work - something with the refcount goes wrong, - # they never get cleaned up properly - ts = pool._tasks.nodes - print pool.num_tasks() + assert pool.num_tasks() == null_tasks @@ -585,7 +576,6 @@ class TestThreadPool(TestBase): # step one gear up - just one thread for now. p.set_size(1) assert p.size() == 1 - print len(threading.enumerate()) assert len(threading.enumerate()) == num_threads + 1 # deleting the pool stops its threads - just to be sure ;) # Its not synchronized, hence we wait a moment -- cgit v1.2.3 From 5ff864138cd1e680a78522c26b583639f8f5e313 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 11 Jun 2010 14:37:51 +0200 Subject: test.async: split test_pool up into task implenetations and related utilities, as well as the tests themselves. File became too large --- test/git/async/task.py | 190 ++++++++++++++++++++++++++++++++++++++++++++ test/git/async/test_pool.py | 186 ++----------------------------------------- 2 files changed, 196 insertions(+), 180 deletions(-) create mode 100644 test/git/async/task.py diff --git a/test/git/async/task.py b/test/git/async/task.py new file mode 100644 index 00000000..9cc3cb9d --- /dev/null +++ b/test/git/async/task.py @@ -0,0 +1,190 @@ +"""Module containing task implementations useful for testing them""" +from git.async.task import * + +import threading +import weakref + +class _TestTaskBase(object): + """Note: causes great slowdown due to the required locking of task variables""" + def __init__(self, *args, **kwargs): + super(_TestTaskBase, self).__init__(*args, **kwargs) + self.should_fail = False + self.lock = threading.Lock() # yes, can't safely do x = x + 1 :) + self.plock = threading.Lock() + self.item_count = 0 + self.process_count = 0 + + def do_fun(self, item): + self.lock.acquire() + self.item_count += 1 + self.lock.release() + if self.should_fail: + raise AssertionError("I am failing just for the fun of it") + return item + + def process(self, count=1): + # must do it first, otherwise we might read and check results before + # the thread gets here :). Its a lesson ! + self.plock.acquire() + self.process_count += 1 + self.plock.release() + super(_TestTaskBase, self).process(count) + + def _assert(self, pc, fc, check_scheduled=False): + """Assert for num process counts (pc) and num function counts (fc) + :return: self""" + self.lock.acquire() + if self.item_count != fc: + print self.item_count, fc + assert self.item_count == fc + self.lock.release() + + # NOTE: asserting num-writers fails every now and then, implying a thread is + # still processing (an empty chunk) when we are checking it. This can + # only be prevented by checking the scheduled items, which requires locking + # and causes slowdows, so we don't do that. If the num_writers + # counter wouldn't be maintained properly, more tests would fail, so + # we can safely refrain from checking this here + # self._wlock.acquire() + # assert self._num_writers == 0 + # self._wlock.release() + return self + + +class TestThreadTaskNode(_TestTaskBase, InputIteratorThreadTask): + pass + + +class TestThreadFailureNode(TestThreadTaskNode): + """Fails after X items""" + def __init__(self, *args, **kwargs): + self.fail_after = kwargs.pop('fail_after') + super(TestThreadFailureNode, self).__init__(*args, **kwargs) + + def do_fun(self, item): + item = TestThreadTaskNode.do_fun(self, item) + + self.lock.acquire() + try: + if self.item_count > self.fail_after: + raise AssertionError("Simulated failure after processing %i items" % self.fail_after) + finally: + self.lock.release() + # END handle fail after + return item + + +class TestThreadInputChannelTaskNode(_TestTaskBase, InputChannelTask): + """Apply a transformation on items read from an input channel""" + def __init__(self, *args, **kwargs): + self.fail_after = kwargs.pop('fail_after', 0) + super(TestThreadInputChannelTaskNode, self).__init__(*args, **kwargs) + + def do_fun(self, item): + """return tuple(i, i*2)""" + item = super(TestThreadInputChannelTaskNode, self).do_fun(item) + + # fail after support + if self.fail_after: + self.lock.acquire() + try: + if self.item_count > self.fail_after: + raise AssertionError("Simulated failure after processing %i items" % self.fail_after) + finally: + self.lock.release() + # END handle fail-after + + if isinstance(item, tuple): + i = item[0] + return item + (i * self.id, ) + else: + return (item, item * self.id) + # END handle tuple + + +class TestThreadInputChannelVerifyTaskNode(_TestTaskBase, InputChannelTask): + """An input channel task, which verifies the result of its input channels, + should be last in the chain. + Id must be int""" + + def do_fun(self, item): + """return tuple(i, i*2)""" + item = super(TestThreadInputChannelVerifyTaskNode, self).do_fun(item) + + # make sure the computation order matches + assert isinstance(item, tuple), "input was no tuple: %s" % item + + base = item[0] + for id, num in enumerate(item[1:]): + assert num == base * id, "%i != %i, orig = %s" % (num, base * id, str(item)) + # END verify order + + return item + + +#{ Utilities + +def make_proxy_method(t): + """required to prevent binding self into the method we call""" + wt = weakref.proxy(t) + return lambda item: wt.do_fun(item) + +def add_task_chain(p, ni, count=1, fail_setup=list(), feeder_channel=None, id_offset=0): + """Create a task chain of feeder, count transformers and order verifcator + to the pool p, like t1 -> t2 -> t3 + :param fail_setup: a list of pairs, task_id, fail_after, i.e. [(2, 20)] would + make the third transformer fail after 20 items + :param feeder_channel: if set to a channel, it will be used as input of the + first transformation task. The respective first task in the return value + will be None. + :param id_offset: defines the id of the first transformation task, all subsequent + ones will add one + :return: tuple(list(task1, taskN, ...), list(rc1, rcN, ...))""" + nt = p.num_tasks() + + feeder = None + frc = feeder_channel + if feeder_channel is None: + feeder = make_iterator_task(ni) + frc = p.add_task(feeder) + # END handle specific feeder + + rcs = [frc] + tasks = [feeder] + + inrc = frc + for tc in xrange(count): + t = TestThreadInputChannelTaskNode(inrc, tc+id_offset, None) + + t.fun = make_proxy_method(t) + #t.fun = t.do_fun + inrc = p.add_task(t) + + tasks.append(t) + rcs.append(inrc) + # END create count transformers + + # setup failure + for id, fail_after in fail_setup: + tasks[1+id].fail_after = fail_after + # END setup failure + + verifier = TestThreadInputChannelVerifyTaskNode(inrc, 'verifier', None) + #verifier.fun = verifier.do_fun + verifier.fun = make_proxy_method(verifier) + vrc = p.add_task(verifier) + + + tasks.append(verifier) + rcs.append(vrc) + return tasks, rcs + +def make_iterator_task(ni, taskcls=TestThreadTaskNode, **kwargs): + """:return: task which yields ni items + :param taskcls: the actual iterator type to use + :param **kwargs: additional kwargs to be passed to the task""" + t = taskcls(iter(range(ni)), 'iterator', None, **kwargs) + t.fun = make_proxy_method(t) + return t + +#} END utilities diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 5bb48cc2..0fa34f6a 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -1,196 +1,22 @@ """Channel testing""" from test.testlib import * +from task import * + from git.async.pool import * -from git.async.task import * from git.async.thread import terminate_threads from git.async.util import cpu_count + import threading import weakref import time import sys -class _TestTaskBase(object): - """Note: causes great slowdown due to the required locking of task variables""" - def __init__(self, *args, **kwargs): - super(_TestTaskBase, self).__init__(*args, **kwargs) - self.should_fail = False - self.lock = threading.Lock() # yes, can't safely do x = x + 1 :) - self.plock = threading.Lock() - self.item_count = 0 - self.process_count = 0 - - def do_fun(self, item): - self.lock.acquire() - self.item_count += 1 - self.lock.release() - if self.should_fail: - raise AssertionError("I am failing just for the fun of it") - return item - - def process(self, count=1): - # must do it first, otherwise we might read and check results before - # the thread gets here :). Its a lesson ! - self.plock.acquire() - self.process_count += 1 - self.plock.release() - super(_TestTaskBase, self).process(count) - - def _assert(self, pc, fc, check_scheduled=False): - """Assert for num process counts (pc) and num function counts (fc) - :return: self""" - self.lock.acquire() - if self.item_count != fc: - print self.item_count, fc - assert self.item_count == fc - self.lock.release() - self._wlock.acquire() - assert self._num_writers == 0 - self._wlock.release() - return self - - -class TestThreadTaskNode(_TestTaskBase, InputIteratorThreadTask): - pass - - -class TestThreadFailureNode(TestThreadTaskNode): - """Fails after X items""" - def __init__(self, *args, **kwargs): - self.fail_after = kwargs.pop('fail_after') - super(TestThreadFailureNode, self).__init__(*args, **kwargs) - - def do_fun(self, item): - item = TestThreadTaskNode.do_fun(self, item) - - self.lock.acquire() - try: - if self.item_count > self.fail_after: - raise AssertionError("Simulated failure after processing %i items" % self.fail_after) - finally: - self.lock.release() - # END handle fail after - return item - - -class TestThreadInputChannelTaskNode(_TestTaskBase, InputChannelTask): - """Apply a transformation on items read from an input channel""" - def __init__(self, *args, **kwargs): - self.fail_after = kwargs.pop('fail_after', 0) - super(TestThreadInputChannelTaskNode, self).__init__(*args, **kwargs) - - def do_fun(self, item): - """return tuple(i, i*2)""" - item = super(TestThreadInputChannelTaskNode, self).do_fun(item) - - # fail after support - if self.fail_after: - self.lock.acquire() - try: - if self.item_count > self.fail_after: - raise AssertionError("Simulated failure after processing %i items" % self.fail_after) - finally: - self.lock.release() - # END handle fail-after - - if isinstance(item, tuple): - i = item[0] - return item + (i * self.id, ) - else: - return (item, item * self.id) - # END handle tuple -class TestThreadInputChannelVerifyTaskNode(_TestTaskBase, InputChannelTask): - """An input channel task, which verifies the result of its input channels, - should be last in the chain. - Id must be int""" - - def do_fun(self, item): - """return tuple(i, i*2)""" - item = super(TestThreadInputChannelVerifyTaskNode, self).do_fun(item) - - # make sure the computation order matches - assert isinstance(item, tuple), "input was no tuple: %s" % item - - base = item[0] - for id, num in enumerate(item[1:]): - assert num == base * id, "%i != %i, orig = %s" % (num, base * id, str(item)) - # END verify order - - return item - - - class TestThreadPool(TestBase): max_threads = cpu_count() - - def _make_proxy_method(self, t): - """required to prevent binding self into the method we call""" - wt = weakref.proxy(t) - return lambda item: wt.do_fun(item) - - def _add_task_chain(self, p, ni, count=1, fail_setup=list(), feeder_channel=None, id_offset=0): - """Create a task chain of feeder, count transformers and order verifcator - to the pool p, like t1 -> t2 -> t3 - :param fail_setup: a list of pairs, task_id, fail_after, i.e. [(2, 20)] would - make the third transformer fail after 20 items - :param feeder_channel: if set to a channel, it will be used as input of the - first transformation task. The respective first task in the return value - will be None. - :param id_offset: defines the id of the first transformation task, all subsequent - ones will add one - :return: tuple(list(task1, taskN, ...), list(rc1, rcN, ...))""" - nt = p.num_tasks() - - feeder = None - frc = feeder_channel - if feeder_channel is None: - feeder = self._make_iterator_task(ni) - frc = p.add_task(feeder) - # END handle specific feeder - - rcs = [frc] - tasks = [feeder] - - make_proxy_method = self._make_proxy_method - - inrc = frc - for tc in xrange(count): - t = TestThreadInputChannelTaskNode(inrc, tc+id_offset, None) - - t.fun = make_proxy_method(t) - #t.fun = t.do_fun - inrc = p.add_task(t) - - tasks.append(t) - rcs.append(inrc) - # END create count transformers - - # setup failure - for id, fail_after in fail_setup: - tasks[1+id].fail_after = fail_after - # END setup failure - - verifier = TestThreadInputChannelVerifyTaskNode(inrc, 'verifier', None) - #verifier.fun = verifier.do_fun - verifier.fun = make_proxy_method(verifier) - vrc = p.add_task(verifier) - - - tasks.append(verifier) - rcs.append(vrc) - return tasks, rcs - - def _make_iterator_task(self, ni, taskcls=TestThreadTaskNode, **kwargs): - """:return: task which yields ni items - :param taskcls: the actual iterator type to use - :param **kwargs: additional kwargs to be passed to the task""" - t = taskcls(iter(range(ni)), 'iterator', None, **kwargs) - t.fun = self._make_proxy_method(t) - return t - def _assert_single_task(self, p, async=False): """Performs testing in a synchronized environment""" # return # DEBUG TODO: Fixme deactivated it @@ -203,7 +29,7 @@ class TestThreadPool(TestBase): assert ni % 2 == 0, "ni needs to be dividable by 2" assert ni % 4 == 0, "ni needs to be dividable by 4" - make_task = lambda *args, **kwargs: self._make_iterator_task(ni, *args, **kwargs) + make_task = lambda *args, **kwargs: make_iterator_task(ni, *args, **kwargs) task = make_task() @@ -388,7 +214,7 @@ class TestThreadPool(TestBase): ni = 1000 count = 3 aic = count + 2 - make_task = lambda *args, **kwargs: self._add_task_chain(pool, ni, count, *args, **kwargs) + make_task = lambda *args, **kwargs: add_task_chain(pool, ni, count, *args, **kwargs) ts, rcs = make_task() assert len(ts) == aic @@ -473,7 +299,7 @@ class TestThreadPool(TestBase): # connect verifier channel as feeder of the second pool p2 = ThreadPool(0) # don't spawn new threads, they have the tendency not to wake up on mutexes assert p2.size() == 0 - p2ts, p2rcs = self._add_task_chain(p2, ni, count, feeder_channel=rcs[-1], id_offset=count) + p2ts, p2rcs = add_task_chain(p2, ni, count, feeder_channel=rcs[-1], id_offset=count) assert p2ts[0] is None # we have no feeder task assert rcs[-1].pool_ref()() is pool # it didnt change the pool assert rcs[-1] is p2ts[1].reader() -- cgit v1.2.3 From 18e3252a1f655f09093a4cffd5125342a8f94f3b Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 11 Jun 2010 14:58:51 +0200 Subject: Finished dependent task testing according to the features we would currently like to see --- test/git/async/test_pool.py | 48 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 0fa34f6a..40c6d66e 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -132,8 +132,13 @@ class TestThreadPool(TestBase): assert len(items) == nri task._assert( 5, ni) - assert task.is_done() + + # delete the handle first, causing the task to be removed and to be set + # done. We check for the set-done state later. Depending on the timing, + # The task is not yet set done when we are checking it because we were + # scheduled in before the flag could be set. del(rc) + assert task.is_done() assert p.num_tasks() == null_tasks # depleted # but this only hits if we want too many items, if we want less, it could @@ -307,9 +312,41 @@ class TestThreadPool(TestBase): # reading from the last one will evaluate all pools correctly print "read(0) multi-pool" + st = time.time() items = p2rcs[-1].read() + elapsed = time.time() - st assert len(items) == ni + print >> sys.stderr, "Dependent Tasks: evaluated 2 connected pools and %i items with read(0), of %i dependent tasks in %f s ( %i items / s )" % (ni, aic + aic-1, elapsed, ni / elapsed) + + + # loose the handles of the second pool to allow others to go as well + del(p2rcs); del(p2ts) + assert p2.num_tasks() == 0 + + # now we lost our old handles as well, and the tasks go away + ts, rcs = make_task() + assert pool.num_tasks() == len(ts) + + p2ts, p2rcs = add_task_chain(p2, ni, count, feeder_channel=rcs[-1], id_offset=count) + assert p2.num_tasks() == len(p2ts) - 1 + + # Test multi-read(1) + print "read(1) * %i" % ni + reader = rcs[-1] + st = time.time() + for i in xrange(ni): + items = reader.read(1) + assert len(items) == 1 + # END for each item to get + elapsed = time.time() - st + del(reader) # decrement refcount + + print >> sys.stderr, "Dependent Tasks: evaluated 2 connected pools and %i items with read(1), of %i dependent tasks in %f s ( %i items / s )" % (ni, aic + aic-1, elapsed, ni / elapsed) + + # another read is empty + assert len(rcs[-1].read()) == 0 + # now that both are connected, I can drop my handle to the reader # without affecting the task-count, but whats more important: # They remove their tasks correctly once we drop our references in the @@ -329,11 +366,10 @@ class TestThreadPool(TestBase): assert pool.num_tasks() == null_tasks - # TODO: Test multi-read(1) - - # in the end, we expect all tasks to be gone, automatically - # order of deletion doesnt matter - + # ASSERTION: We already tested that one pool behaves correctly when an error + # occours - if two pools handle their ref-counts correctly, which they + # do if we are here, then they should handle errors happening during + # the task processing as expected as well. Hence we can safe this here -- cgit v1.2.3 From 1873db442dc7511fc2c92fbaeb8d998d3e62723d Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 11 Jun 2010 16:25:27 +0200 Subject: Improved shutdown handling - although its impossible to prevent some stderr printing thanks to the underlying threading implementation, we can at least make sure that the interpreter doesn't block during shutdown. Now it appears to be running smoothly --- lib/git/async/__init__.py | 29 +++++++++++++++++++++++++++++ lib/git/async/thread.py | 25 +++++++++++++++++-------- test/git/async/test_pool.py | 8 +++----- 3 files changed, 49 insertions(+), 13 deletions(-) diff --git a/lib/git/async/__init__.py b/lib/git/async/__init__.py index 89b9eb47..e212f1b2 100644 --- a/lib/git/async/__init__.py +++ b/lib/git/async/__init__.py @@ -1 +1,30 @@ """Initialize the multi-processing package""" + +#{ Initialization +def _init_atexit(): + """Setup an at-exit job to be sure our workers are shutdown correctly before + the interpreter quits""" + import atexit + import thread + atexit.register(thread.do_terminate_threads) + +def _init_signals(): + """Assure we shutdown our threads correctly when being interrupted""" + import signal + import thread + + prev_handler = signal.getsignal(signal.SIGINT) + def thread_interrupt_handler(signum, frame): + thread.do_terminate_threads() + if callable(prev_handler): + prev_handler(signum, frame) + raise KeyboardInterrupt() + # END call previous handler + # END signal handler + signal.signal(signal.SIGINT, thread_interrupt_handler) + + +#} END init + +_init_atexit() +_init_signals() diff --git a/lib/git/async/thread.py b/lib/git/async/thread.py index 4d046a2f..afe0d79d 100644 --- a/lib/git/async/thread.py +++ b/lib/git/async/thread.py @@ -136,16 +136,21 @@ class WorkerThread(TerminatableThread): def run(self): """Process input tasks until we receive the quit signal""" - print self.name, "starts processing" # DEBUG - gettask = self.inq.get while True: if self._should_terminate(): break # END check for stop request - # we wait and block - to terminate, send the 'stop' method + # note: during shutdown, this turns None in the middle of waiting + # for an item to be put onto it - we can't du anything about it - + # even if we catch everything and break gracefully, the parent + # call will think we failed with an empty exception. + # Hence we just don't do anything about it. Alternatively + # we could override the start method to get our own bootstrapping, + # which would mean repeating plenty of code in of the threading module. tasktuple = gettask() + # needing exactly one function, and one arg routine, arg = tasktuple @@ -161,7 +166,7 @@ class WorkerThread(TerminatableThread): rval = routine(arg) else: # ignore unknown items - print >> sys.stderr, "%s: task %s was not understood - terminating" % (self.getName(), str(tasktuple)) + sys.stderr.write("%s: task %s was not understood - terminating\n" % (self.getName(), str(tasktuple))) break # END make routine call finally: @@ -171,10 +176,9 @@ class WorkerThread(TerminatableThread): del(routine) del(tasktuple) except StopProcessing: - print self.name, "stops processing" # DEBUG break except Exception,e: - print >> sys.stderr, "%s: Task %s raised unhandled exception: %s - this really shouldn't happen !" % (self.getName(), str(tasktuple), str(e)) + sys.stderr.write("%s: Task %s raised unhandled exception: %s - this really shouldn't happen !\n" % (self.getName(), str(tasktuple), str(e))) continue # just continue # END routine exception handling @@ -182,7 +186,12 @@ class WorkerThread(TerminatableThread): # END endless loop def stop_and_join(self): - """Send stop message to ourselves""" + """Send stop message to ourselves - we don't block, the thread will terminate + once it has finished processing its input queue to receive our termination + event""" + # DONT call superclass as it will try to join - join's don't work for + # some reason, as python apparently doesn't switch threads (so often) + # while waiting ... I don't know, but the threads respond properly, + # but only if dear python switches to them self.inq.put((self.stop, None)) - super(WorkerThread, self).stop_and_join() #} END classes diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 40c6d66e..c786770a 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -373,10 +373,7 @@ class TestThreadPool(TestBase): - - # for some reason, sometimes it has multiple workerthreads already when he - # enters the method ... dunno yet, pools should clean up themselvess - #@terminate_threads + @terminate_threads def test_base(self): assert len(threading.enumerate()) == 1 @@ -463,10 +460,11 @@ class TestThreadPool(TestBase): # threads per core p.set_size(4) self._assert_single_task(p, True) + + # DEPENDENT TASK ASYNC MODE ########################### self._assert_async_dependent_tasks(p) print >> sys.stderr, "Done with everything" - # TODO: test multi-pool connections -- cgit v1.2.3 From e14e3f143e7260de9581aee27e5a9b2645db72de Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 11 Jun 2010 16:42:09 +0200 Subject: Removed commented-out debug code and additional debug printings. Verified it works on py2.4, 2.5 and 2.6 --- lib/git/async/pool.py | 6 ------ lib/git/async/task.py | 6 ------ lib/git/async/util.py | 1 - test/git/async/test_pool.py | 1 - 4 files changed, 14 deletions(-) diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index dbc201a9..7ee3e8eb 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -95,10 +95,6 @@ class PoolReader(CallbackReader): # provided enough - its better to have some possibly empty task runs # than having and empty queue that blocks. - # NOTE: TODO: that case is only possible if one Task could be connected - # to multiple input channels in a manner known by the system. Currently - # this is not possible, but should be implemented at some point. - # if the user tries to use us to read from a done task, we will never # compute as all produced items are already in the channel task = self._task_ref() @@ -260,8 +256,6 @@ class Pool(object): # the following loops are kind of unrolled - code duplication # should make things execute faster. Putting the if statements # into the loop would be less code, but ... slower - # DEBUG - # print actual_count, numchunks, chunksize, remainder, task._out_writer.size() if self._num_workers: # respect the chunk size, and split the task up if we want # to process too much. This can be defined per task diff --git a/lib/git/async/task.py b/lib/git/async/task.py index 49e7e7cf..10b22649 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -88,11 +88,7 @@ class OutputChannelTask(Node): self._num_writers += 1 self._wlock.release() - #print "%r: reading %i" % (self.id, count) - #if hasattr(self, 'reader'): - # print "from", self.reader().channel items = self._read(count) - #print "%r: done reading %i items" % (self.id, len(items)) try: try: @@ -117,7 +113,6 @@ class OutputChannelTask(Node): self._wlock.release() # END handle writer count except Exception, e: - print >> sys.stderr, "task %s error:" % self.id, type(e), str(e) # TODO: REMOVE DEBUG, or make it use logging # be sure our task is not scheduled again self.set_done() @@ -164,7 +159,6 @@ class OutputChannelTask(Node): self._wlock.acquire() try: if self._num_writers == 0: - # print "Closing channel of %r" % self.id, len(self._out_writer.channel.queue), self._out_writer.channel self.close() # END handle writers finally: diff --git a/lib/git/async/util.py b/lib/git/async/util.py index 11ab75a6..4c4f3929 100644 --- a/lib/git/async/util.py +++ b/lib/git/async/util.py @@ -2,7 +2,6 @@ from threading import ( Lock, - current_thread, _allocate_lock, _Condition, _sleep, diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index c786770a..0042c4a8 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -19,7 +19,6 @@ class TestThreadPool(TestBase): def _assert_single_task(self, p, async=False): """Performs testing in a synchronized environment""" - # return # DEBUG TODO: Fixme deactivated it print >> sys.stderr, "Threadpool: Starting single task (async = %i) with %i threads" % (async, p.size()) null_tasks = p.num_tasks() # in case we had some before -- cgit v1.2.3 From cac6e06cc9ef2903a15e594186445f3baa989a1a Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 11 Jun 2010 16:58:44 +0200 Subject: test_task: fixed import error, made all modules from x import * safe --- lib/git/async/channel.py | 3 +++ lib/git/async/graph.py | 2 ++ lib/git/async/pool.py | 2 ++ lib/git/async/task.py | 5 ++++- lib/git/async/thread.py | 4 ++++ test/git/async/test_task.py | 5 ++++- 6 files changed, 19 insertions(+), 2 deletions(-) diff --git a/lib/git/async/channel.py b/lib/git/async/channel.py index 9b019707..ae476cda 100644 --- a/lib/git/async/channel.py +++ b/lib/git/async/channel.py @@ -13,6 +13,9 @@ from util import ( from time import time import sys +__all__ = ('Channel', 'SerialChannel', 'Writer', 'CallbackWriter', 'Reader', + 'CallbackReader', 'mkchannel', 'ReadOnly') + #{ Classes class Channel(object): """A channel is similar to a file like object. It has a write end as well as one or diff --git a/lib/git/async/graph.py b/lib/git/async/graph.py index 9ee0e891..4e14c81e 100644 --- a/lib/git/async/graph.py +++ b/lib/git/async/graph.py @@ -1,5 +1,7 @@ """Simplistic implementation of a graph""" +__all__ = ('Node', 'Graph') + class Node(object): """A Node in the graph. They know their neighbours, and have an id which should resolve into a string""" diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index 7ee3e8eb..cf14e47b 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -30,6 +30,8 @@ from time import sleep import new +__all__ = ('PoolReader', 'Pool', 'ThreadPool') + class PoolReader(CallbackReader): """A reader designed to read from channels which take part in pools It acts like a handle to the underlying task in the pool.""" diff --git a/lib/git/async/task.py b/lib/git/async/task.py index 10b22649..d7f331b7 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -6,8 +6,11 @@ import weakref import sys import new +__all__ = ('OutputChannelTask', 'ThreadTaskBase', 'InputIteratorTaskBase', + 'InputIteratorThreadTask', 'InputChannelTask') + class OutputChannelTask(Node): - """Abstracts a named task as part of a set of interdependent tasks, which contains + """Abstracts a named task, which contains additional information on how the task should be queued and processed. Results of the item processing are sent to a write channel, which is to be diff --git a/lib/git/async/thread.py b/lib/git/async/thread.py index afe0d79d..96b4f0c4 100644 --- a/lib/git/async/thread.py +++ b/lib/git/async/thread.py @@ -7,6 +7,10 @@ import Queue import sys +__all__ = ('do_terminate_threads', 'terminate_threads', 'TerminatableThread', + 'WorkerThread') + + #{ Decorators def do_terminate_threads(whitelist=list()): diff --git a/test/git/async/test_task.py b/test/git/async/test_task.py index 91ac4dc3..c6a796e9 100644 --- a/test/git/async/test_task.py +++ b/test/git/async/test_task.py @@ -1,5 +1,6 @@ """Channel testing""" from test.testlib import * +from git.async.util import * from git.async.task import * import time @@ -9,4 +10,6 @@ class TestTask(TestBase): max_threads = cpu_count() def test_iterator_task(self): - self.fail("test iterator task") + # tested via test_pool + pass + -- cgit v1.2.3 From a28942bdf01f4ddb9d0b5a0489bd6f4e101dd775 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 11 Jun 2010 20:13:21 +0200 Subject: Added performance test, improved iterator task which will now be usable by default. It shows that there must be the notion of a producer, which can work if there are no items read --- lib/git/async/task.py | 3 +++ test/git/async/task.py | 38 ++++++++++++++++++---------- test/git/async/test_performance.py | 51 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 13 deletions(-) create mode 100644 test/git/async/test_performance.py diff --git a/lib/git/async/task.py b/lib/git/async/task.py index d7f331b7..0eb4527c 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -196,6 +196,9 @@ class InputIteratorTaskBase(OutputChannelTask): self._read = lambda count: weakself().__read(count) self._empty = False + # defaults to returning our items unchanged + self.fun = lambda item: item + def __read(self, count=0): """Read count items from the iterator, and return them""" # not threadsafe, but worst thing that could happen is that diff --git a/test/git/async/task.py b/test/git/async/task.py index 9cc3cb9d..f3599efe 100644 --- a/test/git/async/task.py +++ b/test/git/async/task.py @@ -102,6 +102,14 @@ class TestThreadInputChannelTaskNode(_TestTaskBase, InputChannelTask): # END handle tuple +class TestThreadPerformanceTaskNode(InputChannelTask): + """Applies no operation to the item, and does not lock, measuring + the actual throughput of the system""" + + def do_fun(self, item): + return item + + class TestThreadInputChannelVerifyTaskNode(_TestTaskBase, InputChannelTask): """An input channel task, which verifies the result of its input channels, should be last in the chain. @@ -121,7 +129,6 @@ class TestThreadInputChannelVerifyTaskNode(_TestTaskBase, InputChannelTask): return item - #{ Utilities def make_proxy_method(t): @@ -129,7 +136,9 @@ def make_proxy_method(t): wt = weakref.proxy(t) return lambda item: wt.do_fun(item) -def add_task_chain(p, ni, count=1, fail_setup=list(), feeder_channel=None, id_offset=0): +def add_task_chain(p, ni, count=1, fail_setup=list(), feeder_channel=None, id_offset=0, + feedercls=TestThreadTaskNode, transformercls=TestThreadInputChannelTaskNode, + include_verifier=True): """Create a task chain of feeder, count transformers and order verifcator to the pool p, like t1 -> t2 -> t3 :param fail_setup: a list of pairs, task_id, fail_after, i.e. [(2, 20)] would @@ -145,7 +154,7 @@ def add_task_chain(p, ni, count=1, fail_setup=list(), feeder_channel=None, id_of feeder = None frc = feeder_channel if feeder_channel is None: - feeder = make_iterator_task(ni) + feeder = make_iterator_task(ni, taskcls=feedercls) frc = p.add_task(feeder) # END handle specific feeder @@ -154,7 +163,7 @@ def add_task_chain(p, ni, count=1, fail_setup=list(), feeder_channel=None, id_of inrc = frc for tc in xrange(count): - t = TestThreadInputChannelTaskNode(inrc, tc+id_offset, None) + t = transformercls(inrc, tc+id_offset, None) t.fun = make_proxy_method(t) #t.fun = t.do_fun @@ -169,14 +178,16 @@ def add_task_chain(p, ni, count=1, fail_setup=list(), feeder_channel=None, id_of tasks[1+id].fail_after = fail_after # END setup failure - verifier = TestThreadInputChannelVerifyTaskNode(inrc, 'verifier', None) - #verifier.fun = verifier.do_fun - verifier.fun = make_proxy_method(verifier) - vrc = p.add_task(verifier) - - - tasks.append(verifier) - rcs.append(vrc) + if include_verifier: + verifier = TestThreadInputChannelVerifyTaskNode(inrc, 'verifier', None) + #verifier.fun = verifier.do_fun + verifier.fun = make_proxy_method(verifier) + vrc = p.add_task(verifier) + + + tasks.append(verifier) + rcs.append(vrc) + # END handle include verifier return tasks, rcs def make_iterator_task(ni, taskcls=TestThreadTaskNode, **kwargs): @@ -184,7 +195,8 @@ def make_iterator_task(ni, taskcls=TestThreadTaskNode, **kwargs): :param taskcls: the actual iterator type to use :param **kwargs: additional kwargs to be passed to the task""" t = taskcls(iter(range(ni)), 'iterator', None, **kwargs) - t.fun = make_proxy_method(t) + if isinstance(t, _TestTaskBase): + t.fun = make_proxy_method(t) return t #} END utilities diff --git a/test/git/async/test_performance.py b/test/git/async/test_performance.py new file mode 100644 index 00000000..896d230e --- /dev/null +++ b/test/git/async/test_performance.py @@ -0,0 +1,51 @@ +"""Channel testing""" +from test.testlib import * +from task import * + +from git.async.pool import * +from git.async.thread import terminate_threads +from git.async.util import cpu_count + +import time +import sys + + + +class TestThreadPoolPerformance(TestBase): + + max_threads = cpu_count() + + def test_base(self): + # create a dependency network, and see how the performance changes + # when adjusting the amount of threads + pool = ThreadPool(0) + ni = 1000 # number of items to process + print self.max_threads + for num_threads in range(self.max_threads*2 + 1): + pool.set_size(num_threads) + for num_transformers in (1, 5, 10): + for read_mode in range(2): + ts, rcs = add_task_chain(pool, ni, count=num_transformers, + feedercls=InputIteratorThreadTask, + transformercls=TestThreadPerformanceTaskNode, + include_verifier=False) + + mode_info = "read(0)" + if read_mode == 1: + mode_info = "read(1) * %i" % ni + # END mode info + fmt = "Threadcount=%%i: Produced %%i items using %s in %%i transformations in %%f s (%%f items / s)" % mode_info + reader = rcs[-1] + st = time.time() + if read_mode == 1: + for i in xrange(ni): + assert len(reader.read(1)) == 1 + # END for each item to read + else: + assert len(reader.read(0)) == ni + # END handle read mode + elapsed = time.time() - st + print >> sys.stderr, fmt % (num_threads, ni, num_transformers, elapsed, ni / elapsed) + # END for each read-mode + # END for each amount of processors + # END for each thread count -- cgit v1.2.3 From be8955a0fbb77d673587974b763f17c214904b57 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 12 Jun 2010 11:19:18 +0200 Subject: Cleaned up channel design, Reader and Writer bases don't require a channel anymore, but are abstract. Added IteratorReader, implementing the reader interface from an iterator. The implementation moved from the TaskIterator to the channel --- lib/git/async/channel.py | 139 ++++++++++++++++++++++++++++++++++------- lib/git/async/pool.py | 15 ++--- lib/git/async/task.py | 51 ++------------- test/git/async/test_channel.py | 20 +++++- 4 files changed, 147 insertions(+), 78 deletions(-) diff --git a/lib/git/async/channel.py b/lib/git/async/channel.py index ae476cda..79cb5294 100644 --- a/lib/git/async/channel.py +++ b/lib/git/async/channel.py @@ -11,10 +11,12 @@ from util import ( ) from time import time +import threading import sys -__all__ = ('Channel', 'SerialChannel', 'Writer', 'CallbackWriter', 'Reader', - 'CallbackReader', 'mkchannel', 'ReadOnly') +__all__ = ('Channel', 'SerialChannel', 'Writer', 'ChannelWriter', 'CallbackChannelWriter', + 'Reader', 'ChannelReader', 'CallbackChannelReader', 'mkchannel', 'ReadOnly', + 'IteratorReader') #{ Classes class Channel(object): @@ -43,15 +45,50 @@ class SerialChannel(Channel): class Writer(object): + """A writer is an object providing write access to a possibly blocking reading device""" + __slots__ = tuple() + + #{ Interface + + def __init__(self, device): + """Initialize the instance with the device to write to""" + + def write(self, item, block=True, timeout=None): + """Write the given item into the device + :param block: True if the device may block until space for the item is available + :param timeout: The time in seconds to wait for the device to become ready + in blocking mode""" + raise NotImplementedError() + + def size(self): + """:return: number of items already in the device, they could be read with a reader""" + raise NotImplementedError() + + def close(self): + """Close the channel. Multiple close calls on a closed channel are no + an error""" + raise NotImplementedError() + + def closed(self): + """:return: True if the channel was closed""" + raise NotImplementedError() + + #} END interface + + +class ChannelWriter(Writer): """The write end of a channel, a file-like interface for a channel""" - __slots__ = ('write', 'channel') + __slots__ = ('channel', '_put') def __init__(self, channel): """Initialize the writer to use the given channel""" self.channel = channel - self.write = channel.queue.put + self._put = self.channel.queue.put #{ Interface + def write(self, item, block=False, timeout=None): + return self._put(item, block, timeout) + def size(self): return self.channel.queue.qsize() @@ -66,15 +103,14 @@ class Writer(object): #} END interface -class CallbackWriter(Writer): +class CallbackChannelWriter(ChannelWriter): """The write end of a channel which allows you to setup a callback to be called after an item was written to the channel""" __slots__ = ('_pre_cb') def __init__(self, channel): - Writer.__init__(self, channel) + super(CallbackChannelWriter, self).__init__(channel) self._pre_cb = None - self.write = self._write def set_pre_cb(self, fun = lambda item: item): """Install a callback to be called before the given item is written. @@ -87,25 +123,22 @@ class CallbackWriter(Writer): self._pre_cb = fun return prev - def _write(self, item, block=True, timeout=None): + def write(self, item, block=True, timeout=None): if self._pre_cb: item = self._pre_cb(item) - self.channel.queue.put(item, block, timeout) + super(CallbackChannelWriter, self).write(item, block, timeout) class Reader(object): - """Allows reading from a channel""" - __slots__ = 'channel' + """Allows reading from a device""" + __slots__ = tuple() - def __init__(self, channel): - """Initialize this instance from its parent write channel""" - self.channel = channel - - #{ Interface - + def __init__(self, device): + """Initialize the instance with the device to read from""" + def read(self, count=0, block=True, timeout=None): - """read a list of items read from the channel. The list, as a sequence + """read a list of items read from the device. The list, as a sequence of items, is similar to the string of characters returned when reading from file like objects. :param count: given amount of items to read. If < 1, all items will be read @@ -114,11 +147,25 @@ class Reader(object): given amount of seconds, returning the items it received so far. The timeout is applied to each read item, not for the whole operation. :return: single item in a list if count is 1, or a list of count items. - If the channel was empty and count was 1, an empty list will be returned. + If the device was empty and count was 1, an empty list will be returned. If count was greater 1, a list with less than count items will be returned. If count was < 1, a list with all items that could be read will be returned.""" + raise NotImplementedError() + + +class ChannelReader(Reader): + """Allows reading from a channel. The reader is thread-safe if the channel is as well""" + __slots__ = 'channel' + + def __init__(self, channel): + """Initialize this instance from its parent write channel""" + self.channel = channel + + #{ Interface + + def read(self, count=0, block=True, timeout=None): # if the channel is closed for writing, we never block # NOTE: is handled by the queue # We don't check for a closed state here has it costs time - most of @@ -191,12 +238,12 @@ class Reader(object): #} END interface -class CallbackReader(Reader): +class CallbackChannelReader(ChannelReader): """A channel which sends a callback before items are read from the channel""" __slots__ = "_pre_cb" def __init__(self, channel): - Reader.__init__(self, channel) + super(CallbackChannelReader, self).__init__(channel) self._pre_cb = None def set_pre_cb(self, fun = lambda count: None): @@ -213,13 +260,59 @@ class CallbackReader(Reader): def read(self, count=0, block=True, timeout=None): if self._pre_cb: self._pre_cb(count) - return Reader.read(self, count, block, timeout) + return super(CallbackChannelReader, self).read(count, block, timeout) + +class IteratorReader(Reader): + """A Reader allowing to read items from an iterator, instead of a channel. + Reads will never block. Its thread-safe""" + __slots__ = ("_empty", '_iter', '_lock') + + # the type of the lock to use when reading from the iterator + lock_type = threading.Lock + + def __init__(self, iterator): + self._empty = False + if not hasattr(iterator, 'next'): + raise ValueError("Iterator %r needs a next() function" % iterator) + self._iter = iterator + self._lock = self.lock_type() + + def read(self, count=0, block=True, timeout=None): + """Non-Blocking implementation of read""" + # not threadsafe, but worst thing that could happen is that + # we try to get items one more time + if self._empty: + return list() + # END early abort + + self._lock.acquire() + try: + if count == 0: + self._empty = True + return list(self._iter) + else: + out = list() + it = self._iter + for i in xrange(count): + try: + out.append(it.next()) + except StopIteration: + self._empty = True + break + # END handle empty iterator + # END for each item to take + return out + # END handle count + finally: + self._lock.release() + # END handle locking + #} END classes #{ Constructors -def mkchannel(ctype = Channel, wtype = Writer, rtype = Reader): +def mkchannel(ctype = Channel, wtype = ChannelWriter, rtype = ChannelReader): """Create a channel, with a reader and a writer :return: tuple(reader, writer) :param ctype: Channel to instantiate diff --git a/lib/git/async/pool.py b/lib/git/async/pool.py index cf14e47b..8f33a029 100644 --- a/lib/git/async/pool.py +++ b/lib/git/async/pool.py @@ -18,10 +18,10 @@ from Queue import ( from graph import Graph from channel import ( mkchannel, - Writer, + ChannelWriter, Channel, SerialChannel, - CallbackReader + CallbackChannelReader ) import sys @@ -32,13 +32,14 @@ import new __all__ = ('PoolReader', 'Pool', 'ThreadPool') -class PoolReader(CallbackReader): + +class PoolReader(CallbackChannelReader): """A reader designed to read from channels which take part in pools It acts like a handle to the underlying task in the pool.""" __slots__ = ('_task_ref', '_pool_ref') def __init__(self, channel, task, pool): - CallbackReader.__init__(self, channel) + CallbackChannelReader.__init__(self, channel) self._task_ref = weakref.ref(task) self._pool_ref = weakref.ref(pool) @@ -69,7 +70,7 @@ class PoolReader(CallbackReader): #{ Internal def _read(self, count=0, block=True, timeout=None): - return CallbackReader.read(self, count, block, timeout) + return CallbackChannelReader.read(self, count, block, timeout) #} END internal @@ -115,7 +116,7 @@ class PoolReader(CallbackReader): ####### read data ######## ########################## # read actual items, tasks were setup to put their output into our channel ( as well ) - items = CallbackReader.read(self, count, block, timeout) + items = CallbackChannelReader.read(self, count, block, timeout) ########################## @@ -446,7 +447,7 @@ class Pool(object): ch = None if wc is None: ch = ctype() - wc = Writer(ch) + wc = ChannelWriter(ch) task.set_writer(wc) else: ch = wc.channel diff --git a/lib/git/async/task.py b/lib/git/async/task.py index 0eb4527c..b7b5e699 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -1,5 +1,7 @@ from graph import Node from util import ReadOnly +from channel import IteratorReader + import threading import weakref @@ -179,56 +181,15 @@ class ThreadTaskBase(object): class InputIteratorTaskBase(OutputChannelTask): """Implements a task which processes items from an iterable in a multi-processing safe manner""" - __slots__ = ('_iterator', '_lock', '_empty') - # the type of the lock to use when reading from the iterator - lock_type = None + __slots__ = tuple() + def __init__(self, iterator, *args, **kwargs): OutputChannelTask.__init__(self, *args, **kwargs) - if not hasattr(iterator, 'next'): - raise ValueError("Iterator %r needs a next() function" % iterator) - self._iterator = iterator - self._lock = self.lock_type() - - # this is necessary to prevent a cyclic ref, preventing us from - # getting deleted ( and collected ) - weakself = weakref.ref(self) - self._read = lambda count: weakself().__read(count) - self._empty = False - + self._read = IteratorReader(iterator).read # defaults to returning our items unchanged self.fun = lambda item: item - - def __read(self, count=0): - """Read count items from the iterator, and return them""" - # not threadsafe, but worst thing that could happen is that - # we try to get items one more time - if self._empty: - return list() - # END early abort - - self._lock.acquire() - try: - if count == 0: - self._empty = True - return list(self._iterator) - else: - out = list() - it = self._iterator - for i in xrange(count): - try: - out.append(it.next()) - except StopIteration: - self._empty = True - break - # END handle empty iterator - # END for each item to take - return out - # END handle count - finally: - self._lock.release() - # END handle locking - + class InputIteratorThreadTask(InputIteratorTaskBase, ThreadTaskBase): """An input iterator for threaded pools""" diff --git a/test/git/async/test_channel.py b/test/git/async/test_channel.py index a24c7c91..e9e1b64c 100644 --- a/test/git/async/test_channel.py +++ b/test/git/async/test_channel.py @@ -9,8 +9,8 @@ class TestChannels(TestBase): def test_base(self): # creating channel yields a write and a read channal wc, rc = mkchannel() - assert isinstance(wc, Writer) # default args - assert isinstance(rc, Reader) + assert isinstance(wc, ChannelWriter) # default args + assert isinstance(rc, ChannelReader) # TEST UNLIMITED SIZE CHANNEL - writing+reading is FIFO @@ -46,7 +46,7 @@ class TestChannels(TestBase): # test callback channels - wc, rc = mkchannel(wtype = CallbackWriter, rtype = CallbackReader) + wc, rc = mkchannel(wtype = CallbackChannelWriter, rtype = CallbackChannelReader) cb = [0, 0] # set slots to one if called def pre_write(item): @@ -71,3 +71,17 @@ class TestChannels(TestBase): assert rval == val + 1 + + # ITERATOR READER + reader = IteratorReader(iter(range(10))) + assert len(reader.read(2)) == 2 + assert len(reader.read(0)) == 8 + # its empty now + assert len(reader.read(0)) == 0 + assert len(reader.read(5)) == 0 + + # doesn't work if item is not an iterator + self.failUnlessRaises(ValueError, IteratorReader, list()) + + # NOTE: its thread-safety is tested by the pool + -- cgit v1.2.3 From 1d8a577ffc6ad7ce1465001ddebdc157aecc1617 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 12 Jun 2010 11:41:10 +0200 Subject: channel: cleaned up inheritance hierarchy, adding mixing for callback functionality - previously the callback functionality was bound to channel based readers/writers --- lib/git/async/channel.py | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/lib/git/async/channel.py b/lib/git/async/channel.py index 79cb5294..a29ff17c 100644 --- a/lib/git/async/channel.py +++ b/lib/git/async/channel.py @@ -103,13 +103,14 @@ class ChannelWriter(Writer): #} END interface -class CallbackChannelWriter(ChannelWriter): +class CallbackWriterMixin(object): """The write end of a channel which allows you to setup a callback to be called after an item was written to the channel""" - __slots__ = ('_pre_cb') + # slots don't work with mixin's :( + # __slots__ = ('_pre_cb') - def __init__(self, channel): - super(CallbackChannelWriter, self).__init__(channel) + def __init__(self, *args): + super(CallbackWriterMixin, self).__init__(*args) self._pre_cb = None def set_pre_cb(self, fun = lambda item: item): @@ -126,7 +127,12 @@ class CallbackChannelWriter(ChannelWriter): def write(self, item, block=True, timeout=None): if self._pre_cb: item = self._pre_cb(item) - super(CallbackChannelWriter, self).write(item, block, timeout) + super(CallbackWriterMixin, self).write(item, block, timeout) + + +class CallbackChannelWriter(CallbackWriterMixin, ChannelWriter): + """Implements a channel writer with callback functionality""" + pass class Reader(object): @@ -238,12 +244,14 @@ class ChannelReader(Reader): #} END interface -class CallbackChannelReader(ChannelReader): + +class CallbackReaderMixin(object): """A channel which sends a callback before items are read from the channel""" - __slots__ = "_pre_cb" + # unfortunately, slots can only use direct inheritance, have to turn it off :( + # __slots__ = "_pre_cb" - def __init__(self, channel): - super(CallbackChannelReader, self).__init__(channel) + def __init__(self, *args): + super(CallbackReaderMixin, self).__init__(*args) self._pre_cb = None def set_pre_cb(self, fun = lambda count: None): @@ -260,7 +268,12 @@ class CallbackChannelReader(ChannelReader): def read(self, count=0, block=True, timeout=None): if self._pre_cb: self._pre_cb(count) - return super(CallbackChannelReader, self).read(count, block, timeout) + return super(CallbackReaderMixin, self).read(count, block, timeout) + + +class CallbackChannelReader(CallbackReaderMixin, ChannelReader): + """Implements a channel reader with callback functionality""" + pass class IteratorReader(Reader): -- cgit v1.2.3 From 7a0b79ee574999ecbc76696506352e4a5a0d7159 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 12 Jun 2010 12:38:02 +0200 Subject: task: improved naming of task types, improved pool test to be less dependent on starting with just the main thread --- lib/git/async/task.py | 26 ++++++++++++++++---------- test/git/async/task.py | 26 +++++++++++++------------- test/git/async/test_performance.py | 4 ++-- test/git/async/test_pool.py | 15 +++++++++++---- 4 files changed, 42 insertions(+), 29 deletions(-) diff --git a/lib/git/async/task.py b/lib/git/async/task.py index b7b5e699..ac948dc0 100644 --- a/lib/git/async/task.py +++ b/lib/git/async/task.py @@ -8,21 +8,27 @@ import weakref import sys import new -__all__ = ('OutputChannelTask', 'ThreadTaskBase', 'InputIteratorTaskBase', - 'InputIteratorThreadTask', 'InputChannelTask') +__all__ = ('Task', 'ThreadTaskBase', 'IteratorTaskBase', + 'IteratorThreadTask', 'ChannelThreadTask') -class OutputChannelTask(Node): +class Task(Node): """Abstracts a named task, which contains additional information on how the task should be queued and processed. - Results of the item processing are sent to a write channel, which is to be + Results of the item processing are sent to a writer, which is to be set by the creator using the ``set_writer`` method. + Items are read using the internal ``_read`` callable, subclasses are meant to + set this to a callable that supports the Reader interface's read function. + * **min_count** assures that not less than min_count items will be processed per call. * **max_chunksize** assures that multi-threading is happening in smaller chunks. If someone wants all items to be processed, using read(0), the whole task would go to one worker, as well as dependent tasks. If you want finer granularity , you can - specify this here, causing chunks to be no larger than max_chunksize""" + specify this here, causing chunks to be no larger than max_chunksize + * **apply_single** if True, default True, individual items will be given to the + worker function. If False, a list of possibly multiple items will be passed + instead.""" __slots__ = ( '_read', # method to yield items to process '_out_writer', # output write channel '_exc', # exception caught @@ -178,32 +184,32 @@ class ThreadTaskBase(object): pass -class InputIteratorTaskBase(OutputChannelTask): +class IteratorTaskBase(Task): """Implements a task which processes items from an iterable in a multi-processing safe manner""" __slots__ = tuple() def __init__(self, iterator, *args, **kwargs): - OutputChannelTask.__init__(self, *args, **kwargs) + Task.__init__(self, *args, **kwargs) self._read = IteratorReader(iterator).read # defaults to returning our items unchanged self.fun = lambda item: item -class InputIteratorThreadTask(InputIteratorTaskBase, ThreadTaskBase): +class IteratorThreadTask(IteratorTaskBase, ThreadTaskBase): """An input iterator for threaded pools""" lock_type = threading.Lock -class InputChannelTask(OutputChannelTask, ThreadTaskBase): +class ChannelThreadTask(Task, ThreadTaskBase): """Uses an input channel as source for reading items For instantiation, it takes all arguments of its base, the first one needs to be the input channel to read from though.""" __slots__ = "_pool_ref" def __init__(self, in_reader, *args, **kwargs): - OutputChannelTask.__init__(self, *args, **kwargs) + Task.__init__(self, *args, **kwargs) self._read = in_reader.read self._pool_ref = None diff --git a/test/git/async/task.py b/test/git/async/task.py index f3599efe..583cb1f8 100644 --- a/test/git/async/task.py +++ b/test/git/async/task.py @@ -51,18 +51,18 @@ class _TestTaskBase(object): return self -class TestThreadTaskNode(_TestTaskBase, InputIteratorThreadTask): +class TestThreadTask(_TestTaskBase, IteratorThreadTask): pass -class TestThreadFailureNode(TestThreadTaskNode): +class TestFailureThreadTask(TestThreadTask): """Fails after X items""" def __init__(self, *args, **kwargs): self.fail_after = kwargs.pop('fail_after') - super(TestThreadFailureNode, self).__init__(*args, **kwargs) + super(TestFailureThreadTask, self).__init__(*args, **kwargs) def do_fun(self, item): - item = TestThreadTaskNode.do_fun(self, item) + item = TestThreadTask.do_fun(self, item) self.lock.acquire() try: @@ -74,15 +74,15 @@ class TestThreadFailureNode(TestThreadTaskNode): return item -class TestThreadInputChannelTaskNode(_TestTaskBase, InputChannelTask): +class TestChannelThreadTask(_TestTaskBase, ChannelThreadTask): """Apply a transformation on items read from an input channel""" def __init__(self, *args, **kwargs): self.fail_after = kwargs.pop('fail_after', 0) - super(TestThreadInputChannelTaskNode, self).__init__(*args, **kwargs) + super(TestChannelThreadTask, self).__init__(*args, **kwargs) def do_fun(self, item): """return tuple(i, i*2)""" - item = super(TestThreadInputChannelTaskNode, self).do_fun(item) + item = super(TestChannelThreadTask, self).do_fun(item) # fail after support if self.fail_after: @@ -102,7 +102,7 @@ class TestThreadInputChannelTaskNode(_TestTaskBase, InputChannelTask): # END handle tuple -class TestThreadPerformanceTaskNode(InputChannelTask): +class TestPerformanceThreadTask(ChannelThreadTask): """Applies no operation to the item, and does not lock, measuring the actual throughput of the system""" @@ -110,14 +110,14 @@ class TestThreadPerformanceTaskNode(InputChannelTask): return item -class TestThreadInputChannelVerifyTaskNode(_TestTaskBase, InputChannelTask): +class TestVerifyChannelThreadTask(_TestTaskBase, ChannelThreadTask): """An input channel task, which verifies the result of its input channels, should be last in the chain. Id must be int""" def do_fun(self, item): """return tuple(i, i*2)""" - item = super(TestThreadInputChannelVerifyTaskNode, self).do_fun(item) + item = super(TestVerifyChannelThreadTask, self).do_fun(item) # make sure the computation order matches assert isinstance(item, tuple), "input was no tuple: %s" % item @@ -137,7 +137,7 @@ def make_proxy_method(t): return lambda item: wt.do_fun(item) def add_task_chain(p, ni, count=1, fail_setup=list(), feeder_channel=None, id_offset=0, - feedercls=TestThreadTaskNode, transformercls=TestThreadInputChannelTaskNode, + feedercls=TestThreadTask, transformercls=TestChannelThreadTask, include_verifier=True): """Create a task chain of feeder, count transformers and order verifcator to the pool p, like t1 -> t2 -> t3 @@ -179,7 +179,7 @@ def add_task_chain(p, ni, count=1, fail_setup=list(), feeder_channel=None, id_of # END setup failure if include_verifier: - verifier = TestThreadInputChannelVerifyTaskNode(inrc, 'verifier', None) + verifier = TestVerifyChannelThreadTask(inrc, 'verifier', None) #verifier.fun = verifier.do_fun verifier.fun = make_proxy_method(verifier) vrc = p.add_task(verifier) @@ -190,7 +190,7 @@ def add_task_chain(p, ni, count=1, fail_setup=list(), feeder_channel=None, id_of # END handle include verifier return tasks, rcs -def make_iterator_task(ni, taskcls=TestThreadTaskNode, **kwargs): +def make_iterator_task(ni, taskcls=TestThreadTask, **kwargs): """:return: task which yields ni items :param taskcls: the actual iterator type to use :param **kwargs: additional kwargs to be passed to the task""" diff --git a/test/git/async/test_performance.py b/test/git/async/test_performance.py index 896d230e..703c8593 100644 --- a/test/git/async/test_performance.py +++ b/test/git/async/test_performance.py @@ -26,8 +26,8 @@ class TestThreadPoolPerformance(TestBase): for num_transformers in (1, 5, 10): for read_mode in range(2): ts, rcs = add_task_chain(pool, ni, count=num_transformers, - feedercls=InputIteratorThreadTask, - transformercls=TestThreadPerformanceTaskNode, + feedercls=IteratorThreadTask, + transformercls=TestPerformanceThreadTask, include_verifier=False) mode_info = "read(0)" diff --git a/test/git/async/test_pool.py b/test/git/async/test_pool.py index 0042c4a8..aab618aa 100644 --- a/test/git/async/test_pool.py +++ b/test/git/async/test_pool.py @@ -198,7 +198,7 @@ class TestThreadPool(TestBase): # test failure after ni / 2 items # This makes sure it correctly closes the channel on failure to prevent blocking nri = ni/2 - task = make_task(TestThreadFailureNode, fail_after=ni/2) + task = make_task(TestFailureThreadTask, fail_after=ni/2) rc = p.add_task(task) assert len(rc.read()) == nri assert task.is_done() @@ -374,7 +374,14 @@ class TestThreadPool(TestBase): @terminate_threads def test_base(self): - assert len(threading.enumerate()) == 1 + max_wait_attempts = 3 + sleep_time = 0.1 + for mc in range(max_wait_attempts): + # wait for threads to die + if len(threading.enumerate()) != 1: + time.sleep(sleep_time) + # END for each attempt + assert len(threading.enumerate()) == 1, "Waited %f s for threads to die, its still alive" % (max_wait_attempts, sleep_time) p = ThreadPool() @@ -401,7 +408,7 @@ class TestThreadPool(TestBase): # SINGLE TASK SERIAL SYNC MODE ############################## # put a few unrelated tasks that we forget about - check ref counts and cleanup - t1, t2 = TestThreadTaskNode(iter(list()), "nothing1", None), TestThreadTaskNode(iter(list()), "nothing2", None) + t1, t2 = TestThreadTask(iter(list()), "nothing1", None), TestThreadTask(iter(list()), "nothing2", None) urc1 = p.add_task(t1) urc2 = p.add_task(t2) assert p.num_tasks() == 2 @@ -416,7 +423,7 @@ class TestThreadPool(TestBase): assert p.num_tasks() == 0 assert sys.getrefcount(t2) == 2 - t3 = TestThreadInputChannelTaskNode(urc2, "channel", None) + t3 = TestChannelThreadTask(urc2, "channel", None) urc3 = p.add_task(t3) assert p.num_tasks() == 1 del(urc3) -- cgit v1.2.3