diff options
| author | Sebastian Thiel <byronimo@gmail.com> | 2010-11-25 18:10:33 +0100 |
|---|---|---|
| committer | Sebastian Thiel <byronimo@gmail.com> | 2010-11-25 18:18:15 +0100 |
| commit | f8ce24a835cae8c623e2936bec2618a8855c605b (patch) | |
| tree | d4c1d392579e24285381613a4ac1b7cc2d6b6fae /git/test/performance | |
| parent | 65747a216c67c3101c6ae2edaa8119d786b793cb (diff) | |
| parent | 9004e3a1cf33110f2cbc458f1dc3259c930ad9b4 (diff) | |
| download | GitPython-f8ce24a835cae8c623e2936bec2618a8855c605b.tar.gz GitPython-f8ce24a835cae8c623e2936bec2618a8855c605b.zip | |
-#######->WARNING<-####### Directory structure changed, see commit message
If you use git-python as a submodule of your own project, which alters the sys.path to import it,
you will have to adjust your code to take the changed directory structure into consideration.
Previously, you would put the path
./git-python/lib
into your syspath. All modules moved one level up into the 'git' subdirectory, which means that the 'git-python' directory
now contains the 'git' root package. To allow git to be found, add ./git-python into your path.
To finalize your update, run the following commands
git submodule update --init --recursive
As there will be left-over directories, consider running git-clean
Diffstat (limited to 'git/test/performance')
| -rw-r--r-- | git/test/performance/lib.py | 78 | ||||
| -rw-r--r-- | git/test/performance/test_commit.py | 99 | ||||
| -rw-r--r-- | git/test/performance/test_odb.py | 70 | ||||
| -rw-r--r-- | git/test/performance/test_streams.py | 131 | ||||
| -rw-r--r-- | git/test/performance/test_utils.py | 174 |
5 files changed, 552 insertions, 0 deletions
diff --git a/git/test/performance/lib.py b/git/test/performance/lib.py new file mode 100644 index 00000000..d0727b60 --- /dev/null +++ b/git/test/performance/lib.py @@ -0,0 +1,78 @@ +"""Contains library functions""" +import os +from git.test.lib import * +import shutil +import tempfile + +from git.db import ( + GitCmdObjectDB, + GitDB + ) + +from git import ( + Repo + ) + +#{ Invvariants +k_env_git_repo = "GIT_PYTHON_TEST_GIT_REPO_BASE" +#} END invariants + + +#{ Utilities +def resolve_or_fail(env_var): + """:return: resolved environment variable or raise EnvironmentError""" + try: + return os.environ[env_var] + except KeyError: + raise EnvironmentError("Please set the %r envrionment variable and retry" % env_var) + # END exception handling + +#} END utilities + + +#{ Base Classes + +class TestBigRepoR(TestBase): + """TestCase providing access to readonly 'big' repositories using the following + member variables: + + * gitrorepo + + * Read-Only git repository - actually the repo of git itself + + * puregitrorepo + + * As gitrepo, but uses pure python implementation + """ + + #{ Invariants + head_sha_2k = '235d521da60e4699e5bd59ac658b5b48bd76ddca' + head_sha_50 = '32347c375250fd470973a5d76185cac718955fd5' + #} END invariants + + @classmethod + def setUpAll(cls): + super(TestBigRepoR, cls).setUpAll() + repo_path = resolve_or_fail(k_env_git_repo) + cls.gitrorepo = Repo(repo_path, odbt=GitCmdObjectDB) + cls.puregitrorepo = Repo(repo_path, odbt=GitDB) + + +class TestBigRepoRW(TestBigRepoR): + """As above, but provides a big repository that we can write to. + + Provides ``self.gitrwrepo`` and ``self.puregitrwrepo``""" + + @classmethod + def setUpAll(cls): + super(TestBigRepoRW, cls).setUpAll() + dirname = tempfile.mktemp() + os.mkdir(dirname) + cls.gitrwrepo = cls.gitrorepo.clone(dirname, shared=True, bare=True, odbt=GitCmdObjectDB) + cls.puregitrwrepo = Repo(dirname, odbt=GitDB) + + @classmethod + def tearDownAll(cls): + shutil.rmtree(cls.gitrwrepo.working_dir) + +#} END base classes diff --git a/git/test/performance/test_commit.py b/git/test/performance/test_commit.py new file mode 100644 index 00000000..80421aa2 --- /dev/null +++ b/git/test/performance/test_commit.py @@ -0,0 +1,99 @@ +# test_performance.py +# Copyright (C) 2008, 2009 Michael Trier (mtrier@gmail.com) and contributors +# +# This module is part of GitPython and is released under +# the BSD License: http://www.opensource.org/licenses/bsd-license.php + +from lib import * +from git import * +from gitdb import IStream +from git.test.test_commit import assert_commit_serialization +from cStringIO import StringIO +from time import time +import sys + +class TestPerformance(TestBigRepoRW): + + # ref with about 100 commits in its history + ref_100 = '0.1.6' + + def _query_commit_info(self, c): + c.author + c.authored_date + c.author_tz_offset + c.committer + c.committed_date + c.committer_tz_offset + c.message + c.parents + + def test_iteration(self): + no = 0 + nc = 0 + + # find the first commit containing the given path - always do a full + # iteration ( restricted to the path in question ), but in fact it should + # return quite a lot of commits, we just take one and hence abort the operation + + st = time() + for c in self.rorepo.iter_commits(self.ref_100): + nc += 1 + self._query_commit_info(c) + for obj in c.tree.traverse(): + obj.size + no += 1 + # END for each object + # END for each commit + elapsed_time = time() - st + print >> sys.stderr, "Traversed %i Trees and a total of %i unchached objects in %s [s] ( %f objs/s )" % (nc, no, elapsed_time, no/elapsed_time) + + def test_commit_traversal(self): + # bound to cat-file parsing performance + nc = 0 + st = time() + for c in self.gitrorepo.commit(self.head_sha_2k).traverse(branch_first=False): + nc += 1 + self._query_commit_info(c) + # END for each traversed commit + elapsed_time = time() - st + print >> sys.stderr, "Traversed %i Commits in %s [s] ( %f commits/s )" % (nc, elapsed_time, nc/elapsed_time) + + def test_commit_iteration(self): + # bound to stream parsing performance + nc = 0 + st = time() + for c in Commit.iter_items(self.gitrorepo, self.head_sha_2k): + nc += 1 + self._query_commit_info(c) + # END for each traversed commit + elapsed_time = time() - st + print >> sys.stderr, "Iterated %i Commits in %s [s] ( %f commits/s )" % (nc, elapsed_time, nc/elapsed_time) + + def test_commit_serialization(self): + assert_commit_serialization(self.gitrwrepo, self.head_sha_2k, True) + + rwrepo = self.gitrwrepo + make_object = rwrepo.odb.store + # direct serialization - deserialization can be tested afterwards + # serialization is probably limited on IO + hc = rwrepo.commit(self.head_sha_2k) + + commits = list() + nc = 5000 + st = time() + for i in xrange(nc): + cm = Commit( rwrepo, Commit.NULL_BIN_SHA, hc.tree, + hc.author, hc.authored_date, hc.author_tz_offset, + hc.committer, hc.committed_date, hc.committer_tz_offset, + str(i), parents=hc.parents, encoding=hc.encoding) + + stream = StringIO() + cm._serialize(stream) + slen = stream.tell() + stream.seek(0) + + cm.binsha = make_object(IStream(Commit.type, slen, stream)).binsha + # END commit creation + elapsed = time() - st + + print >> sys.stderr, "Serialized %i commits to loose objects in %f s ( %f commits / s )" % (nc, elapsed, nc / elapsed) diff --git a/git/test/performance/test_odb.py b/git/test/performance/test_odb.py new file mode 100644 index 00000000..32b70f69 --- /dev/null +++ b/git/test/performance/test_odb.py @@ -0,0 +1,70 @@ +"""Performance tests for object store""" + +from time import time +import sys +import stat + +from lib import ( + TestBigRepoR + ) + + +class TestObjDBPerformance(TestBigRepoR): + + def test_random_access(self): + results = [ ["Iterate Commits"], ["Iterate Blobs"], ["Retrieve Blob Data"] ] + for repo in (self.gitrorepo, self.puregitrorepo): + # GET COMMITS + st = time() + root_commit = repo.commit(self.head_sha_2k) + commits = list(root_commit.traverse()) + nc = len(commits) + elapsed = time() - st + + print >> sys.stderr, "%s: Retrieved %i commits from ObjectStore in %g s ( %f commits / s )" % (type(repo.odb), nc, elapsed, nc / elapsed) + results[0].append(elapsed) + + # GET TREES + # walk all trees of all commits + st = time() + blobs_per_commit = list() + nt = 0 + for commit in commits: + tree = commit.tree + blobs = list() + for item in tree.traverse(): + nt += 1 + if item.type == 'blob': + blobs.append(item) + # direct access for speed + # END while trees are there for walking + blobs_per_commit.append(blobs) + # END for each commit + elapsed = time() - st + + print >> sys.stderr, "%s: Retrieved %i objects from %i commits in %g s ( %f objects / s )" % (type(repo.odb), nt, len(commits), elapsed, nt / elapsed) + results[1].append(elapsed) + + # GET BLOBS + st = time() + nb = 0 + too_many = 15000 + data_bytes = 0 + for blob_list in blobs_per_commit: + for blob in blob_list: + data_bytes += len(blob.data_stream.read()) + # END for each blobsha + nb += len(blob_list) + if nb > too_many: + break + # END for each bloblist + elapsed = time() - st + + print >> sys.stderr, "%s: Retrieved %i blob (%i KiB) and their data in %g s ( %f blobs / s, %f KiB / s )" % (type(repo.odb), nb, data_bytes/1000, elapsed, nb / elapsed, (data_bytes / 1000) / elapsed) + results[2].append(elapsed) + # END for each repo type + + # final results + for test_name, a, b in results: + print >> sys.stderr, "%s: %f s vs %f s, pure is %f times slower" % (test_name, a, b, b / a) + # END for each result diff --git a/git/test/performance/test_streams.py b/git/test/performance/test_streams.py new file mode 100644 index 00000000..7f17d722 --- /dev/null +++ b/git/test/performance/test_streams.py @@ -0,0 +1,131 @@ +"""Performance data streaming performance""" + +from git.test.lib import * +from gitdb import * +from gitdb.util import bin_to_hex + +from time import time +import os +import sys +import stat +import subprocess + +from gitdb.test.lib import make_memory_file + +from lib import ( + TestBigRepoR + ) + + +class TestObjDBPerformance(TestBigRepoR): + + large_data_size_bytes = 1000*1000*10 # some MiB should do it + moderate_data_size_bytes = 1000*1000*1 # just 1 MiB + + @with_rw_repo('HEAD', bare=True) + def test_large_data_streaming(self, rwrepo): + # TODO: This part overlaps with the same file in gitdb.test.performance.test_stream + # It should be shared if possible + ldb = LooseObjectDB(os.path.join(rwrepo.git_dir, 'objects')) + + for randomize in range(2): + desc = (randomize and 'random ') or '' + print >> sys.stderr, "Creating %s data ..." % desc + st = time() + size, stream = make_memory_file(self.large_data_size_bytes, randomize) + elapsed = time() - st + print >> sys.stderr, "Done (in %f s)" % elapsed + + # writing - due to the compression it will seem faster than it is + st = time() + binsha = ldb.store(IStream('blob', size, stream)).binsha + elapsed_add = time() - st + assert ldb.has_object(binsha) + db_file = ldb.readable_db_object_path(bin_to_hex(binsha)) + fsize_kib = os.path.getsize(db_file) / 1000 + + + size_kib = size / 1000 + print >> sys.stderr, "Added %i KiB (filesize = %i KiB) of %s data to loose odb in %f s ( %f Write KiB / s)" % (size_kib, fsize_kib, desc, elapsed_add, size_kib / elapsed_add) + + # reading all at once + st = time() + ostream = ldb.stream(binsha) + shadata = ostream.read() + elapsed_readall = time() - st + + stream.seek(0) + assert shadata == stream.getvalue() + print >> sys.stderr, "Read %i KiB of %s data at once from loose odb in %f s ( %f Read KiB / s)" % (size_kib, desc, elapsed_readall, size_kib / elapsed_readall) + + + # reading in chunks of 1 MiB + cs = 512*1000 + chunks = list() + st = time() + ostream = ldb.stream(binsha) + while True: + data = ostream.read(cs) + chunks.append(data) + if len(data) < cs: + break + # END read in chunks + elapsed_readchunks = time() - st + + stream.seek(0) + assert ''.join(chunks) == stream.getvalue() + + cs_kib = cs / 1000 + print >> sys.stderr, "Read %i KiB of %s data in %i KiB chunks from loose odb in %f s ( %f Read KiB / s)" % (size_kib, desc, cs_kib, elapsed_readchunks, size_kib / elapsed_readchunks) + + # del db file so git has something to do + os.remove(db_file) + + # VS. CGIT + ########## + # CGIT ! Can using the cgit programs be faster ? + proc = rwrepo.git.hash_object('-w', '--stdin', as_process=True, istream=subprocess.PIPE) + + # write file - pump everything in at once to be a fast as possible + data = stream.getvalue() # cache it + st = time() + proc.stdin.write(data) + proc.stdin.close() + gitsha = proc.stdout.read().strip() + proc.wait() + gelapsed_add = time() - st + del(data) + assert gitsha == bin_to_hex(binsha) # we do it the same way, right ? + + # as its the same sha, we reuse our path + fsize_kib = os.path.getsize(db_file) / 1000 + print >> sys.stderr, "Added %i KiB (filesize = %i KiB) of %s data to using git-hash-object in %f s ( %f Write KiB / s)" % (size_kib, fsize_kib, desc, gelapsed_add, size_kib / gelapsed_add) + + # compare ... + print >> sys.stderr, "Git-Python is %f %% faster than git when adding big %s files" % (100.0 - (elapsed_add / gelapsed_add) * 100, desc) + + + # read all + st = time() + s, t, size, data = rwrepo.git.get_object_data(gitsha) + gelapsed_readall = time() - st + print >> sys.stderr, "Read %i KiB of %s data at once using git-cat-file in %f s ( %f Read KiB / s)" % (size_kib, desc, gelapsed_readall, size_kib / gelapsed_readall) + + # compare + print >> sys.stderr, "Git-Python is %f %% faster than git when reading big %sfiles" % (100.0 - (elapsed_readall / gelapsed_readall) * 100, desc) + + + # read chunks + st = time() + s, t, size, stream = rwrepo.git.stream_object_data(gitsha) + while True: + data = stream.read(cs) + if len(data) < cs: + break + # END read stream + gelapsed_readchunks = time() - st + print >> sys.stderr, "Read %i KiB of %s data in %i KiB chunks from git-cat-file in %f s ( %f Read KiB / s)" % (size_kib, desc, cs_kib, gelapsed_readchunks, size_kib / gelapsed_readchunks) + + # compare + print >> sys.stderr, "Git-Python is %f %% faster than git when reading big %s files in chunks" % (100.0 - (elapsed_readchunks / gelapsed_readchunks) * 100, desc) + # END for each randomization factor diff --git a/git/test/performance/test_utils.py b/git/test/performance/test_utils.py new file mode 100644 index 00000000..19c1e84a --- /dev/null +++ b/git/test/performance/test_utils.py @@ -0,0 +1,174 @@ +"""Performance of utilities""" +from time import time +import sys +import stat + +from lib import ( + TestBigRepoR + ) + + +class TestUtilPerformance(TestBigRepoR): + + def test_access(self): + # compare dict vs. slot access + class Slotty(object): + __slots__ = "attr" + def __init__(self): + self.attr = 1 + + class Dicty(object): + def __init__(self): + self.attr = 1 + + class BigSlotty(object): + __slots__ = ('attr', ) + tuple('abcdefghijk') + def __init__(self): + for attr in self.__slots__: + setattr(self, attr, 1) + + class BigDicty(object): + def __init__(self): + for attr in BigSlotty.__slots__: + setattr(self, attr, 1) + + ni = 1000000 + for cls in (Slotty, Dicty, BigSlotty, BigDicty): + cli = cls() + st = time() + for i in xrange(ni): + cli.attr + # END for each access + elapsed = time() - st + print >> sys.stderr, "Accessed %s.attr %i times in %s s ( %f acc / s)" % (cls.__name__, ni, elapsed, ni / elapsed) + # END for each class type + + # check num of sequence-acceses + for cls in (list, tuple): + x = 10 + st = time() + s = cls(range(x)) + for i in xrange(ni): + s[0] + s[1] + s[2] + # END for + elapsed = time() - st + na = ni * 3 + print >> sys.stderr, "Accessed %s[x] %i times in %s s ( %f acc / s)" % (cls.__name__, na, elapsed, na / elapsed) + # END for each sequence + + def test_instantiation(self): + ni = 100000 + max_num_items = 4 + for mni in range(max_num_items+1): + for cls in (tuple, list): + st = time() + for i in xrange(ni): + if mni == 0: + cls() + elif mni == 1: + cls((1,)) + elif mni == 2: + cls((1,2)) + elif mni == 3: + cls((1,2,3)) + elif mni == 4: + cls((1,2,3,4)) + else: + cls(x for x in xrange(mni)) + # END handle empty cls + # END for each item + elapsed = time() - st + print >> sys.stderr, "Created %i %ss of size %i in %f s ( %f inst / s)" % (ni, cls.__name__, mni, elapsed, ni / elapsed) + # END for each type + # END for each item count + + # tuple and tuple direct + st = time() + for i in xrange(ni): + t = (1,2,3,4) + # END for each item + elapsed = time() - st + print >> sys.stderr, "Created %i tuples (1,2,3,4) in %f s ( %f tuples / s)" % (ni, elapsed, ni / elapsed) + + st = time() + for i in xrange(ni): + t = tuple((1,2,3,4)) + # END for each item + elapsed = time() - st + print >> sys.stderr, "Created %i tuples tuple((1,2,3,4)) in %f s ( %f tuples / s)" % (ni, elapsed, ni / elapsed) + + def test_unpacking_vs_indexing(self): + ni = 1000000 + list_items = [1,2,3,4] + tuple_items = (1,2,3,4) + + for sequence in (list_items, tuple_items): + st = time() + for i in xrange(ni): + one, two, three, four = sequence + # END for eac iteration + elapsed = time() - st + print >> sys.stderr, "Unpacked %i %ss of size %i in %f s ( %f acc / s)" % (ni, type(sequence).__name__, len(sequence), elapsed, ni / elapsed) + + st = time() + for i in xrange(ni): + one, two, three, four = sequence[0], sequence[1], sequence[2], sequence[3] + # END for eac iteration + elapsed = time() - st + print >> sys.stderr, "Unpacked %i %ss of size %i individually in %f s ( %f acc / s)" % (ni, type(sequence).__name__, len(sequence), elapsed, ni / elapsed) + + st = time() + for i in xrange(ni): + one, two = sequence[0], sequence[1] + # END for eac iteration + elapsed = time() - st + print >> sys.stderr, "Unpacked %i %ss of size %i individually (2 of 4) in %f s ( %f acc / s)" % (ni, type(sequence).__name__, len(sequence), elapsed, ni / elapsed) + # END for each sequence + + def test_large_list_vs_iteration(self): + # what costs more: alloc/realloc of lists, or the cpu strain of iterators ? + def slow_iter(ni): + for i in xrange(ni): + yield i + # END slow iter - be closer to the real world + + # alloc doesn't play a role here it seems + for ni in (500, 1000, 10000, 20000, 40000): + st = time() + for i in list(xrange(ni)): + i + # END for each item + elapsed = time() - st + print >> sys.stderr, "Iterated %i items from list in %f s ( %f acc / s)" % (ni, elapsed, ni / elapsed) + + st = time() + for i in slow_iter(ni): + i + # END for each item + elapsed = time() - st + print >> sys.stderr, "Iterated %i items from iterator in %f s ( %f acc / s)" % (ni, elapsed, ni / elapsed) + # END for each number of iterations + + def test_type_vs_inst_class(self): + class NewType(object): + pass + + # lets see which way is faster + inst = NewType() + + ni = 1000000 + st = time() + for i in xrange(ni): + inst.__class__() + # END for each item + elapsed = time() - st + print >> sys.stderr, "Created %i items using inst.__class__ in %f s ( %f items / s)" % (ni, elapsed, ni / elapsed) + + st = time() + for i in xrange(ni): + type(inst)() + # END for each item + elapsed = time() - st + print >> sys.stderr, "Created %i items using type(inst)() in %f s ( %f items / s)" % (ni, elapsed, ni / elapsed) |
