From df0892351a394d768489b5647d47b73c24d3ef5f Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Jun 2010 00:48:16 +0200 Subject: commit: initial version of commit_from_tree which could create commit objects if it could serialize itself --- CHANGES | 6 +- lib/git/objects/base.py | 1 + lib/git/objects/commit.py | 777 +++++++++++++++++++++++++--------------------- lib/git/objects/utils.py | 411 +++++++++++++++--------- test/git/test_utils.py | 233 ++++++++------ 5 files changed, 815 insertions(+), 613 deletions(-) diff --git a/CHANGES b/CHANGES index 5d677b06..e24e723d 100644 --- a/CHANGES +++ b/CHANGES @@ -1,7 +1,11 @@ ======= CHANGES ======= - + +0.2 Beta 2 +=========== + * Commit objects now carry the 'encoding' information of their message. It wasn't parsed previously, and defaults to UTF-8 + 0.2 ===== General diff --git a/lib/git/objects/base.py b/lib/git/objects/base.py index 6a51eed3..bb15192d 100644 --- a/lib/git/objects/base.py +++ b/lib/git/objects/base.py @@ -141,6 +141,7 @@ class Object(LazyMixin): self.repo.git.cat_file(self.type, self.sha, output_stream=ostream) return self + class IndexObject(Object): """ Base for all objects that can be part of the index file , namely Tree, Blob and diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py index 826f684c..87eed49b 100644 --- a/lib/git/objects/commit.py +++ b/lib/git/objects/commit.py @@ -7,372 +7,425 @@ from git.utils import Iterable import git.diff as diff import git.stats as stats +from git.actor import Actor from tree import Tree import base import utils -import tempfile +import time import os class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable): - """ - Wraps a git Commit object. - - This class will act lazily on some of its attributes and will query the - value on demand only if it involves calling the git binary. - """ - - # object configuration - type = "commit" - __slots__ = ("tree", - "author", "authored_date", "author_tz_offset", - "committer", "committed_date", "committer_tz_offset", - "message", "parents") - _id_attribute_ = "sha" - - def __init__(self, repo, sha, tree=None, author=None, authored_date=None, author_tz_offset=None, - committer=None, committed_date=None, committer_tz_offset=None, message=None, parents=None): - """ - Instantiate a new Commit. All keyword arguments taking None as default will - be implicitly set if id names a valid sha. - - The parameter documentation indicates the type of the argument after a colon ':'. - - ``sha`` - is the sha id of the commit or a ref - - ``parents`` : tuple( Commit, ... ) - is a tuple of commit ids or actual Commits - - ``tree`` : Tree - is the corresponding tree id or an actual Tree - - ``author`` : Actor - is the author string ( will be implicitly converted into an Actor object ) - - ``authored_date`` : int_seconds_since_epoch - is the authored DateTime - use time.gmtime() to convert it into a - different format - - ``author_tz_offset``: int_seconds_west_of_utc - is the timezone that the authored_date is in - - ``committer`` : Actor - is the committer string - - ``committed_date`` : int_seconds_since_epoch - is the committed DateTime - use time.gmtime() to convert it into a - different format - - ``committer_tz_offset``: int_seconds_west_of_utc - is the timezone that the authored_date is in - - ``message`` : string - is the commit message - - Returns - git.Commit - """ - super(Commit,self).__init__(repo, sha) - self._set_self_from_args_(locals()) - - if parents is not None: - self.parents = tuple( self.__class__(repo, p) for p in parents ) - # END for each parent to convert - - if self.sha and tree is not None: - self.tree = Tree(repo, tree, path='') - # END id to tree conversion - - @classmethod - def _get_intermediate_items(cls, commit): - return commit.parents - - def _set_cache_(self, attr): - """ - Called by LazyMixin superclass when the given uninitialized member needs - to be set. - We set all values at once. - """ - if attr in Commit.__slots__: - # prepare our data lines to match rev-list - data_lines = self.data.splitlines() - data_lines.insert(0, "commit %s" % self.sha) - temp = self._iter_from_process_or_stream(self.repo, iter(data_lines), False).next() - self.parents = temp.parents - self.tree = temp.tree - self.author = temp.author - self.authored_date = temp.authored_date - self.author_tz_offset = temp.author_tz_offset - self.committer = temp.committer - self.committed_date = temp.committed_date - self.committer_tz_offset = temp.committer_tz_offset - self.message = temp.message - else: - super(Commit, self)._set_cache_(attr) - - @property - def summary(self): - """ - Returns - First line of the commit message. - """ - return self.message.split('\n', 1)[0] - - def count(self, paths='', **kwargs): - """ - Count the number of commits reachable from this commit - - ``paths`` - is an optinal path or a list of paths restricting the return value - to commits actually containing the paths - - ``kwargs`` - Additional options to be passed to git-rev-list. They must not alter - the ouput style of the command, or parsing will yield incorrect results - Returns - int - """ - # yes, it makes a difference whether empty paths are given or not in our case - # as the empty paths version will ignore merge commits for some reason. - if paths: - return len(self.repo.git.rev_list(self.sha, '--', paths, **kwargs).splitlines()) - else: - return len(self.repo.git.rev_list(self.sha, **kwargs).splitlines()) - - - @property - def name_rev(self): - """ - Returns - String describing the commits hex sha based on the closest Reference. - Mostly useful for UI purposes - """ - return self.repo.git.name_rev(self) - - @classmethod - def iter_items(cls, repo, rev, paths='', **kwargs): - """ - Find all commits matching the given criteria. - - ``repo`` - is the Repo - - ``rev`` - revision specifier, see git-rev-parse for viable options - - ``paths`` - is an optinal path or list of paths, if set only Commits that include the path - or paths will be considered - - ``kwargs`` - optional keyword arguments to git rev-list where - ``max_count`` is the maximum number of commits to fetch - ``skip`` is the number of commits to skip - ``since`` all commits since i.e. '1970-01-01' - - Returns - iterator yielding Commit items - """ - options = {'pretty': 'raw', 'as_process' : True } - options.update(kwargs) - - args = list() - if paths: - args.extend(('--', paths)) - # END if paths - - proc = repo.git.rev_list(rev, args, **options) - return cls._iter_from_process_or_stream(repo, proc, True) - - def iter_parents(self, paths='', **kwargs): - """ - Iterate _all_ parents of this commit. - - ``paths`` - Optional path or list of paths limiting the Commits to those that - contain at least one of the paths - - ``kwargs`` - All arguments allowed by git-rev-list - - Return: - Iterator yielding Commit objects which are parents of self - """ - # skip ourselves - skip = kwargs.get("skip", 1) - if skip == 0: # skip ourselves - skip = 1 - kwargs['skip'] = skip - - return self.iter_items( self.repo, self, paths, **kwargs ) - - @property - def stats(self): - """ - Create a git stat from changes between this commit and its first parent - or from all changes done if this is the very first commit. - - Return - git.Stats - """ - if not self.parents: - text = self.repo.git.diff_tree(self.sha, '--', numstat=True, root=True) - text2 = "" - for line in text.splitlines()[1:]: - (insertions, deletions, filename) = line.split("\t") - text2 += "%s\t%s\t%s\n" % (insertions, deletions, filename) - text = text2 - else: - text = self.repo.git.diff(self.parents[0].sha, self.sha, '--', numstat=True) - return stats.Stats._list_from_string(self.repo, text) - - @classmethod - def _iter_from_process_or_stream(cls, repo, proc_or_stream, from_rev_list): - """ - Parse out commit information into a list of Commit objects - - ``repo`` - is the Repo - - ``proc`` - git-rev-list process instance (raw format) - - ``from_rev_list`` - If True, the stream was created by rev-list in which case we parse - the message differently - Returns - iterator returning Commit objects - """ - stream = proc_or_stream - if not hasattr(stream,'next'): - stream = proc_or_stream.stdout - - for line in stream: - commit_tokens = line.split() - id = commit_tokens[1] - assert commit_tokens[0] == "commit" - tree = stream.next().split()[1] - - parents = [] - next_line = None - for parent_line in stream: - if not parent_line.startswith('parent'): - next_line = parent_line - break - # END abort reading parents - parents.append(parent_line.split()[-1]) - # END for each parent line - - author, authored_date, author_tz_offset = utils.parse_actor_and_date(next_line) - committer, committed_date, committer_tz_offset = utils.parse_actor_and_date(stream.next()) - - # empty line - stream.next() - - message_lines = [] - if from_rev_list: - for msg_line in stream: - if not msg_line.startswith(' '): - # and forget about this empty marker - break - # END abort message reading - # strip leading 4 spaces - message_lines.append(msg_line[4:]) - # END while there are message lines - else: - # a stream from our data simply gives us the plain message - for msg_line in stream: - message_lines.append(msg_line) - # END message parsing - message = '\n'.join(message_lines) - - yield Commit(repo, id, parents=tuple(parents), tree=tree, - author=author, authored_date=authored_date, author_tz_offset=author_tz_offset, - committer=committer, committed_date=committed_date, committer_tz_offset=committer_tz_offset, - message=message) - # END for each line in stream - - - @classmethod - def create_from_tree(cls, repo, tree, message, parent_commits=None, head=False): - """ - Commit the given tree, creating a commit object. - - ``repo`` - is the Repo - - ``tree`` - Sha of a tree or a tree object to become the tree of the new commit - - ``message`` - Commit message. It may be an empty string if no message is provided. - It will be converted to a string in any case. - - ``parent_commits`` - Optional Commit objects to use as parents for the new commit. - If empty list, the commit will have no parents at all and become - a root commit. - If None , the current head commit will be the parent of the - new commit object - - ``head`` - If True, the HEAD will be advanced to the new commit automatically. - Else the HEAD will remain pointing on the previous commit. This could - lead to undesired results when diffing files. - - Returns - Commit object representing the new commit - - Note: - Additional information about hte committer and Author are taken from the - environment or from the git configuration, see git-commit-tree for - more information - """ - parents = parent_commits - if parent_commits is None: - try: - parent_commits = [ repo.head.commit ] - except ValueError: - # empty repositories have no head commit - parent_commits = list() - # END handle parent commits - # END if parent commits are unset - - parent_args = [ ("-p", str(commit)) for commit in parent_commits ] - - # create message stream - tmp_file_path = tempfile.mktemp() - fp = open(tmp_file_path,"wb") - fp.write(str(message)) - fp.close() - fp = open(tmp_file_path,"rb") - fp.seek(0) - - try: - # write the current index as tree - commit_sha = repo.git.commit_tree(tree, parent_args, istream=fp) - new_commit = cls(repo, commit_sha) - - if head: - try: - repo.head.commit = new_commit - except ValueError: - # head is not yet set to the ref our HEAD points to. - import git.refs - master = git.refs.Head.create(repo, repo.head.ref, commit=new_commit) - repo.head.reference = master - # END handle empty repositories - # END advance head handling - - return new_commit - finally: - fp.close() - os.remove(tmp_file_path) - - def __str__(self): - """ Convert commit to string which is SHA1 """ - return self.sha - - def __repr__(self): - return '' % self.sha + """ + Wraps a git Commit object. + + This class will act lazily on some of its attributes and will query the + value on demand only if it involves calling the git binary. + """ + + # ENVIRONMENT VARIABLES + # read when creating new commits + env_author_name = "GIT_AUTHOR_NAME" + env_author_email = "GIT_AUTHOR_EMAIL" + env_author_date = "GIT_AUTHOR_DATE" + env_committer_name = "GIT_COMMITTER_NAME" + env_committer_email = "GIT_COMMITTER_EMAIL" + env_committer_date = "GIT_COMMITTER_DATE" + env_email = "EMAIL" + + # CONFIGURATION KEYS + conf_email = 'email' + conf_name = 'name' + conf_encoding = 'i18n.commitencoding' + + # INVARIANTS + default_encoding = "UTF-8" + + + # object configuration + type = "commit" + __slots__ = ("tree", + "author", "authored_date", "author_tz_offset", + "committer", "committed_date", "committer_tz_offset", + "message", "parents", "encoding") + _id_attribute_ = "sha" + + def __init__(self, repo, sha, tree=None, author=None, authored_date=None, author_tz_offset=None, + committer=None, committed_date=None, committer_tz_offset=None, + message=None, parents=None, encoding=None): + """ + Instantiate a new Commit. All keyword arguments taking None as default will + be implicitly set if id names a valid sha. + + The parameter documentation indicates the type of the argument after a colon ':'. + + :param sha: is the sha id of the commit or a ref + :param parents: tuple( Commit, ... ) + is a tuple of commit ids or actual Commits + :param tree: Tree + is the corresponding tree id or an actual Tree + :param author: Actor + is the author string ( will be implicitly converted into an Actor object ) + :param authored_date: int_seconds_since_epoch + is the authored DateTime - use time.gmtime() to convert it into a + different format + :param author_tz_offset: int_seconds_west_of_utc + is the timezone that the authored_date is in + :param committer: Actor + is the committer string + :param committed_date: int_seconds_since_epoch + is the committed DateTime - use time.gmtime() to convert it into a + different format + :param committer_tz_offset: int_seconds_west_of_utc + is the timezone that the authored_date is in + :param message: string + is the commit message + :param encoding: string + encoding of the message, defaults to UTF-8 + :return: git.Commit + + :note: Timezone information is in the same format and in the same sign + as what time.altzone returns. The sign is inverted compared to git's + UTC timezone. + """ + super(Commit,self).__init__(repo, sha) + self._set_self_from_args_(locals()) + + if parents is not None: + self.parents = tuple( self.__class__(repo, p) for p in parents ) + # END for each parent to convert + + if self.sha and tree is not None: + self.tree = Tree(repo, tree, path='') + # END id to tree conversion + + @classmethod + def _get_intermediate_items(cls, commit): + return commit.parents + + def _set_cache_(self, attr): + """ + Called by LazyMixin superclass when the given uninitialized member needs + to be set. + We set all values at once. + """ + if attr in Commit.__slots__: + # prepare our data lines to match rev-list + data_lines = self.data.splitlines() + data_lines.insert(0, "commit %s" % self.sha) + temp = self._iter_from_process_or_stream(self.repo, iter(data_lines), False).next() + self.parents = temp.parents + self.tree = temp.tree + self.author = temp.author + self.authored_date = temp.authored_date + self.author_tz_offset = temp.author_tz_offset + self.committer = temp.committer + self.committed_date = temp.committed_date + self.committer_tz_offset = temp.committer_tz_offset + self.message = temp.message + self.encoding = temp.encoding + else: + super(Commit, self)._set_cache_(attr) + + @property + def summary(self): + """ + Returns + First line of the commit message. + """ + return self.message.split('\n', 1)[0] + + def count(self, paths='', **kwargs): + """ + Count the number of commits reachable from this commit + + ``paths`` + is an optinal path or a list of paths restricting the return value + to commits actually containing the paths + + ``kwargs`` + Additional options to be passed to git-rev-list. They must not alter + the ouput style of the command, or parsing will yield incorrect results + Returns + int + """ + # yes, it makes a difference whether empty paths are given or not in our case + # as the empty paths version will ignore merge commits for some reason. + if paths: + return len(self.repo.git.rev_list(self.sha, '--', paths, **kwargs).splitlines()) + else: + return len(self.repo.git.rev_list(self.sha, **kwargs).splitlines()) + + + @property + def name_rev(self): + """ + Returns + String describing the commits hex sha based on the closest Reference. + Mostly useful for UI purposes + """ + return self.repo.git.name_rev(self) + + @classmethod + def iter_items(cls, repo, rev, paths='', **kwargs): + """ + Find all commits matching the given criteria. + + ``repo`` + is the Repo + + ``rev`` + revision specifier, see git-rev-parse for viable options + + ``paths`` + is an optinal path or list of paths, if set only Commits that include the path + or paths will be considered + + ``kwargs`` + optional keyword arguments to git rev-list where + ``max_count`` is the maximum number of commits to fetch + ``skip`` is the number of commits to skip + ``since`` all commits since i.e. '1970-01-01' + + Returns + iterator yielding Commit items + """ + options = {'pretty': 'raw', 'as_process' : True } + options.update(kwargs) + + args = list() + if paths: + args.extend(('--', paths)) + # END if paths + + proc = repo.git.rev_list(rev, args, **options) + return cls._iter_from_process_or_stream(repo, proc, True) + + def iter_parents(self, paths='', **kwargs): + """ + Iterate _all_ parents of this commit. + + ``paths`` + Optional path or list of paths limiting the Commits to those that + contain at least one of the paths + + ``kwargs`` + All arguments allowed by git-rev-list + + Return: + Iterator yielding Commit objects which are parents of self + """ + # skip ourselves + skip = kwargs.get("skip", 1) + if skip == 0: # skip ourselves + skip = 1 + kwargs['skip'] = skip + + return self.iter_items( self.repo, self, paths, **kwargs ) + + @property + def stats(self): + """ + Create a git stat from changes between this commit and its first parent + or from all changes done if this is the very first commit. + + Return + git.Stats + """ + if not self.parents: + text = self.repo.git.diff_tree(self.sha, '--', numstat=True, root=True) + text2 = "" + for line in text.splitlines()[1:]: + (insertions, deletions, filename) = line.split("\t") + text2 += "%s\t%s\t%s\n" % (insertions, deletions, filename) + text = text2 + else: + text = self.repo.git.diff(self.parents[0].sha, self.sha, '--', numstat=True) + return stats.Stats._list_from_string(self.repo, text) + + @classmethod + def _iter_from_process_or_stream(cls, repo, proc_or_stream, from_rev_list): + """ + Parse out commit information into a list of Commit objects + + ``repo`` + is the Repo + + ``proc`` + git-rev-list process instance (raw format) + + ``from_rev_list`` + If True, the stream was created by rev-list in which case we parse + the message differently + Returns + iterator returning Commit objects + """ + stream = proc_or_stream + if not hasattr(stream,'next'): + stream = proc_or_stream.stdout + + for line in stream: + commit_tokens = line.split() + id = commit_tokens[1] + assert commit_tokens[0] == "commit" + tree = stream.next().split()[1] + + parents = [] + next_line = None + for parent_line in stream: + if not parent_line.startswith('parent'): + next_line = parent_line + break + # END abort reading parents + parents.append(parent_line.split()[-1]) + # END for each parent line + + author, authored_date, author_tz_offset = utils.parse_actor_and_date(next_line) + committer, committed_date, committer_tz_offset = utils.parse_actor_and_date(stream.next()) + + + # empty line + encoding = stream.next() + encoding.strip() + if encoding: + encoding = encoding[encoding.find(' ')+1:] + # END parse encoding + + message_lines = list() + if from_rev_list: + for msg_line in stream: + if not msg_line.startswith(' '): + # and forget about this empty marker + break + # END abort message reading + # strip leading 4 spaces + message_lines.append(msg_line[4:]) + # END while there are message lines + else: + # a stream from our data simply gives us the plain message + for msg_line in stream: + message_lines.append(msg_line) + # END message parsing + message = '\n'.join(message_lines) + + + yield Commit(repo, id, tree, + author, authored_date, author_tz_offset, + committer, committed_date, committer_tz_offset, + message, tuple(parents), + encoding or cls.default_encoding) + # END for each line in stream + + + @classmethod + def create_from_tree(cls, repo, tree, message, parent_commits=None, head=False): + """Commit the given tree, creating a commit object. + + :param repo: Repo object the commit should be part of + :param tree: Sha of a tree or a tree object to become the tree of the new commit + :param message: Commit message. It may be an empty string if no message is provided. + It will be converted to a string in any case. + :param parent_commits: + Optional Commit objects to use as parents for the new commit. + If empty list, the commit will have no parents at all and become + a root commit. + If None , the current head commit will be the parent of the + new commit object + :param head: + If True, the HEAD will be advanced to the new commit automatically. + Else the HEAD will remain pointing on the previous commit. This could + lead to undesired results when diffing files. + + :return: Commit object representing the new commit + + :note: + Additional information about the committer and Author are taken from the + environment or from the git configuration, see git-commit-tree for + more information + """ + parents = parent_commits + if parent_commits is None: + try: + parent_commits = [ repo.head.commit ] + except ValueError: + # empty repositories have no head commit + parent_commits = list() + # END handle parent commits + # END if parent commits are unset + + # retrieve all additional information, create a commit object, and + # serialize it + # Generally: + # * Environment variables override configuration values + # * Sensible defaults are set according to the git documentation + + # COMMITER AND AUTHOR INFO + cr = repo.config_reader() + env = os.environ + default_email = utils.get_user_id() + default_name = default_email.split('@')[0] + + conf_name = cr.get_value('user', cls.conf_name, default_name) + conf_email = cr.get_value('user', cls.conf_email, default_email) + + author_name = env.get(cls.env_author_name, conf_name) + author_email = env.get(cls.env_author_email, default_email) + + committer_name = env.get(cls.env_committer_name, conf_name) + committer_email = env.get(cls.env_committer_email, conf_email) + + # PARSE THE DATES + unix_time = int(time.time()) + offset = time.altzone + + author_date_str = env.get(cls.env_author_date, '') + if author_date_str: + author_time, author_offset = utils.parse_date(author_date_str) + else: + author_time, author_offset = unix_time, offset + # END set author time + + committer_date_str = env.get(cls.env_committer_date, '') + if committer_date_str: + committer_time, committer_offset = utils.parse_date(committer_date_str) + else: + committer_time, committer_offset = unix_time, offset + # END set committer time + + # assume utf8 encoding + enc_section, enc_option = cls.conf_encoding.split('.') + conf_encoding = cr.get_value(enc_section, enc_option, default_encoding) + + author = Actor(author_name, author_email) + committer = Actor(committer_name, committer_email) + + + # CREATE NEW COMMIT + new_commit = cls(repo, cls.NULL_HEX_SHA, tree, + author, author_time, author_offset, + committer, committer_time, committer_offset, + message, parent_commits, conf_encoding) + + # serialize ! + + if head: + try: + repo.head.commit = new_commit + except ValueError: + # head is not yet set to the ref our HEAD points to + # Happens on first commit + import git.refs + master = git.refs.Head.create(repo, repo.head.ref, commit=new_commit) + repo.head.reference = master + # END handle empty repositories + # END advance head handling + + return new_commit + + + def __str__(self): + """ Convert commit to string which is SHA1 """ + return self.sha + + def __repr__(self): + return '' % self.sha diff --git a/lib/git/objects/utils.py b/lib/git/objects/utils.py index 4f17b652..7060e293 100644 --- a/lib/git/objects/utils.py +++ b/lib/git/objects/utils.py @@ -9,159 +9,274 @@ Module for general utility functions import re from collections import deque as Deque from git.actor import Actor +import platform + +from string import digits +import time +import os + +__all__ = ('get_object_type_by_name', 'get_user_id', 'parse_date', 'parse_actor_and_date', + 'ProcessStreamAdapter', 'Traversable') def get_object_type_by_name(object_type_name): - """ - Returns - type suitable to handle the given object type name. - Use the type to create new instances. - - ``object_type_name`` - Member of TYPES - - Raises - ValueError: In case object_type_name is unknown - """ - if object_type_name == "commit": - import commit - return commit.Commit - elif object_type_name == "tag": - import tag - return tag.TagObject - elif object_type_name == "blob": - import blob - return blob.Blob - elif object_type_name == "tree": - import tree - return tree.Tree - else: - raise ValueError("Cannot handle unknown object type: %s" % object_type_name) - - + """ + Returns + type suitable to handle the given object type name. + Use the type to create new instances. + + ``object_type_name`` + Member of TYPES + + Raises + ValueError: In case object_type_name is unknown + """ + if object_type_name == "commit": + import commit + return commit.Commit + elif object_type_name == "tag": + import tag + return tag.TagObject + elif object_type_name == "blob": + import blob + return blob.Blob + elif object_type_name == "tree": + import tree + return tree.Tree + else: + raise ValueError("Cannot handle unknown object type: %s" % object_type_name) + + +def get_user_id(): + """:return: string identifying the currently active system user as name@node + :note: user can be set with the 'USER' environment variable, usually set on windows""" + ukn = 'UNKNOWN' + username = os.environ.get('USER', ukn) + if username == ukn and hasattr(os, 'getlogin'): + username = os.getlogin() + # END get username from login + return "%s@%s" % (username, platform.node()) + + +def _utc_tz_to_altz(utctz): + """we convert utctz to the timezone in seconds, it is the format time.altzone + returns. Git stores it as UTC timezon which has the opposite sign as well, + which explains the -1 * ( that was made explicit here ) + :param utctz: git utc timezone string, i.e. +0200""" + return -1 * int(float(utctz)/100*3600) + +def _verify_utctz(offset): + """:raise ValueError: if offset is incorrect + :return: offset""" + fmt_exc = ValueError("Invalid timezone offset format: %s" % offset) + if len(offset) != 5: + raise fmt_exc + if offset[0] not in "+-": + raise fmt_exc + if offset[1] not in digits or \ + offset[2] not in digits or \ + offset[3] not in digits or \ + offset[4] not in digits: + raise fmt_exc + # END for each char + return offset + +def parse_date(string_date): + """ + Parse the given date as one of the following + * Git internal format: timestamp offset + * RFC 2822: Thu, 07 Apr 2005 22:13:13 +0200. + * ISO 8601 2005-04-07T22:13:13 + The T can be a space as well + + :return: Tuple(int(timestamp), int(offset), both in seconds since epoch + :raise ValueError: If the format could not be understood + :note: Date can also be YYYY.MM.DD, MM/DD/YYYY and DD.MM.YYYY + """ + # git time + try: + if string_date.count(' ') == 1 and string_date.rfind(':') == -1: + timestamp, offset = string_date.split() + timestamp = int(timestamp) + return timestamp, _utc_tz_to_altz(_verify_utctz(offset)) + else: + offset = "+0000" # local time by default + if string_date[-5] in '-+': + offset = _verify_utctz(string_date[-5:]) + string_date = string_date[:-6] # skip space as well + # END split timezone info + + # now figure out the date and time portion - split time + date_formats = list() + splitter = -1 + if ',' in string_date: + date_formats.append("%a, %d %b %Y") + splitter = string_date.rfind(' ') + else: + # iso plus additional + date_formats.append("%Y-%m-%d") + date_formats.append("%Y.%m.%d") + date_formats.append("%m/%d/%Y") + date_formats.append("%d.%m.%Y") + + splitter = string_date.rfind('T') + if splitter == -1: + splitter = string_date.rfind(' ') + # END handle 'T' and ' ' + # END handle rfc or iso + + assert splitter > -1 + + # split date and time + time_part = string_date[splitter+1:] # skip space + date_part = string_date[:splitter] + + # parse time + tstruct = time.strptime(time_part, "%H:%M:%S") + + for fmt in date_formats: + try: + dtstruct = time.strptime(date_part, fmt) + fstruct = time.struct_time((dtstruct.tm_year, dtstruct.tm_mon, dtstruct.tm_mday, + tstruct.tm_hour, tstruct.tm_min, tstruct.tm_sec, + dtstruct.tm_wday, dtstruct.tm_yday, tstruct.tm_isdst)) + return int(time.mktime(fstruct)), _utc_tz_to_altz(offset) + except ValueError: + continue + # END exception handling + # END for each fmt + + # still here ? fail + raise ValueError("no format matched") + # END handle format + except Exception: + raise ValueError("Unsupported date format: %s" % string_date) + # END handle exceptions + + # precompiled regex _re_actor_epoch = re.compile(r'^.+? (.*) (\d+) ([+-]\d+).*$') def parse_actor_and_date(line): - """ - Parse out the actor (author or committer) info from a line like:: - - author Tom Preston-Werner 1191999972 -0700 - - Returns - [Actor, int_seconds_since_epoch, int_timezone_offset] - """ - m = _re_actor_epoch.search(line) - actor, epoch, offset = m.groups() - return (Actor._from_string(actor), int(epoch), -int(float(offset)/100*3600)) - - - + """ + Parse out the actor (author or committer) info from a line like:: + + author Tom Preston-Werner 1191999972 -0700 + + Returns + [Actor, int_seconds_since_epoch, int_timezone_offset] + """ + m = _re_actor_epoch.search(line) + actor, epoch, offset = m.groups() + return (Actor._from_string(actor), int(epoch), _utc_tz_to_altz(offset)) + + + class ProcessStreamAdapter(object): - """ - Class wireing all calls to the contained Process instance. - - Use this type to hide the underlying process to provide access only to a specified - stream. The process is usually wrapped into an AutoInterrupt class to kill - it if the instance goes out of scope. - """ - __slots__ = ("_proc", "_stream") - def __init__(self, process, stream_name): - self._proc = process - self._stream = getattr(process, stream_name) - - def __getattr__(self, attr): - return getattr(self._stream, attr) - - + """ + Class wireing all calls to the contained Process instance. + + Use this type to hide the underlying process to provide access only to a specified + stream. The process is usually wrapped into an AutoInterrupt class to kill + it if the instance goes out of scope. + """ + __slots__ = ("_proc", "_stream") + def __init__(self, process, stream_name): + self._proc = process + self._stream = getattr(process, stream_name) + + def __getattr__(self, attr): + return getattr(self._stream, attr) + + class Traversable(object): - """Simple interface to perforam depth-first or breadth-first traversals - into one direction. - Subclasses only need to implement one function. - Instances of the Subclass must be hashable""" - __slots__ = tuple() - - @classmethod - def _get_intermediate_items(cls, item): - """ - Returns: - List of items connected to the given item. - Must be implemented in subclass - """ - raise NotImplementedError("To be implemented in subclass") - - - def traverse( self, predicate = lambda i,d: True, - prune = lambda i,d: False, depth = -1, branch_first=True, - visit_once = True, ignore_self=1, as_edge = False ): - """ - ``Returns`` - iterator yieling of items found when traversing self - - ``predicate`` - f(i,d) returns False if item i at depth d should not be included in the result - - ``prune`` - f(i,d) return True if the search should stop at item i at depth d. - Item i will not be returned. - - ``depth`` - define at which level the iteration should not go deeper - if -1, there is no limit - if 0, you would effectively only get self, the root of the iteration - i.e. if 1, you would only get the first level of predessessors/successors - - ``branch_first`` - if True, items will be returned branch first, otherwise depth first - - ``visit_once`` - if True, items will only be returned once, although they might be encountered - several times. Loops are prevented that way. - - ``ignore_self`` - if True, self will be ignored and automatically pruned from - the result. Otherwise it will be the first item to be returned. - If as_edge is True, the source of the first edge is None - - ``as_edge`` - if True, return a pair of items, first being the source, second the - destinatination, i.e. tuple(src, dest) with the edge spanning from - source to destination""" - visited = set() - stack = Deque() - stack.append( ( 0 ,self, None ) ) # self is always depth level 0 - - def addToStack( stack, item, branch_first, depth ): - lst = self._get_intermediate_items( item ) - if not lst: - return - if branch_first: - stack.extendleft( ( depth , i, item ) for i in lst ) - else: - reviter = ( ( depth , lst[i], item ) for i in range( len( lst )-1,-1,-1) ) - stack.extend( reviter ) - # END addToStack local method - - while stack: - d, item, src = stack.pop() # depth of item, item, item_source - - if visit_once and item in visited: - continue - - if visit_once: - visited.add(item) - - rval = ( as_edge and (src, item) ) or item - if prune( rval, d ): - continue - - skipStartItem = ignore_self and ( item == self ) - if not skipStartItem and predicate( rval, d ): - yield rval - - # only continue to next level if this is appropriate ! - nd = d + 1 - if depth > -1 and nd > depth: - continue - - addToStack( stack, item, branch_first, nd ) - # END for each item on work stack + """Simple interface to perforam depth-first or breadth-first traversals + into one direction. + Subclasses only need to implement one function. + Instances of the Subclass must be hashable""" + __slots__ = tuple() + + @classmethod + def _get_intermediate_items(cls, item): + """ + Returns: + List of items connected to the given item. + Must be implemented in subclass + """ + raise NotImplementedError("To be implemented in subclass") + + + def traverse( self, predicate = lambda i,d: True, + prune = lambda i,d: False, depth = -1, branch_first=True, + visit_once = True, ignore_self=1, as_edge = False ): + """ + ``Returns`` + iterator yieling of items found when traversing self + + ``predicate`` + f(i,d) returns False if item i at depth d should not be included in the result + + ``prune`` + f(i,d) return True if the search should stop at item i at depth d. + Item i will not be returned. + + ``depth`` + define at which level the iteration should not go deeper + if -1, there is no limit + if 0, you would effectively only get self, the root of the iteration + i.e. if 1, you would only get the first level of predessessors/successors + + ``branch_first`` + if True, items will be returned branch first, otherwise depth first + + ``visit_once`` + if True, items will only be returned once, although they might be encountered + several times. Loops are prevented that way. + + ``ignore_self`` + if True, self will be ignored and automatically pruned from + the result. Otherwise it will be the first item to be returned. + If as_edge is True, the source of the first edge is None + + ``as_edge`` + if True, return a pair of items, first being the source, second the + destinatination, i.e. tuple(src, dest) with the edge spanning from + source to destination""" + visited = set() + stack = Deque() + stack.append( ( 0 ,self, None ) ) # self is always depth level 0 + + def addToStack( stack, item, branch_first, depth ): + lst = self._get_intermediate_items( item ) + if not lst: + return + if branch_first: + stack.extendleft( ( depth , i, item ) for i in lst ) + else: + reviter = ( ( depth , lst[i], item ) for i in range( len( lst )-1,-1,-1) ) + stack.extend( reviter ) + # END addToStack local method + + while stack: + d, item, src = stack.pop() # depth of item, item, item_source + + if visit_once and item in visited: + continue + + if visit_once: + visited.add(item) + + rval = ( as_edge and (src, item) ) or item + if prune( rval, d ): + continue + + skipStartItem = ignore_self and ( item == self ) + if not skipStartItem and predicate( rval, d ): + yield rval + + # only continue to next level if this is appropriate ! + nd = d + 1 + if depth > -1 and nd > depth: + continue + + addToStack( stack, item, branch_first, nd ) + # END for each item on work stack diff --git a/test/git/test_utils.py b/test/git/test_utils.py index f843c12e..2c3c392b 100644 --- a/test/git/test_utils.py +++ b/test/git/test_utils.py @@ -9,112 +9,141 @@ import tempfile from test.testlib import * from git.utils import * +from git.objects.utils import * from git import * from git.cmd import dashify import time class TestUtils(TestCase): - def setup(self): - self.testdict = { - "string": "42", - "int": 42, - "array": [ 42 ], - } + def setup(self): + self.testdict = { + "string": "42", + "int": 42, + "array": [ 42 ], + } - def test_it_should_dashify(self): - assert_equal('this-is-my-argument', dashify('this_is_my_argument')) - assert_equal('foo', dashify('foo')) - - - def test_lock_file(self): - my_file = tempfile.mktemp() - lock_file = LockFile(my_file) - assert not lock_file._has_lock() - # release lock we don't have - fine - lock_file._release_lock() - - # get lock - lock_file._obtain_lock_or_raise() - assert lock_file._has_lock() - - # concurrent access - other_lock_file = LockFile(my_file) - assert not other_lock_file._has_lock() - self.failUnlessRaises(IOError, other_lock_file._obtain_lock_or_raise) - - lock_file._release_lock() - assert not lock_file._has_lock() - - other_lock_file._obtain_lock_or_raise() - self.failUnlessRaises(IOError, lock_file._obtain_lock_or_raise) - - # auto-release on destruction - del(other_lock_file) - lock_file._obtain_lock_or_raise() - lock_file._release_lock() - - def test_blocking_lock_file(self): - my_file = tempfile.mktemp() - lock_file = BlockingLockFile(my_file) - lock_file._obtain_lock() - - # next one waits for the lock - start = time.time() - wait_time = 0.1 - wait_lock = BlockingLockFile(my_file, 0.05, wait_time) - self.failUnlessRaises(IOError, wait_lock._obtain_lock) - elapsed = time.time() - start - assert elapsed <= wait_time + 0.02 # some extra time it may cost - - def _cmp_contents(self, file_path, data): - # raise if data from file at file_path - # does not match data string - fp = open(file_path, "rb") - try: - assert fp.read() == data - finally: - fp.close() - - def test_safe_operation(self): - my_file = tempfile.mktemp() - orig_data = "hello" - new_data = "world" - my_file_fp = open(my_file, "wb") - my_file_fp.write(orig_data) - my_file_fp.close() - - try: - cwrite = ConcurrentWriteOperation(my_file) - - # didn't start writing, doesnt matter - cwrite._end_writing(False) - cwrite._end_writing(True) - assert not cwrite._is_writing() - - # write data and fail - stream = cwrite._begin_writing() - assert cwrite._is_writing() - stream.write(new_data) - cwrite._end_writing(successful=False) - self._cmp_contents(my_file, orig_data) - assert not os.path.exists(stream.name) - - # write data - concurrently - ocwrite = ConcurrentWriteOperation(my_file) - stream = cwrite._begin_writing() - self.failUnlessRaises(IOError, ocwrite._begin_writing) - - stream.write("world") - cwrite._end_writing(successful=True) - self._cmp_contents(my_file, new_data) - assert not os.path.exists(stream.name) - - # could test automatic _end_writing on destruction - finally: - os.remove(my_file) - # END final cleanup - - - - + def test_it_should_dashify(self): + assert_equal('this-is-my-argument', dashify('this_is_my_argument')) + assert_equal('foo', dashify('foo')) + + + def test_lock_file(self): + my_file = tempfile.mktemp() + lock_file = LockFile(my_file) + assert not lock_file._has_lock() + # release lock we don't have - fine + lock_file._release_lock() + + # get lock + lock_file._obtain_lock_or_raise() + assert lock_file._has_lock() + + # concurrent access + other_lock_file = LockFile(my_file) + assert not other_lock_file._has_lock() + self.failUnlessRaises(IOError, other_lock_file._obtain_lock_or_raise) + + lock_file._release_lock() + assert not lock_file._has_lock() + + other_lock_file._obtain_lock_or_raise() + self.failUnlessRaises(IOError, lock_file._obtain_lock_or_raise) + + # auto-release on destruction + del(other_lock_file) + lock_file._obtain_lock_or_raise() + lock_file._release_lock() + + def test_blocking_lock_file(self): + my_file = tempfile.mktemp() + lock_file = BlockingLockFile(my_file) + lock_file._obtain_lock() + + # next one waits for the lock + start = time.time() + wait_time = 0.1 + wait_lock = BlockingLockFile(my_file, 0.05, wait_time) + self.failUnlessRaises(IOError, wait_lock._obtain_lock) + elapsed = time.time() - start + assert elapsed <= wait_time + 0.02 # some extra time it may cost + + def _cmp_contents(self, file_path, data): + # raise if data from file at file_path + # does not match data string + fp = open(file_path, "rb") + try: + assert fp.read() == data + finally: + fp.close() + + def test_safe_operation(self): + my_file = tempfile.mktemp() + orig_data = "hello" + new_data = "world" + my_file_fp = open(my_file, "wb") + my_file_fp.write(orig_data) + my_file_fp.close() + + try: + cwrite = ConcurrentWriteOperation(my_file) + + # didn't start writing, doesnt matter + cwrite._end_writing(False) + cwrite._end_writing(True) + assert not cwrite._is_writing() + + # write data and fail + stream = cwrite._begin_writing() + assert cwrite._is_writing() + stream.write(new_data) + cwrite._end_writing(successful=False) + self._cmp_contents(my_file, orig_data) + assert not os.path.exists(stream.name) + + # write data - concurrently + ocwrite = ConcurrentWriteOperation(my_file) + stream = cwrite._begin_writing() + self.failUnlessRaises(IOError, ocwrite._begin_writing) + + stream.write("world") + cwrite._end_writing(successful=True) + self._cmp_contents(my_file, new_data) + assert not os.path.exists(stream.name) + + # could test automatic _end_writing on destruction + finally: + os.remove(my_file) + # END final cleanup + + + + def test_user_id(self): + assert '@' in get_user_id() + + def test_parse_date(self): + # test all supported formats + def assert_rval(rval, veri_time, offset=0): + assert len(rval) == 2 + assert isinstance(rval[0], int) and isinstance(rval[1], int) + assert rval[0] == veri_time + assert rval[1] == offset + # END assert rval utility + + rfc = ("Thu, 07 Apr 2005 22:13:11 +0000", 0) + iso = ("2005-04-07T22:13:11 -0200", 7200) + iso2 = ("2005-04-07 22:13:11 +0400", -14400) + iso3 = ("2005.04.07 22:13:11 -0000", 0) + alt = ("04/07/2005 22:13:11", 0) + alt2 = ("07.04.2005 22:13:11", 0) + veri_time = 1112904791 # the time this represents + for date, offset in (rfc, iso, iso2, iso3, alt, alt2): + assert_rval(parse_date(date), veri_time, offset) + # END for each date type + + # and failure + self.failUnlessRaises(ValueError, parse_date, 'invalid format') + self.failUnlessRaises(ValueError, parse_date, '123456789 -02000') + self.failUnlessRaises(ValueError, parse_date, ' 123456789 -0200') + + -- cgit v1.2.3 From 8c1a87d11df666d308d14e4ae7ee0e9d614296b6 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Jun 2010 12:30:33 +0200 Subject: commit: refactored existing code to decode commits from streams - performance is slightly better git.cmd: added method to provide access to the content stream directly. This is more efficient if large objects are handled, if it is actually used test.helpers: removed unnecessary code --- lib/git/cmd.py | 901 +++++++++++++++++++++++++--------------------- lib/git/objects/base.py | 418 ++++++++++----------- lib/git/objects/commit.py | 139 +++---- lib/git/objects/tree.py | 2 +- lib/git/objects/utils.py | 17 + test/git/test_commit.py | 2 +- test/git/test_diff.py | 6 +- test/git/test_repo.py | 5 +- test/testlib/helper.py | 38 +- 9 files changed, 810 insertions(+), 718 deletions(-) diff --git a/lib/git/cmd.py b/lib/git/cmd.py index ef2fdf4e..cef4ea60 100644 --- a/lib/git/cmd.py +++ b/lib/git/cmd.py @@ -13,414 +13,505 @@ from errors import GitCommandError GIT_PYTHON_TRACE = os.environ.get("GIT_PYTHON_TRACE", False) execute_kwargs = ('istream', 'with_keep_cwd', 'with_extended_output', - 'with_exceptions', 'as_process', - 'output_stream' ) + 'with_exceptions', 'as_process', + 'output_stream' ) def dashify(string): - return string.replace('_', '-') + return string.replace('_', '-') class Git(object): - """ - The Git class manages communication with the Git binary. - - It provides a convenient interface to calling the Git binary, such as in:: - - g = Git( git_dir ) - g.init() # calls 'git init' program - rval = g.ls_files() # calls 'git ls-files' program - - ``Debugging`` - Set the GIT_PYTHON_TRACE environment variable print each invocation - of the command to stdout. - Set its value to 'full' to see details about the returned values. - """ - __slots__ = ("_working_dir", "cat_file_all", "cat_file_header") - - class AutoInterrupt(object): - """ - Kill/Interrupt the stored process instance once this instance goes out of scope. It is - used to prevent processes piling up in case iterators stop reading. - Besides all attributes are wired through to the contained process object. - - The wait method was overridden to perform automatic status code checking - and possibly raise. - """ - __slots__= ("proc", "args") - - def __init__(self, proc, args ): - self.proc = proc - self.args = args - - def __del__(self): - # did the process finish already so we have a return code ? - if self.proc.poll() is not None: - return - - # can be that nothing really exists anymore ... - if os is None: - return - - # try to kill it - try: - os.kill(self.proc.pid, 2) # interrupt signal - except AttributeError: - # try windows - # for some reason, providing None for stdout/stderr still prints something. This is why - # we simply use the shell and redirect to nul. Its slower than CreateProcess, question - # is whether we really want to see all these messages. Its annoying no matter what. - subprocess.call(("TASKKILL /F /T /PID %s 2>nul 1>nul" % str(self.proc.pid)), shell=True) - # END exception handling - - def __getattr__(self, attr): - return getattr(self.proc, attr) - - def wait(self): - """ - Wait for the process and return its status code. - - Raise - GitCommandError if the return status is not 0 - """ - status = self.proc.wait() - if status != 0: - raise GitCommandError(self.args, status, self.proc.stderr.read()) - # END status handling - return status - - - - def __init__(self, working_dir=None): - """ - Initialize this instance with: - - ``working_dir`` - Git directory we should work in. If None, we always work in the current - directory as returned by os.getcwd(). - It is meant to be the working tree directory if available, or the - .git directory in case of bare repositories. - """ - super(Git, self).__init__() - self._working_dir = working_dir - - # cached command slots - self.cat_file_header = None - self.cat_file_all = None - - def __getattr__(self, name): - """ - A convenience method as it allows to call the command as if it was - an object. - Returns - Callable object that will execute call _call_process with your arguments. - """ - if name[:1] == '_': - raise AttributeError(name) - return lambda *args, **kwargs: self._call_process(name, *args, **kwargs) - - @property - def working_dir(self): - """ - Returns - Git directory we are working on - """ - return self._working_dir - - def execute(self, command, - istream=None, - with_keep_cwd=False, - with_extended_output=False, - with_exceptions=True, - as_process=False, - output_stream=None, - **subprocess_kwargs - ): - """ - Handles executing the command on the shell and consumes and returns - the returned information (stdout) - - ``command`` - The command argument list to execute. - It should be a string, or a sequence of program arguments. The - program to execute is the first item in the args sequence or string. - - ``istream`` - Standard input filehandle passed to subprocess.Popen. - - ``with_keep_cwd`` - Whether to use the current working directory from os.getcwd(). - The cmd otherwise uses its own working_dir that it has been initialized - with if possible. - - ``with_extended_output`` - Whether to return a (status, stdout, stderr) tuple. - - ``with_exceptions`` - Whether to raise an exception when git returns a non-zero status. - - ``as_process`` - Whether to return the created process instance directly from which - streams can be read on demand. This will render with_extended_output and - with_exceptions ineffective - the caller will have - to deal with the details himself. - It is important to note that the process will be placed into an AutoInterrupt - wrapper that will interrupt the process once it goes out of scope. If you - use the command in iterators, you should pass the whole process instance - instead of a single stream. - - ``output_stream`` - If set to a file-like object, data produced by the git command will be - output to the given stream directly. - This feature only has any effect if as_process is False. Processes will - always be created with a pipe due to issues with subprocess. - This merely is a workaround as data will be copied from the - output pipe to the given output stream directly. - - ``**subprocess_kwargs`` - Keyword arguments to be passed to subprocess.Popen. Please note that - some of the valid kwargs are already set by this method, the ones you - specify may not be the same ones. - - Returns:: - - str(output) # extended_output = False (Default) - tuple(int(status), str(stdout), str(stderr)) # extended_output = True - - if ouput_stream is True, the stdout value will be your output stream: - output_stream # extended_output = False - tuple(int(status), output_stream, str(stderr))# extended_output = True - - Raise - GitCommandError - - NOTE - If you add additional keyword arguments to the signature of this method, - you must update the execute_kwargs tuple housed in this module. - """ - if GIT_PYTHON_TRACE and not GIT_PYTHON_TRACE == 'full': - print ' '.join(command) - - # Allow the user to have the command executed in their working dir. - if with_keep_cwd or self._working_dir is None: - cwd = os.getcwd() - else: - cwd=self._working_dir - - # Start the process - proc = subprocess.Popen(command, - cwd=cwd, - stdin=istream, - stderr=subprocess.PIPE, - stdout=subprocess.PIPE, - close_fds=(os.name=='posix'),# unsupported on linux - **subprocess_kwargs - ) - if as_process: - return self.AutoInterrupt(proc, command) - - # Wait for the process to return - status = 0 - stdout_value = '' - stderr_value = '' - try: - if output_stream is None: - stdout_value = proc.stdout.read().rstrip() # strip trailing "\n" - else: - max_chunk_size = 1024*64 - while True: - chunk = proc.stdout.read(max_chunk_size) - output_stream.write(chunk) - if len(chunk) < max_chunk_size: - break - # END reading output stream - stdout_value = output_stream - # END stdout handling - stderr_value = proc.stderr.read().rstrip() # strip trailing "\n" - - # waiting here should do nothing as we have finished stream reading - status = proc.wait() - finally: - proc.stdout.close() - proc.stderr.close() - - if with_exceptions and status != 0: - raise GitCommandError(command, status, stderr_value) - - if GIT_PYTHON_TRACE == 'full': - if stderr_value: - print "%s -> %d: '%s' !! '%s'" % (command, status, stdout_value, stderr_value) - elif stdout_value: - print "%s -> %d: '%s'" % (command, status, stdout_value) - else: - print "%s -> %d" % (command, status) - - # Allow access to the command's status code - if with_extended_output: - return (status, stdout_value, stderr_value) - else: - return stdout_value - - def transform_kwargs(self, **kwargs): - """ - Transforms Python style kwargs into git command line options. - """ - args = [] - for k, v in kwargs.items(): - if len(k) == 1: - if v is True: - args.append("-%s" % k) - elif type(v) is not bool: - args.append("-%s%s" % (k, v)) - else: - if v is True: - args.append("--%s" % dashify(k)) - elif type(v) is not bool: - args.append("--%s=%s" % (dashify(k), v)) - return args - - @classmethod - def __unpack_args(cls, arg_list): - if not isinstance(arg_list, (list,tuple)): - return [ str(arg_list) ] - - outlist = list() - for arg in arg_list: - if isinstance(arg_list, (list, tuple)): - outlist.extend(cls.__unpack_args( arg )) - # END recursion - else: - outlist.append(str(arg)) - # END for each arg - return outlist - - def _call_process(self, method, *args, **kwargs): - """ - Run the given git command with the specified arguments and return - the result as a String - - ``method`` - is the command. Contained "_" characters will be converted to dashes, - such as in 'ls_files' to call 'ls-files'. - - ``args`` - is the list of arguments. If None is included, it will be pruned. - This allows your commands to call git more conveniently as None - is realized as non-existent - - ``kwargs`` - is a dict of keyword arguments. - This function accepts the same optional keyword arguments - as execute(). - - Examples:: - git.rev_list('master', max_count=10, header=True) - - Returns - Same as execute() - """ - - # Handle optional arguments prior to calling transform_kwargs - # otherwise these'll end up in args, which is bad. - _kwargs = {} - for kwarg in execute_kwargs: - try: - _kwargs[kwarg] = kwargs.pop(kwarg) - except KeyError: - pass - - # Prepare the argument list - opt_args = self.transform_kwargs(**kwargs) - - ext_args = self.__unpack_args([a for a in args if a is not None]) - args = opt_args + ext_args - - call = ["git", dashify(method)] - call.extend(args) - - return self.execute(call, **_kwargs) - - def _parse_object_header(self, header_line): - """ - ``header_line`` - type_string size_as_int - - Returns - (hex_sha, type_string, size_as_int) - - Raises - ValueError if the header contains indication for an error due to incorrect - input sha - """ - tokens = header_line.split() - if len(tokens) != 3: - raise ValueError("SHA named %s could not be resolved, git returned: %r" % (tokens[0], header_line.strip()) ) - if len(tokens[0]) != 40: - raise ValueError("Failed to parse header: %r" % header_line) - return (tokens[0], tokens[1], int(tokens[2])) - - def __prepare_ref(self, ref): - # required for command to separate refs on stdin - refstr = str(ref) # could be ref-object - if refstr.endswith("\n"): - return refstr - return refstr + "\n" - - def __get_persistent_cmd(self, attr_name, cmd_name, *args,**kwargs): - cur_val = getattr(self, attr_name) - if cur_val is not None: - return cur_val - - options = { "istream" : subprocess.PIPE, "as_process" : True } - options.update( kwargs ) - - cmd = self._call_process( cmd_name, *args, **options ) - setattr(self, attr_name, cmd ) - return cmd - - def __get_object_header(self, cmd, ref): - cmd.stdin.write(self.__prepare_ref(ref)) - cmd.stdin.flush() - return self._parse_object_header(cmd.stdout.readline()) - - def get_object_header(self, ref): - """ - Use this method to quickly examine the type and size of the object behind - the given ref. - - NOTE - The method will only suffer from the costs of command invocation - once and reuses the command in subsequent calls. - - Return: - (hexsha, type_string, size_as_int) - """ - cmd = self.__get_persistent_cmd("cat_file_header", "cat_file", batch_check=True) - return self.__get_object_header(cmd, ref) - - def get_object_data(self, ref): - """ - As get_object_header, but returns object data as well - - Return: - (hexsha, type_string, size_as_int,data_string) - """ - cmd = self.__get_persistent_cmd("cat_file_all", "cat_file", batch=True) - hexsha, typename, size = self.__get_object_header(cmd, ref) - data = cmd.stdout.read(size) - cmd.stdout.read(1) # finishing newlines - - return (hexsha, typename, size, data) - - def clear_cache(self): - """ - Clear all kinds of internal caches to release resources. - - Currently persistent commands will be interrupted. - - Returns - self - """ - self.cat_file_all = None - self.cat_file_header = None - return self + """ + The Git class manages communication with the Git binary. + + It provides a convenient interface to calling the Git binary, such as in:: + + g = Git( git_dir ) + g.init() # calls 'git init' program + rval = g.ls_files() # calls 'git ls-files' program + + ``Debugging`` + Set the GIT_PYTHON_TRACE environment variable print each invocation + of the command to stdout. + Set its value to 'full' to see details about the returned values. + """ + __slots__ = ("_working_dir", "cat_file_all", "cat_file_header") + + class AutoInterrupt(object): + """ + Kill/Interrupt the stored process instance once this instance goes out of scope. It is + used to prevent processes piling up in case iterators stop reading. + Besides all attributes are wired through to the contained process object. + + The wait method was overridden to perform automatic status code checking + and possibly raise. + """ + __slots__= ("proc", "args") + + def __init__(self, proc, args ): + self.proc = proc + self.args = args + + def __del__(self): + # did the process finish already so we have a return code ? + if self.proc.poll() is not None: + return + + # can be that nothing really exists anymore ... + if os is None: + return + + # try to kill it + try: + os.kill(self.proc.pid, 2) # interrupt signal + except AttributeError: + # try windows + # for some reason, providing None for stdout/stderr still prints something. This is why + # we simply use the shell and redirect to nul. Its slower than CreateProcess, question + # is whether we really want to see all these messages. Its annoying no matter what. + subprocess.call(("TASKKILL /F /T /PID %s 2>nul 1>nul" % str(self.proc.pid)), shell=True) + # END exception handling + + def __getattr__(self, attr): + return getattr(self.proc, attr) + + def wait(self): + """ + Wait for the process and return its status code. + + Raise + GitCommandError if the return status is not 0 + """ + status = self.proc.wait() + if status != 0: + raise GitCommandError(self.args, status, self.proc.stderr.read()) + # END status handling + return status + # END auto interrupt + + class CatFileContentStream(object): + """Object representing a sized read-only stream returning the contents of + an object. + It behaves like a stream, but counts the data read and simulates an empty + stream once our sized content region is empty. + If not all data is read to the end of the objects's lifetime, we read the + rest to assure the underlying stream continues to work""" + + __slots__ = ('_stream', '_nbr', '_size') + + def __init__(self, size, stream): + self._stream = stream + self._size = size + self._nbr = 0 # num bytes read + + def read(self, size=-1): + bytes_left = self._size - self._nbr + if bytes_left == 0: + return '' + if size > -1: + # assure we don't try to read past our limit + size = min(bytes_left, size) + else: + # they try to read all, make sure its not more than what remains + size = bytes_left + # END check early depletion + data = self._stream.read(size) + self._nbr += len(data) + + # check for depletion, read our final byte to make the stream usable by others + if self._size - self._nbr == 0: + self._stream.read(1) # final newline + # END finish reading + + return data + + def readline(self, size=-1): + if self._nbr == self._size: + return '' + + if size > -1: + size = min(self._size - self._nbr, size) + + data = self._stream.readline(size) + self._nbr += len(data) + + # handle final byte + # we inline everything, it must be fast ! + if self._size - self._nbr == 0: + self._stream.read(1) + # END finish reading + + return data + + def readlines(self, size=-1): + if self._nbr == self._size: + return list() + + # leave all additional logic to our readline method, we just check the size + out = list() + nbr = 0 + while True: + line = self.readline() + if not line: + break + out.append(line) + if size > -1: + nbr += len(line) + if nbr > size: + break + # END handle size constraint + # END readline loop + return out + + def __iter__(self): + return self + + def next(self): + line = self.readline() + if not line: + raise StopIteration + return line + + def __del__(self): + bytes_left = self._size - self._nbr + if bytes_left: + # seek and discard + self._stream.seek(bytes_left + 1, os.SEEK_CUR) # includes terminating newline + # END handle incomplete read + + + def __init__(self, working_dir=None): + """ + Initialize this instance with: + + ``working_dir`` + Git directory we should work in. If None, we always work in the current + directory as returned by os.getcwd(). + It is meant to be the working tree directory if available, or the + .git directory in case of bare repositories. + """ + super(Git, self).__init__() + self._working_dir = working_dir + + # cached command slots + self.cat_file_header = None + self.cat_file_all = None + + def __getattr__(self, name): + """ + A convenience method as it allows to call the command as if it was + an object. + Returns + Callable object that will execute call _call_process with your arguments. + """ + if name[:1] == '_': + raise AttributeError(name) + return lambda *args, **kwargs: self._call_process(name, *args, **kwargs) + + @property + def working_dir(self): + """ + Returns + Git directory we are working on + """ + return self._working_dir + + def execute(self, command, + istream=None, + with_keep_cwd=False, + with_extended_output=False, + with_exceptions=True, + as_process=False, + output_stream=None, + **subprocess_kwargs + ): + """ + Handles executing the command on the shell and consumes and returns + the returned information (stdout) + + ``command`` + The command argument list to execute. + It should be a string, or a sequence of program arguments. The + program to execute is the first item in the args sequence or string. + + ``istream`` + Standard input filehandle passed to subprocess.Popen. + + ``with_keep_cwd`` + Whether to use the current working directory from os.getcwd(). + The cmd otherwise uses its own working_dir that it has been initialized + with if possible. + + ``with_extended_output`` + Whether to return a (status, stdout, stderr) tuple. + + ``with_exceptions`` + Whether to raise an exception when git returns a non-zero status. + + ``as_process`` + Whether to return the created process instance directly from which + streams can be read on demand. This will render with_extended_output and + with_exceptions ineffective - the caller will have + to deal with the details himself. + It is important to note that the process will be placed into an AutoInterrupt + wrapper that will interrupt the process once it goes out of scope. If you + use the command in iterators, you should pass the whole process instance + instead of a single stream. + + ``output_stream`` + If set to a file-like object, data produced by the git command will be + output to the given stream directly. + This feature only has any effect if as_process is False. Processes will + always be created with a pipe due to issues with subprocess. + This merely is a workaround as data will be copied from the + output pipe to the given output stream directly. + + ``**subprocess_kwargs`` + Keyword arguments to be passed to subprocess.Popen. Please note that + some of the valid kwargs are already set by this method, the ones you + specify may not be the same ones. + + Returns:: + + str(output) # extended_output = False (Default) + tuple(int(status), str(stdout), str(stderr)) # extended_output = True + + if ouput_stream is True, the stdout value will be your output stream: + output_stream # extended_output = False + tuple(int(status), output_stream, str(stderr))# extended_output = True + + Raise + GitCommandError + + NOTE + If you add additional keyword arguments to the signature of this method, + you must update the execute_kwargs tuple housed in this module. + """ + if GIT_PYTHON_TRACE and not GIT_PYTHON_TRACE == 'full': + print ' '.join(command) + + # Allow the user to have the command executed in their working dir. + if with_keep_cwd or self._working_dir is None: + cwd = os.getcwd() + else: + cwd=self._working_dir + + # Start the process + proc = subprocess.Popen(command, + cwd=cwd, + stdin=istream, + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + close_fds=(os.name=='posix'),# unsupported on linux + **subprocess_kwargs + ) + if as_process: + return self.AutoInterrupt(proc, command) + + # Wait for the process to return + status = 0 + stdout_value = '' + stderr_value = '' + try: + if output_stream is None: + stdout_value = proc.stdout.read().rstrip() # strip trailing "\n" + else: + max_chunk_size = 1024*64 + while True: + chunk = proc.stdout.read(max_chunk_size) + output_stream.write(chunk) + if len(chunk) < max_chunk_size: + break + # END reading output stream + stdout_value = output_stream + # END stdout handling + stderr_value = proc.stderr.read().rstrip() # strip trailing "\n" + + # waiting here should do nothing as we have finished stream reading + status = proc.wait() + finally: + proc.stdout.close() + proc.stderr.close() + + if with_exceptions and status != 0: + raise GitCommandError(command, status, stderr_value) + + if GIT_PYTHON_TRACE == 'full': + if stderr_value: + print "%s -> %d: '%s' !! '%s'" % (command, status, stdout_value, stderr_value) + elif stdout_value: + print "%s -> %d: '%s'" % (command, status, stdout_value) + else: + print "%s -> %d" % (command, status) + + # Allow access to the command's status code + if with_extended_output: + return (status, stdout_value, stderr_value) + else: + return stdout_value + + def transform_kwargs(self, **kwargs): + """ + Transforms Python style kwargs into git command line options. + """ + args = [] + for k, v in kwargs.items(): + if len(k) == 1: + if v is True: + args.append("-%s" % k) + elif type(v) is not bool: + args.append("-%s%s" % (k, v)) + else: + if v is True: + args.append("--%s" % dashify(k)) + elif type(v) is not bool: + args.append("--%s=%s" % (dashify(k), v)) + return args + + @classmethod + def __unpack_args(cls, arg_list): + if not isinstance(arg_list, (list,tuple)): + return [ str(arg_list) ] + + outlist = list() + for arg in arg_list: + if isinstance(arg_list, (list, tuple)): + outlist.extend(cls.__unpack_args( arg )) + # END recursion + else: + outlist.append(str(arg)) + # END for each arg + return outlist + + def _call_process(self, method, *args, **kwargs): + """ + Run the given git command with the specified arguments and return + the result as a String + + ``method`` + is the command. Contained "_" characters will be converted to dashes, + such as in 'ls_files' to call 'ls-files'. + + ``args`` + is the list of arguments. If None is included, it will be pruned. + This allows your commands to call git more conveniently as None + is realized as non-existent + + ``kwargs`` + is a dict of keyword arguments. + This function accepts the same optional keyword arguments + as execute(). + + Examples:: + git.rev_list('master', max_count=10, header=True) + + Returns + Same as execute() + """ + + # Handle optional arguments prior to calling transform_kwargs + # otherwise these'll end up in args, which is bad. + _kwargs = {} + for kwarg in execute_kwargs: + try: + _kwargs[kwarg] = kwargs.pop(kwarg) + except KeyError: + pass + + # Prepare the argument list + opt_args = self.transform_kwargs(**kwargs) + + ext_args = self.__unpack_args([a for a in args if a is not None]) + args = opt_args + ext_args + + call = ["git", dashify(method)] + call.extend(args) + + return self.execute(call, **_kwargs) + + def _parse_object_header(self, header_line): + """ + ``header_line`` + type_string size_as_int + + Returns + (hex_sha, type_string, size_as_int) + + Raises + ValueError if the header contains indication for an error due to incorrect + input sha + """ + tokens = header_line.split() + if len(tokens) != 3: + raise ValueError("SHA named %s could not be resolved, git returned: %r" % (tokens[0], header_line.strip()) ) + if len(tokens[0]) != 40: + raise ValueError("Failed to parse header: %r" % header_line) + return (tokens[0], tokens[1], int(tokens[2])) + + def __prepare_ref(self, ref): + # required for command to separate refs on stdin + refstr = str(ref) # could be ref-object + if refstr.endswith("\n"): + return refstr + return refstr + "\n" + + def __get_persistent_cmd(self, attr_name, cmd_name, *args,**kwargs): + cur_val = getattr(self, attr_name) + if cur_val is not None: + return cur_val + + options = { "istream" : subprocess.PIPE, "as_process" : True } + options.update( kwargs ) + + cmd = self._call_process( cmd_name, *args, **options ) + setattr(self, attr_name, cmd ) + return cmd + + def __get_object_header(self, cmd, ref): + cmd.stdin.write(self.__prepare_ref(ref)) + cmd.stdin.flush() + return self._parse_object_header(cmd.stdout.readline()) + + def get_object_header(self, ref): + """ Use this method to quickly examine the type and size of the object behind + the given ref. + + :note: The method will only suffer from the costs of command invocation + once and reuses the command in subsequent calls. + + :return: (hexsha, type_string, size_as_int) """ + cmd = self.__get_persistent_cmd("cat_file_header", "cat_file", batch_check=True) + return self.__get_object_header(cmd, ref) + + def get_object_data(self, ref): + """ As get_object_header, but returns object data as well + :return: (hexsha, type_string, size_as_int,data_string) + :note: not threadsafe + """ + hexsha, typename, size, stream = self.stream_object_data(ref) + data = stream.read(size) + del(stream) + return (hexsha, typename, size, data) + + def stream_object_data(self, ref): + """As get_object_header, but returns the data as a stream + :return: (hexsha, type_string, size_as_int, stream) + :note: This method is not threadsafe, you need one independent Command instance + per thread to be safe !""" + cmd = self.__get_persistent_cmd("cat_file_all", "cat_file", batch=True) + hexsha, typename, size = self.__get_object_header(cmd, ref) + return (hexsha, typename, size, self.CatFileContentStream(size, cmd.stdout)) + + def clear_cache(self): + """ + Clear all kinds of internal caches to release resources. + + Currently persistent commands will be interrupted. + + Returns + self + """ + self.cat_file_all = None + self.cat_file_header = None + return self diff --git a/lib/git/objects/base.py b/lib/git/objects/base.py index bb15192d..f7043199 100644 --- a/lib/git/objects/base.py +++ b/lib/git/objects/base.py @@ -6,223 +6,223 @@ import os from git.utils import LazyMixin, join_path_native import utils - + _assertion_msg_format = "Created object %r whose python type %r disagrees with the acutal git object type %r" class Object(LazyMixin): - """ - Implements an Object which may be Blobs, Trees, Commits and Tags - - This Object also serves as a constructor for instances of the correct type:: - - inst = Object.new(repo,id) - inst.sha # objects sha in hex - inst.size # objects uncompressed data size - inst.data # byte string containing the whole data of the object - """ - NULL_HEX_SHA = '0'*40 - TYPES = ("blob", "tree", "commit", "tag") - __slots__ = ("repo", "sha", "size", "data" ) - type = None # to be set by subclass - - def __init__(self, repo, id): - """ - Initialize an object by identifying it by its id. All keyword arguments - will be set on demand if None. - - ``repo`` - repository this object is located in - - ``id`` - SHA1 or ref suitable for git-rev-parse - """ - super(Object,self).__init__() - self.repo = repo - self.sha = id + """ + Implements an Object which may be Blobs, Trees, Commits and Tags + + This Object also serves as a constructor for instances of the correct type:: + + inst = Object.new(repo,id) + inst.sha # objects sha in hex + inst.size # objects uncompressed data size + inst.data # byte string containing the whole data of the object + """ + NULL_HEX_SHA = '0'*40 + TYPES = ("blob", "tree", "commit", "tag") + __slots__ = ("repo", "sha", "size", "data" ) + type = None # to be set by subclass + + def __init__(self, repo, id): + """ + Initialize an object by identifying it by its id. All keyword arguments + will be set on demand if None. + + ``repo`` + repository this object is located in + + ``id`` + SHA1 or ref suitable for git-rev-parse + """ + super(Object,self).__init__() + self.repo = repo + self.sha = id - @classmethod - def new(cls, repo, id): - """ - Return - New Object instance of a type appropriate to the object type behind - id. The id of the newly created object will be a hexsha even though - the input id may have been a Reference or Rev-Spec - - Note - This cannot be a __new__ method as it would always call __init__ - with the input id which is not necessarily a hexsha. - """ - hexsha, typename, size = repo.git.get_object_header(id) - obj_type = utils.get_object_type_by_name(typename) - inst = obj_type(repo, hexsha) - inst.size = size - return inst - - def _set_self_from_args_(self, args_dict): - """ - Initialize attributes on self from the given dict that was retrieved - from locals() in the calling method. - - Will only set an attribute on self if the corresponding value in args_dict - is not None - """ - for attr, val in args_dict.items(): - if attr != "self" and val is not None: - setattr( self, attr, val ) - # END set all non-None attributes - - def _set_cache_(self, attr): - """ - Retrieve object information - """ - if attr == "size": - hexsha, typename, self.size = self.repo.git.get_object_header(self.sha) - assert typename == self.type, _assertion_msg_format % (self.sha, typename, self.type) - elif attr == "data": - hexsha, typename, self.size, self.data = self.repo.git.get_object_data(self.sha) - assert typename == self.type, _assertion_msg_format % (self.sha, typename, self.type) - else: - super(Object,self)._set_cache_(attr) - - def __eq__(self, other): - """ - Returns - True if the objects have the same SHA1 - """ - return self.sha == other.sha - - def __ne__(self, other): - """ - Returns - True if the objects do not have the same SHA1 - """ - return self.sha != other.sha - - def __hash__(self): - """ - Returns - Hash of our id allowing objects to be used in dicts and sets - """ - return hash(self.sha) - - def __str__(self): - """ - Returns - string of our SHA1 as understood by all git commands - """ - return self.sha - - def __repr__(self): - """ - Returns - string with pythonic representation of our object - """ - return '' % (self.__class__.__name__, self.sha) + @classmethod + def new(cls, repo, id): + """ + Return + New Object instance of a type appropriate to the object type behind + id. The id of the newly created object will be a hexsha even though + the input id may have been a Reference or Rev-Spec + + Note + This cannot be a __new__ method as it would always call __init__ + with the input id which is not necessarily a hexsha. + """ + hexsha, typename, size = repo.git.get_object_header(id) + obj_type = utils.get_object_type_by_name(typename) + inst = obj_type(repo, hexsha) + inst.size = size + return inst + + def _set_self_from_args_(self, args_dict): + """ + Initialize attributes on self from the given dict that was retrieved + from locals() in the calling method. + + Will only set an attribute on self if the corresponding value in args_dict + is not None + """ + for attr, val in args_dict.items(): + if attr != "self" and val is not None: + setattr( self, attr, val ) + # END set all non-None attributes + + def _set_cache_(self, attr): + """ + Retrieve object information + """ + if attr == "size": + hexsha, typename, self.size = self.repo.git.get_object_header(self.sha) + assert typename == self.type, _assertion_msg_format % (self.sha, typename, self.type) + elif attr == "data": + hexsha, typename, self.size, self.data = self.repo.git.get_object_data(self.sha) + assert typename == self.type, _assertion_msg_format % (self.sha, typename, self.type) + else: + super(Object,self)._set_cache_(attr) + + def __eq__(self, other): + """ + Returns + True if the objects have the same SHA1 + """ + return self.sha == other.sha + + def __ne__(self, other): + """ + Returns + True if the objects do not have the same SHA1 + """ + return self.sha != other.sha + + def __hash__(self): + """ + Returns + Hash of our id allowing objects to be used in dicts and sets + """ + return hash(self.sha) + + def __str__(self): + """ + Returns + string of our SHA1 as understood by all git commands + """ + return self.sha + + def __repr__(self): + """ + Returns + string with pythonic representation of our object + """ + return '' % (self.__class__.__name__, self.sha) - @property - def data_stream(self): - """ - Returns - File Object compatible stream to the uncompressed raw data of the object - """ - proc = self.repo.git.cat_file(self.type, self.sha, as_process=True) - return utils.ProcessStreamAdapter(proc, "stdout") - - def stream_data(self, ostream): - """ - Writes our data directly to the given output stream - - ``ostream`` - File object compatible stream object. - - Returns - self - """ - self.repo.git.cat_file(self.type, self.sha, output_stream=ostream) - return self + @property + def data_stream(self): + """ + Returns + File Object compatible stream to the uncompressed raw data of the object + """ + proc = self.repo.git.cat_file(self.type, self.sha, as_process=True) + return utils.ProcessStreamAdapter(proc, "stdout") + def stream_data(self, ostream): + """ + Writes our data directly to the given output stream + + ``ostream`` + File object compatible stream object. + + Returns + self + """ + self.repo.git.cat_file(self.type, self.sha, output_stream=ostream) + return self + class IndexObject(Object): - """ - Base for all objects that can be part of the index file , namely Tree, Blob and - SubModule objects - """ - __slots__ = ("path", "mode") - - def __init__(self, repo, sha, mode=None, path=None): - """ - Initialize a newly instanced IndexObject - ``repo`` - is the Repo we are located in + """ + Base for all objects that can be part of the index file , namely Tree, Blob and + SubModule objects + """ + __slots__ = ("path", "mode") + + def __init__(self, repo, sha, mode=None, path=None): + """ + Initialize a newly instanced IndexObject + ``repo`` + is the Repo we are located in - ``sha`` : string - is the git object id as hex sha + ``sha`` : string + is the git object id as hex sha - ``mode`` : int - is the file mode as int, use the stat module to evaluate the infomration + ``mode`` : int + is the file mode as int, use the stat module to evaluate the infomration - ``path`` : str - is the path to the file in the file system, relative to the git repository root, i.e. - file.ext or folder/other.ext - - NOTE - Path may not be set of the index object has been created directly as it cannot - be retrieved without knowing the parent tree. - """ - super(IndexObject, self).__init__(repo, sha) - self._set_self_from_args_(locals()) - if isinstance(mode, basestring): - self.mode = self._mode_str_to_int(mode) - - def __hash__(self): - """ - Returns - Hash of our path as index items are uniquely identifyable by path, not - by their data ! - """ - return hash(self.path) - - def _set_cache_(self, attr): - if attr in IndexObject.__slots__: - # they cannot be retrieved lateron ( not without searching for them ) - raise AttributeError( "path and mode attributes must have been set during %s object creation" % type(self).__name__ ) - else: - super(IndexObject, self)._set_cache_(attr) - - @classmethod - def _mode_str_to_int(cls, modestr): - """ - ``modestr`` - string like 755 or 644 or 100644 - only the last 6 chars will be used - - Returns - String identifying a mode compatible to the mode methods ids of the - stat module regarding the rwx permissions for user, group and other, - special flags and file system flags, i.e. whether it is a symlink - for example. - """ - mode = 0 - for iteration,char in enumerate(reversed(modestr[-6:])): - mode += int(char) << iteration*3 - # END for each char - return mode - - @property - def name(self): - """ - Returns - Name portion of the path, effectively being the basename - """ - return os.path.basename(self.path) - - @property - def abspath(self): - """ - Returns - Absolute path to this index object in the file system ( as opposed to the - .path field which is a path relative to the git repository ). - - The returned path will be native to the system and contains '\' on windows. - """ - return join_path_native(self.repo.working_tree_dir, self.path) - + ``path`` : str + is the path to the file in the file system, relative to the git repository root, i.e. + file.ext or folder/other.ext + + NOTE + Path may not be set of the index object has been created directly as it cannot + be retrieved without knowing the parent tree. + """ + super(IndexObject, self).__init__(repo, sha) + self._set_self_from_args_(locals()) + if isinstance(mode, basestring): + self.mode = self._mode_str_to_int(mode) + + def __hash__(self): + """ + Returns + Hash of our path as index items are uniquely identifyable by path, not + by their data ! + """ + return hash(self.path) + + def _set_cache_(self, attr): + if attr in IndexObject.__slots__: + # they cannot be retrieved lateron ( not without searching for them ) + raise AttributeError( "path and mode attributes must have been set during %s object creation" % type(self).__name__ ) + else: + super(IndexObject, self)._set_cache_(attr) + + @classmethod + def _mode_str_to_int(cls, modestr): + """ + ``modestr`` + string like 755 or 644 or 100644 - only the last 6 chars will be used + + Returns + String identifying a mode compatible to the mode methods ids of the + stat module regarding the rwx permissions for user, group and other, + special flags and file system flags, i.e. whether it is a symlink + for example. + """ + mode = 0 + for iteration,char in enumerate(reversed(modestr[-6:])): + mode += int(char) << iteration*3 + # END for each char + return mode + + @property + def name(self): + """ + Returns + Name portion of the path, effectively being the basename + """ + return os.path.basename(self.path) + + @property + def abspath(self): + """ + Returns + Absolute path to this index object in the file system ( as opposed to the + .path field which is a path relative to the git repository ). + + The returned path will be native to the system and contains '\' on windows. + """ + return join_path_native(self.repo.working_tree_dir, self.path) + diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py index 87eed49b..948e9a54 100644 --- a/lib/git/objects/commit.py +++ b/lib/git/objects/commit.py @@ -9,12 +9,14 @@ import git.diff as diff import git.stats as stats from git.actor import Actor from tree import Tree +from cStringIO import StringIO import base import utils import time import os -class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable): + +class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Serializable): """ Wraps a git Commit object. @@ -91,7 +93,8 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable): self._set_self_from_args_(locals()) if parents is not None: - self.parents = tuple( self.__class__(repo, p) for p in parents ) + cls = type(self) + self.parents = tuple(cls(repo, p) for p in parents if not isinstance(p, cls)) # END for each parent to convert if self.sha and tree is not None: @@ -109,20 +112,9 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable): We set all values at once. """ if attr in Commit.__slots__: - # prepare our data lines to match rev-list - data_lines = self.data.splitlines() - data_lines.insert(0, "commit %s" % self.sha) - temp = self._iter_from_process_or_stream(self.repo, iter(data_lines), False).next() - self.parents = temp.parents - self.tree = temp.tree - self.author = temp.author - self.authored_date = temp.authored_date - self.author_tz_offset = temp.author_tz_offset - self.committer = temp.committer - self.committed_date = temp.committed_date - self.committer_tz_offset = temp.committer_tz_offset - self.message = temp.message - self.encoding = temp.encoding + # read the data in a chunk, its faster - then provide a file wrapper + hexsha, typename, size, data = self.repo.git.get_object_data(self) + self._deserialize(StringIO(data)) else: super(Commit, self)._set_cache_(attr) @@ -260,59 +252,18 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable): iterator returning Commit objects """ stream = proc_or_stream - if not hasattr(stream,'next'): + if not hasattr(stream,'readline'): stream = proc_or_stream.stdout - for line in stream: - commit_tokens = line.split() + while True: + line = stream.readline() + if not line: + break + commit_tokens = line.split() id = commit_tokens[1] assert commit_tokens[0] == "commit" - tree = stream.next().split()[1] - - parents = [] - next_line = None - for parent_line in stream: - if not parent_line.startswith('parent'): - next_line = parent_line - break - # END abort reading parents - parents.append(parent_line.split()[-1]) - # END for each parent line - - author, authored_date, author_tz_offset = utils.parse_actor_and_date(next_line) - committer, committed_date, committer_tz_offset = utils.parse_actor_and_date(stream.next()) - - # empty line - encoding = stream.next() - encoding.strip() - if encoding: - encoding = encoding[encoding.find(' ')+1:] - # END parse encoding - - message_lines = list() - if from_rev_list: - for msg_line in stream: - if not msg_line.startswith(' '): - # and forget about this empty marker - break - # END abort message reading - # strip leading 4 spaces - message_lines.append(msg_line[4:]) - # END while there are message lines - else: - # a stream from our data simply gives us the plain message - for msg_line in stream: - message_lines.append(msg_line) - # END message parsing - message = '\n'.join(message_lines) - - - yield Commit(repo, id, tree, - author, authored_date, author_tz_offset, - committer, committed_date, committer_tz_offset, - message, tuple(parents), - encoding or cls.default_encoding) + yield Commit(repo, id)._deserialize(stream, from_rev_list) # END for each line in stream @@ -393,7 +344,7 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable): # assume utf8 encoding enc_section, enc_option = cls.conf_encoding.split('.') - conf_encoding = cr.get_value(enc_section, enc_option, default_encoding) + conf_encoding = cr.get_value(enc_section, enc_option, cls.default_encoding) author = Actor(author_name, author_email) committer = Actor(committer_name, committer_email) @@ -429,3 +380,61 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable): def __repr__(self): return '' % self.sha + #{ Serializable Implementation + + def _serialize(self, stream): + # for now, this is very inefficient and in fact shouldn't be used like this + return super(Commit, self)._serialize(stream) + + def _deserialize(self, stream, from_rev_list=False): + """:param from_rev_list: if true, the stream format is coming from the rev-list command + Otherwise it is assumed to be a plain data stream from our object""" + self.tree = Tree(self.repo, stream.readline().split()[1], 0, '') + + self.parents = list() + next_line = None + while True: + parent_line = stream.readline() + if not parent_line.startswith('parent'): + next_line = parent_line + break + # END abort reading parents + self.parents.append(type(self)(self.repo, parent_line.split()[-1])) + # END for each parent line + self.parents = tuple(self.parents) + + self.author, self.authored_date, self.author_tz_offset = utils.parse_actor_and_date(next_line) + self.committer, self.committed_date, self.committer_tz_offset = utils.parse_actor_and_date(stream.readline()) + + + # empty line + self.encoding = self.default_encoding + enc = stream.readline() + enc.strip() + if enc: + self.encoding = enc[enc.find(' ')+1:] + # END parse encoding + + message_lines = list() + if from_rev_list: + while True: + msg_line = stream.readline() + if not msg_line.startswith(' '): + # and forget about this empty marker + # cut the last newline to get rid of the artificial newline added + # by rev-list command. Lets hope its just linux style \n + message_lines[-1] = message_lines[-1][:-1] + break + # END abort message reading + # strip leading 4 spaces + message_lines.append(msg_line[4:]) + # END while there are message lines + self.message = ''.join(message_lines) + else: + # a stream from our data simply gives us the plain message + # The end of our message stream is marked with a newline that we strip + self.message = stream.read()[:-1] + # END message parsing + return self + + #} END serializable implementation diff --git a/lib/git/objects/tree.py b/lib/git/objects/tree.py index a9e60981..285d3b5b 100644 --- a/lib/git/objects/tree.py +++ b/lib/git/objects/tree.py @@ -209,7 +209,7 @@ class Tree(base.IndexObject, diff.Diffable, utils.Traversable): visit_once = False, ignore_self=1 ): """For documentation, see utils.Traversable.traverse - Trees are set to visist_once = False to gain more performance in the traversal""" + Trees are set to visit_once = False to gain more performance in the traversal""" return super(Tree, self).traverse(predicate, prune, depth, branch_first, visit_once, ignore_self) # List protocol diff --git a/lib/git/objects/utils.py b/lib/git/objects/utils.py index 7060e293..6d378a72 100644 --- a/lib/git/objects/utils.py +++ b/lib/git/objects/utils.py @@ -280,3 +280,20 @@ class Traversable(object): addToStack( stack, item, branch_first, nd ) # END for each item on work stack + + +class Serializable(object): + """Defines methods to serialize and deserialize objects from and into a data stream""" + + def _serialize(self, stream): + """Serialize the data of this object into the given data stream + :note: a serialized object would ``_deserialize`` into the same objet + :param stream: a file-like object + :return: self""" + raise NotImplementedError("To be implemented in subclass") + + def _deserialize(self, stream): + """Deserialize all information regarding this object from the stream + :param stream: a file-like object + :return: self""" + raise NotImplementedError("To be implemented in subclass") diff --git a/test/git/test_commit.py b/test/git/test_commit.py index 48937c93..28b407ac 100644 --- a/test/git/test_commit.py +++ b/test/git/test_commit.py @@ -129,7 +129,7 @@ class TestCommit(TestBase): bisect_all=True) assert_true(git.called) - commits = Commit._iter_from_process_or_stream(self.rorepo, ListProcessAdapter(revs), True) + commits = Commit._iter_from_process_or_stream(self.rorepo, StringProcessAdapter(revs), True) expected_ids = ( 'cf37099ea8d1d8c7fbf9b6d12d7ec0249d3acb8b', '33ebe7acec14b25c5f84f35a664803fcab2f7781', diff --git a/test/git/test_diff.py b/test/git/test_diff.py index 2f6a19bd..a113b992 100644 --- a/test/git/test_diff.py +++ b/test/git/test_diff.py @@ -20,7 +20,7 @@ class TestDiff(TestBase): return diffs def test_list_from_string_new_mode(self): - output = ListProcessAdapter(fixture('diff_new_mode')) + output = StringProcessAdapter(fixture('diff_new_mode')) diffs = Diff._index_from_patch_format(self.rorepo, output.stdout) self._assert_diff_format(diffs) @@ -28,7 +28,7 @@ class TestDiff(TestBase): assert_equal(10, len(diffs[0].diff.splitlines())) def test_diff_with_rename(self): - output = ListProcessAdapter(fixture('diff_rename')) + output = StringProcessAdapter(fixture('diff_rename')) diffs = Diff._index_from_patch_format(self.rorepo, output.stdout) self._assert_diff_format(diffs) @@ -47,7 +47,7 @@ class TestDiff(TestBase): "diff_tree_numstat_root" ) for fixture_name in fixtures: - diff_proc = ListProcessAdapter(fixture(fixture_name)) + diff_proc = StringProcessAdapter(fixture(fixture_name)) diffs = Diff._index_from_patch_format(self.rorepo, diff_proc.stdout) # END for each fixture diff --git a/test/git/test_repo.py b/test/git/test_repo.py index ce79402a..9316245b 100644 --- a/test/git/test_repo.py +++ b/test/git/test_repo.py @@ -48,6 +48,7 @@ class TestRepo(TestBase): def test_tree_from_revision(self): tree = self.rorepo.tree('0.1.6') + assert len(tree.sha) == 40 assert tree.type == "tree" assert self.rorepo.tree(tree) == tree @@ -56,9 +57,9 @@ class TestRepo(TestBase): @patch_object(Git, '_call_process') def test_commits(self, git): - git.return_value = ListProcessAdapter(fixture('rev_list')) + git.return_value = StringProcessAdapter(fixture('rev_list')) - commits = list( self.rorepo.iter_commits('master', max_count=10) ) + commits = list(self.rorepo.iter_commits('master', max_count=10)) c = commits[0] assert_equal('4c8124ffcf4039d292442eeccabdeca5af5c5017', c.sha) diff --git a/test/testlib/helper.py b/test/testlib/helper.py index 9c38ffd5..c9b4c2ac 100644 --- a/test/testlib/helper.py +++ b/test/testlib/helper.py @@ -9,6 +9,7 @@ from git import Repo, Remote, GitCommandError from unittest import TestCase import tempfile import shutil +import cStringIO GIT_REPO = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) @@ -23,40 +24,13 @@ def absolute_project_path(): return os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) -class ListProcessAdapter(object): - """Allows to use lists as Process object as returned by SubProcess.Popen. +class StringProcessAdapter(object): + """Allows to use strings as Process object as returned by SubProcess.Popen. Its tailored to work with the test system only""" - class Stream(object): - """Simple stream emulater meant to work only with tests""" - def __init__(self, data): - self.data = data - self.cur_iter = None - - def __iter__(self): - dat = self.data - if isinstance(dat, basestring): - dat = dat.splitlines() - if self.cur_iter is None: - self.cur_iter = iter(dat) - return self.cur_iter - - def read(self): - dat = self.data - if isinstance(dat, (tuple,list)): - dat = "\n".join(dat) - return dat - - def next(self): - if self.cur_iter is None: - self.cur_iter = iter(self) - return self.cur_iter.next() - - # END stream - - def __init__(self, input_list_or_string): - self.stdout = self.Stream(input_list_or_string) - self.stderr = self.Stream('') + def __init__(self, input_string): + self.stdout = cStringIO.StringIO(input_string) + self.stderr = cStringIO.StringIO() def wait(self): return 0 -- cgit v1.2.3 From 4a25347d7f4c371345da2348ac6cceec7a143da2 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Jun 2010 13:01:27 +0200 Subject: Added commit-iteration test --- test/git/test_performance.py | 96 ++++++++++++++++++++++++++------------------ 1 file changed, 56 insertions(+), 40 deletions(-) diff --git a/test/git/test_performance.py b/test/git/test_performance.py index 72acfcac..c1f8ce59 100644 --- a/test/git/test_performance.py +++ b/test/git/test_performance.py @@ -7,46 +7,62 @@ from test.testlib import * from git import * from time import time +import sys class TestPerformance(TestBase): - def _query_commit_info(self, c): - c.author - c.authored_date - c.author_tz_offset - c.committer - c.committed_date - c.committer_tz_offset - c.message - c.parents - - def test_iteration(self): - num_objs = 0 - num_commits = 0 - - # find the first commit containing the given path - always do a full - # iteration ( restricted to the path in question ), but in fact it should - # return quite a lot of commits, we just take one and hence abort the operation - - st = time() - for c in self.rorepo.iter_commits('0.1.6'): - num_commits += 1 - self._query_commit_info(c) - for obj in c.tree.traverse(): - obj.size - num_objs += 1 - # END for each object - # END for each commit - elapsed_time = time() - st - print "Traversed %i Trees and a total of %i unchached objects in %s [s] ( %f objs/s )" % (num_commits, num_objs, elapsed_time, num_objs/elapsed_time) - - def test_commit_traversal(self): - num_commits = 0 - - st = time() - for c in self.rorepo.commit('0.1.6').traverse(branch_first=False): - num_commits += 1 - self._query_commit_info(c) - # END for each traversed commit - elapsed_time = time() - st - print "Traversed %i Commits in %s [s] ( %f commits/s )" % (num_commits, elapsed_time, num_commits/elapsed_time) + # ref with about 100 commits in its history + ref_100 = '0.1.6' + + def _query_commit_info(self, c): + c.author + c.authored_date + c.author_tz_offset + c.committer + c.committed_date + c.committer_tz_offset + c.message + c.parents + + def test_iteration(self): + no = 0 + nc = 0 + + # find the first commit containing the given path - always do a full + # iteration ( restricted to the path in question ), but in fact it should + # return quite a lot of commits, we just take one and hence abort the operation + + st = time() + for c in self.rorepo.iter_commits(self.ref_100): + nc += 1 + self._query_commit_info(c) + for obj in c.tree.traverse(): + obj.size + no += 1 + # END for each object + # END for each commit + elapsed_time = time() - st + print >> sys.stderr, "Traversed %i Trees and a total of %i unchached objects in %s [s] ( %f objs/s )" % (nc, no, elapsed_time, no/elapsed_time) + + def test_commit_traversal(self): + # bound to cat-file parsing performance + nc = 0 + st = time() + for c in self.rorepo.commit(self.ref_100).traverse(branch_first=False): + nc += 1 + self._query_commit_info(c) + # END for each traversed commit + elapsed_time = time() - st + print >> sys.stderr, "Traversed %i Commits in %s [s] ( %f commits/s )" % (nc, elapsed_time, nc/elapsed_time) + + def test_commit_iteration(self): + # bound to stream parsing performance + nc = 0 + st = time() + for c in Commit.iter_items(self.rorepo, self.ref_100): + nc += 1 + self._query_commit_info(c) + # END for each traversed commit + elapsed_time = time() - st + print >> sys.stderr, "Iterated %i Commits in %s [s] ( %f commits/s )" % (nc, elapsed_time, nc/elapsed_time) + -- cgit v1.2.3 From 4e1c89ec97ec90037583e85d0e9e71e9c845a19b Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Jun 2010 16:13:32 +0200 Subject: Added performance testing foundation library, reworked existing performance tests to work on larger repositories --- test/git/performance/lib.py | 46 +++++++++++++++++++++++++ test/git/performance/test_commit.py | 68 +++++++++++++++++++++++++++++++++++++ test/git/test_performance.py | 68 ------------------------------------- 3 files changed, 114 insertions(+), 68 deletions(-) create mode 100644 test/git/performance/lib.py create mode 100644 test/git/performance/test_commit.py delete mode 100644 test/git/test_performance.py diff --git a/test/git/performance/lib.py b/test/git/performance/lib.py new file mode 100644 index 00000000..4b552b20 --- /dev/null +++ b/test/git/performance/lib.py @@ -0,0 +1,46 @@ +"""Contains library functions""" +import os +from test.testlib import * + +from git import ( + Repo + ) + +#{ Invvariants +k_env_git_repo = "GIT_PYTHON_TEST_GIT_REPO_BASE" +#} END invariants + + +#{ Utilities +def resolve_or_fail(env_var): + """:return: resolved environment variable or raise EnvironmentError""" + try: + return os.environ[env_var] + except KeyError: + raise EnvironmentError("Please set the %r envrionment variable and retry" % env_var) + # END exception handling + +#} END utilities + + +#{ Base Classes + +class TestBigRepoReadOnly(TestBase): + """TestCase providing access to readonly 'big' repositories using the following + member variables: + + * gitrepo + + * Read-Only git repository - actually the repo of git itself""" + + #{ Invariants + head_sha_2k = '235d521da60e4699e5bd59ac658b5b48bd76ddca' + head_sha_50 = '32347c375250fd470973a5d76185cac718955fd5' + #} END invariants + + @classmethod + def setUpAll(cls): + super(TestBigRepoReadOnly, cls).setUpAll() + cls.gitrepo = Repo(resolve_or_fail(k_env_git_repo)) + +#} END base classes diff --git a/test/git/performance/test_commit.py b/test/git/performance/test_commit.py new file mode 100644 index 00000000..c1f8ce59 --- /dev/null +++ b/test/git/performance/test_commit.py @@ -0,0 +1,68 @@ +# test_performance.py +# Copyright (C) 2008, 2009 Michael Trier (mtrier@gmail.com) and contributors +# +# This module is part of GitPython and is released under +# the BSD License: http://www.opensource.org/licenses/bsd-license.php + +from test.testlib import * +from git import * +from time import time +import sys + +class TestPerformance(TestBase): + + # ref with about 100 commits in its history + ref_100 = '0.1.6' + + def _query_commit_info(self, c): + c.author + c.authored_date + c.author_tz_offset + c.committer + c.committed_date + c.committer_tz_offset + c.message + c.parents + + def test_iteration(self): + no = 0 + nc = 0 + + # find the first commit containing the given path - always do a full + # iteration ( restricted to the path in question ), but in fact it should + # return quite a lot of commits, we just take one and hence abort the operation + + st = time() + for c in self.rorepo.iter_commits(self.ref_100): + nc += 1 + self._query_commit_info(c) + for obj in c.tree.traverse(): + obj.size + no += 1 + # END for each object + # END for each commit + elapsed_time = time() - st + print >> sys.stderr, "Traversed %i Trees and a total of %i unchached objects in %s [s] ( %f objs/s )" % (nc, no, elapsed_time, no/elapsed_time) + + def test_commit_traversal(self): + # bound to cat-file parsing performance + nc = 0 + st = time() + for c in self.rorepo.commit(self.ref_100).traverse(branch_first=False): + nc += 1 + self._query_commit_info(c) + # END for each traversed commit + elapsed_time = time() - st + print >> sys.stderr, "Traversed %i Commits in %s [s] ( %f commits/s )" % (nc, elapsed_time, nc/elapsed_time) + + def test_commit_iteration(self): + # bound to stream parsing performance + nc = 0 + st = time() + for c in Commit.iter_items(self.rorepo, self.ref_100): + nc += 1 + self._query_commit_info(c) + # END for each traversed commit + elapsed_time = time() - st + print >> sys.stderr, "Iterated %i Commits in %s [s] ( %f commits/s )" % (nc, elapsed_time, nc/elapsed_time) + diff --git a/test/git/test_performance.py b/test/git/test_performance.py deleted file mode 100644 index c1f8ce59..00000000 --- a/test/git/test_performance.py +++ /dev/null @@ -1,68 +0,0 @@ -# test_performance.py -# Copyright (C) 2008, 2009 Michael Trier (mtrier@gmail.com) and contributors -# -# This module is part of GitPython and is released under -# the BSD License: http://www.opensource.org/licenses/bsd-license.php - -from test.testlib import * -from git import * -from time import time -import sys - -class TestPerformance(TestBase): - - # ref with about 100 commits in its history - ref_100 = '0.1.6' - - def _query_commit_info(self, c): - c.author - c.authored_date - c.author_tz_offset - c.committer - c.committed_date - c.committer_tz_offset - c.message - c.parents - - def test_iteration(self): - no = 0 - nc = 0 - - # find the first commit containing the given path - always do a full - # iteration ( restricted to the path in question ), but in fact it should - # return quite a lot of commits, we just take one and hence abort the operation - - st = time() - for c in self.rorepo.iter_commits(self.ref_100): - nc += 1 - self._query_commit_info(c) - for obj in c.tree.traverse(): - obj.size - no += 1 - # END for each object - # END for each commit - elapsed_time = time() - st - print >> sys.stderr, "Traversed %i Trees and a total of %i unchached objects in %s [s] ( %f objs/s )" % (nc, no, elapsed_time, no/elapsed_time) - - def test_commit_traversal(self): - # bound to cat-file parsing performance - nc = 0 - st = time() - for c in self.rorepo.commit(self.ref_100).traverse(branch_first=False): - nc += 1 - self._query_commit_info(c) - # END for each traversed commit - elapsed_time = time() - st - print >> sys.stderr, "Traversed %i Commits in %s [s] ( %f commits/s )" % (nc, elapsed_time, nc/elapsed_time) - - def test_commit_iteration(self): - # bound to stream parsing performance - nc = 0 - st = time() - for c in Commit.iter_items(self.rorepo, self.ref_100): - nc += 1 - self._query_commit_info(c) - # END for each traversed commit - elapsed_time = time() - st - print >> sys.stderr, "Iterated %i Commits in %s [s] ( %f commits/s )" % (nc, elapsed_time, nc/elapsed_time) - -- cgit v1.2.3 From ae5a69f67822d81bbbd8f4af93be68703e730b37 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Jun 2010 16:41:28 +0200 Subject: commit: redesigned revlist and commit parsing, commits are always retrieved from their object information directly. This is faster, and resolves issues with the rev-list format and empty commit messages Adjusted many tests to go with the changes, as they were still mocked. The mock was removed if necessary and replaced by code that actually executes --- lib/git/objects/commit.py | 98 +++++------- test/fixtures/rev_list | 27 +--- test/git/performance/test_commit.py | 8 +- test/git/test_commit.py | 310 ++++++++++++++++++------------------ test/git/test_repo.py | 39 ++--- 5 files changed, 217 insertions(+), 265 deletions(-) diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py index 948e9a54..98aca360 100644 --- a/lib/git/objects/commit.py +++ b/lib/git/objects/commit.py @@ -106,13 +106,12 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri return commit.parents def _set_cache_(self, attr): - """ - Called by LazyMixin superclass when the given uninitialized member needs + """ Called by LazyMixin superclass when the given uninitialized member needs to be set. - We set all values at once. - """ + We set all values at once. """ if attr in Commit.__slots__: # read the data in a chunk, its faster - then provide a file wrapper + # Could use self.data, but lets try to get it with less calls hexsha, typename, size, data = self.repo.git.get_object_data(self) self._deserialize(StringIO(data)) else: @@ -181,16 +180,16 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri Returns iterator yielding Commit items """ - options = {'pretty': 'raw', 'as_process' : True } - options.update(kwargs) - + if 'pretty' in kwargs: + raise ValueError("--pretty cannot be used as parsing expects single sha's only") + # END handle pretty args = list() if paths: args.extend(('--', paths)) # END if paths - proc = repo.git.rev_list(rev, args, **options) - return cls._iter_from_process_or_stream(repo, proc, True) + proc = repo.git.rev_list(rev, args, as_process=True, **kwargs) + return cls._iter_from_process_or_stream(repo, proc) def iter_parents(self, paths='', **kwargs): """ @@ -235,35 +234,30 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri return stats.Stats._list_from_string(self.repo, text) @classmethod - def _iter_from_process_or_stream(cls, repo, proc_or_stream, from_rev_list): - """ - Parse out commit information into a list of Commit objects - - ``repo`` - is the Repo - - ``proc`` - git-rev-list process instance (raw format) + def _iter_from_process_or_stream(cls, repo, proc_or_stream): + """Parse out commit information into a list of Commit objects + We expect one-line per commit, and parse the actual commit information directly + from our lighting fast object database - ``from_rev_list`` - If True, the stream was created by rev-list in which case we parse - the message differently - Returns - iterator returning Commit objects - """ + :param proc: git-rev-list process instance - one sha per line + :return: iterator returning Commit objects""" stream = proc_or_stream if not hasattr(stream,'readline'): stream = proc_or_stream.stdout + readline = stream.readline while True: - line = stream.readline() + line = readline() if not line: break - commit_tokens = line.split() - id = commit_tokens[1] - assert commit_tokens[0] == "commit" + sha = line.strip() + if len(sha) > 40: + # split additional information, as returned by bisect for instance + sha, rest = line.split(None, 1) + # END handle extra info - yield Commit(repo, id)._deserialize(stream, from_rev_list) + assert len(sha) == 40, "Invalid line: %s" % sha + yield Commit(repo, sha) # END for each line in stream @@ -386,15 +380,16 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri # for now, this is very inefficient and in fact shouldn't be used like this return super(Commit, self)._serialize(stream) - def _deserialize(self, stream, from_rev_list=False): + def _deserialize(self, stream): """:param from_rev_list: if true, the stream format is coming from the rev-list command Otherwise it is assumed to be a plain data stream from our object""" - self.tree = Tree(self.repo, stream.readline().split()[1], 0, '') + readline = stream.readline + self.tree = Tree(self.repo, readline().split()[1], 0, '') self.parents = list() next_line = None while True: - parent_line = stream.readline() + parent_line = readline() if not parent_line.startswith('parent'): next_line = parent_line break @@ -404,37 +399,24 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri self.parents = tuple(self.parents) self.author, self.authored_date, self.author_tz_offset = utils.parse_actor_and_date(next_line) - self.committer, self.committed_date, self.committer_tz_offset = utils.parse_actor_and_date(stream.readline()) + self.committer, self.committed_date, self.committer_tz_offset = utils.parse_actor_and_date(readline()) - # empty line + # now we can have the encoding line, or an empty line followed by the optional + # message. self.encoding = self.default_encoding - enc = stream.readline() - enc.strip() + # read encoding or empty line to separate message + enc = readline() + enc = enc.strip() if enc: self.encoding = enc[enc.find(' ')+1:] - # END parse encoding - - message_lines = list() - if from_rev_list: - while True: - msg_line = stream.readline() - if not msg_line.startswith(' '): - # and forget about this empty marker - # cut the last newline to get rid of the artificial newline added - # by rev-list command. Lets hope its just linux style \n - message_lines[-1] = message_lines[-1][:-1] - break - # END abort message reading - # strip leading 4 spaces - message_lines.append(msg_line[4:]) - # END while there are message lines - self.message = ''.join(message_lines) - else: - # a stream from our data simply gives us the plain message - # The end of our message stream is marked with a newline that we strip - self.message = stream.read()[:-1] - # END message parsing + # now comes the message separator + readline() + # END handle encoding + + # a stream from our data simply gives us the plain message + # The end of our message stream is marked with a newline that we strip + self.message = stream.read()[:-1] return self #} END serializable implementation diff --git a/test/fixtures/rev_list b/test/fixtures/rev_list index 95a1ebff..1a576118 100644 --- a/test/fixtures/rev_list +++ b/test/fixtures/rev_list @@ -1,24 +1,3 @@ -commit 4c8124ffcf4039d292442eeccabdeca5af5c5017 -tree 672eca9b7f9e09c22dcb128c283e8c3c8d7697a4 -parent 634396b2f541a9f2d58b00be1a07f0c358b999b3 -author Tom Preston-Werner 1191999972 -0700 -committer Tom Preston-Werner 1191999972 -0700 - - implement Grit#heads - -commit 634396b2f541a9f2d58b00be1a07f0c358b999b3 -tree b35b4bf642d667fdd613eebcfe4e17efd420fb8a -author Tom Preston-Werner 1191997100 -0700 -committer Tom Preston-Werner 1191997100 -0700 - - initial grit setup - -commit ab25fd8483882c3bda8a458ad2965d2248654335 -tree c20b5ec543bde1e43a931449b196052c06ed8acc -parent 6e64c55896aabb9a7d8e9f8f296f426d21a78c2c -parent 7f874954efb9ba35210445be456c74e037ba6af2 -author Tom Preston-Werner 1182645538 -0700 -committer Tom Preston-Werner 1182645538 -0700 - - Merge branch 'site' - Some other stuff +4c8124ffcf4039d292442eeccabdeca5af5c5017 +634396b2f541a9f2d58b00be1a07f0c358b999b3 +ab25fd8483882c3bda8a458ad2965d2248654335 diff --git a/test/git/performance/test_commit.py b/test/git/performance/test_commit.py index c1f8ce59..b4a9d868 100644 --- a/test/git/performance/test_commit.py +++ b/test/git/performance/test_commit.py @@ -4,12 +4,12 @@ # This module is part of GitPython and is released under # the BSD License: http://www.opensource.org/licenses/bsd-license.php -from test.testlib import * +from lib import * from git import * from time import time import sys -class TestPerformance(TestBase): +class TestPerformance(TestBigRepoReadOnly): # ref with about 100 commits in its history ref_100 = '0.1.6' @@ -48,7 +48,7 @@ class TestPerformance(TestBase): # bound to cat-file parsing performance nc = 0 st = time() - for c in self.rorepo.commit(self.ref_100).traverse(branch_first=False): + for c in self.gitrepo.commit(self.head_sha_2k).traverse(branch_first=False): nc += 1 self._query_commit_info(c) # END for each traversed commit @@ -59,7 +59,7 @@ class TestPerformance(TestBase): # bound to stream parsing performance nc = 0 st = time() - for c in Commit.iter_items(self.rorepo, self.ref_100): + for c in Commit.iter_items(self.gitrepo, self.head_sha_2k): nc += 1 self._query_commit_info(c) # END for each traversed commit diff --git a/test/git/test_commit.py b/test/git/test_commit.py index 28b407ac..ad7a0082 100644 --- a/test/git/test_commit.py +++ b/test/git/test_commit.py @@ -9,169 +9,165 @@ from git import * class TestCommit(TestBase): - def test_bake(self): + def test_bake(self): - commit = Commit(self.rorepo, **{'sha': '2454ae89983a4496a445ce347d7a41c0bb0ea7ae'}) - commit.author # bake + commit = Commit(self.rorepo, '2454ae89983a4496a445ce347d7a41c0bb0ea7ae') + commit.author # bake - assert_equal("Sebastian Thiel", commit.author.name) - assert_equal("byronimo@gmail.com", commit.author.email) - assert commit.author == commit.committer - assert isinstance(commit.authored_date, int) and isinstance(commit.committed_date, int) - assert isinstance(commit.author_tz_offset, int) and isinstance(commit.committer_tz_offset, int) - assert commit.message == "Added missing information to docstrings of commit and stats module" + assert_equal("Sebastian Thiel", commit.author.name) + assert_equal("byronimo@gmail.com", commit.author.email) + assert commit.author == commit.committer + assert isinstance(commit.authored_date, int) and isinstance(commit.committed_date, int) + assert isinstance(commit.author_tz_offset, int) and isinstance(commit.committer_tz_offset, int) + assert commit.message == "Added missing information to docstrings of commit and stats module" - def test_stats(self): - commit = Commit(self.rorepo, '33ebe7acec14b25c5f84f35a664803fcab2f7781') - stats = commit.stats - - def check_entries(d): - assert isinstance(d, dict) - for key in ("insertions", "deletions", "lines"): - assert key in d - # END assertion helper - assert stats.files - assert stats.total - - check_entries(stats.total) - assert "files" in stats.total - - for filepath, d in stats.files.items(): - check_entries(d) - # END for each stated file - - # assure data is parsed properly - michael = Actor._from_string("Michael Trier ") - assert commit.author == michael - assert commit.committer == michael - assert commit.authored_date == 1210193388 - assert commit.committed_date == 1210193388 - assert commit.author_tz_offset == 14400, commit.author_tz_offset - assert commit.committer_tz_offset == 14400, commit.committer_tz_offset - assert commit.message == "initial project" - - def test_traversal(self): - start = self.rorepo.commit("a4d06724202afccd2b5c54f81bcf2bf26dea7fff") - first = self.rorepo.commit("33ebe7acec14b25c5f84f35a664803fcab2f7781") - p0 = start.parents[0] - p1 = start.parents[1] - p00 = p0.parents[0] - p10 = p1.parents[0] - - # basic branch first, depth first - dfirst = start.traverse(branch_first=False) - bfirst = start.traverse(branch_first=True) - assert dfirst.next() == p0 - assert dfirst.next() == p00 - - assert bfirst.next() == p0 - assert bfirst.next() == p1 - assert bfirst.next() == p00 - assert bfirst.next() == p10 - - # at some point, both iterations should stop - assert list(bfirst)[-1] == first - stoptraverse = self.rorepo.commit("254d04aa3180eb8b8daf7b7ff25f010cd69b4e7d").traverse(as_edge=True) - l = list(stoptraverse) - assert len(l[0]) == 2 - - # ignore self - assert start.traverse(ignore_self=False).next() == start - - # depth - assert len(list(start.traverse(ignore_self=False, depth=0))) == 1 - - # prune - assert start.traverse(branch_first=1, prune=lambda i,d: i==p0).next() == p1 - - # predicate - assert start.traverse(branch_first=1, predicate=lambda i,d: i==p1).next() == p1 - - # traversal should stop when the beginning is reached - self.failUnlessRaises(StopIteration, first.traverse().next) - - # parents of the first commit should be empty ( as the only parent has a null - # sha ) - assert len(first.parents) == 0 - - def test_iteration(self): - # we can iterate commits - all_commits = Commit.list_items(self.rorepo, self.rorepo.head) - assert all_commits - assert all_commits == list(self.rorepo.iter_commits()) - - # this includes merge commits - mcomit = Commit(self.rorepo, 'd884adc80c80300b4cc05321494713904ef1df2d') - assert mcomit in all_commits - - # we can limit the result to paths - ltd_commits = list(self.rorepo.iter_commits(paths='CHANGES')) - assert ltd_commits and len(ltd_commits) < len(all_commits) - - # show commits of multiple paths, resulting in a union of commits - less_ltd_commits = list(Commit.iter_items(self.rorepo, 'master', paths=('CHANGES', 'AUTHORS'))) - assert len(ltd_commits) < len(less_ltd_commits) - - - @patch_object(Git, '_call_process') - def test_rev_list_bisect_all(self, git): - """ - 'git rev-list --bisect-all' returns additional information - in the commit header. This test ensures that we properly parse it. - """ + def test_stats(self): + commit = Commit(self.rorepo, '33ebe7acec14b25c5f84f35a664803fcab2f7781') + stats = commit.stats + + def check_entries(d): + assert isinstance(d, dict) + for key in ("insertions", "deletions", "lines"): + assert key in d + # END assertion helper + assert stats.files + assert stats.total + + check_entries(stats.total) + assert "files" in stats.total + + for filepath, d in stats.files.items(): + check_entries(d) + # END for each stated file + + # assure data is parsed properly + michael = Actor._from_string("Michael Trier ") + assert commit.author == michael + assert commit.committer == michael + assert commit.authored_date == 1210193388 + assert commit.committed_date == 1210193388 + assert commit.author_tz_offset == 14400, commit.author_tz_offset + assert commit.committer_tz_offset == 14400, commit.committer_tz_offset + assert commit.message == "initial project" + + def test_traversal(self): + start = self.rorepo.commit("a4d06724202afccd2b5c54f81bcf2bf26dea7fff") + first = self.rorepo.commit("33ebe7acec14b25c5f84f35a664803fcab2f7781") + p0 = start.parents[0] + p1 = start.parents[1] + p00 = p0.parents[0] + p10 = p1.parents[0] + + # basic branch first, depth first + dfirst = start.traverse(branch_first=False) + bfirst = start.traverse(branch_first=True) + assert dfirst.next() == p0 + assert dfirst.next() == p00 + + assert bfirst.next() == p0 + assert bfirst.next() == p1 + assert bfirst.next() == p00 + assert bfirst.next() == p10 + + # at some point, both iterations should stop + assert list(bfirst)[-1] == first + stoptraverse = self.rorepo.commit("254d04aa3180eb8b8daf7b7ff25f010cd69b4e7d").traverse(as_edge=True) + l = list(stoptraverse) + assert len(l[0]) == 2 + + # ignore self + assert start.traverse(ignore_self=False).next() == start + + # depth + assert len(list(start.traverse(ignore_self=False, depth=0))) == 1 + + # prune + assert start.traverse(branch_first=1, prune=lambda i,d: i==p0).next() == p1 + + # predicate + assert start.traverse(branch_first=1, predicate=lambda i,d: i==p1).next() == p1 + + # traversal should stop when the beginning is reached + self.failUnlessRaises(StopIteration, first.traverse().next) + + # parents of the first commit should be empty ( as the only parent has a null + # sha ) + assert len(first.parents) == 0 + + def test_iteration(self): + # we can iterate commits + all_commits = Commit.list_items(self.rorepo, self.rorepo.head) + assert all_commits + assert all_commits == list(self.rorepo.iter_commits()) + + # this includes merge commits + mcomit = Commit(self.rorepo, 'd884adc80c80300b4cc05321494713904ef1df2d') + assert mcomit in all_commits + + # we can limit the result to paths + ltd_commits = list(self.rorepo.iter_commits(paths='CHANGES')) + assert ltd_commits and len(ltd_commits) < len(all_commits) + + # show commits of multiple paths, resulting in a union of commits + less_ltd_commits = list(Commit.iter_items(self.rorepo, 'master', paths=('CHANGES', 'AUTHORS'))) + assert len(ltd_commits) < len(less_ltd_commits) + + def test_iter_items(self): + # pretty not allowed + self.failUnlessRaises(ValueError, Commit.iter_items, self.rorepo, 'master', pretty="raw") + + def test_rev_list_bisect_all(self): + """ + 'git rev-list --bisect-all' returns additional information + in the commit header. This test ensures that we properly parse it. + """ + revs = self.rorepo.git.rev_list('933d23bf95a5bd1624fbcdf328d904e1fa173474', + first_parent=True, + bisect_all=True) - git.return_value = fixture('rev_list_bisect_all') + commits = Commit._iter_from_process_or_stream(self.rorepo, StringProcessAdapter(revs)) + expected_ids = ( + '7156cece3c49544abb6bf7a0c218eb36646fad6d', + '1f66cfbbce58b4b552b041707a12d437cc5f400a', + '33ebe7acec14b25c5f84f35a664803fcab2f7781', + '933d23bf95a5bd1624fbcdf328d904e1fa173474' + ) + for sha1, commit in zip(expected_ids, commits): + assert_equal(sha1, commit.sha) - revs = self.rorepo.git.rev_list('HEAD', - pretty='raw', - first_parent=True, - bisect_all=True) - assert_true(git.called) + def test_count(self): + assert self.rorepo.tag('refs/tags/0.1.5').commit.count( ) == 143 + + def test_list(self): + assert isinstance(Commit.list_items(self.rorepo, '0.1.5', max_count=5)['5117c9c8a4d3af19a9958677e45cda9269de1541'], Commit) - commits = Commit._iter_from_process_or_stream(self.rorepo, StringProcessAdapter(revs), True) - expected_ids = ( - 'cf37099ea8d1d8c7fbf9b6d12d7ec0249d3acb8b', - '33ebe7acec14b25c5f84f35a664803fcab2f7781', - 'a6604a00a652e754cb8b6b0b9f194f839fc38d7c', - '8df638c22c75ddc9a43ecdde90c0c9939f5009e7', - 'c231551328faa864848bde6ff8127f59c9566e90', - ) - for sha1, commit in zip(expected_ids, commits): - assert_equal(sha1, commit.sha) + def test_str(self): + commit = Commit(self.rorepo, 'abc') + assert_equal ("abc", str(commit)) - def test_count(self): - assert self.rorepo.tag('refs/tags/0.1.5').commit.count( ) == 143 - - def test_list(self): - assert isinstance(Commit.list_items(self.rorepo, '0.1.5', max_count=5)['5117c9c8a4d3af19a9958677e45cda9269de1541'], Commit) + def test_repr(self): + commit = Commit(self.rorepo, 'abc') + assert_equal('', repr(commit)) - def test_str(self): - commit = Commit(self.rorepo, 'abc') - assert_equal ("abc", str(commit)) - - def test_repr(self): - commit = Commit(self.rorepo, 'abc') - assert_equal('', repr(commit)) - - def test_equality(self): - commit1 = Commit(self.rorepo, 'abc') - commit2 = Commit(self.rorepo, 'abc') - commit3 = Commit(self.rorepo, 'zyx') - assert_equal(commit1, commit2) - assert_not_equal(commit2, commit3) - - def test_iter_parents(self): - # should return all but ourselves, even if skip is defined - c = self.rorepo.commit('0.1.5') - for skip in (0, 1): - piter = c.iter_parents(skip=skip) - first_parent = piter.next() - assert first_parent != c - assert first_parent == c.parents[0] - # END for each - - def test_base(self): - name_rev = self.rorepo.head.commit.name_rev - assert isinstance(name_rev, basestring) - + def test_equality(self): + commit1 = Commit(self.rorepo, 'abc') + commit2 = Commit(self.rorepo, 'abc') + commit3 = Commit(self.rorepo, 'zyx') + assert_equal(commit1, commit2) + assert_not_equal(commit2, commit3) + + def test_iter_parents(self): + # should return all but ourselves, even if skip is defined + c = self.rorepo.commit('0.1.5') + for skip in (0, 1): + piter = c.iter_parents(skip=skip) + first_parent = piter.next() + assert first_parent != c + assert first_parent == c.parents[0] + # END for each + + def test_base(self): + name_rev = self.rorepo.head.commit.name_rev + assert isinstance(name_rev, basestring) + diff --git a/test/git/test_repo.py b/test/git/test_repo.py index 9316245b..ddf2b3e1 100644 --- a/test/git/test_repo.py +++ b/test/git/test_repo.py @@ -55,32 +55,27 @@ class TestRepo(TestBase): # try from invalid revision that does not exist self.failUnlessRaises(ValueError, self.rorepo.tree, 'hello world') - @patch_object(Git, '_call_process') - def test_commits(self, git): - git.return_value = StringProcessAdapter(fixture('rev_list')) - - commits = list(self.rorepo.iter_commits('master', max_count=10)) + def test_commits(self): + mc = 10 + commits = list(self.rorepo.iter_commits('0.1.6', max_count=mc)) + assert len(commits) == mc c = commits[0] - assert_equal('4c8124ffcf4039d292442eeccabdeca5af5c5017', c.sha) - assert_equal(["634396b2f541a9f2d58b00be1a07f0c358b999b3"], [p.sha for p in c.parents]) - assert_equal("672eca9b7f9e09c22dcb128c283e8c3c8d7697a4", c.tree.sha) - assert_equal("Tom Preston-Werner", c.author.name) - assert_equal("tom@mojombo.com", c.author.email) - assert_equal(1191999972, c.authored_date) - assert_equal("Tom Preston-Werner", c.committer.name) - assert_equal("tom@mojombo.com", c.committer.email) - assert_equal(1191999972, c.committed_date) - assert_equal("implement Grit#heads", c.message) + assert_equal('9a4b1d4d11eee3c5362a4152216376e634bd14cf', c.sha) + assert_equal(["c76852d0bff115720af3f27acdb084c59361e5f6"], [p.sha for p in c.parents]) + assert_equal("ce41fc29549042f1aa09cc03174896cf23f112e3", c.tree.sha) + assert_equal("Michael Trier", c.author.name) + assert_equal("mtrier@gmail.com", c.author.email) + assert_equal(1232829715, c.authored_date) + assert_equal(5*3600, c.author_tz_offset) + assert_equal("Michael Trier", c.committer.name) + assert_equal("mtrier@gmail.com", c.committer.email) + assert_equal(1232829715, c.committed_date) + assert_equal(5*3600, c.committer_tz_offset) + assert_equal("Bumped version 0.1.6", c.message) c = commits[1] - assert_equal(tuple(), c.parents) - - c = commits[2] - assert_equal(["6e64c55896aabb9a7d8e9f8f296f426d21a78c2c", "7f874954efb9ba35210445be456c74e037ba6af2"], map(lambda p: p.sha, c.parents)) - assert_equal("Merge branch 'site'", c.summary) - - assert_true(git.called) + assert isinstance(c.parents, tuple) def test_trees(self): mc = 30 -- cgit v1.2.3 From 538820055ce1bf9dd07ecda48210832f96194504 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Jun 2010 17:39:17 +0200 Subject: git.cmd: added test for stream section constraint used in git command, found bug of course which just didn't kick in yet --- lib/git/cmd.py | 7 +- test/git/test_repo.py | 651 ++++++++++++++++++++++++++++---------------------- 2 files changed, 369 insertions(+), 289 deletions(-) diff --git a/lib/git/cmd.py b/lib/git/cmd.py index cef4ea60..aaa27adc 100644 --- a/lib/git/cmd.py +++ b/lib/git/cmd.py @@ -130,8 +130,13 @@ class Git(object): if self._nbr == self._size: return '' + # clamp size to lowest allowed value + bytes_left = self._size - self._nbr if size > -1: - size = min(self._size - self._nbr, size) + size = min(bytes_left, size) + else: + size = bytes_left + # END handle size data = self._stream.readline(size) self._nbr += len(data) diff --git a/test/git/test_repo.py b/test/git/test_repo.py index ddf2b3e1..7dc645b6 100644 --- a/test/git/test_repo.py +++ b/test/git/test_repo.py @@ -10,321 +10,396 @@ from git import * from git.utils import join_path_native import tempfile import shutil +from cStringIO import StringIO class TestRepo(TestBase): - - @raises(InvalidGitRepositoryError) - def test_new_should_raise_on_invalid_repo_location(self): - Repo(tempfile.gettempdir()) + + @raises(InvalidGitRepositoryError) + def test_new_should_raise_on_invalid_repo_location(self): + Repo(tempfile.gettempdir()) - @raises(NoSuchPathError) - def test_new_should_raise_on_non_existant_path(self): - Repo("repos/foobar") + @raises(NoSuchPathError) + def test_new_should_raise_on_non_existant_path(self): + Repo("repos/foobar") - def test_repo_creation_from_different_paths(self): - r_from_gitdir = Repo(self.rorepo.git_dir) - assert r_from_gitdir.git_dir == self.rorepo.git_dir - assert r_from_gitdir.git_dir.endswith('.git') - assert not self.rorepo.git.working_dir.endswith('.git') - assert r_from_gitdir.git.working_dir == self.rorepo.git.working_dir + def test_repo_creation_from_different_paths(self): + r_from_gitdir = Repo(self.rorepo.git_dir) + assert r_from_gitdir.git_dir == self.rorepo.git_dir + assert r_from_gitdir.git_dir.endswith('.git') + assert not self.rorepo.git.working_dir.endswith('.git') + assert r_from_gitdir.git.working_dir == self.rorepo.git.working_dir - def test_description(self): - txt = "Test repository" - self.rorepo.description = txt - assert_equal(self.rorepo.description, txt) + def test_description(self): + txt = "Test repository" + self.rorepo.description = txt + assert_equal(self.rorepo.description, txt) - def test_heads_should_return_array_of_head_objects(self): - for head in self.rorepo.heads: - assert_equal(Head, head.__class__) + def test_heads_should_return_array_of_head_objects(self): + for head in self.rorepo.heads: + assert_equal(Head, head.__class__) - def test_heads_should_populate_head_data(self): - for head in self.rorepo.heads: - assert head.name - assert isinstance(head.commit,Commit) - # END for each head - - assert isinstance(self.rorepo.heads.master, Head) - assert isinstance(self.rorepo.heads['master'], Head) - - def test_tree_from_revision(self): - tree = self.rorepo.tree('0.1.6') - assert len(tree.sha) == 40 - assert tree.type == "tree" - assert self.rorepo.tree(tree) == tree - - # try from invalid revision that does not exist - self.failUnlessRaises(ValueError, self.rorepo.tree, 'hello world') + def test_heads_should_populate_head_data(self): + for head in self.rorepo.heads: + assert head.name + assert isinstance(head.commit,Commit) + # END for each head + + assert isinstance(self.rorepo.heads.master, Head) + assert isinstance(self.rorepo.heads['master'], Head) + + def test_tree_from_revision(self): + tree = self.rorepo.tree('0.1.6') + assert len(tree.sha) == 40 + assert tree.type == "tree" + assert self.rorepo.tree(tree) == tree + + # try from invalid revision that does not exist + self.failUnlessRaises(ValueError, self.rorepo.tree, 'hello world') - def test_commits(self): - mc = 10 - commits = list(self.rorepo.iter_commits('0.1.6', max_count=mc)) - assert len(commits) == mc - - c = commits[0] - assert_equal('9a4b1d4d11eee3c5362a4152216376e634bd14cf', c.sha) - assert_equal(["c76852d0bff115720af3f27acdb084c59361e5f6"], [p.sha for p in c.parents]) - assert_equal("ce41fc29549042f1aa09cc03174896cf23f112e3", c.tree.sha) - assert_equal("Michael Trier", c.author.name) - assert_equal("mtrier@gmail.com", c.author.email) - assert_equal(1232829715, c.authored_date) - assert_equal(5*3600, c.author_tz_offset) - assert_equal("Michael Trier", c.committer.name) - assert_equal("mtrier@gmail.com", c.committer.email) - assert_equal(1232829715, c.committed_date) - assert_equal(5*3600, c.committer_tz_offset) - assert_equal("Bumped version 0.1.6", c.message) + def test_commits(self): + mc = 10 + commits = list(self.rorepo.iter_commits('0.1.6', max_count=mc)) + assert len(commits) == mc + + c = commits[0] + assert_equal('9a4b1d4d11eee3c5362a4152216376e634bd14cf', c.sha) + assert_equal(["c76852d0bff115720af3f27acdb084c59361e5f6"], [p.sha for p in c.parents]) + assert_equal("ce41fc29549042f1aa09cc03174896cf23f112e3", c.tree.sha) + assert_equal("Michael Trier", c.author.name) + assert_equal("mtrier@gmail.com", c.author.email) + assert_equal(1232829715, c.authored_date) + assert_equal(5*3600, c.author_tz_offset) + assert_equal("Michael Trier", c.committer.name) + assert_equal("mtrier@gmail.com", c.committer.email) + assert_equal(1232829715, c.committed_date) + assert_equal(5*3600, c.committer_tz_offset) + assert_equal("Bumped version 0.1.6", c.message) - c = commits[1] - assert isinstance(c.parents, tuple) + c = commits[1] + assert isinstance(c.parents, tuple) - def test_trees(self): - mc = 30 - num_trees = 0 - for tree in self.rorepo.iter_trees('0.1.5', max_count=mc): - num_trees += 1 - assert isinstance(tree, Tree) - # END for each tree - assert num_trees == mc + def test_trees(self): + mc = 30 + num_trees = 0 + for tree in self.rorepo.iter_trees('0.1.5', max_count=mc): + num_trees += 1 + assert isinstance(tree, Tree) + # END for each tree + assert num_trees == mc - def _test_empty_repo(self, repo): - # test all kinds of things with an empty, freshly initialized repo. - # It should throw good errors - - # entries should be empty - assert len(repo.index.entries) == 0 - - # head is accessible - assert repo.head - assert repo.head.ref - assert not repo.head.is_valid() - - # we can change the head to some other ref - head_ref = Head.from_path(repo, Head.to_full_path('some_head')) - assert not head_ref.is_valid() - repo.head.ref = head_ref - - # is_dirty can handle all kwargs - for args in ((1, 0, 0), (0, 1, 0), (0, 0, 1)): - assert not repo.is_dirty(*args) - # END for each arg - - # we can add a file to the index ( if we are not bare ) - if not repo.bare: - pass - # END test repos with working tree - + def _test_empty_repo(self, repo): + # test all kinds of things with an empty, freshly initialized repo. + # It should throw good errors + + # entries should be empty + assert len(repo.index.entries) == 0 + + # head is accessible + assert repo.head + assert repo.head.ref + assert not repo.head.is_valid() + + # we can change the head to some other ref + head_ref = Head.from_path(repo, Head.to_full_path('some_head')) + assert not head_ref.is_valid() + repo.head.ref = head_ref + + # is_dirty can handle all kwargs + for args in ((1, 0, 0), (0, 1, 0), (0, 0, 1)): + assert not repo.is_dirty(*args) + # END for each arg + + # we can add a file to the index ( if we are not bare ) + if not repo.bare: + pass + # END test repos with working tree + - def test_init(self): - prev_cwd = os.getcwd() - os.chdir(tempfile.gettempdir()) - git_dir_rela = "repos/foo/bar.git" - del_dir_abs = os.path.abspath("repos") - git_dir_abs = os.path.abspath(git_dir_rela) - try: - # with specific path - for path in (git_dir_rela, git_dir_abs): - r = Repo.init(path=path, bare=True) - assert isinstance(r, Repo) - assert r.bare == True - assert os.path.isdir(r.git_dir) - - self._test_empty_repo(r) - shutil.rmtree(git_dir_abs) - # END for each path - - os.makedirs(git_dir_rela) - os.chdir(git_dir_rela) - r = Repo.init(bare=False) - r.bare == False - - self._test_empty_repo(r) - finally: - try: - shutil.rmtree(del_dir_abs) - except OSError: - pass - os.chdir(prev_cwd) - # END restore previous state - - def test_bare_property(self): - self.rorepo.bare + def test_init(self): + prev_cwd = os.getcwd() + os.chdir(tempfile.gettempdir()) + git_dir_rela = "repos/foo/bar.git" + del_dir_abs = os.path.abspath("repos") + git_dir_abs = os.path.abspath(git_dir_rela) + try: + # with specific path + for path in (git_dir_rela, git_dir_abs): + r = Repo.init(path=path, bare=True) + assert isinstance(r, Repo) + assert r.bare == True + assert os.path.isdir(r.git_dir) + + self._test_empty_repo(r) + shutil.rmtree(git_dir_abs) + # END for each path + + os.makedirs(git_dir_rela) + os.chdir(git_dir_rela) + r = Repo.init(bare=False) + r.bare == False + + self._test_empty_repo(r) + finally: + try: + shutil.rmtree(del_dir_abs) + except OSError: + pass + os.chdir(prev_cwd) + # END restore previous state + + def test_bare_property(self): + self.rorepo.bare - @patch_object(Repo, '__init__') - @patch_object(Git, '_call_process') - def test_init_with_options(self, git, repo): - git.return_value = True - repo.return_value = None + @patch_object(Repo, '__init__') + @patch_object(Git, '_call_process') + def test_init_with_options(self, git, repo): + git.return_value = True + repo.return_value = None - r = Repo.init("repos/foo/bar.git", **{'bare' : True,'template': "/baz/sweet"}) - assert isinstance(r, Repo) + r = Repo.init("repos/foo/bar.git", **{'bare' : True,'template': "/baz/sweet"}) + assert isinstance(r, Repo) - assert_true(git.called) - assert_true(repo.called) + assert_true(git.called) + assert_true(repo.called) - @patch_object(Repo, '__init__') - @patch_object(Git, '_call_process') - def test_clone(self, git, repo): - git.return_value = None - repo.return_value = None + @patch_object(Repo, '__init__') + @patch_object(Git, '_call_process') + def test_clone(self, git, repo): + git.return_value = None + repo.return_value = None - self.rorepo.clone("repos/foo/bar.git") + self.rorepo.clone("repos/foo/bar.git") - assert_true(git.called) - path = os.path.join(absolute_project_path(), '.git') - assert_equal(git.call_args, (('clone', path, 'repos/foo/bar.git'), {})) - assert_true(repo.called) + assert_true(git.called) + path = os.path.join(absolute_project_path(), '.git') + assert_equal(git.call_args, (('clone', path, 'repos/foo/bar.git'), {})) + assert_true(repo.called) - @patch_object(Repo, '__init__') - @patch_object(Git, '_call_process') - def test_clone_with_options(self, git, repo): - git.return_value = None - repo.return_value = None + @patch_object(Repo, '__init__') + @patch_object(Git, '_call_process') + def test_clone_with_options(self, git, repo): + git.return_value = None + repo.return_value = None - self.rorepo.clone("repos/foo/bar.git", **{'template': '/awesome'}) + self.rorepo.clone("repos/foo/bar.git", **{'template': '/awesome'}) - assert_true(git.called) - path = os.path.join(absolute_project_path(), '.git') - assert_equal(git.call_args, (('clone', path, 'repos/foo/bar.git'), - { 'template': '/awesome'})) - assert_true(repo.called) + assert_true(git.called) + path = os.path.join(absolute_project_path(), '.git') + assert_equal(git.call_args, (('clone', path, 'repos/foo/bar.git'), + { 'template': '/awesome'})) + assert_true(repo.called) - def test_daemon_export(self): - orig_val = self.rorepo.daemon_export - self.rorepo.daemon_export = not orig_val - assert self.rorepo.daemon_export == ( not orig_val ) - self.rorepo.daemon_export = orig_val - assert self.rorepo.daemon_export == orig_val + def test_daemon_export(self): + orig_val = self.rorepo.daemon_export + self.rorepo.daemon_export = not orig_val + assert self.rorepo.daemon_export == ( not orig_val ) + self.rorepo.daemon_export = orig_val + assert self.rorepo.daemon_export == orig_val - def test_alternates(self): - cur_alternates = self.rorepo.alternates - # empty alternates - self.rorepo.alternates = [] - assert self.rorepo.alternates == [] - alts = [ "other/location", "this/location" ] - self.rorepo.alternates = alts - assert alts == self.rorepo.alternates - self.rorepo.alternates = cur_alternates + def test_alternates(self): + cur_alternates = self.rorepo.alternates + # empty alternates + self.rorepo.alternates = [] + assert self.rorepo.alternates == [] + alts = [ "other/location", "this/location" ] + self.rorepo.alternates = alts + assert alts == self.rorepo.alternates + self.rorepo.alternates = cur_alternates - def test_repr(self): - path = os.path.join(os.path.abspath(GIT_REPO), '.git') - assert_equal('' % path, repr(self.rorepo)) + def test_repr(self): + path = os.path.join(os.path.abspath(GIT_REPO), '.git') + assert_equal('' % path, repr(self.rorepo)) - def test_is_dirty_with_bare_repository(self): - self.rorepo._bare = True - assert_false(self.rorepo.is_dirty()) + def test_is_dirty_with_bare_repository(self): + self.rorepo._bare = True + assert_false(self.rorepo.is_dirty()) - def test_is_dirty(self): - self.rorepo._bare = False - for index in (0,1): - for working_tree in (0,1): - for untracked_files in (0,1): - assert self.rorepo.is_dirty(index, working_tree, untracked_files) in (True, False) - # END untracked files - # END working tree - # END index - self.rorepo._bare = True - assert self.rorepo.is_dirty() == False + def test_is_dirty(self): + self.rorepo._bare = False + for index in (0,1): + for working_tree in (0,1): + for untracked_files in (0,1): + assert self.rorepo.is_dirty(index, working_tree, untracked_files) in (True, False) + # END untracked files + # END working tree + # END index + self.rorepo._bare = True + assert self.rorepo.is_dirty() == False - def test_head(self): - assert self.rorepo.head.reference.object == self.rorepo.active_branch.object + def test_head(self): + assert self.rorepo.head.reference.object == self.rorepo.active_branch.object - def test_index(self): - index = self.rorepo.index - assert isinstance(index, IndexFile) - - def test_tag(self): - assert self.rorepo.tag('refs/tags/0.1.5').commit - - def test_archive(self): - tmpfile = os.tmpfile() - self.rorepo.archive(tmpfile, '0.1.5') - assert tmpfile.tell() - - @patch_object(Git, '_call_process') - def test_should_display_blame_information(self, git): - git.return_value = fixture('blame') - b = self.rorepo.blame( 'master', 'lib/git.py') - assert_equal(13, len(b)) - assert_equal( 2, len(b[0]) ) - # assert_equal(25, reduce(lambda acc, x: acc + len(x[-1]), b)) - assert_equal(hash(b[0][0]), hash(b[9][0])) - c = b[0][0] - assert_true(git.called) - assert_equal(git.call_args, (('blame', 'master', '--', 'lib/git.py'), {'p': True})) - - assert_equal('634396b2f541a9f2d58b00be1a07f0c358b999b3', c.sha) - assert_equal('Tom Preston-Werner', c.author.name) - assert_equal('tom@mojombo.com', c.author.email) - assert_equal(1191997100, c.authored_date) - assert_equal('Tom Preston-Werner', c.committer.name) - assert_equal('tom@mojombo.com', c.committer.email) - assert_equal(1191997100, c.committed_date) - assert_equal('initial grit setup', c.message) - - # test the 'lines per commit' entries - tlist = b[0][1] - assert_true( tlist ) - assert_true( isinstance( tlist[0], basestring ) ) - assert_true( len( tlist ) < sum( len(t) for t in tlist ) ) # test for single-char bug - - def test_untracked_files(self): - base = self.rorepo.working_tree_dir - files = ( join_path_native(base, "__test_myfile"), - join_path_native(base, "__test_other_file") ) - num_recently_untracked = 0 - try: - for fpath in files: - fd = open(fpath,"wb") - fd.close() - # END for each filename - untracked_files = self.rorepo.untracked_files - num_recently_untracked = len(untracked_files) - - # assure we have all names - they are relative to the git-dir - num_test_untracked = 0 - for utfile in untracked_files: - num_test_untracked += join_path_native(base, utfile) in files - assert len(files) == num_test_untracked - finally: - for fpath in files: - if os.path.isfile(fpath): - os.remove(fpath) - # END handle files - - assert len(self.rorepo.untracked_files) == (num_recently_untracked - len(files)) - - def test_config_reader(self): - reader = self.rorepo.config_reader() # all config files - assert reader.read_only - reader = self.rorepo.config_reader("repository") # single config file - assert reader.read_only - - def test_config_writer(self): - for config_level in self.rorepo.config_level: - try: - writer = self.rorepo.config_writer(config_level) - assert not writer.read_only - except IOError: - # its okay not to get a writer for some configuration files if we - # have no permissions - pass - # END for each config level - - def test_creation_deletion(self): - # just a very quick test to assure it generally works. There are - # specialized cases in the test_refs module - head = self.rorepo.create_head("new_head", "HEAD~1") - self.rorepo.delete_head(head) - - tag = self.rorepo.create_tag("new_tag", "HEAD~2") - self.rorepo.delete_tag(tag) - - remote = self.rorepo.create_remote("new_remote", "git@server:repo.git") - self.rorepo.delete_remote(remote) - - def test_comparison_and_hash(self): - # this is only a preliminary test, more testing done in test_index - assert self.rorepo == self.rorepo and not (self.rorepo != self.rorepo) - assert len(set((self.rorepo, self.rorepo))) == 1 + def test_index(self): + index = self.rorepo.index + assert isinstance(index, IndexFile) + + def test_tag(self): + assert self.rorepo.tag('refs/tags/0.1.5').commit + + def test_archive(self): + tmpfile = os.tmpfile() + self.rorepo.archive(tmpfile, '0.1.5') + assert tmpfile.tell() + + @patch_object(Git, '_call_process') + def test_should_display_blame_information(self, git): + git.return_value = fixture('blame') + b = self.rorepo.blame( 'master', 'lib/git.py') + assert_equal(13, len(b)) + assert_equal( 2, len(b[0]) ) + # assert_equal(25, reduce(lambda acc, x: acc + len(x[-1]), b)) + assert_equal(hash(b[0][0]), hash(b[9][0])) + c = b[0][0] + assert_true(git.called) + assert_equal(git.call_args, (('blame', 'master', '--', 'lib/git.py'), {'p': True})) + + assert_equal('634396b2f541a9f2d58b00be1a07f0c358b999b3', c.sha) + assert_equal('Tom Preston-Werner', c.author.name) + assert_equal('tom@mojombo.com', c.author.email) + assert_equal(1191997100, c.authored_date) + assert_equal('Tom Preston-Werner', c.committer.name) + assert_equal('tom@mojombo.com', c.committer.email) + assert_equal(1191997100, c.committed_date) + assert_equal('initial grit setup', c.message) + + # test the 'lines per commit' entries + tlist = b[0][1] + assert_true( tlist ) + assert_true( isinstance( tlist[0], basestring ) ) + assert_true( len( tlist ) < sum( len(t) for t in tlist ) ) # test for single-char bug + + def test_untracked_files(self): + base = self.rorepo.working_tree_dir + files = ( join_path_native(base, "__test_myfile"), + join_path_native(base, "__test_other_file") ) + num_recently_untracked = 0 + try: + for fpath in files: + fd = open(fpath,"wb") + fd.close() + # END for each filename + untracked_files = self.rorepo.untracked_files + num_recently_untracked = len(untracked_files) + + # assure we have all names - they are relative to the git-dir + num_test_untracked = 0 + for utfile in untracked_files: + num_test_untracked += join_path_native(base, utfile) in files + assert len(files) == num_test_untracked + finally: + for fpath in files: + if os.path.isfile(fpath): + os.remove(fpath) + # END handle files + + assert len(self.rorepo.untracked_files) == (num_recently_untracked - len(files)) + + def test_config_reader(self): + reader = self.rorepo.config_reader() # all config files + assert reader.read_only + reader = self.rorepo.config_reader("repository") # single config file + assert reader.read_only + + def test_config_writer(self): + for config_level in self.rorepo.config_level: + try: + writer = self.rorepo.config_writer(config_level) + assert not writer.read_only + except IOError: + # its okay not to get a writer for some configuration files if we + # have no permissions + pass + # END for each config level + + def test_creation_deletion(self): + # just a very quick test to assure it generally works. There are + # specialized cases in the test_refs module + head = self.rorepo.create_head("new_head", "HEAD~1") + self.rorepo.delete_head(head) + + tag = self.rorepo.create_tag("new_tag", "HEAD~2") + self.rorepo.delete_tag(tag) + + remote = self.rorepo.create_remote("new_remote", "git@server:repo.git") + self.rorepo.delete_remote(remote) + + def test_comparison_and_hash(self): + # this is only a preliminary test, more testing done in test_index + assert self.rorepo == self.rorepo and not (self.rorepo != self.rorepo) + assert len(set((self.rorepo, self.rorepo))) == 1 + + def test_git_cmd(self): + # test CatFileContentStream, just to be very sure we have no fencepost errors + # last \n is the terminating newline that it expects + l1 = "0123456789\n" + l2 = "abcdefghijklmnopqrstxy\n" + l3 = "z\n" + d = "%s%s%s\n" % (l1, l2, l3) + + l1p = l1[:5] + + # full size + # size is without terminating newline + def mkfull(): + return Git.CatFileContentStream(len(d)-1, StringIO(d)) + + ts = 5 + def mktiny(): + return Git.CatFileContentStream(ts, StringIO(d)) + + # readlines no limit + s = mkfull() + lines = s.readlines() + assert len(lines) == 3 and lines[-1].endswith('\n') + assert s._stream.tell() == len(d) # must have scrubbed to the end + + # realines line limit + s = mkfull() + lines = s.readlines(5) + assert len(lines) == 1 + + # readlines on tiny sections + s = mktiny() + lines = s.readlines() + assert len(lines) == 1 and lines[0] == l1p + assert s._stream.tell() == ts+1 + + # readline no limit + s = mkfull() + assert s.readline() == l1 + assert s.readline() == l2 + assert s.readline() == l3 + assert s.readline() == '' + assert s._stream.tell() == len(d) + + # readline limit + s = mkfull() + assert s.readline(5) == l1p + assert s.readline() == l1[5:] + + # readline on tiny section + s = mktiny() + assert s.readline() == l1p + assert s.readline() == '' + assert s._stream.tell() == ts+1 + + # read no limit + s = mkfull() + assert s.read() == d[:-1] + assert s.read() == '' + assert s._stream.tell() == len(d) + + # read limit + s = mkfull() + assert s.read(5) == l1p + assert s.read(6) == l1[5:] + assert s._stream.tell() == 5 + 6 # its not yet done + + # read tiny + s = mktiny() + assert s.read(2) == l1[:2] + assert s._stream.tell() == 2 + assert s.read() == l1[2:ts] + assert s._stream.tell() == ts+1 -- cgit v1.2.3 From 282018b79cc8df078381097cb3aeb29ff56e83c6 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Jun 2010 20:11:00 +0200 Subject: Added first design and frame for object database. In a first step, loose objects will be written using our utilities, and certain object retrieval functionality moves into the GitObjectDatabase which is used by the repo instance Added performance test for object database access, which shows quite respectable tree parsing performance, and okay blob access. Nonetheless, it will be hard to beat the c performance using a pure python implementation, but it can be a nice practice to write it anyway to allow more direct pack manipulations. Some could benefit from the ability to write packs as these can serve as local cache if alternates are used --- lib/git/errors.py | 15 ++--- lib/git/odb/__init__.py | 2 + lib/git/odb/db.py | 129 +++++++++++++++++++++++++++++++++++++++ test/git/performance/test_odb.py | 61 ++++++++++++++++++ test/git/test_odb.py | 12 ++++ 5 files changed, 210 insertions(+), 9 deletions(-) create mode 100644 lib/git/odb/__init__.py create mode 100644 lib/git/odb/db.py create mode 100644 test/git/performance/test_odb.py create mode 100644 test/git/test_odb.py diff --git a/lib/git/errors.py b/lib/git/errors.py index f66fb528..ecb1c35b 100644 --- a/lib/git/errors.py +++ b/lib/git/errors.py @@ -8,19 +8,16 @@ Module containing all exceptions thrown througout the git package, """ class InvalidGitRepositoryError(Exception): - """ - Thrown if the given repository appears to have an invalid format. - """ + """ Thrown if the given repository appears to have an invalid format. """ + +class InvalidDBRoot(Exception): + """Thrown if an object database cannot be initialized at the given path""" class NoSuchPathError(OSError): - """ - Thrown if a path could not be access by the system. - """ + """ Thrown if a path could not be access by the system. """ class GitCommandError(Exception): - """ - Thrown if execution of the git command fails with non-zero status code. - """ + """ Thrown if execution of the git command fails with non-zero status code. """ def __init__(self, command, status, stderr=None): self.stderr = stderr self.status = status diff --git a/lib/git/odb/__init__.py b/lib/git/odb/__init__.py new file mode 100644 index 00000000..17000244 --- /dev/null +++ b/lib/git/odb/__init__.py @@ -0,0 +1,2 @@ +"""Initialize the object database module""" + diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py new file mode 100644 index 00000000..fd1b640a --- /dev/null +++ b/lib/git/odb/db.py @@ -0,0 +1,129 @@ +"""Contains implementations of database retrieveing objects""" +import os +from git.errors import InvalidDBRoot + + +class iObjectDBR(object): + """Defines an interface for object database lookup. + Objects are identified either by hex-sha (40 bytes) or + by sha (20 bytes)""" + __slots__ = tuple() + + #{ Query Interface + def has_obj_hex(self, hexsha): + """:return: True if the object identified by the given 40 byte hexsha is + contained in the database""" + raise NotImplementedError("To be implemented in subclass") + + def has_obj_bin(self, sha): + """:return: as ``has_obj_hex``, but takes a 20 byte binary sha""" + raise NotImplementedError("To be implemented in subclass") + + def obj_hex(self, hexsha): + """:return: tuple(type_string, size_in_bytes, stream) a tuple with object + information including its type, its size as well as a stream from which its + contents can be read""" + raise NotImplementedError("To be implemented in subclass") + + def obj_bin(self, sha): + """:return: as in ``obj_hex``, but takes a binary sha""" + raise NotImplementedError("To be implemented in subclass") + + def obj_info_hex(self, hexsha): + """:return: tuple(type_string, size_in_bytes) tuple with the object's type + string as well as its size in bytes""" + raise NotImplementedError("To be implemented in subclass") + + #} END query interface + +class iObjectDBW(object): + """Defines an interface to create objects in the database""" + __slots__ = tuple() + + #{ Edit Interface + + def to_obj(self, type, size, stream, dry_run=False, sha_as_hex=True): + """Create a new object in the database + :return: the sha identifying the object in the database + :param type: type string identifying the object + :param size: size of the data to read from stream + :param stream: stream providing the data + :param dry_run: if True, the object database will not actually be changed + :param sha_as_hex: if True, the returned sha identifying the object will be + hex encoded, not binary""" + raise NotImplementedError("To be implemented in subclass") + + def to_objs(self, iter_info, dry_run=False, sha_as_hex=True, max_threads=0): + """Create multiple new objects in the database + :return: sequence of shas identifying the created objects in the order in which + they where given. + :param iter_info: iterable yielding tuples containing the type_string + size_in_bytes and the steam with the content data. + :param dry_run: see ``to_obj`` + :param sha_as_hex: see ``to_obj`` + :param max_threads: if < 1, any number of threads may be started while processing + the request, otherwise the given number of threads will be started.""" + # a trivial implementation, ignoring the threads for now + # TODO: add configuration to the class to determine whether we may + # actually use multiple threads, default False of course. If the add + shas = list() + for args in iter_info: + shas.append(self.to_obj(*args, dry_run=dry_run, sha_as_hex=sha_as_hex)) + return shas + + #} END edit interface + + +class FileDBBase(object): + """Provides basic facilities to retrieve files of interest, including + caching facilities to help mapping hexsha's to objects""" + __slots__ = ('_root_path', ) + + def __init__(self, root_path): + """Initialize this instance to look for its files at the given root path + All subsequent operations will be relative to this path + :raise InvalidDBRoot: + :note: The base will perform basic checking for accessability, but the subclass + is required to verify that the root_path contains the database structure it needs""" + if not os.path.isdir(root_path): + raise InvalidDBRoot(root_path) + self._root_path = root_path + + + #{ Interface + def root_path(self): + """:return: path at which this db operates""" + return self._root_path + + #} END interface + + #{ Utiltities + def _root_rela_path(self, rela_path): + """:return: the given relative path relative to our database root""" + return os.path.join(self._root_path, rela_path) + + #} END utilities + + +class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): + """A database which operates on loose object files""" + + +class PackedDB(FileDBBase, iObjectDBR): + """A database operating on a set of object packs""" + + +class CompoundDB(iObjectDBR): + """A database which delegates calls to sub-databases""" + + +class ReferenceDB(CompoundDB): + """A database consisting of database referred to in a file""" + + +class GitObjectDB(CompoundDB, iObjectDBW): + """A database representing the default git object store, which includes loose + objects, pack files and an alternates file + + It will create objects only in the loose object database.""" + diff --git a/test/git/performance/test_odb.py b/test/git/performance/test_odb.py new file mode 100644 index 00000000..0ad2ce33 --- /dev/null +++ b/test/git/performance/test_odb.py @@ -0,0 +1,61 @@ +"""Performance tests for object store""" + +from time import time +import sys +import stat + +from lib import ( + TestBigRepoReadOnly + ) + + +class TestObjDBPerformance(TestBigRepoReadOnly): + + def test_random_access(self): + + # GET COMMITS + # TODO: use the actual db for this + st = time() + root_commit = self.gitrepo.commit(self.head_sha_2k) + commits = list(root_commit.traverse()) + nc = len(commits) + elapsed = time() - st + + print >> sys.stderr, "Retrieved %i commits from ObjectStore in %g s ( %f commits / s )" % (nc, elapsed, nc / elapsed) + + + # GET TREES + # walk all trees of all commits + st = time() + blobs_per_commit = list() + nt = 0 + for commit in commits: + tree = commit.tree + blobs = list() + for item in tree.traverse(): + nt += 1 + if item.type == 'blob': + blobs.append(item) + # direct access for speed + # END while trees are there for walking + blobs_per_commit.append(blobs) + # END for each commit + elapsed = time() - st + + print >> sys.stderr, "Retrieved %i objects from %i commits in %g s ( %f objects / s )" % (nt, len(commits), elapsed, nt / elapsed) + + # GET BLOBS + st = time() + nb = 0 + too_many = 15000 + for blob_list in blobs_per_commit: + for blob in blob_list: + blob.data + # END for each blobsha + nb += len(blob_list) + if nb > too_many: + break + # END for each bloblist + elapsed = time() - st + + print >> sys.stderr, "Retrieved %i blob and their data in %g s ( %f blobs / s )" % (nb, elapsed, nb / elapsed) diff --git a/test/git/test_odb.py b/test/git/test_odb.py new file mode 100644 index 00000000..6f92a5c1 --- /dev/null +++ b/test/git/test_odb.py @@ -0,0 +1,12 @@ +"""Test for object db""" + +from test.testlib import * +from git.odb.db import * + + +class TestDB(TestBase): + """Test the different db class implementations""" + + def test_loose_db(self): + self.fail("todo") + -- cgit v1.2.3 From 8b86f9b399a8f5af792a04025fdeefc02883f3e5 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Jun 2010 22:40:52 +0200 Subject: initial version of loose object writing and simple cached object lookup appears to be working --- lib/git/index.py | 4 +- lib/git/odb/db.py | 148 ++++++++++++++++++++++++++++++++++++++++++--------- lib/git/odb/utils.py | 78 +++++++++++++++++++++++++++ lib/git/utils.py | 6 ++- test/git/test_odb.py | 31 ++++++++++- 5 files changed, 237 insertions(+), 30 deletions(-) create mode 100644 lib/git/odb/utils.py diff --git a/lib/git/index.py b/lib/git/index.py index 8ccc3fe3..36428315 100644 --- a/lib/git/index.py +++ b/lib/git/index.py @@ -21,7 +21,7 @@ import git.diff as diff from errors import GitCommandError from git.objects import Blob, Tree, Object, Commit -from git.utils import SHA1Writer, LazyMixin, ConcurrentWriteOperation, join_path_native +from git.utils import IndexFileSHA1Writer, LazyMixin, ConcurrentWriteOperation, join_path_native class CheckoutError( Exception ): @@ -461,7 +461,7 @@ class IndexFile(LazyMixin, diff.Diffable): write_op = ConcurrentWriteOperation(file_path or self._file_path) stream = write_op._begin_writing() - stream = SHA1Writer(stream) + stream = IndexFileSHA1Writer(stream) # header stream.write("DIRC") diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index fd1b640a..204da9ad 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -1,6 +1,21 @@ """Contains implementations of database retrieveing objects""" import os from git.errors import InvalidDBRoot +from git.utils import IndexFileSHA1Writer + +from utils import ( + to_hex_sha, + exists, + hex_to_bin, + FDCompressedSha1Writer, + isdir, + mkdir, + rename, + dirname, + join + ) + +import tempfile class iObjectDBR(object): @@ -9,29 +24,29 @@ class iObjectDBR(object): by sha (20 bytes)""" __slots__ = tuple() + def __contains__(self, sha): + return self.has_obj + #{ Query Interface - def has_obj_hex(self, hexsha): - """:return: True if the object identified by the given 40 byte hexsha is - contained in the database""" - raise NotImplementedError("To be implemented in subclass") - - def has_obj_bin(self, sha): - """:return: as ``has_obj_hex``, but takes a 20 byte binary sha""" - raise NotImplementedError("To be implemented in subclass") - - def obj_hex(self, hexsha): - """:return: tuple(type_string, size_in_bytes, stream) a tuple with object - information including its type, its size as well as a stream from which its - contents can be read""" + def has_object(self, sha): + """ + :return: True if the object identified by the given 40 byte hexsha or 20 bytes + binary sha is contained in the database""" raise NotImplementedError("To be implemented in subclass") - def obj_bin(self, sha): - """:return: as in ``obj_hex``, but takes a binary sha""" + def object(self, sha): + """ + :return: tuple(type_string, size_in_bytes, stream) a tuple with object + information including its type, its size as well as a stream from which its + contents can be read + :param sha: 40 bytes hexsha or 20 bytes binary sha """ raise NotImplementedError("To be implemented in subclass") - def obj_info_hex(self, hexsha): - """:return: tuple(type_string, size_in_bytes) tuple with the object's type - string as well as its size in bytes""" + def object_info(self, sha): + """ + :return: tuple(type_string, size_in_bytes) tuple with the object's type + string as well as its size in bytes + :param sha: 40 bytes hexsha or 20 bytes binary sha""" raise NotImplementedError("To be implemented in subclass") #} END query interface @@ -42,7 +57,7 @@ class iObjectDBW(object): #{ Edit Interface - def to_obj(self, type, size, stream, dry_run=False, sha_as_hex=True): + def to_object(self, type, size, stream, dry_run=False, sha_as_hex=True): """Create a new object in the database :return: the sha identifying the object in the database :param type: type string identifying the object @@ -53,7 +68,7 @@ class iObjectDBW(object): hex encoded, not binary""" raise NotImplementedError("To be implemented in subclass") - def to_objs(self, iter_info, dry_run=False, sha_as_hex=True, max_threads=0): + def to_objects(self, iter_info, dry_run=False, sha_as_hex=True, max_threads=0): """Create multiple new objects in the database :return: sequence of shas identifying the created objects in the order in which they where given. @@ -68,7 +83,7 @@ class iObjectDBW(object): # actually use multiple threads, default False of course. If the add shas = list() for args in iter_info: - shas.append(self.to_obj(*args, dry_run=dry_run, sha_as_hex=sha_as_hex)) + shas.append(self.to_object(*args, dry_run=dry_run, sha_as_hex=sha_as_hex)) return shas #} END edit interface @@ -95,18 +110,103 @@ class FileDBBase(object): """:return: path at which this db operates""" return self._root_path + def db_path(self, rela_path): + """ + :return: the given relative path relative to our database root, allowing + to pontentially access datafiles""" + return join(self._root_path, rela_path) #} END interface #{ Utiltities - def _root_rela_path(self, rela_path): - """:return: the given relative path relative to our database root""" - return os.path.join(self._root_path, rela_path) + #} END utilities class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): """A database which operates on loose object files""" + __slots__ = ('_hexsha_to_file', ) + + # CONFIGURATION + # chunks in which data will be copied between streams + stream_chunk_size = 1000*1000 + + def __init__(self, root_path): + super(LooseObjectDB, self).__init__(root_path) + self._hexsha_to_file = dict() + + #{ Interface + def hexsha_to_object_path(self, hexsha): + """ + :return: path at which the object with the given hexsha would be stored, + relative to the database root""" + return join(hexsha[:2], hexsha[2:]) + + #} END interface + + def has_object(self, sha): + sha = to_hex_sha(sha) + # try cache + if sha in self._hexsha_to_file: + return True + + # try filesystem + path = self.db_path(self.hexsha_to_object_path(sha)) + if exists(path): + self._hexsha_to_file[sha] = path + return True + # END handle cache + return False + + def to_object(self, type, size, stream, dry_run=False, sha_as_hex=True): + # open a tmp file to write the data to + fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path) + writer = FDCompressedSha1Writer(fd) + + # WRITE HEADER: type SP size NULL + writer.write("%s %i%s" % (type, size, chr(0))) + + # WRITE ALL DATA + chunksize = self.stream_chunk_size + try: + try: + while True: + data_len = writer.write(stream.read(chunksize)) + if data_len < chunksize: + # WRITE FOOTER + writer.write('\n') + break + # END check for stream end + # END duplicate data + finally: + writer.close() + # END assure file was closed + except: + os.remove(tmp_path) + raise + # END assure tmpfile removal on error + + + # in dry-run mode, we delete the file afterwards + sha = writer.sha(as_hex=True) + + if dry_run: + os.remove(tmp_path) + else: + # rename the file into place + obj_path = self.db_path(self.hexsha_to_object_path(sha)) + obj_dir = dirname(obj_path) + if not isdir(obj_dir): + mkdir(obj_dir) + # END handle destination directory + rename(tmp_path, obj_path) + # END handle dry_run + + if not sha_as_hex: + sha = hex_to_bin(sha) + # END handle sha format + + return sha class PackedDB(FileDBBase, iObjectDBR): diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py new file mode 100644 index 00000000..04d3eaba --- /dev/null +++ b/lib/git/odb/utils.py @@ -0,0 +1,78 @@ +import binascii +import os +import zlib +from git.utils import make_sha + +__all__ = ('FDSha1Writer', ) + +#{ Routines + +hex_to_bin = binascii.a2b_hex +bin_to_hex = binascii.b2a_hex + +def to_hex_sha(sha): + """:return: hexified version of sha""" + if len(sha) == 40: + return sha + return bin_to_hex(sha) + +def to_bin_sha(sha): + if len(sha) == 20: + return sha + return hex_to_bin(sha) + +# os shortcuts +exists = os.path.exists +mkdir = os.mkdir +isdir = os.path.isdir +rename = os.rename +dirname = os.path.dirname +join = os.path.join +read = os.read +write = os.write +close = os.close +#} END Routines + + +#{ Classes + +class FDCompressedSha1Writer(object): + """Digests data written to it, making the sha available, then compress the + data and write it to the file descriptor + :note: operates on raw file descriptors + :note: for this to work, you have to use the close-method of this instance""" + __slots__ = ("fd", "sha1", "zip") + + # default exception + exc = IOError("Failed to write all bytes to filedescriptor") + + def __init__(self, fd): + self.fd = fd + self.sha1 = make_sha("") + self.zip = zlib.compressobj() + + def write(self, data): + """:raise IOError: If not all bytes could be written + :return: lenght of incoming data""" + self.sha1.update(data) + cdata = self.zip.compress(data) + bytes_written = write(self.fd, cdata) + if bytes_written != len(cdata): + raise self.exc + return bytes_written + + def sha(self, as_hex = False): + """:return: sha so far + :param as_hex: if True, sha will be hex-encoded, binary otherwise""" + if as_hex: + return self.sha1.hexdigest() + return self.sha1.digest() + + def close(self): + remainder = self.zip.flush() + if write(self.fd, remainder) != len(remainder): + raise self.exc + return close(self.fd) + + +#} END classes diff --git a/lib/git/utils.py b/lib/git/utils.py index c21528b1..360c77c9 100644 --- a/lib/git/utils.py +++ b/lib/git/utils.py @@ -61,12 +61,14 @@ def join_path_native(a, *p): return to_native_path(join_path(a, *p)) -class SHA1Writer(object): +class IndexFileSHA1Writer(object): """ Wrapper around a file-like object that remembers the SHA1 of the data written to it. It will write a sha when the stream is closed or if the asked for explicitly usign write_sha. + Only useful to the indexfile + Note: Based on the dulwich project """ @@ -78,7 +80,7 @@ class SHA1Writer(object): def write(self, data): self.sha1.update(data) - self.f.write(data) + return self.f.write(data) def write_sha(self): sha = self.sha1.digest() diff --git a/test/git/test_odb.py b/test/git/test_odb.py index 6f92a5c1..bc92a493 100644 --- a/test/git/test_odb.py +++ b/test/git/test_odb.py @@ -2,11 +2,38 @@ from test.testlib import * from git.odb.db import * +from git import Blob + +from cStringIO import StringIO +import os class TestDB(TestBase): """Test the different db class implementations""" - def test_loose_db(self): - self.fail("todo") + # data + two_lines = "1234\nhello world" + + all_data = (two_lines, ) + + def _assert_object_writing(self, db): + """General tests to verify object writing, compatible to iObjectDBW + :note: requires write access to the database""" + # start in dry-run mode + for dry_run in range(1, -1, -1): + for data in self.all_data: + for hex_sha in range(2): + sha = db.to_object(Blob.type, len(data), StringIO(data), dry_run, hex_sha) + assert db.has_object(sha) != dry_run + assert len(sha) == 20 + hex_sha * 20 + # END for each sha type + # END for each data set + # END for each dry_run mode + + @with_bare_rw_repo + def test_writing(self, rwrepo): + ldb = LooseObjectDB(os.path.join(rwrepo.git_dir, 'objects')) + + # write data + self._assert_object_writing(ldb) -- cgit v1.2.3 From 6f8ce8901e21587cd2320562df412e05b5ab1731 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Jun 2010 23:53:29 +0200 Subject: added frame for object reading, including simple test --- lib/git/errors.py | 8 +++++- lib/git/odb/db.py | 76 ++++++++++++++++++++++++++++++++++++++++++---------- lib/git/odb/utils.py | 1 + test/git/test_odb.py | 13 +++++++++ 4 files changed, 83 insertions(+), 15 deletions(-) diff --git a/lib/git/errors.py b/lib/git/errors.py index ecb1c35b..956e007f 100644 --- a/lib/git/errors.py +++ b/lib/git/errors.py @@ -10,8 +10,14 @@ Module containing all exceptions thrown througout the git package, class InvalidGitRepositoryError(Exception): """ Thrown if the given repository appears to have an invalid format. """ -class InvalidDBRoot(Exception): +class ODBError(Exception): + """All errors thrown by the object database""" + +class InvalidDBRoot(ODBError): """Thrown if an object database cannot be initialized at the given path""" + +class BadObject(ODBError): + """The object with the given SHA does not exist""" class NoSuchPathError(OSError): """ Thrown if a path could not be access by the system. """ diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index 204da9ad..1248a3f4 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -1,9 +1,13 @@ """Contains implementations of database retrieveing objects""" import os -from git.errors import InvalidDBRoot +from git.errors import ( + InvalidDBRoot, + BadObject + ) from git.utils import IndexFileSHA1Writer from utils import ( + getsize, to_hex_sha, exists, hex_to_bin, @@ -16,6 +20,7 @@ from utils import ( ) import tempfile +import mmap class iObjectDBR(object): @@ -136,27 +141,70 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): self._hexsha_to_file = dict() #{ Interface - def hexsha_to_object_path(self, hexsha): + def object_path(self, hexsha): """ :return: path at which the object with the given hexsha would be stored, relative to the database root""" return join(hexsha[:2], hexsha[2:]) - #} END interface - - def has_object(self, sha): - sha = to_hex_sha(sha) - # try cache - if sha in self._hexsha_to_file: - return True + def readable_db_object_path(self, hexsha): + """ + :return: readable object path to the object identified by hexsha + :raise BadObject: If the object file does not exist""" + try: + return self._hexsha_to_file[hexsha] + except KeyError: + pass + # END ignore cache misses # try filesystem - path = self.db_path(self.hexsha_to_object_path(sha)) + path = self.db_path(self.object_path(hexsha)) if exists(path): - self._hexsha_to_file[sha] = path - return True + self._hexsha_to_file[hexsha] = path + return path # END handle cache - return False + raise BadObject(hexsha) + + #} END interface + + def _object_header_info(self, mmap): + """:return: tuple(type_string, uncompressed_size_in_bytes + :param mmap: newly mapped memory map at position 0. It will be + seeked to the actual start of the object contents, which can be used + to initialize a zlib decompress object.""" + raise NotImplementedError("todo") + + def _map_object(self, sha): + """ + :return: tuple(file, mmap) tuple with an opened file for reading, and + a memory map of that file""" + db_path = self.readable_db_object_path(to_hex_sha(sha)) + f = open(db_path, 'rb') + m = mmap.mmap(f.fileno(), getsize(db_path), access=mmap.ACCESS_READ) + return (f, m) + + def object_info(self, sha): + f, m = self._map_object(sha) + try: + type, size = self._object_header_info(m) + finally: + f.close() + m.close() + # END assure release of system resources + + def object(self, sha): + f, m = self._map_object(sha) + type, size = self._object_header_info(m) + # TODO: init a dynamic decompress stream from our memory map + + + def has_object(self, sha): + try: + self.readable_db_object_path(to_hex_sha(sha)) + return True + except BadObject: + return False + # END check existance def to_object(self, type, size, stream, dry_run=False, sha_as_hex=True): # open a tmp file to write the data to @@ -194,7 +242,7 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): os.remove(tmp_path) else: # rename the file into place - obj_path = self.db_path(self.hexsha_to_object_path(sha)) + obj_path = self.db_path(self.object_path(sha)) obj_dir = dirname(obj_path) if not isdir(obj_dir): mkdir(obj_dir) diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py index 04d3eaba..d88dca1a 100644 --- a/lib/git/odb/utils.py +++ b/lib/git/odb/utils.py @@ -22,6 +22,7 @@ def to_bin_sha(sha): return hex_to_bin(sha) # os shortcuts +getsize = os.path.getsize exists = os.path.exists mkdir = os.mkdir isdir = os.path.isdir diff --git a/test/git/test_odb.py b/test/git/test_odb.py index bc92a493..b2840719 100644 --- a/test/git/test_odb.py +++ b/test/git/test_odb.py @@ -3,6 +3,7 @@ from test.testlib import * from git.odb.db import * from git import Blob +from git.errors import BadObject from cStringIO import StringIO import os @@ -26,6 +27,18 @@ class TestDB(TestBase): sha = db.to_object(Blob.type, len(data), StringIO(data), dry_run, hex_sha) assert db.has_object(sha) != dry_run assert len(sha) == 20 + hex_sha * 20 + + # verify data - the slow way, we want to run code + if not dry_run: + type, size = db.object_info(sha) + assert Blob.type == type + assert size == len(data) + + type, size, stream = db.object(sha) + assert stream.read() == data + else: + self.failUnlessRaises(BadObject, db.object_info, sha) + self.failUnlessRaises(BadObject, db.object, sha) # END for each sha type # END for each data set # END for each dry_run mode -- cgit v1.2.3 From 38d59fc8ccccae8882fa48671377bf40a27915a7 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 3 Jun 2010 16:35:35 +0200 Subject: odb: implemented loose object streaming, which is impossible to do efficiently considering that it copies string buffers all the time --- lib/git/errors.py | 3 + lib/git/objects/base.py | 4 +- lib/git/odb/db.py | 114 +++++++++++++++------------- lib/git/odb/fun.py | 114 ++++++++++++++++++++++++++++ lib/git/odb/utils.py | 147 ++++++++++++++++++++++++++++++++++++- test/git/performance/test_utils.py | 44 +++++++++++ 6 files changed, 371 insertions(+), 55 deletions(-) create mode 100644 lib/git/odb/fun.py create mode 100644 test/git/performance/test_utils.py diff --git a/lib/git/errors.py b/lib/git/errors.py index 956e007f..d8a35e02 100644 --- a/lib/git/errors.py +++ b/lib/git/errors.py @@ -18,6 +18,9 @@ class InvalidDBRoot(ODBError): class BadObject(ODBError): """The object with the given SHA does not exist""" + +class BadObjectType(ODBError): + """The object had an unsupported type""" class NoSuchPathError(OSError): """ Thrown if a path could not be access by the system. """ diff --git a/lib/git/objects/base.py b/lib/git/objects/base.py index f7043199..64a5678e 100644 --- a/lib/git/objects/base.py +++ b/lib/git/objects/base.py @@ -125,8 +125,8 @@ class Object(LazyMixin): Returns File Object compatible stream to the uncompressed raw data of the object """ - proc = self.repo.git.cat_file(self.type, self.sha, as_process=True) - return utils.ProcessStreamAdapter(proc, "stdout") + sha, type, size, stream = self.repo.git.stream_object_data(self.sha) + return stream def stream_data(self, ostream): """ diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index 1248a3f4..5c50a512 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -1,17 +1,18 @@ """Contains implementations of database retrieveing objects""" -import os +from git.utils import IndexFileSHA1Writer from git.errors import ( InvalidDBRoot, - BadObject + BadObject, + BadObjectType ) -from git.utils import IndexFileSHA1Writer from utils import ( - getsize, + DecompressMemMapReader, + FDCompressedSha1Writer, + ENOENT, to_hex_sha, exists, hex_to_bin, - FDCompressedSha1Writer, isdir, mkdir, rename, @@ -19,8 +20,15 @@ from utils import ( join ) +from fun import ( + chunk_size, + loose_object_header_info, + write_object + ) + import tempfile import mmap +import os class iObjectDBR(object): @@ -36,7 +44,8 @@ class iObjectDBR(object): def has_object(self, sha): """ :return: True if the object identified by the given 40 byte hexsha or 20 bytes - binary sha is contained in the database""" + binary sha is contained in the database + :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") def object(self, sha): @@ -44,14 +53,16 @@ class iObjectDBR(object): :return: tuple(type_string, size_in_bytes, stream) a tuple with object information including its type, its size as well as a stream from which its contents can be read - :param sha: 40 bytes hexsha or 20 bytes binary sha """ + :param sha: 40 bytes hexsha or 20 bytes binary sha + :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") def object_info(self, sha): """ :return: tuple(type_string, size_in_bytes) tuple with the object's type string as well as its size in bytes - :param sha: 40 bytes hexsha or 20 bytes binary sha""" + :param sha: 40 bytes hexsha or 20 bytes binary sha + :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") #} END query interface @@ -70,7 +81,8 @@ class iObjectDBW(object): :param stream: stream providing the data :param dry_run: if True, the object database will not actually be changed :param sha_as_hex: if True, the returned sha identifying the object will be - hex encoded, not binary""" + hex encoded, not binary + :raise IOError: if data could not be written""" raise NotImplementedError("To be implemented in subclass") def to_objects(self, iter_info, dry_run=False, sha_as_hex=True, max_threads=0): @@ -82,7 +94,8 @@ class iObjectDBW(object): :param dry_run: see ``to_obj`` :param sha_as_hex: see ``to_obj`` :param max_threads: if < 1, any number of threads may be started while processing - the request, otherwise the given number of threads will be started.""" + the request, otherwise the given number of threads will be started. + :raise IOError: if data could not be written""" # a trivial implementation, ignoring the threads for now # TODO: add configuration to the class to determine whether we may # actually use multiple threads, default False of course. If the add @@ -130,15 +143,19 @@ class FileDBBase(object): class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): """A database which operates on loose object files""" - __slots__ = ('_hexsha_to_file', ) - + __slots__ = ('_hexsha_to_file', '_fd_open_flags') # CONFIGURATION # chunks in which data will be copied between streams - stream_chunk_size = 1000*1000 + stream_chunk_size = chunk_size + def __init__(self, root_path): super(LooseObjectDB, self).__init__(root_path) self._hexsha_to_file = dict() + # Additional Flags - might be set to 0 after the first failure + # Depending on the root, this might work for some mounts, for others not, which + # is why it is per instance + self._fd_open_flags = os.O_NOATIME #{ Interface def object_path(self, hexsha): @@ -167,36 +184,46 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): #} END interface - def _object_header_info(self, mmap): - """:return: tuple(type_string, uncompressed_size_in_bytes - :param mmap: newly mapped memory map at position 0. It will be - seeked to the actual start of the object contents, which can be used - to initialize a zlib decompress object.""" - raise NotImplementedError("todo") - - def _map_object(self, sha): + def _map_loose_object(self, sha): """ - :return: tuple(file, mmap) tuple with an opened file for reading, and - a memory map of that file""" - db_path = self.readable_db_object_path(to_hex_sha(sha)) - f = open(db_path, 'rb') - m = mmap.mmap(f.fileno(), getsize(db_path), access=mmap.ACCESS_READ) - return (f, m) + :return: memory map of that file to allow random read access + :raise BadObject: if object could not be located""" + db_path = self.db_path(self.object_path(to_hex_sha(sha))) + try: + fd = os.open(db_path, os.O_RDONLY|self._fd_open_flags) + except OSError,e: + if e.errno != ENOENT: + # try again without noatime + try: + fd = os.open(db_path, os.O_RDONLY) + except OSError: + raise BadObject(to_hex_sha(sha)) + # didn't work because of our flag, don't try it again + self._fd_open_flags = 0 + else: + raise BadObject(to_hex_sha(sha)) + # END handle error + # END exception handling + try: + return mmap.mmap(fd, 0, access=mmap.ACCESS_READ) + finally: + os.close(fd) + # END assure file is closed def object_info(self, sha): - f, m = self._map_object(sha) + m = self._map_loose_object(sha) try: - type, size = self._object_header_info(m) + return loose_object_header_info(m) finally: - f.close() m.close() # END assure release of system resources def object(self, sha): - f, m = self._map_object(sha) - type, size = self._object_header_info(m) - # TODO: init a dynamic decompress stream from our memory map + m = self._map_loose_object(sha) + reader = DecompressMemMapReader(m, close_on_deletion = True) + type, size = reader.initialize() + return type, size, reader def has_object(self, sha): try: @@ -210,25 +237,10 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): # open a tmp file to write the data to fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path) writer = FDCompressedSha1Writer(fd) - - # WRITE HEADER: type SP size NULL - writer.write("%s %i%s" % (type, size, chr(0))) - - # WRITE ALL DATA - chunksize = self.stream_chunk_size + try: - try: - while True: - data_len = writer.write(stream.read(chunksize)) - if data_len < chunksize: - # WRITE FOOTER - writer.write('\n') - break - # END check for stream end - # END duplicate data - finally: - writer.close() - # END assure file was closed + write_object(type, size, stream, writer, + close_target_stream=True, chunk_size=self.stream_chunk_size) except: os.remove(tmp_path) raise diff --git a/lib/git/odb/fun.py b/lib/git/odb/fun.py new file mode 100644 index 00000000..ee7144dd --- /dev/null +++ b/lib/git/odb/fun.py @@ -0,0 +1,114 @@ +"""Contains basic c-functions which usually contain performance critical code +Keeping this code separate from the beginning makes it easier to out-source +it into c later, if required""" + +from git.errors import ( + BadObjectType + ) + +import zlib +decompressobj = zlib.decompressobj + + +# INVARIANTS +type_id_to_type_map = { + 1 : "commit", + 2 : "tree", + 3 : "blob", + 4 : "tag" + } + +# used when dealing with larger streams +chunk_size = 1000*1000 + + +#{ Routines + +def is_loose_object(m): + """:return: True the file contained in memory map m appears to be a loose object. + Only the first two bytes are needed""" + b0, b1 = map(ord, m[:2]) + word = (b0 << 8) + b1 + return b0 == 0x78 and (word % 31) == 0 + +def loose_object_header_info(m): + """:return: tuple(type_string, uncompressed_size_in_bytes) the type string of the + object as well as its uncompressed size in bytes. + :param m: memory map from which to read the compressed object data""" + decompress_size = 8192 # is used in cgit as well + hdr = decompressobj().decompress(m, decompress_size) + type_name, size = hdr[:hdr.find("\0")].split(" ") + return type_name, int(size) + +def object_header_info(m): + """:return: tuple(type_string, uncompressed_size_in_bytes + :param mmap: mapped memory map. It will be + seeked to the actual start of the object contents, which can be used + to initialize a zlib decompress object. + :note: This routine can only handle new-style objects which are assumably contained + in packs + """ + assert not is_loose_object(m), "Use loose_object_header_info instead" + + c = b0 # first byte + i = 1 # next char to read + type_id = (c >> 4) & 7 # numeric type + size = c & 15 # starting size + s = 4 # starting bit-shift size + while c & 0x80: + c = ord(m[i]) + i += 1 + size += (c & 0x7f) << s + s += 7 + # END character loop + + # finally seek the map to the start of the data stream + m.seek(i) + try: + return (type_id_to_type_map[type_id], size) + except KeyError: + # invalid object type - we could try to be smart now and decode part + # of the stream to get the info, problem is that we had trouble finding + # the exact start of the content stream + raise BadObjectType(type_id) + # END handle exceptions + +def write_object(type, size, source_stream, target_stream, close_target_stream=True, + chunk_size=chunk_size): + """Write the object as identified by type, size and source_stream into the + target_stream + + :param type: type string of the object + :param size: amount of bytes to write from source_stream + :param source_stream: stream as file-like object providing at least size bytes + :param target_stream: stream as file-like object to receive the data + :param close_target_stream: if True, the target stream will be closed when + the routine exits, even if an error is thrown + :param chunk_size: size of chunks to read from source. Larger values can be beneficial + for io performance, but cost more memory as well + :return: The actual amount of bytes written to stream, which includes the header and a trailing newline""" + tbw = 0 # total num bytes written + dbw = 0 # num data bytes written + try: + # WRITE HEADER: type SP size NULL + tbw += target_stream.write("%s %i\0" % (type, size)) + + # WRITE ALL DATA UP TO SIZE + while True: + cs = min(chunk_size, size-dbw) + data_len = target_stream.write(source_stream.read(cs)) + dbw += data_len + if data_len < cs or dbw == size: + tbw += dbw + break + # END check for stream end + # END duplicate data + return tbw + finally: + if close_target_stream: + target_stream.close() + # END handle stream closing + # END assure file was closed + + +#} END routines diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py index d88dca1a..8a054201 100644 --- a/lib/git/odb/utils.py +++ b/lib/git/odb/utils.py @@ -1,7 +1,10 @@ import binascii import os import zlib +from cStringIO import StringIO from git.utils import make_sha +import errno +from fun import chunk_size __all__ = ('FDSha1Writer', ) @@ -21,8 +24,10 @@ def to_bin_sha(sha): return sha return hex_to_bin(sha) +# errors +ENOENT = errno.ENOENT + # os shortcuts -getsize = os.path.getsize exists = os.path.exists mkdir = os.mkdir isdir = os.path.isdir @@ -32,6 +37,11 @@ join = os.path.join read = os.read write = os.write close = os.close + +# ZLIB configuration +# used when compressing objects +Z_BEST_SPEED = 1 + #} END Routines @@ -50,7 +60,7 @@ class FDCompressedSha1Writer(object): def __init__(self, fd): self.fd = fd self.sha1 = make_sha("") - self.zip = zlib.compressobj() + self.zip = zlib.compressobj(Z_BEST_SPEED) def write(self, data): """:raise IOError: If not all bytes could be written @@ -76,4 +86,137 @@ class FDCompressedSha1Writer(object): return close(self.fd) +class DecompressMemMapReader(object): + """Reads data in chunks from a memory map and decompresses it. The client sees + only the uncompressed data, respective file-like read calls are handling on-demand + buffered decompression accordingly + + A constraint on the total size of bytes is activated, simulating + a logical file within a possibly larger physical memory area + + To read efficiently, you clearly don't want to read individual bytes, instead, + read a few kilobytes at least. + + :note: The chunk-size should be carefully selected as it will involve quite a bit + of string copying due to the way the zlib is implemented. Its very wasteful, + hence we try to find a good tradeoff between allocation time and number of + times we actually allocate. An own zlib implementation would be good here + to better support streamed reading - it would only need to keep the mmap + and decompress it into chunks, thats all ... """ + __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_cs', '_close') + + def __init__(self, m, close_on_deletion, cs = 128*1024): + """Initialize with mmap and chunk_size for stream reading""" + self._m = m + self._zip = zlib.decompressobj() + self._buf = None # buffer of decompressed bytes + self._buflen = 0 # length of bytes in buffer + self._s = 0 # size of uncompressed data to read in total + self._br = 0 # num uncompressed bytes read + self._cws = 0 # start byte of compression window + self._cwe = 0 # end byte of compression window + self._cs = cs # chunk size (when reading from zip) + self._close = close_on_deletion # close the memmap on deletion ? + + def __del__(self): + if self._close: + self._m.close() + # END handle resource freeing + + def initialize(self, size=0): + """Initialize this instance for acting as a read-only stream for size bytes. + :param size: size in bytes to be decompresed before being depleted. + If 0, default object header information is parsed from the data, + returning a tuple of (type_string, uncompressed_size) + If not 0, the size will be used, and None is returned. + :note: must only be called exactly once""" + if size: + self._s = size + return + # END handle size + + # read header + maxb = 8192 + self._s = maxb + hdr = self.read(maxb) + hdrend = hdr.find("\0") + type, size = hdr[:hdrend].split(" ") + self._s = int(size) + + # adjust internal state to match actual header length that we ignore + # The buffer will be depleted first on future reads + self._br = 0 + hdrend += 1 # count terminating \0 + self._buf = StringIO(hdr[hdrend:]) + self._buflen = len(hdr) - hdrend + + return type, size + + def read(self, size=-1): + if size < 1: + size = self._s - self._br + else: + size = min(size, self._s - self._br) + # END clamp size + + if size == 0: + return str() + # END handle depletion + + # deplete the buffer, then just continue using the decompress object + # which has an own buffer. We just need this to transparently parse the + # header from the zlib stream + dat = str() + if self._buf: + if self._buflen >= size: + # have enough data + dat = self._buf.read(size) + self._buflen -= size + self._br += size + return dat + else: + dat = self._buf.getvalue() # ouch, duplicates data + size -= self._buflen + self._br += self._buflen + + self._buflen = 0 + self._buf = None + # END handle buffer len + # END handle buffer + + # decompress some data + # Abstract: zlib needs to operate on chunks of our memory map ( which may + # be large ), as it will otherwise and always fill in the 'unconsumed_tail' + # attribute which possible reads our whole map to the end, forcing + # everything to be read from disk even though just a portion was requested. + # As this would be a nogo, we workaround it by passing only chunks of data, + # moving the window into the memory map along as we decompress, which keeps + # the tail smaller than our chunk-size. This causes 'only' the chunk to be + # copied once, and another copy of a part of it when it creates the unconsumed + # tail. We have to use it to hand in the appropriate amount of bytes durin g + # the next read. + if self._zip.unconsumed_tail: + # move the window, make it as large as size demands. For code-clarity, + # we just take the chunk from our map again instead of reusing the unconsumed + # tail. The latter one would safe some memory copying, but we could end up + # with not getting enough data uncompressed, so we had to sort that out as well. + # Now we just assume the worst case, hence the data is uncompressed and the window + # needs to be as large as the uncompressed bytes we want to read. + self._cws = self._cwe - len(self._zip.unconsumed_tail) + self._cwe = self._cws + size + indata = self._m[self._cws:self._cwe] # another copy ... :( + else: + cws = self._cws + self._cws = self._cwe + self._cwe = cws + size + indata = self._m[self._cws:self._cwe] # ... copy it again :( + # END handle tail + + dcompdat = self._zip.decompress(indata, size) + self._br += len(dcompdat) + + if dat: + return dat + dcompdat + return dcompdat + #} END classes diff --git a/test/git/performance/test_utils.py b/test/git/performance/test_utils.py new file mode 100644 index 00000000..381d7c8b --- /dev/null +++ b/test/git/performance/test_utils.py @@ -0,0 +1,44 @@ +"""Performance of utilities""" +from time import time +import sys +import stat + +from lib import ( + TestBigRepoReadOnly + ) + + +class TestUtilPerformance(TestBigRepoReadOnly): + + def test_access(self): + # compare dict vs. slot access + class Slotty(object): + __slots__ = "attr" + def __init__(self): + self.attr = 1 + + class Dicty(object): + def __init__(self): + self.attr = 1 + + class BigSlotty(object): + __slots__ = ('attr', ) + tuple('abcdefghijk') + def __init__(self): + for attr in self.__slots__: + setattr(self, attr, 1) + + class BigDicty(object): + def __init__(self): + for attr in BigSlotty.__slots__: + setattr(self, attr, 1) + + ni = 1000000 + for cls in (Slotty, Dicty, BigSlotty, BigDicty): + cli = cls() + st = time() + for i in xrange(ni): + cli.attr + # END for each access + elapsed = time() - st + print >> sys.stderr, "Accessed %s.attr %i times in %s s ( %f acc / s)" % (cls.__name__, ni, elapsed, ni / elapsed) + # END for each class type -- cgit v1.2.3 From 26e138cb47dccc859ff219f108ce9b7d96cbcbcd Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 3 Jun 2010 18:21:05 +0200 Subject: odb: fixed streamed decompression reader ( specific tests would still be missing ) and added performance tests which are extremely promising --- lib/git/odb/db.py | 4 +- lib/git/odb/utils.py | 22 +++++---- test/git/performance/test_streams.py | 91 ++++++++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+), 10 deletions(-) create mode 100644 test/git/performance/test_streams.py diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index 5c50a512..e656b2b5 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -91,8 +91,8 @@ class iObjectDBW(object): they where given. :param iter_info: iterable yielding tuples containing the type_string size_in_bytes and the steam with the content data. - :param dry_run: see ``to_obj`` - :param sha_as_hex: see ``to_obj`` + :param dry_run: see ``to_object`` + :param sha_as_hex: see ``to_object`` :param max_threads: if < 1, any number of threads may be started while processing the request, otherwise the given number of threads will be started. :raise IOError: if data could not be written""" diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py index 8a054201..1e4a8e9d 100644 --- a/lib/git/odb/utils.py +++ b/lib/git/odb/utils.py @@ -39,7 +39,7 @@ write = os.write close = os.close # ZLIB configuration -# used when compressing objects +# used when compressing objects - 1 to 9 ( slowest ) Z_BEST_SPEED = 1 #} END Routines @@ -70,7 +70,7 @@ class FDCompressedSha1Writer(object): bytes_written = write(self.fd, cdata) if bytes_written != len(cdata): raise self.exc - return bytes_written + return len(data) def sha(self, as_hex = False): """:return: sha so far @@ -175,7 +175,7 @@ class DecompressMemMapReader(object): self._br += size return dat else: - dat = self._buf.getvalue() # ouch, duplicates data + dat = self._buf.read() # ouch, duplicates data size -= self._buflen self._br += self._buflen @@ -195,28 +195,34 @@ class DecompressMemMapReader(object): # copied once, and another copy of a part of it when it creates the unconsumed # tail. We have to use it to hand in the appropriate amount of bytes durin g # the next read. - if self._zip.unconsumed_tail: + tail = self._zip.unconsumed_tail + if tail: # move the window, make it as large as size demands. For code-clarity, # we just take the chunk from our map again instead of reusing the unconsumed # tail. The latter one would safe some memory copying, but we could end up # with not getting enough data uncompressed, so we had to sort that out as well. # Now we just assume the worst case, hence the data is uncompressed and the window # needs to be as large as the uncompressed bytes we want to read. - self._cws = self._cwe - len(self._zip.unconsumed_tail) + self._cws = self._cwe - len(tail) self._cwe = self._cws + size + + indata = self._m[self._cws:self._cwe] # another copy ... :( + # get the actual window end to be sure we don't use it for computations + self._cwe = self._cws + len(indata) else: cws = self._cws self._cws = self._cwe self._cwe = cws + size indata = self._m[self._cws:self._cwe] # ... copy it again :( # END handle tail - + dcompdat = self._zip.decompress(indata, size) - self._br += len(dcompdat) + self._br += len(dcompdat) if dat: - return dat + dcompdat + dcompdat = dat + dcompdat + return dcompdat #} END classes diff --git a/test/git/performance/test_streams.py b/test/git/performance/test_streams.py new file mode 100644 index 00000000..15924c08 --- /dev/null +++ b/test/git/performance/test_streams.py @@ -0,0 +1,91 @@ +"""Performance data streaming performance""" + +from test.testlib import * +from git.odb.db import * + +from array import array +from cStringIO import StringIO +from time import time +import os +import sys +import stat +import random + + +from lib import ( + TestBigRepoReadOnly + ) + + + +def make_memory_file(size_in_bytes, randomize=False): + """:return: tuple(size_of_stream, stream) + :param randomize: try to produce a very random stream""" + actual_size = size_in_bytes / 4 + producer = xrange(actual_size) + if randomize: + producer = list(producer) + random.shuffle(producer) + # END randomize + a = array('i', producer) + return actual_size*4, StringIO(a.tostring()) + + +class TestObjDBPerformance(TestBigRepoReadOnly): + + large_data_size_bytes = 1000*1000*10 # some MiB should do it + moderate_data_size_bytes = 1000*1000*1 # just 1 MiB + + @with_bare_rw_repo + def test_large_data_streaming(self, rwrepo): + ldb = LooseObjectDB(os.path.join(rwrepo.git_dir, 'objects')) + + for randomize in range(2): + desc = (randomize and 'random ') or '' + print >> sys.stderr, "Creating %s data ..." % desc + st = time() + size, stream = make_memory_file(self.large_data_size_bytes, randomize) + elapsed = time() - st + print >> sys.stderr, "Done (in %f s)" % elapsed + + # writing - due to the compression it will seem faster than it is + st = time() + sha = ldb.to_object('blob', size, stream) + elapsed = time() - st + assert ldb.has_object(sha) + fsize_kib = os.path.getsize(ldb.readable_db_object_path(sha)) / 1000 + + + size_kib = size / 1000 + print >> sys.stderr, "Added %i KiB (filesize = %i KiB) of %s data to loose odb in %f s ( %f Write KiB / s)" % (size_kib, fsize_kib, desc, elapsed, size_kib / elapsed) + + # reading all at once + st = time() + type, size, shastream = ldb.object(sha) + shadata = shastream.read() + elapsed = time() - st + + stream.seek(0) + assert shadata == stream.getvalue() + print >> sys.stderr, "Read %i KiB of %s data at once from loose odb in %f s ( %f Read KiB / s)" % (size_kib, desc, elapsed, size_kib / elapsed) + + + # reading in chunks of 1 MiB + cs = 512*1000 + chunks = list() + st = time() + type, size, shastream = ldb.object(sha) + while True: + data = shastream.read(cs) + chunks.append(data) + if len(data) < cs: + break + # END read in chunks + elapsed = time() - st + + stream.seek(0) + assert ''.join(chunks) == stream.getvalue() + + cs_kib = cs / 1000 + print >> sys.stderr, "Read %i KiB of %s data in %i KiB chunks from loose odb in %f s ( %f Read KiB / s)" % (size_kib, desc, cs_kib, elapsed, size_kib / elapsed) + # END for each randomization factor -- cgit v1.2.3 From 4b4a514e51fbc7dc6ddcb27c188159d57b5d1fa9 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 3 Jun 2010 19:04:18 +0200 Subject: Added performance comparison to cgit ... and yes, git-python is faster :) --- lib/git/odb/utils.py | 31 ++++++++++++++--- test/git/performance/test_streams.py | 67 ++++++++++++++++++++++++++++++++---- 2 files changed, 87 insertions(+), 11 deletions(-) diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py index 1e4a8e9d..94d1cea8 100644 --- a/lib/git/odb/utils.py +++ b/lib/git/odb/utils.py @@ -103,10 +103,12 @@ class DecompressMemMapReader(object): times we actually allocate. An own zlib implementation would be good here to better support streamed reading - it would only need to keep the mmap and decompress it into chunks, thats all ... """ - __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_cs', '_close') + __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close') - def __init__(self, m, close_on_deletion, cs = 128*1024): - """Initialize with mmap and chunk_size for stream reading""" + max_read_size = 512*1024 + + def __init__(self, m, close_on_deletion): + """Initialize with mmap for stream reading""" self._m = m self._zip = zlib.decompressobj() self._buf = None # buffer of decompressed bytes @@ -115,7 +117,6 @@ class DecompressMemMapReader(object): self._br = 0 # num uncompressed bytes read self._cws = 0 # start byte of compression window self._cwe = 0 # end byte of compression window - self._cs = cs # chunk size (when reading from zip) self._close = close_on_deletion # close the memmap on deletion ? def __del__(self): @@ -163,6 +164,28 @@ class DecompressMemMapReader(object): return str() # END handle depletion + # protect from memory peaks + # If he tries to read large chunks, our memory patterns get really bad + # as we end up copying a possibly huge chunk from our memory map right into + # memory. This might not even be possible. Nonetheless, try to dampen the + # effect a bit by reading in chunks, returning a huge string in the end. + # Our performance now depends on StringIO. This way we don't need two large + # buffers in peak times, but only one large one in the end which is + # the return buffer + if size > self.max_read_size: + sio = StringIO() + while size: + read_size = min(self.max_read_size, size) + data = self.read(read_size) + sio.write(data) + size -= len(data) + if len(data) < read_size: + break + # END data loop + sio.seek(0) + return sio.getvalue() + # END handle maxread + # deplete the buffer, then just continue using the decompress object # which has an own buffer. We just need this to transparently parse the # header from the zlib stream diff --git a/test/git/performance/test_streams.py b/test/git/performance/test_streams.py index 15924c08..6c2834b3 100644 --- a/test/git/performance/test_streams.py +++ b/test/git/performance/test_streams.py @@ -10,6 +10,7 @@ import os import sys import stat import random +import subprocess from lib import ( @@ -51,23 +52,24 @@ class TestObjDBPerformance(TestBigRepoReadOnly): # writing - due to the compression it will seem faster than it is st = time() sha = ldb.to_object('blob', size, stream) - elapsed = time() - st + elapsed_add = time() - st assert ldb.has_object(sha) - fsize_kib = os.path.getsize(ldb.readable_db_object_path(sha)) / 1000 + db_file = ldb.readable_db_object_path(sha) + fsize_kib = os.path.getsize(db_file) / 1000 size_kib = size / 1000 - print >> sys.stderr, "Added %i KiB (filesize = %i KiB) of %s data to loose odb in %f s ( %f Write KiB / s)" % (size_kib, fsize_kib, desc, elapsed, size_kib / elapsed) + print >> sys.stderr, "Added %i KiB (filesize = %i KiB) of %s data to loose odb in %f s ( %f Write KiB / s)" % (size_kib, fsize_kib, desc, elapsed_add, size_kib / elapsed_add) # reading all at once st = time() type, size, shastream = ldb.object(sha) shadata = shastream.read() - elapsed = time() - st + elapsed_readall = time() - st stream.seek(0) assert shadata == stream.getvalue() - print >> sys.stderr, "Read %i KiB of %s data at once from loose odb in %f s ( %f Read KiB / s)" % (size_kib, desc, elapsed, size_kib / elapsed) + print >> sys.stderr, "Read %i KiB of %s data at once from loose odb in %f s ( %f Read KiB / s)" % (size_kib, desc, elapsed_readall, size_kib / elapsed_readall) # reading in chunks of 1 MiB @@ -81,11 +83,62 @@ class TestObjDBPerformance(TestBigRepoReadOnly): if len(data) < cs: break # END read in chunks - elapsed = time() - st + elapsed_readchunks = time() - st stream.seek(0) assert ''.join(chunks) == stream.getvalue() cs_kib = cs / 1000 - print >> sys.stderr, "Read %i KiB of %s data in %i KiB chunks from loose odb in %f s ( %f Read KiB / s)" % (size_kib, desc, cs_kib, elapsed, size_kib / elapsed) + print >> sys.stderr, "Read %i KiB of %s data in %i KiB chunks from loose odb in %f s ( %f Read KiB / s)" % (size_kib, desc, cs_kib, elapsed_readchunks, size_kib / elapsed_readchunks) + + # del db file so git has something to do + os.remove(db_file) + + # VS. CGIT + ########## + # CGIT ! Can using the cgit programs be faster ? + proc = rwrepo.git.hash_object('-w', '--stdin', as_process=True, istream=subprocess.PIPE) + + # write file - pump everything in at once to be a fast as possible + data = stream.getvalue() # cache it + st = time() + proc.stdin.write(data) + proc.stdin.close() + gitsha = proc.stdout.read().strip() + proc.wait() + gelapsed_add = time() - st + del(data) + assert gitsha == sha # we do it the same way, right ? + + # as its the same sha, we reuse our path + fsize_kib = os.path.getsize(db_file) / 1000 + print >> sys.stderr, "Added %i KiB (filesize = %i KiB) of %s data to using git-hash-object in %f s ( %f Write KiB / s)" % (size_kib, fsize_kib, desc, gelapsed_add, size_kib / gelapsed_add) + + # compare ... + print >> sys.stderr, "Git-Python is %f %% faster than git when adding big %s files" % (100.0 - (elapsed_add / gelapsed_add) * 100, desc) + + + # read all + st = time() + s, t, size, data = rwrepo.git.get_object_data(gitsha) + gelapsed_readall = time() - st + print >> sys.stderr, "Read %i KiB of %s data at once using git-cat-file in %f s ( %f Read KiB / s)" % (size_kib, desc, gelapsed_readall, size_kib / gelapsed_readall) + + # compare + print >> sys.stderr, "Git-Python is %f %% faster than git when reading big %sfiles" % (100.0 - (elapsed_readall / gelapsed_readall) * 100, desc) + + + # read chunks + st = time() + s, t, size, stream = rwrepo.git.stream_object_data(gitsha) + while True: + data = stream.read(cs) + if len(data) < cs: + break + # END read stream + gelapsed_readchunks = time() - st + print >> sys.stderr, "Read %i KiB of %s data in %i KiB chunks from git-cat-file in %f s ( %f Read KiB / s)" % (size_kib, desc, cs_kib, gelapsed_readchunks, size_kib / gelapsed_readchunks) + + # compare + print >> sys.stderr, "Git-Python is %f %% faster than git when reading big %s files in chunks" % (100.0 - (elapsed_readchunks / gelapsed_readchunks) * 100, desc) # END for each randomization factor -- cgit v1.2.3 From 1e2b46138ba58033738a24dadccc265748fce2ca Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 3 Jun 2010 23:20:34 +0200 Subject: commit.create_from_tree now uses pure python implementation, fixed message parsing which truncated newlines although it was ilegitimate. Its up to the reader to truncate therse, nowhere in the git code I could find anyone adding newlines to commits where it is written Added performance tests for serialization, it does about 5k commits per second if writing to tmpfs --- CHANGES | 1 + lib/git/cmd.py | 7 +---- lib/git/objects/base.py | 4 +-- lib/git/objects/commit.py | 42 +++++++++++++++++-------- lib/git/objects/utils.py | 25 ++++++++++----- lib/git/odb/utils.py | 32 ++++++++++--------- lib/git/repo.py | 43 +++++++++++++------------- lib/git/utils.py | 15 +++++++++ test/git/performance/lib.py | 25 +++++++++++++-- test/git/performance/test_commit.py | 36 ++++++++++++++++++++-- test/git/performance/test_odb.py | 6 ++-- test/git/performance/test_streams.py | 4 +-- test/git/performance/test_utils.py | 4 +-- test/git/test_commit.py | 59 ++++++++++++++++++++++++++++++++++-- test/git/test_utils.py | 9 ++++-- 15 files changed, 230 insertions(+), 82 deletions(-) diff --git a/CHANGES b/CHANGES index e24e723d..e9e1257e 100644 --- a/CHANGES +++ b/CHANGES @@ -5,6 +5,7 @@ CHANGES 0.2 Beta 2 =========== * Commit objects now carry the 'encoding' information of their message. It wasn't parsed previously, and defaults to UTF-8 + * Commit.create_from_tree now uses a pure-python implementation, mimicing git-commit-tree 0.2 ===== diff --git a/lib/git/cmd.py b/lib/git/cmd.py index aaa27adc..18d1c505 100644 --- a/lib/git/cmd.py +++ b/lib/git/cmd.py @@ -323,12 +323,7 @@ class Git(object): stdout_value = proc.stdout.read().rstrip() # strip trailing "\n" else: max_chunk_size = 1024*64 - while True: - chunk = proc.stdout.read(max_chunk_size) - output_stream.write(chunk) - if len(chunk) < max_chunk_size: - break - # END reading output stream + stream_copy(proc.stdout, output_stream, max_chunk_size) stdout_value = output_stream # END stdout handling stderr_value = proc.stderr.read().rstrip() # strip trailing "\n" diff --git a/lib/git/objects/base.py b/lib/git/objects/base.py index 64a5678e..f7043199 100644 --- a/lib/git/objects/base.py +++ b/lib/git/objects/base.py @@ -125,8 +125,8 @@ class Object(LazyMixin): Returns File Object compatible stream to the uncompressed raw data of the object """ - sha, type, size, stream = self.repo.git.stream_object_data(self.sha) - return stream + proc = self.repo.git.cat_file(self.type, self.sha, as_process=True) + return utils.ProcessStreamAdapter(proc, "stdout") def stream_data(self, ostream): """ diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py index 98aca360..d56ce306 100644 --- a/lib/git/objects/commit.py +++ b/lib/git/objects/commit.py @@ -91,15 +91,6 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri """ super(Commit,self).__init__(repo, sha) self._set_self_from_args_(locals()) - - if parents is not None: - cls = type(self) - self.parents = tuple(cls(repo, p) for p in parents if not isinstance(p, cls)) - # END for each parent to convert - - if self.sha and tree is not None: - self.tree = Tree(repo, tree, path='') - # END id to tree conversion @classmethod def _get_intermediate_items(cls, commit): @@ -350,7 +341,12 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri committer, committer_time, committer_offset, message, parent_commits, conf_encoding) - # serialize ! + stream = StringIO() + new_commit._serialize(stream) + streamlen = stream.tell() + stream.seek(0) + + new_commit.sha = repo.odb.to_object(cls.type, streamlen, stream, sha_as_hex=True) if head: try: @@ -377,8 +373,28 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri #{ Serializable Implementation def _serialize(self, stream): - # for now, this is very inefficient and in fact shouldn't be used like this - return super(Commit, self)._serialize(stream) + write = stream.write + write("tree %s\n" % self.tree) + for p in self.parents: + write("parent %s\n" % p) + + a = self.author + c = self.committer + fmt = "%s %s <%s> %s %s\n" + write(fmt % ("author", a.name, a.email, + self.authored_date, + utils.altz_to_utctz_str(self.author_tz_offset))) + + write(fmt % ("committer", c.name, c.email, + self.committed_date, + utils.altz_to_utctz_str(self.committer_tz_offset))) + + if self.encoding != self.default_encoding: + write("encoding %s\n" % self.encoding) + + write("\n") + write(self.message) + return self def _deserialize(self, stream): """:param from_rev_list: if true, the stream format is coming from the rev-list command @@ -416,7 +432,7 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri # a stream from our data simply gives us the plain message # The end of our message stream is marked with a newline that we strip - self.message = stream.read()[:-1] + self.message = stream.read() return self #} END serializable implementation diff --git a/lib/git/objects/utils.py b/lib/git/objects/utils.py index 6d378a72..c93f2091 100644 --- a/lib/git/objects/utils.py +++ b/lib/git/objects/utils.py @@ -16,7 +16,8 @@ import time import os __all__ = ('get_object_type_by_name', 'get_user_id', 'parse_date', 'parse_actor_and_date', - 'ProcessStreamAdapter', 'Traversable') + 'ProcessStreamAdapter', 'Traversable', 'altz_to_utctz_str', 'utctz_to_altz', + 'verify_utctz') def get_object_type_by_name(object_type_name): """ @@ -57,14 +58,24 @@ def get_user_id(): return "%s@%s" % (username, platform.node()) -def _utc_tz_to_altz(utctz): +def utctz_to_altz(utctz): """we convert utctz to the timezone in seconds, it is the format time.altzone returns. Git stores it as UTC timezon which has the opposite sign as well, which explains the -1 * ( that was made explicit here ) :param utctz: git utc timezone string, i.e. +0200""" return -1 * int(float(utctz)/100*3600) + +def altz_to_utctz_str(altz): + """As above, but inverses the operation, returning a string that can be used + in commit objects""" + utci = -1 * int((altz / 3600)*100) + utcs = str(abs(utci)) + utcs = "0"*(4-len(utcs)) + utcs + prefix = (utci < 0 and '-') or '+' + return prefix + utcs + -def _verify_utctz(offset): +def verify_utctz(offset): """:raise ValueError: if offset is incorrect :return: offset""" fmt_exc = ValueError("Invalid timezone offset format: %s" % offset) @@ -97,11 +108,11 @@ def parse_date(string_date): if string_date.count(' ') == 1 and string_date.rfind(':') == -1: timestamp, offset = string_date.split() timestamp = int(timestamp) - return timestamp, _utc_tz_to_altz(_verify_utctz(offset)) + return timestamp, utctz_to_altz(verify_utctz(offset)) else: offset = "+0000" # local time by default if string_date[-5] in '-+': - offset = _verify_utctz(string_date[-5:]) + offset = verify_utctz(string_date[-5:]) string_date = string_date[:-6] # skip space as well # END split timezone info @@ -139,7 +150,7 @@ def parse_date(string_date): fstruct = time.struct_time((dtstruct.tm_year, dtstruct.tm_mon, dtstruct.tm_mday, tstruct.tm_hour, tstruct.tm_min, tstruct.tm_sec, dtstruct.tm_wday, dtstruct.tm_yday, tstruct.tm_isdst)) - return int(time.mktime(fstruct)), _utc_tz_to_altz(offset) + return int(time.mktime(fstruct)), utctz_to_altz(offset) except ValueError: continue # END exception handling @@ -167,7 +178,7 @@ def parse_actor_and_date(line): """ m = _re_actor_epoch.search(line) actor, epoch, offset = m.groups() - return (Actor._from_string(actor), int(epoch), _utc_tz_to_altz(offset)) + return (Actor._from_string(actor), int(epoch), utctz_to_altz(offset)) diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py index 94d1cea8..fd340962 100644 --- a/lib/git/odb/utils.py +++ b/lib/git/odb/utils.py @@ -137,7 +137,7 @@ class DecompressMemMapReader(object): # END handle size # read header - maxb = 8192 + maxb = 512 # should really be enough, cgit uses 8192 I believe self._s = maxb hdr = self.read(maxb) hdrend = hdr.find("\0") @@ -172,20 +172,24 @@ class DecompressMemMapReader(object): # Our performance now depends on StringIO. This way we don't need two large # buffers in peak times, but only one large one in the end which is # the return buffer - if size > self.max_read_size: - sio = StringIO() - while size: - read_size = min(self.max_read_size, size) - data = self.read(read_size) - sio.write(data) - size -= len(data) - if len(data) < read_size: - break - # END data loop - sio.seek(0) - return sio.getvalue() - # END handle maxread + # NO: We don't do it - if the user thinks its best, he is right. If he + # has trouble, he will start reading in chunks. According to our tests + # its still faster if we read 10 Mb at once instead of chunking it. + # if size > self.max_read_size: + # sio = StringIO() + # while size: + # read_size = min(self.max_read_size, size) + # data = self.read(read_size) + # sio.write(data) + # size -= len(data) + # if len(data) < read_size: + # break + # # END data loop + # sio.seek(0) + # return sio.getvalue() + # # END handle maxread + # # deplete the buffer, then just continue using the decompress object # which has an own buffer. We just need this to transparently parse the # header from the zlib stream diff --git a/lib/git/repo.py b/lib/git/repo.py index f4caa3fb..0bd2249c 100644 --- a/lib/git/repo.py +++ b/lib/git/repo.py @@ -4,12 +4,6 @@ # This module is part of GitPython and is released under # the BSD License: http://www.opensource.org/licenses/bsd-license.php -import os -import sys -import re -import gzip -import StringIO - from errors import InvalidGitRepositoryError, NoSuchPathError from cmd import Git from actor import Actor @@ -19,6 +13,15 @@ from objects import * from config import GitConfigParser from remote import Remote +from odb.db import LooseObjectDB + +import os +import sys +import re +import gzip +import StringIO + + def touch(filename): fp = open(filename, "a") fp.close() @@ -53,7 +56,7 @@ class Repo(object): 'git_dir' is the .git repository directoy, which is always set. """ DAEMON_EXPORT_FILE = 'git-daemon-export-ok' - __slots__ = ( "working_dir", "_working_tree_dir", "git_dir", "_bare", "git" ) + __slots__ = ( "working_dir", "_working_tree_dir", "git_dir", "_bare", "git", "odb" ) # precompiled regex re_whitespace = re.compile(r'\s+') @@ -65,27 +68,22 @@ class Repo(object): # represents the configuration level of a configuration file config_level = ("system", "global", "repository") - def __init__(self, path=None): - """ - Create a new Repo instance - - ``path`` - is the path to either the root git directory or the bare git repo + def __init__(self, path=None, odbt = LooseObjectDB): + """ Create a new Repo instance - Examples:: + :param path: is the path to either the root git directory or the bare git repo:: repo = Repo("/Users/mtrier/Development/git-python") repo = Repo("/Users/mtrier/Development/git-python.git") repo = Repo("~/Development/git-python.git") repo = Repo("$REPOSITORIES/Development/git-python.git") - - Raises - InvalidGitRepositoryError or NoSuchPathError - - Returns - ``git.Repo`` - """ - + + :param odbt: Object DataBase type - a type which is constructed by providing + the directory containing the database objects, i.e. .git/objects. It will + be used to access all object data + :raise InvalidGitRepositoryError: + :raise NoSuchPathError: + :return: git.Repo """ epath = os.path.abspath(os.path.expandvars(os.path.expanduser(path or os.getcwd()))) if not os.path.exists(epath): @@ -130,6 +128,7 @@ class Repo(object): self.working_dir = self._working_tree_dir or self.git_dir self.git = Git(self.working_dir) + self.odb = odbt(os.path.join(self.git_dir, 'objects')) def __eq__(self, rhs): if isinstance(rhs, Repo): diff --git a/lib/git/utils.py b/lib/git/utils.py index 360c77c9..60a7de48 100644 --- a/lib/git/utils.py +++ b/lib/git/utils.py @@ -27,6 +27,21 @@ def make_sha(source=''): sha1 = sha.sha(source) return sha1 +def stream_copy(source, destination, chunk_size=512*1024): + """Copy all data from the source stream into the destination stream in chunks + of size chunk_size + :return: amount of bytes written""" + br = 0 + while True: + chunk = source.read(chunk_size) + destination.write(chunk) + br += len(chunk) + if len(chunk) < chunk_size: + break + # END reading output stream + return br + + def join_path(a, *p): """Join path tokens together similar to os.path.join, but always use '/' instead of possibly '\' on windows.""" diff --git a/test/git/performance/lib.py b/test/git/performance/lib.py index 4b552b20..650bea82 100644 --- a/test/git/performance/lib.py +++ b/test/git/performance/lib.py @@ -1,6 +1,8 @@ """Contains library functions""" import os from test.testlib import * +import shutil +import tempfile from git import ( Repo @@ -25,7 +27,7 @@ def resolve_or_fail(env_var): #{ Base Classes -class TestBigRepoReadOnly(TestBase): +class TestBigRepoR(TestBase): """TestCase providing access to readonly 'big' repositories using the following member variables: @@ -40,7 +42,24 @@ class TestBigRepoReadOnly(TestBase): @classmethod def setUpAll(cls): - super(TestBigRepoReadOnly, cls).setUpAll() - cls.gitrepo = Repo(resolve_or_fail(k_env_git_repo)) + super(TestBigRepoR, cls).setUpAll() + cls.gitrorepo = Repo(resolve_or_fail(k_env_git_repo)) + +class TestBigRepoRW(TestBigRepoR): + """As above, but provides a big repository that we can write to. + + Provides ``self.gitrwrepo``""" + + @classmethod + def setUpAll(cls): + super(TestBigRepoRW, cls).setUpAll() + dirname = tempfile.mktemp() + os.mkdir(dirname) + cls.gitrwrepo = cls.gitrorepo.clone(dirname, shared=True, bare=True) + + @classmethod + def tearDownAll(cls): + shutil.rmtree(cls.gitrwrepo.working_tree_dir) + #} END base classes diff --git a/test/git/performance/test_commit.py b/test/git/performance/test_commit.py index b4a9d868..2398c93d 100644 --- a/test/git/performance/test_commit.py +++ b/test/git/performance/test_commit.py @@ -6,10 +6,12 @@ from lib import * from git import * +from test.git.test_commit import assert_commit_serialization +from cStringIO import StringIO from time import time import sys -class TestPerformance(TestBigRepoReadOnly): +class TestPerformance(TestBigRepoRW): # ref with about 100 commits in its history ref_100 = '0.1.6' @@ -48,7 +50,7 @@ class TestPerformance(TestBigRepoReadOnly): # bound to cat-file parsing performance nc = 0 st = time() - for c in self.gitrepo.commit(self.head_sha_2k).traverse(branch_first=False): + for c in self.gitrorepo.commit(self.head_sha_2k).traverse(branch_first=False): nc += 1 self._query_commit_info(c) # END for each traversed commit @@ -59,10 +61,38 @@ class TestPerformance(TestBigRepoReadOnly): # bound to stream parsing performance nc = 0 st = time() - for c in Commit.iter_items(self.gitrepo, self.head_sha_2k): + for c in Commit.iter_items(self.gitrorepo, self.head_sha_2k): nc += 1 self._query_commit_info(c) # END for each traversed commit elapsed_time = time() - st print >> sys.stderr, "Iterated %i Commits in %s [s] ( %f commits/s )" % (nc, elapsed_time, nc/elapsed_time) + def test_commit_serialization(self): + assert_commit_serialization(self.gitrwrepo, self.head_sha_2k, True) + + rwrepo = self.gitrwrepo + make_object = rwrepo.odb.to_object + # direct serialization - deserialization can be tested afterwards + # serialization is probably limited on IO + hc = rwrepo.commit(self.head_sha_2k) + + commits = list() + nc = 5000 + st = time() + for i in xrange(nc): + cm = Commit( rwrepo, Commit.NULL_HEX_SHA, hc.tree, + hc.author, hc.authored_date, hc.author_tz_offset, + hc.committer, hc.committed_date, hc.committer_tz_offset, + str(i), parents=hc.parents, encoding=hc.encoding) + + stream = StringIO() + cm._serialize(stream) + slen = stream.tell() + stream.seek(0) + + cm.sha = make_object(Commit.type, slen, stream) + # END commit creation + elapsed = time() - st + + print >> sys.stderr, "Serialized %i commits to loose objects in %f s ( %f commits / s )" % (nc, elapsed, nc / elapsed) diff --git a/test/git/performance/test_odb.py b/test/git/performance/test_odb.py index 0ad2ce33..7b1ee838 100644 --- a/test/git/performance/test_odb.py +++ b/test/git/performance/test_odb.py @@ -5,18 +5,18 @@ import sys import stat from lib import ( - TestBigRepoReadOnly + TestBigRepoR ) -class TestObjDBPerformance(TestBigRepoReadOnly): +class TestObjDBPerformance(TestBigRepoR): def test_random_access(self): # GET COMMITS # TODO: use the actual db for this st = time() - root_commit = self.gitrepo.commit(self.head_sha_2k) + root_commit = self.gitrorepo.commit(self.head_sha_2k) commits = list(root_commit.traverse()) nc = len(commits) elapsed = time() - st diff --git a/test/git/performance/test_streams.py b/test/git/performance/test_streams.py index 6c2834b3..d31bee14 100644 --- a/test/git/performance/test_streams.py +++ b/test/git/performance/test_streams.py @@ -14,7 +14,7 @@ import subprocess from lib import ( - TestBigRepoReadOnly + TestBigRepoR ) @@ -32,7 +32,7 @@ def make_memory_file(size_in_bytes, randomize=False): return actual_size*4, StringIO(a.tostring()) -class TestObjDBPerformance(TestBigRepoReadOnly): +class TestObjDBPerformance(TestBigRepoR): large_data_size_bytes = 1000*1000*10 # some MiB should do it moderate_data_size_bytes = 1000*1000*1 # just 1 MiB diff --git a/test/git/performance/test_utils.py b/test/git/performance/test_utils.py index 381d7c8b..47366d34 100644 --- a/test/git/performance/test_utils.py +++ b/test/git/performance/test_utils.py @@ -4,11 +4,11 @@ import sys import stat from lib import ( - TestBigRepoReadOnly + TestBigRepoR ) -class TestUtilPerformance(TestBigRepoReadOnly): +class TestUtilPerformance(TestBigRepoR): def test_access(self): # compare dict vs. slot access diff --git a/test/git/test_commit.py b/test/git/test_commit.py index ad7a0082..a5f184e6 100644 --- a/test/git/test_commit.py +++ b/test/git/test_commit.py @@ -7,6 +7,56 @@ from test.testlib import * from git import * +from cStringIO import StringIO +import time +import sys + + +def assert_commit_serialization(rwrepo, commit_id, print_performance_info=False): + """traverse all commits in the history of commit identified by commit_id and check + if the serialization works. + :param print_performance_info: if True, we will show how fast we are""" + ns = 0 # num serializations + nds = 0 # num deserializations + + st = time.time() + for cm in rwrepo.commit(commit_id).traverse(): + nds += 1 + + # assert that we deserialize commits correctly, hence we get the same + # sha on serialization + stream = StringIO() + cm._serialize(stream) + ns += 1 + streamlen = stream.tell() + stream.seek(0) + + csha = rwrepo.odb.to_object(Commit.type, streamlen, stream) + assert csha == cm.sha + + nc = Commit(rwrepo, Commit.NULL_HEX_SHA, cm.tree.sha, + cm.author, cm.authored_date, cm.author_tz_offset, + cm.committer, cm.committed_date, cm.committer_tz_offset, + cm.message, cm.parents, cm.encoding) + + assert nc.parents == cm.parents + stream = StringIO() + nc._serialize(stream) + ns += 1 + streamlen = stream.tell() + stream.seek(0) + nc.sha = rwrepo.odb.to_object(Commit.type, streamlen, stream) + + # if it worked, we have exactly the same contents ! + assert nc.sha == cm.sha + # END check commits + elapsed = time.time() - st + + if print_performance_info: + print >> sys.stderr, "Serialized %i and deserialized %i commits in %f s ( (%f, %f) commits / s" % (ns, nds, elapsed, ns/elapsed, nds/elapsed) + # END handle performance info + + class TestCommit(TestBase): def test_bake(self): @@ -19,7 +69,7 @@ class TestCommit(TestBase): assert commit.author == commit.committer assert isinstance(commit.authored_date, int) and isinstance(commit.committed_date, int) assert isinstance(commit.author_tz_offset, int) and isinstance(commit.committer_tz_offset, int) - assert commit.message == "Added missing information to docstrings of commit and stats module" + assert commit.message == "Added missing information to docstrings of commit and stats module\n" def test_stats(self): @@ -49,7 +99,7 @@ class TestCommit(TestBase): assert commit.committed_date == 1210193388 assert commit.author_tz_offset == 14400, commit.author_tz_offset assert commit.committer_tz_offset == 14400, commit.committer_tz_offset - assert commit.message == "initial project" + assert commit.message == "initial project\n" def test_traversal(self): start = self.rorepo.commit("a4d06724202afccd2b5c54f81bcf2bf26dea7fff") @@ -171,3 +221,8 @@ class TestCommit(TestBase): name_rev = self.rorepo.head.commit.name_rev assert isinstance(name_rev, basestring) + @with_bare_rw_repo + def test_serialization(self, rwrepo): + # create all commits of our repo + assert_commit_serialization(rwrepo, '0.1.6') + diff --git a/test/git/test_utils.py b/test/git/test_utils.py index 2c3c392b..83ef7e4b 100644 --- a/test/git/test_utils.py +++ b/test/git/test_utils.py @@ -116,8 +116,6 @@ class TestUtils(TestCase): os.remove(my_file) # END final cleanup - - def test_user_id(self): assert '@' in get_user_id() @@ -127,7 +125,12 @@ class TestUtils(TestCase): assert len(rval) == 2 assert isinstance(rval[0], int) and isinstance(rval[1], int) assert rval[0] == veri_time - assert rval[1] == offset + assert rval[1] == offset + + # now that we are here, test our conversion functions as well + utctz = altz_to_utctz_str(offset) + assert isinstance(utctz, basestring) + assert utctz_to_altz(verify_utctz(utctz)) == offset # END assert rval utility rfc = ("Thu, 07 Apr 2005 22:13:11 +0000", 0) -- cgit v1.2.3 From 1906ee4df9ae4e734288c5203cf79894dff76cab Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 3 Jun 2010 23:27:09 +0200 Subject: Fixed compatability issues with python 2.5, made sure all tests run --- lib/git/odb/db.py | 4 ++-- test/git/performance/lib.py | 2 +- test/git/test_repo.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index e656b2b5..c970410d 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -101,7 +101,7 @@ class iObjectDBW(object): # actually use multiple threads, default False of course. If the add shas = list() for args in iter_info: - shas.append(self.to_object(*args, dry_run=dry_run, sha_as_hex=sha_as_hex)) + shas.append(self.to_object(dry_run=dry_run, sha_as_hex=sha_as_hex, *args)) return shas #} END edit interface @@ -155,7 +155,7 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): # Additional Flags - might be set to 0 after the first failure # Depending on the root, this might work for some mounts, for others not, which # is why it is per instance - self._fd_open_flags = os.O_NOATIME + self._fd_open_flags = getattr(os, 'O_NOATIME', 0) #{ Interface def object_path(self, hexsha): diff --git a/test/git/performance/lib.py b/test/git/performance/lib.py index 650bea82..7d2a9f4a 100644 --- a/test/git/performance/lib.py +++ b/test/git/performance/lib.py @@ -60,6 +60,6 @@ class TestBigRepoRW(TestBigRepoR): @classmethod def tearDownAll(cls): - shutil.rmtree(cls.gitrwrepo.working_tree_dir) + shutil.rmtree(cls.gitrwrepo.working_dir) #} END base classes diff --git a/test/git/test_repo.py b/test/git/test_repo.py index 7dc645b6..d2c7c742 100644 --- a/test/git/test_repo.py +++ b/test/git/test_repo.py @@ -73,7 +73,7 @@ class TestRepo(TestBase): assert_equal("mtrier@gmail.com", c.committer.email) assert_equal(1232829715, c.committed_date) assert_equal(5*3600, c.committer_tz_offset) - assert_equal("Bumped version 0.1.6", c.message) + assert_equal("Bumped version 0.1.6\n", c.message) c = commits[1] assert isinstance(c.parents, tuple) -- cgit v1.2.3 From b01ca6a3e4ae9d944d799743c8ff774e2a7a82b6 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 4 Jun 2010 00:09:00 +0200 Subject: db: implemented GitObjectDB using the git command to make sure we can lookup everything. Next is to implement pack-file reading, then alternates which should allow to resolve everything --- lib/git/objects/base.py | 32 +++++++++++++------------------- lib/git/odb/db.py | 22 ++++++++++++++++++++-- lib/git/repo.py | 11 ++++++++--- 3 files changed, 41 insertions(+), 24 deletions(-) diff --git a/lib/git/objects/base.py b/lib/git/objects/base.py index f7043199..446c4406 100644 --- a/lib/git/objects/base.py +++ b/lib/git/objects/base.py @@ -4,7 +4,7 @@ # This module is part of GitPython and is released under # the BSD License: http://www.opensource.org/licenses/bsd-license.php import os -from git.utils import LazyMixin, join_path_native +from git.utils import LazyMixin, join_path_native, stream_copy import utils _assertion_msg_format = "Created object %r whose python type %r disagrees with the acutal git object type %r" @@ -76,10 +76,11 @@ class Object(LazyMixin): Retrieve object information """ if attr == "size": - hexsha, typename, self.size = self.repo.git.get_object_header(self.sha) + typename, self.size = self.repo.odb.object_info(self.sha) assert typename == self.type, _assertion_msg_format % (self.sha, typename, self.type) elif attr == "data": - hexsha, typename, self.size, self.data = self.repo.git.get_object_data(self.sha) + typename, self.size, stream = self.repo.odb.object(self.sha) + self.data = stream.read() # once we have an own odb, we can delay reading assert typename == self.type, _assertion_msg_format % (self.sha, typename, self.type) else: super(Object,self)._set_cache_(attr) @@ -121,24 +122,17 @@ class Object(LazyMixin): @property def data_stream(self): - """ - Returns - File Object compatible stream to the uncompressed raw data of the object - """ - proc = self.repo.git.cat_file(self.type, self.sha, as_process=True) - return utils.ProcessStreamAdapter(proc, "stdout") + """ :return: File Object compatible stream to the uncompressed raw data of the object + :note: returned streams must be read in order""" + type, size, stream = self.repo.odb.object(self.sha) + return stream def stream_data(self, ostream): - """ - Writes our data directly to the given output stream - - ``ostream`` - File object compatible stream object. - - Returns - self - """ - self.repo.git.cat_file(self.type, self.sha, output_stream=ostream) + """Writes our data directly to the given output stream + :param ostream: File object compatible stream object. + :return: self""" + type, size, istream = self.repo.odb.object(self.sha) + stream_copy(istream, ostream) return self diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index c970410d..1d1d4c40 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -281,9 +281,27 @@ class ReferenceDB(CompoundDB): """A database consisting of database referred to in a file""" -class GitObjectDB(CompoundDB, iObjectDBW): +#class GitObjectDB(CompoundDB, iObjectDBW): +class GitObjectDB(LooseObjectDB): """A database representing the default git object store, which includes loose objects, pack files and an alternates file - It will create objects only in the loose object database.""" + It will create objects only in the loose object database. + :note: for now, we use the git command to do all the lookup, just until he + have packs and the other implementations + """ + __slots__ = ('_git', ) + def __init__(self, root_path, git): + """Initialize this instance with the root and a git command""" + super(GitObjectDB, self).__init__(root_path) + self._git = git + + def object_info(self, sha): + discard, type, size = self._git.get_object_header(sha) + return type, size + + def object(self, sha): + """For now, all lookup is done by git itself""" + discard, type, size, stream = self._git.stream_object_data(sha) + return type, size, stream diff --git a/lib/git/repo.py b/lib/git/repo.py index 0bd2249c..1afb1eb7 100644 --- a/lib/git/repo.py +++ b/lib/git/repo.py @@ -13,7 +13,7 @@ from objects import * from config import GitConfigParser from remote import Remote -from odb.db import LooseObjectDB +from odb.db import GitObjectDB import os import sys @@ -68,7 +68,7 @@ class Repo(object): # represents the configuration level of a configuration file config_level = ("system", "global", "repository") - def __init__(self, path=None, odbt = LooseObjectDB): + def __init__(self, path=None, odbt = GitObjectDB): """ Create a new Repo instance :param path: is the path to either the root git directory or the bare git repo:: @@ -128,7 +128,12 @@ class Repo(object): self.working_dir = self._working_tree_dir or self.git_dir self.git = Git(self.working_dir) - self.odb = odbt(os.path.join(self.git_dir, 'objects')) + + # special handling, in special times + args = [os.path.join(self.git_dir, 'objects')] + if issubclass(odbt, GitObjectDB): + args.append(self.git) + self.odb = odbt(*args) def __eq__(self, rhs): if isinstance(rhs, Repo): -- cgit v1.2.3 From a1e80445ad5cb6da4c0070d7cb8af89da3b0803b Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 4 Jun 2010 14:41:15 +0200 Subject: initial version of new odb design to facilitate a channel based multi-threading implementation of all odb functions --- lib/git/objects/base.py | 8 +- lib/git/objects/commit.py | 2 +- lib/git/odb/db.py | 114 ++++++---- lib/git/odb/stream.py | 388 +++++++++++++++++++++++++++++++++++ lib/git/odb/utils.py | 215 ------------------- test/git/performance/test_commit.py | 2 +- test/git/performance/test_streams.py | 6 +- test/git/test_commit.py | 4 +- test/git/test_odb.py | 10 +- 9 files changed, 476 insertions(+), 273 deletions(-) create mode 100644 lib/git/odb/stream.py diff --git a/lib/git/objects/base.py b/lib/git/objects/base.py index 446c4406..76384888 100644 --- a/lib/git/objects/base.py +++ b/lib/git/objects/base.py @@ -76,10 +76,10 @@ class Object(LazyMixin): Retrieve object information """ if attr == "size": - typename, self.size = self.repo.odb.object_info(self.sha) + typename, self.size = self.repo.odb.info(self.sha) assert typename == self.type, _assertion_msg_format % (self.sha, typename, self.type) elif attr == "data": - typename, self.size, stream = self.repo.odb.object(self.sha) + typename, self.size, stream = self.repo.odb.stream(self.sha) self.data = stream.read() # once we have an own odb, we can delay reading assert typename == self.type, _assertion_msg_format % (self.sha, typename, self.type) else: @@ -124,14 +124,14 @@ class Object(LazyMixin): def data_stream(self): """ :return: File Object compatible stream to the uncompressed raw data of the object :note: returned streams must be read in order""" - type, size, stream = self.repo.odb.object(self.sha) + type, size, stream = self.repo.odb.stream(self.sha) return stream def stream_data(self, ostream): """Writes our data directly to the given output stream :param ostream: File object compatible stream object. :return: self""" - type, size, istream = self.repo.odb.object(self.sha) + type, size, istream = self.repo.odb.stream(self.sha) stream_copy(istream, ostream) return self diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py index d56ce306..dbc0cf27 100644 --- a/lib/git/objects/commit.py +++ b/lib/git/objects/commit.py @@ -346,7 +346,7 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri streamlen = stream.tell() stream.seek(0) - new_commit.sha = repo.odb.to_object(cls.type, streamlen, stream, sha_as_hex=True) + new_commit.sha = repo.odb.store(cls.type, streamlen, stream, sha_as_hex=True) if head: try: diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index 1d1d4c40..7ae8f446 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -6,9 +6,12 @@ from git.errors import ( BadObjectType ) -from utils import ( +from stream import ( DecompressMemMapReader, - FDCompressedSha1Writer, + FDCompressedSha1Writer + ) + +from utils import ( ENOENT, to_hex_sha, exists, @@ -31,7 +34,7 @@ import mmap import os -class iObjectDBR(object): +class ObjectDBR(object): """Defines an interface for object database lookup. Objects are identified either by hex-sha (40 bytes) or by sha (20 bytes)""" @@ -48,62 +51,87 @@ class iObjectDBR(object): :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") - def object(self, sha): - """ - :return: tuple(type_string, size_in_bytes, stream) a tuple with object - information including its type, its size as well as a stream from which its - contents can be read + def info(self, sha): + """ :return: ODB_Info instance :param sha: 40 bytes hexsha or 20 bytes binary sha :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") - def object_info(self, sha): - """ - :return: tuple(type_string, size_in_bytes) tuple with the object's type - string as well as its size in bytes + def info_async(self, input_channel): + """Retrieve information of a multitude of objects asynchronously + :param input_channel: Channel yielding the sha's of the objects of interest + :return: Channel yielding ODB_Info|InvalidODB_Info, in any order""" + raise NotImplementedError("To be implemented in subclass") + + def stream(self, sha): + """:return: ODB_OStream instance :param sha: 40 bytes hexsha or 20 bytes binary sha :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") + + def stream_async(self, input_channel): + """Retrieve the ODB_OStream of multiple objects + :param input_channel: see ``info`` + :param max_threads: see ``ObjectDBW.store`` + :return: Channel yielding ODB_OStream|InvalidODB_OStream instances in any order""" + raise NotImplementedError("To be implemented in subclass") #} END query interface -class iObjectDBW(object): +class ObjectDBW(object): """Defines an interface to create objects in the database""" - __slots__ = tuple() + __slots__ = "_ostream" + + def __init__(self, *args, **kwargs): + self._ostream = None #{ Edit Interface + def set_ostream(self, stream): + """Adjusts the stream to which all data should be sent when storing new objects + :param stream: if not None, the stream to use, if None the default stream + will be used. + :return: previously installed stream, or None if there was no override + :raise TypeError: if the stream doesn't have the supported functionality""" + cstream = self._ostream + self._ostream = stream + return cstream + + def ostream(self): + """:return: overridden output stream this instance will write to, or None + if it will write to the default stream""" + return self._ostream - def to_object(self, type, size, stream, dry_run=False, sha_as_hex=True): + def store(self, istream): """Create a new object in the database - :return: the sha identifying the object in the database - :param type: type string identifying the object - :param size: size of the data to read from stream - :param stream: stream providing the data - :param dry_run: if True, the object database will not actually be changed - :param sha_as_hex: if True, the returned sha identifying the object will be - hex encoded, not binary + :return: the input istream object with its sha set to its corresponding value + :param istream: ODB_IStream compatible instance. If its sha is already set + to a value, the object will just be stored in the our database format, + in which case the input stream is expected to be in object format ( header + contents ). :raise IOError: if data could not be written""" raise NotImplementedError("To be implemented in subclass") - def to_objects(self, iter_info, dry_run=False, sha_as_hex=True, max_threads=0): - """Create multiple new objects in the database - :return: sequence of shas identifying the created objects in the order in which - they where given. - :param iter_info: iterable yielding tuples containing the type_string - size_in_bytes and the steam with the content data. - :param dry_run: see ``to_object`` - :param sha_as_hex: see ``to_object`` - :param max_threads: if < 1, any number of threads may be started while processing - the request, otherwise the given number of threads will be started. - :raise IOError: if data could not be written""" + def store_async(self, input_channel): + """Create multiple new objects in the database asynchronously. The method will + return right away, returning an output channel which receives the results as + they are computed. + + :return: Channel yielding your ODB_IStream which served as input, in any order. + The IStreams sha will be set to the sha it received during the process, + or its error attribute will be set to the exception informing about the error. + :param input_channel: Channel yielding ODB_IStream instance. + As the same instances will be used in the output channel, you can create a map + between the id(istream) -> istream + :note:As some ODB implementations implement this operation as atomic, they might + abort the whole operation if one item could not be processed. Hence check how + many items have actually been produced.""" # a trivial implementation, ignoring the threads for now # TODO: add configuration to the class to determine whether we may # actually use multiple threads, default False of course. If the add shas = list() for args in iter_info: - shas.append(self.to_object(dry_run=dry_run, sha_as_hex=sha_as_hex, *args)) + shas.append(self.store(dry_run=dry_run, sha_as_hex=sha_as_hex, *args)) return shas - + #} END edit interface @@ -118,6 +146,7 @@ class FileDBBase(object): :raise InvalidDBRoot: :note: The base will perform basic checking for accessability, but the subclass is required to verify that the root_path contains the database structure it needs""" + super(FileDBBase, self).__init__() if not os.path.isdir(root_path): raise InvalidDBRoot(root_path) self._root_path = root_path @@ -141,7 +170,7 @@ class FileDBBase(object): #} END utilities -class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): +class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): """A database which operates on loose object files""" __slots__ = ('_hexsha_to_file', '_fd_open_flags') # CONFIGURATION @@ -210,7 +239,7 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): os.close(fd) # END assure file is closed - def object_info(self, sha): + def info(self, sha): m = self._map_loose_object(sha) try: return loose_object_header_info(m) @@ -233,8 +262,9 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): return False # END check existance - def to_object(self, type, size, stream, dry_run=False, sha_as_hex=True): + def store(self, istream): # open a tmp file to write the data to + # todo: implement ostream properly fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path) writer = FDCompressedSha1Writer(fd) @@ -269,11 +299,11 @@ class LooseObjectDB(FileDBBase, iObjectDBR, iObjectDBW): return sha -class PackedDB(FileDBBase, iObjectDBR): +class PackedDB(FileDBBase, ObjectDBR): """A database operating on a set of object packs""" -class CompoundDB(iObjectDBR): +class CompoundDB(ObjectDBR): """A database which delegates calls to sub-databases""" @@ -281,7 +311,7 @@ class ReferenceDB(CompoundDB): """A database consisting of database referred to in a file""" -#class GitObjectDB(CompoundDB, iObjectDBW): +#class GitObjectDB(CompoundDB, ObjectDBW): class GitObjectDB(LooseObjectDB): """A database representing the default git object store, which includes loose objects, pack files and an alternates file @@ -296,7 +326,7 @@ class GitObjectDB(LooseObjectDB): super(GitObjectDB, self).__init__(root_path) self._git = git - def object_info(self, sha): + def info(self, sha): discard, type, size = self._git.get_object_header(sha) return type, size diff --git a/lib/git/odb/stream.py b/lib/git/odb/stream.py new file mode 100644 index 00000000..325c1444 --- /dev/null +++ b/lib/git/odb/stream.py @@ -0,0 +1,388 @@ +import zlib +from cStringIO import StringIO +from git.utils import make_sha +import errno + +from utils import ( + to_hex_sha, + to_bin_sha + ) + +__all__ = ('FDCompressedSha1Writer', 'DecompressMemMapReader') + + +# ZLIB configuration +# used when compressing objects - 1 to 9 ( slowest ) +Z_BEST_SPEED = 1 + + +#{ ODB Bases + +class ODB_Info(tuple): + """Carries information about an object in an ODB, provdiing information + about the sha of the object, the type_string as well as the uncompressed size + in bytes. + + It can be accessed using tuple notation and using attribute access notation:: + + assert dbi[0] == dbi.sha + assert dbi[1] == dbi.type + assert dbi[2] == dbi.size + + The type is designed to be as lighteight as possible.""" + __slots__ = tuple() + + def __new__(cls, sha, type, size): + return tuple.__new__(cls, (sha, type, size)) + + def __init__(self, sha, type, size): + pass + + #{ Interface + @property + def sha(self): + return self[0] + + @property + def type(self): + return self[1] + + @property + def size(self): + return self[2] + #} END interface + + +class ODB_OStream(ODB_Info): + """Base for object streams retrieved from the database, providing additional + information about the stream. + Generally, ODB streams are read-only as objects are immutable""" + __slots__ = tuple() + + def __new__(cls, sha, type, size, *args, **kwargs): + """Helps with the initialization of subclasses""" + return tuple.__new__(cls, (sha, type, size)) + + def is_compressed(self): + """:return: True if reads of this stream yield zlib compressed data. + :note: this does not imply anything about the actual internal storage. + Hence the data could be uncompressed, but read compressed, or vice versa""" + raise NotImplementedError("To be implemented by subclass") + + +class ODB_IStream(list): + """Represents an input content stream to be fed into the ODB. It is mutable to allow + the ODB to record information about the operations outcome right in this instance. + + It provides interfaces for the ODB_OStream and a StreamReader to allow the instance + to blend in without prior conversion. + + The only method your content stream must support is 'read'""" + __slots__ = tuple() + + def __new__(cls, type, size, stream, sha=None, compressed=False): + list.__new__(cls, (sha, type, size, stream, compressed, None)) + + def __init__(cls, type, size, stream, sha=None, compressed=None): + pass + + #{ Interface + + def hexsha(self): + """:return: our sha, hex encoded, 40 bytes""" + return to_hex_sha(self[0]) + + def binsha(self): + """:return: our sha as binary, 20 bytes""" + return to_bin_sha(self[0]) + + def _error(self): + """:return: the error that occurred when processing the stream, or None""" + return self[5] + + def _set_error(self, exc): + """Set this input stream to the given exc, may be None to reset the error""" + self[5] = exc + + error = property(_error, _set_error) + + #} END interface + + #{ Stream Reader Interface + + def read(self, size=-1): + """Implements a simple stream reader interface, passing the read call on + to our internal stream""" + return self[3].read(size) + + #} END stream reader interface + + #{ interface + + def _set_sha(self, sha): + self[0] = sha + + def _sha(self): + return self[0] + + sha = property(_sha, _set_sha) + + @property + def type(self): + return self[1] + + @property + def size(self): + return self[2] + + #} END odb info interface + + #{ ODB_OStream interface + + def is_compressed(self): + return self[4] + + #} END ODB_OStream interface + + +class InvalidODB_Info(tuple): + """Carries information about a sha identifying an object which is invalid in + the queried database. The exception attribute provides more information about + the cause of the issue""" + __slots__ = tuple() + + def __new__(cls, sha, exc): + return tuple.__new__(cls, (sha, exc)) + + def __init__(self, sha, exc): + pass + + @property + def sha(self): + return self[0] + + @property + def error(self): + """:return: exception instance explaining the failure""" + return self[1] + +class InvalidODB_OStream(InvalidODB_Info): + """Carries information about an invalid ODB stream""" + __slots__ = tuple() + +#} END ODB Bases + + +#{ RO Streams + +class DecompressMemMapReader(ODB_OStream): + """Reads data in chunks from a memory map and decompresses it. The client sees + only the uncompressed data, respective file-like read calls are handling on-demand + buffered decompression accordingly + + A constraint on the total size of bytes is activated, simulating + a logical file within a possibly larger physical memory area + + To read efficiently, you clearly don't want to read individual bytes, instead, + read a few kilobytes at least. + + :note: The chunk-size should be carefully selected as it will involve quite a bit + of string copying due to the way the zlib is implemented. Its very wasteful, + hence we try to find a good tradeoff between allocation time and number of + times we actually allocate. An own zlib implementation would be good here + to better support streamed reading - it would only need to keep the mmap + and decompress it into chunks, thats all ... """ + # __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close') + + max_read_size = 512*1024 + + def __init__(self, sha, type, size, m, close_on_deletion): + """Initialize with mmap for stream reading""" + self._m = m + self._zip = zlib.decompressobj() + self._buf = None # buffer of decompressed bytes + self._buflen = 0 # length of bytes in buffer + self._s = 0 # size of uncompressed data to read in total + self._br = 0 # num uncompressed bytes read + self._cws = 0 # start byte of compression window + self._cwe = 0 # end byte of compression window + self._close = close_on_deletion # close the memmap on deletion ? + + def __del__(self): + if self._close: + self._m.close() + # END handle resource freeing + + def initialize(self, size=0): + """Initialize this instance for acting as a read-only stream for size bytes. + :param size: size in bytes to be decompresed before being depleted. + If 0, default object header information is parsed from the data, + returning a tuple of (type_string, uncompressed_size) + If not 0, the size will be used, and None is returned. + :note: must only be called exactly once""" + if size: + self._s = size + return + # END handle size + + # read header + maxb = 512 # should really be enough, cgit uses 8192 I believe + self._s = maxb + hdr = self.read(maxb) + hdrend = hdr.find("\0") + type, size = hdr[:hdrend].split(" ") + self._s = int(size) + + # adjust internal state to match actual header length that we ignore + # The buffer will be depleted first on future reads + self._br = 0 + hdrend += 1 # count terminating \0 + self._buf = StringIO(hdr[hdrend:]) + self._buflen = len(hdr) - hdrend + + return type, size + + def read(self, size=-1): + if size < 1: + size = self._s - self._br + else: + size = min(size, self._s - self._br) + # END clamp size + + if size == 0: + return str() + # END handle depletion + + # protect from memory peaks + # If he tries to read large chunks, our memory patterns get really bad + # as we end up copying a possibly huge chunk from our memory map right into + # memory. This might not even be possible. Nonetheless, try to dampen the + # effect a bit by reading in chunks, returning a huge string in the end. + # Our performance now depends on StringIO. This way we don't need two large + # buffers in peak times, but only one large one in the end which is + # the return buffer + # NO: We don't do it - if the user thinks its best, he is right. If he + # has trouble, he will start reading in chunks. According to our tests + # its still faster if we read 10 Mb at once instead of chunking it. + + # if size > self.max_read_size: + # sio = StringIO() + # while size: + # read_size = min(self.max_read_size, size) + # data = self.read(read_size) + # sio.write(data) + # size -= len(data) + # if len(data) < read_size: + # break + # # END data loop + # sio.seek(0) + # return sio.getvalue() + # # END handle maxread + # + # deplete the buffer, then just continue using the decompress object + # which has an own buffer. We just need this to transparently parse the + # header from the zlib stream + dat = str() + if self._buf: + if self._buflen >= size: + # have enough data + dat = self._buf.read(size) + self._buflen -= size + self._br += size + return dat + else: + dat = self._buf.read() # ouch, duplicates data + size -= self._buflen + self._br += self._buflen + + self._buflen = 0 + self._buf = None + # END handle buffer len + # END handle buffer + + # decompress some data + # Abstract: zlib needs to operate on chunks of our memory map ( which may + # be large ), as it will otherwise and always fill in the 'unconsumed_tail' + # attribute which possible reads our whole map to the end, forcing + # everything to be read from disk even though just a portion was requested. + # As this would be a nogo, we workaround it by passing only chunks of data, + # moving the window into the memory map along as we decompress, which keeps + # the tail smaller than our chunk-size. This causes 'only' the chunk to be + # copied once, and another copy of a part of it when it creates the unconsumed + # tail. We have to use it to hand in the appropriate amount of bytes durin g + # the next read. + tail = self._zip.unconsumed_tail + if tail: + # move the window, make it as large as size demands. For code-clarity, + # we just take the chunk from our map again instead of reusing the unconsumed + # tail. The latter one would safe some memory copying, but we could end up + # with not getting enough data uncompressed, so we had to sort that out as well. + # Now we just assume the worst case, hence the data is uncompressed and the window + # needs to be as large as the uncompressed bytes we want to read. + self._cws = self._cwe - len(tail) + self._cwe = self._cws + size + + + indata = self._m[self._cws:self._cwe] # another copy ... :( + # get the actual window end to be sure we don't use it for computations + self._cwe = self._cws + len(indata) + else: + cws = self._cws + self._cws = self._cwe + self._cwe = cws + size + indata = self._m[self._cws:self._cwe] # ... copy it again :( + # END handle tail + + dcompdat = self._zip.decompress(indata, size) + + self._br += len(dcompdat) + if dat: + dcompdat = dat + dcompdat + + return dcompdat + +#} END RO streams + + +#{ W Streams + +class FDCompressedSha1Writer(object): + """Digests data written to it, making the sha available, then compress the + data and write it to the file descriptor + :note: operates on raw file descriptors + :note: for this to work, you have to use the close-method of this instance""" + __slots__ = ("fd", "sha1", "zip") + + # default exception + exc = IOError("Failed to write all bytes to filedescriptor") + + def __init__(self, fd): + self.fd = fd + self.sha1 = make_sha("") + self.zip = zlib.compressobj(Z_BEST_SPEED) + + def write(self, data): + """:raise IOError: If not all bytes could be written + :return: lenght of incoming data""" + self.sha1.update(data) + cdata = self.zip.compress(data) + bytes_written = write(self.fd, cdata) + if bytes_written != len(cdata): + raise self.exc + return len(data) + + def sha(self, as_hex = False): + """:return: sha so far + :param as_hex: if True, sha will be hex-encoded, binary otherwise""" + if as_hex: + return self.sha1.hexdigest() + return self.sha1.digest() + + def close(self): + remainder = self.zip.flush() + if write(self.fd, remainder) != len(remainder): + raise self.exc + return close(self.fd) + + +#} END W streams diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py index fd340962..61565ba9 100644 --- a/lib/git/odb/utils.py +++ b/lib/git/odb/utils.py @@ -1,10 +1,6 @@ import binascii import os -import zlib -from cStringIO import StringIO -from git.utils import make_sha import errno -from fun import chunk_size __all__ = ('FDSha1Writer', ) @@ -38,218 +34,7 @@ read = os.read write = os.write close = os.close -# ZLIB configuration -# used when compressing objects - 1 to 9 ( slowest ) -Z_BEST_SPEED = 1 #} END Routines -#{ Classes - -class FDCompressedSha1Writer(object): - """Digests data written to it, making the sha available, then compress the - data and write it to the file descriptor - :note: operates on raw file descriptors - :note: for this to work, you have to use the close-method of this instance""" - __slots__ = ("fd", "sha1", "zip") - - # default exception - exc = IOError("Failed to write all bytes to filedescriptor") - - def __init__(self, fd): - self.fd = fd - self.sha1 = make_sha("") - self.zip = zlib.compressobj(Z_BEST_SPEED) - - def write(self, data): - """:raise IOError: If not all bytes could be written - :return: lenght of incoming data""" - self.sha1.update(data) - cdata = self.zip.compress(data) - bytes_written = write(self.fd, cdata) - if bytes_written != len(cdata): - raise self.exc - return len(data) - - def sha(self, as_hex = False): - """:return: sha so far - :param as_hex: if True, sha will be hex-encoded, binary otherwise""" - if as_hex: - return self.sha1.hexdigest() - return self.sha1.digest() - - def close(self): - remainder = self.zip.flush() - if write(self.fd, remainder) != len(remainder): - raise self.exc - return close(self.fd) - - -class DecompressMemMapReader(object): - """Reads data in chunks from a memory map and decompresses it. The client sees - only the uncompressed data, respective file-like read calls are handling on-demand - buffered decompression accordingly - - A constraint on the total size of bytes is activated, simulating - a logical file within a possibly larger physical memory area - - To read efficiently, you clearly don't want to read individual bytes, instead, - read a few kilobytes at least. - - :note: The chunk-size should be carefully selected as it will involve quite a bit - of string copying due to the way the zlib is implemented. Its very wasteful, - hence we try to find a good tradeoff between allocation time and number of - times we actually allocate. An own zlib implementation would be good here - to better support streamed reading - it would only need to keep the mmap - and decompress it into chunks, thats all ... """ - __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close') - - max_read_size = 512*1024 - - def __init__(self, m, close_on_deletion): - """Initialize with mmap for stream reading""" - self._m = m - self._zip = zlib.decompressobj() - self._buf = None # buffer of decompressed bytes - self._buflen = 0 # length of bytes in buffer - self._s = 0 # size of uncompressed data to read in total - self._br = 0 # num uncompressed bytes read - self._cws = 0 # start byte of compression window - self._cwe = 0 # end byte of compression window - self._close = close_on_deletion # close the memmap on deletion ? - - def __del__(self): - if self._close: - self._m.close() - # END handle resource freeing - - def initialize(self, size=0): - """Initialize this instance for acting as a read-only stream for size bytes. - :param size: size in bytes to be decompresed before being depleted. - If 0, default object header information is parsed from the data, - returning a tuple of (type_string, uncompressed_size) - If not 0, the size will be used, and None is returned. - :note: must only be called exactly once""" - if size: - self._s = size - return - # END handle size - - # read header - maxb = 512 # should really be enough, cgit uses 8192 I believe - self._s = maxb - hdr = self.read(maxb) - hdrend = hdr.find("\0") - type, size = hdr[:hdrend].split(" ") - self._s = int(size) - - # adjust internal state to match actual header length that we ignore - # The buffer will be depleted first on future reads - self._br = 0 - hdrend += 1 # count terminating \0 - self._buf = StringIO(hdr[hdrend:]) - self._buflen = len(hdr) - hdrend - - return type, size - - def read(self, size=-1): - if size < 1: - size = self._s - self._br - else: - size = min(size, self._s - self._br) - # END clamp size - - if size == 0: - return str() - # END handle depletion - - # protect from memory peaks - # If he tries to read large chunks, our memory patterns get really bad - # as we end up copying a possibly huge chunk from our memory map right into - # memory. This might not even be possible. Nonetheless, try to dampen the - # effect a bit by reading in chunks, returning a huge string in the end. - # Our performance now depends on StringIO. This way we don't need two large - # buffers in peak times, but only one large one in the end which is - # the return buffer - # NO: We don't do it - if the user thinks its best, he is right. If he - # has trouble, he will start reading in chunks. According to our tests - # its still faster if we read 10 Mb at once instead of chunking it. - - # if size > self.max_read_size: - # sio = StringIO() - # while size: - # read_size = min(self.max_read_size, size) - # data = self.read(read_size) - # sio.write(data) - # size -= len(data) - # if len(data) < read_size: - # break - # # END data loop - # sio.seek(0) - # return sio.getvalue() - # # END handle maxread - # - # deplete the buffer, then just continue using the decompress object - # which has an own buffer. We just need this to transparently parse the - # header from the zlib stream - dat = str() - if self._buf: - if self._buflen >= size: - # have enough data - dat = self._buf.read(size) - self._buflen -= size - self._br += size - return dat - else: - dat = self._buf.read() # ouch, duplicates data - size -= self._buflen - self._br += self._buflen - - self._buflen = 0 - self._buf = None - # END handle buffer len - # END handle buffer - - # decompress some data - # Abstract: zlib needs to operate on chunks of our memory map ( which may - # be large ), as it will otherwise and always fill in the 'unconsumed_tail' - # attribute which possible reads our whole map to the end, forcing - # everything to be read from disk even though just a portion was requested. - # As this would be a nogo, we workaround it by passing only chunks of data, - # moving the window into the memory map along as we decompress, which keeps - # the tail smaller than our chunk-size. This causes 'only' the chunk to be - # copied once, and another copy of a part of it when it creates the unconsumed - # tail. We have to use it to hand in the appropriate amount of bytes durin g - # the next read. - tail = self._zip.unconsumed_tail - if tail: - # move the window, make it as large as size demands. For code-clarity, - # we just take the chunk from our map again instead of reusing the unconsumed - # tail. The latter one would safe some memory copying, but we could end up - # with not getting enough data uncompressed, so we had to sort that out as well. - # Now we just assume the worst case, hence the data is uncompressed and the window - # needs to be as large as the uncompressed bytes we want to read. - self._cws = self._cwe - len(tail) - self._cwe = self._cws + size - - - indata = self._m[self._cws:self._cwe] # another copy ... :( - # get the actual window end to be sure we don't use it for computations - self._cwe = self._cws + len(indata) - else: - cws = self._cws - self._cws = self._cwe - self._cwe = cws + size - indata = self._m[self._cws:self._cwe] # ... copy it again :( - # END handle tail - - dcompdat = self._zip.decompress(indata, size) - - self._br += len(dcompdat) - if dat: - dcompdat = dat + dcompdat - - return dcompdat - -#} END classes diff --git a/test/git/performance/test_commit.py b/test/git/performance/test_commit.py index 2398c93d..bca3ad8b 100644 --- a/test/git/performance/test_commit.py +++ b/test/git/performance/test_commit.py @@ -72,7 +72,7 @@ class TestPerformance(TestBigRepoRW): assert_commit_serialization(self.gitrwrepo, self.head_sha_2k, True) rwrepo = self.gitrwrepo - make_object = rwrepo.odb.to_object + make_object = rwrepo.odb.store # direct serialization - deserialization can be tested afterwards # serialization is probably limited on IO hc = rwrepo.commit(self.head_sha_2k) diff --git a/test/git/performance/test_streams.py b/test/git/performance/test_streams.py index d31bee14..30fd8048 100644 --- a/test/git/performance/test_streams.py +++ b/test/git/performance/test_streams.py @@ -51,7 +51,7 @@ class TestObjDBPerformance(TestBigRepoR): # writing - due to the compression it will seem faster than it is st = time() - sha = ldb.to_object('blob', size, stream) + sha = ldb.store('blob', size, stream) elapsed_add = time() - st assert ldb.has_object(sha) db_file = ldb.readable_db_object_path(sha) @@ -63,7 +63,7 @@ class TestObjDBPerformance(TestBigRepoR): # reading all at once st = time() - type, size, shastream = ldb.object(sha) + type, size, shastream = ldbstreamsha) shadata = shastream.read() elapsed_readall = time() - st @@ -76,7 +76,7 @@ class TestObjDBPerformance(TestBigRepoR): cs = 512*1000 chunks = list() st = time() - type, size, shastream = ldb.object(sha) + type, size, shastream = ldbstreamsha) while True: data = shastream.read(cs) chunks.append(data) diff --git a/test/git/test_commit.py b/test/git/test_commit.py index a5f184e6..e914b9a7 100644 --- a/test/git/test_commit.py +++ b/test/git/test_commit.py @@ -31,7 +31,7 @@ def assert_commit_serialization(rwrepo, commit_id, print_performance_info=False) streamlen = stream.tell() stream.seek(0) - csha = rwrepo.odb.to_object(Commit.type, streamlen, stream) + csha = rwrepo.odb.store(Commit.type, streamlen, stream) assert csha == cm.sha nc = Commit(rwrepo, Commit.NULL_HEX_SHA, cm.tree.sha, @@ -45,7 +45,7 @@ def assert_commit_serialization(rwrepo, commit_id, print_performance_info=False) ns += 1 streamlen = stream.tell() stream.seek(0) - nc.sha = rwrepo.odb.to_object(Commit.type, streamlen, stream) + nc.sha = rwrepo.odb.store(Commit.type, streamlen, stream) # if it worked, we have exactly the same contents ! assert nc.sha == cm.sha diff --git a/test/git/test_odb.py b/test/git/test_odb.py index b2840719..80597df6 100644 --- a/test/git/test_odb.py +++ b/test/git/test_odb.py @@ -18,26 +18,26 @@ class TestDB(TestBase): all_data = (two_lines, ) def _assert_object_writing(self, db): - """General tests to verify object writing, compatible to iObjectDBW + """General tests to verify object writing, compatible to ObjectDBW :note: requires write access to the database""" # start in dry-run mode for dry_run in range(1, -1, -1): for data in self.all_data: for hex_sha in range(2): - sha = db.to_object(Blob.type, len(data), StringIO(data), dry_run, hex_sha) + sha = db.store(Blob.type, len(data), StringIO(data), dry_run, hex_sha) assert db.has_object(sha) != dry_run assert len(sha) == 20 + hex_sha * 20 # verify data - the slow way, we want to run code if not dry_run: - type, size = db.object_info(sha) + type, size = db.info(sha) assert Blob.type == type assert size == len(data) - type, size, stream = db.object(sha) + type, size, stream = dbstreamsha) assert stream.read() == data else: - self.failUnlessRaises(BadObject, db.object_info, sha) + self.failUnlessRaises(BadObject, db.info, sha) self.failUnlessRaises(BadObject, db.object, sha) # END for each sha type # END for each data set -- cgit v1.2.3 From e746f96bcc29238b79118123028ca170adc4ff0f Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 4 Jun 2010 17:22:08 +0200 Subject: Fixed implementation after design change to deal with it - all tests run, but next there will have to be more through testing --- lib/git/__init__.py | 3 + lib/git/objects/base.py | 17 ++-- lib/git/objects/commit.py | 4 +- lib/git/odb/__init__.py | 4 + lib/git/odb/db.py | 116 ++++++++++++------------ lib/git/odb/fun.py | 46 +++++----- lib/git/odb/stream.py | 168 +++++++++++++++++++++++------------ lib/git/odb/utils.py | 2 - lib/git/repo.py | 2 +- test/git/performance/test_commit.py | 2 +- test/git/performance/test_streams.py | 12 +-- test/git/performance/test_utils.py | 15 ++++ test/git/test_commit.py | 12 ++- test/git/test_odb.py | 52 +++++++---- 14 files changed, 275 insertions(+), 180 deletions(-) diff --git a/lib/git/__init__.py b/lib/git/__init__.py index aac539eb..2f17c55b 100644 --- a/lib/git/__init__.py +++ b/lib/git/__init__.py @@ -22,5 +22,8 @@ from git.remote import * from git.index import * from git.utils import LockFile, BlockingLockFile +# odb is NOT imported intentionally - if you really want it, you should get it +# yourself as its part of the core + __all__ = [ name for name, obj in locals().items() if not (name.startswith('_') or inspect.ismodule(obj)) ] diff --git a/lib/git/objects/base.py b/lib/git/objects/base.py index 76384888..5a3a15a7 100644 --- a/lib/git/objects/base.py +++ b/lib/git/objects/base.py @@ -76,12 +76,14 @@ class Object(LazyMixin): Retrieve object information """ if attr == "size": - typename, self.size = self.repo.odb.info(self.sha) - assert typename == self.type, _assertion_msg_format % (self.sha, typename, self.type) + oinfo = self.repo.odb.info(self.sha) + self.size = oinfo.size + assert oinfo.type == self.type, _assertion_msg_format % (self.sha, oinfo.type, self.type) elif attr == "data": - typename, self.size, stream = self.repo.odb.stream(self.sha) - self.data = stream.read() # once we have an own odb, we can delay reading - assert typename == self.type, _assertion_msg_format % (self.sha, typename, self.type) + ostream = self.repo.odb.stream(self.sha) + self.size = ostream.size + self.data = ostream.read() + assert ostream.type == self.type, _assertion_msg_format % (self.sha, ostream.type, self.type) else: super(Object,self)._set_cache_(attr) @@ -124,14 +126,13 @@ class Object(LazyMixin): def data_stream(self): """ :return: File Object compatible stream to the uncompressed raw data of the object :note: returned streams must be read in order""" - type, size, stream = self.repo.odb.stream(self.sha) - return stream + return self.repo.odb.stream(self.sha) def stream_data(self, ostream): """Writes our data directly to the given output stream :param ostream: File object compatible stream object. :return: self""" - type, size, istream = self.repo.odb.stream(self.sha) + istream = self.repo.odb.stream(self.sha) stream_copy(istream, ostream) return self diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py index dbc0cf27..9a3c2c95 100644 --- a/lib/git/objects/commit.py +++ b/lib/git/objects/commit.py @@ -9,6 +9,7 @@ import git.diff as diff import git.stats as stats from git.actor import Actor from tree import Tree +from git.odb import IStream from cStringIO import StringIO import base import utils @@ -346,7 +347,8 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri streamlen = stream.tell() stream.seek(0) - new_commit.sha = repo.odb.store(cls.type, streamlen, stream, sha_as_hex=True) + istream = repo.odb.store(IStream(cls.type, streamlen, stream)) + new_commit.sha = istream.sha if head: try: diff --git a/lib/git/odb/__init__.py b/lib/git/odb/__init__.py index 17000244..5789d7eb 100644 --- a/lib/git/odb/__init__.py +++ b/lib/git/odb/__init__.py @@ -1,2 +1,6 @@ """Initialize the object database module""" +# default imports +from db import * +from stream import * + diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py index 7ae8f446..a8de28ec 100644 --- a/lib/git/odb/db.py +++ b/lib/git/odb/db.py @@ -8,7 +8,10 @@ from git.errors import ( from stream import ( DecompressMemMapReader, - FDCompressedSha1Writer + FDCompressedSha1Writer, + Sha1Writer, + OStream, + OInfo ) from utils import ( @@ -34,11 +37,13 @@ import mmap import os +__all__ = ('ObjectDBR', 'ObjectDBW', 'FileDBBase', 'LooseObjectDB', 'PackedDB', + 'CompoundDB', 'ReferenceDB', 'GitObjectDB' ) + class ObjectDBR(object): """Defines an interface for object database lookup. Objects are identified either by hex-sha (40 bytes) or by sha (20 bytes)""" - __slots__ = tuple() def __contains__(self, sha): return self.has_obj @@ -52,7 +57,7 @@ class ObjectDBR(object): raise NotImplementedError("To be implemented in subclass") def info(self, sha): - """ :return: ODB_Info instance + """ :return: OInfo instance :param sha: 40 bytes hexsha or 20 bytes binary sha :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") @@ -60,27 +65,26 @@ class ObjectDBR(object): def info_async(self, input_channel): """Retrieve information of a multitude of objects asynchronously :param input_channel: Channel yielding the sha's of the objects of interest - :return: Channel yielding ODB_Info|InvalidODB_Info, in any order""" + :return: Channel yielding OInfo|InvalidOInfo, in any order""" raise NotImplementedError("To be implemented in subclass") def stream(self, sha): - """:return: ODB_OStream instance + """:return: OStream instance :param sha: 40 bytes hexsha or 20 bytes binary sha :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") def stream_async(self, input_channel): - """Retrieve the ODB_OStream of multiple objects + """Retrieve the OStream of multiple objects :param input_channel: see ``info`` :param max_threads: see ``ObjectDBW.store`` - :return: Channel yielding ODB_OStream|InvalidODB_OStream instances in any order""" + :return: Channel yielding OStream|InvalidOStream instances in any order""" raise NotImplementedError("To be implemented in subclass") #} END query interface class ObjectDBW(object): """Defines an interface to create objects in the database""" - __slots__ = "_ostream" def __init__(self, *args, **kwargs): self._ostream = None @@ -99,12 +103,12 @@ class ObjectDBW(object): def ostream(self): """:return: overridden output stream this instance will write to, or None if it will write to the default stream""" - return self._ostream + return self._ostream def store(self, istream): """Create a new object in the database :return: the input istream object with its sha set to its corresponding value - :param istream: ODB_IStream compatible instance. If its sha is already set + :param istream: IStream compatible instance. If its sha is already set to a value, the object will just be stored in the our database format, in which case the input stream is expected to be in object format ( header + contents ). :raise IOError: if data could not be written""" @@ -115,22 +119,16 @@ class ObjectDBW(object): return right away, returning an output channel which receives the results as they are computed. - :return: Channel yielding your ODB_IStream which served as input, in any order. + :return: Channel yielding your IStream which served as input, in any order. The IStreams sha will be set to the sha it received during the process, or its error attribute will be set to the exception informing about the error. - :param input_channel: Channel yielding ODB_IStream instance. + :param input_channel: Channel yielding IStream instance. As the same instances will be used in the output channel, you can create a map between the id(istream) -> istream :note:As some ODB implementations implement this operation as atomic, they might abort the whole operation if one item could not be processed. Hence check how many items have actually been produced.""" - # a trivial implementation, ignoring the threads for now - # TODO: add configuration to the class to determine whether we may - # actually use multiple threads, default False of course. If the add - shas = list() - for args in iter_info: - shas.append(self.store(dry_run=dry_run, sha_as_hex=sha_as_hex, *args)) - return shas + raise NotImplementedError("To be implemented in subclass") #} END edit interface @@ -138,7 +136,6 @@ class ObjectDBW(object): class FileDBBase(object): """Provides basic facilities to retrieve files of interest, including caching facilities to help mapping hexsha's to objects""" - __slots__ = ('_root_path', ) def __init__(self, root_path): """Initialize this instance to look for its files at the given root path @@ -164,15 +161,11 @@ class FileDBBase(object): return join(self._root_path, rela_path) #} END interface - #{ Utiltities - - - #} END utilities class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): """A database which operates on loose object files""" - __slots__ = ('_hexsha_to_file', '_fd_open_flags') + # CONFIGURATION # chunks in which data will be copied between streams stream_chunk_size = chunk_size @@ -238,21 +231,26 @@ class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): finally: os.close(fd) # END assure file is closed + + def set_ostream(self, stream): + """:raise TypeError: if the stream does not support the Sha1Writer interface""" + if stream is not None and not isinstance(stream, Sha1Writer): + raise TypeError("Output stream musst support the %s interface" % Sha1Writer.__name__) + return super(LooseObjectDB, self).set_ostream(stream) def info(self, sha): m = self._map_loose_object(sha) try: - return loose_object_header_info(m) + type, size = loose_object_header_info(m) + return OInfo(sha, type, size) finally: m.close() # END assure release of system resources - def object(self, sha): + def stream(self, sha): m = self._map_loose_object(sha) - reader = DecompressMemMapReader(m, close_on_deletion = True) - type, size = reader.initialize() - - return type, size, reader + type, size, stream = DecompressMemMapReader.new(m, close_on_deletion = True) + return OStream(sha, type, size, stream) def has_object(self, sha): try: @@ -263,27 +261,33 @@ class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): # END check existance def store(self, istream): - # open a tmp file to write the data to - # todo: implement ostream properly - fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path) - writer = FDCompressedSha1Writer(fd) + """note: The sha we produce will be hex by nature""" + assert istream.sha is None, "Direct istream writing not yet implemented" + tmp_path = None + writer = self.ostream() + if writer is None: + # open a tmp file to write the data to + fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path) + writer = FDCompressedSha1Writer(fd) + # END handle custom writer try: - write_object(type, size, stream, writer, - close_target_stream=True, chunk_size=self.stream_chunk_size) - except: - os.remove(tmp_path) - raise - # END assure tmpfile removal on error - + try: + write_object(istream.type, istream.size, istream.read, writer.write, + chunk_size=self.stream_chunk_size) + except: + if tmp_path: + os.remove(tmp_path) + raise + # END assure tmpfile removal on error + finally: + if tmp_path: + writer.close() + # END assure target stream is closed - # in dry-run mode, we delete the file afterwards sha = writer.sha(as_hex=True) - if dry_run: - os.remove(tmp_path) - else: - # rename the file into place + if tmp_path: obj_path = self.db_path(self.object_path(sha)) obj_dir = dirname(obj_path) if not isdir(obj_dir): @@ -292,11 +296,8 @@ class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): rename(tmp_path, obj_path) # END handle dry_run - if not sha_as_hex: - sha = hex_to_bin(sha) - # END handle sha format - - return sha + istream.sha = sha + return istream class PackedDB(FileDBBase, ObjectDBR): @@ -320,18 +321,17 @@ class GitObjectDB(LooseObjectDB): :note: for now, we use the git command to do all the lookup, just until he have packs and the other implementations """ - __slots__ = ('_git', ) def __init__(self, root_path, git): """Initialize this instance with the root and a git command""" super(GitObjectDB, self).__init__(root_path) self._git = git def info(self, sha): - discard, type, size = self._git.get_object_header(sha) - return type, size + t = self._git.get_object_header(sha) + return OInfo(t[0], t[1], t[2]) - def object(self, sha): + def stream(self, sha): """For now, all lookup is done by git itself""" - discard, type, size, stream = self._git.stream_object_data(sha) - return type, size, stream + t = self._git.stream_object_data(sha) + return OStream(t[0], t[1], t[2], t[3]) diff --git a/lib/git/odb/fun.py b/lib/git/odb/fun.py index ee7144dd..870a6f02 100644 --- a/lib/git/odb/fun.py +++ b/lib/git/odb/fun.py @@ -21,6 +21,8 @@ type_id_to_type_map = { # used when dealing with larger streams chunk_size = 1000*1000 +__all__ = ('is_loose_object', 'loose_object_header_info', 'object_header_info', + 'write_object' ) #{ Routines @@ -73,42 +75,34 @@ def object_header_info(m): raise BadObjectType(type_id) # END handle exceptions -def write_object(type, size, source_stream, target_stream, close_target_stream=True, - chunk_size=chunk_size): +def write_object(type, size, read, write, chunk_size=chunk_size): """Write the object as identified by type, size and source_stream into the target_stream :param type: type string of the object :param size: amount of bytes to write from source_stream - :param source_stream: stream as file-like object providing at least size bytes - :param target_stream: stream as file-like object to receive the data + :param read: read method of a stream providing the content data + :param write: write method of the output stream :param close_target_stream: if True, the target stream will be closed when the routine exits, even if an error is thrown - :param chunk_size: size of chunks to read from source. Larger values can be beneficial - for io performance, but cost more memory as well :return: The actual amount of bytes written to stream, which includes the header and a trailing newline""" tbw = 0 # total num bytes written dbw = 0 # num data bytes written - try: - # WRITE HEADER: type SP size NULL - tbw += target_stream.write("%s %i\0" % (type, size)) - - # WRITE ALL DATA UP TO SIZE - while True: - cs = min(chunk_size, size-dbw) - data_len = target_stream.write(source_stream.read(cs)) - dbw += data_len - if data_len < cs or dbw == size: - tbw += dbw - break - # END check for stream end - # END duplicate data - return tbw - finally: - if close_target_stream: - target_stream.close() - # END handle stream closing - # END assure file was closed + # WRITE HEADER: type SP size NULL + tbw += write("%s %i\0" % (type, size)) + + # WRITE ALL DATA UP TO SIZE + while True: + cs = min(chunk_size, size-dbw) + data_len = write(read(cs)) + dbw += data_len + if data_len < cs or dbw == size: + tbw += dbw + break + # END check for stream end + # END duplicate data + return tbw + #} END routines diff --git a/lib/git/odb/stream.py b/lib/git/odb/stream.py index 325c1444..d1181382 100644 --- a/lib/git/odb/stream.py +++ b/lib/git/odb/stream.py @@ -5,10 +5,13 @@ import errno from utils import ( to_hex_sha, - to_bin_sha + to_bin_sha, + write, + close ) -__all__ = ('FDCompressedSha1Writer', 'DecompressMemMapReader') +__all__ = ('OInfo', 'OStream', 'IStream', 'InvalidOInfo', 'InvalidOStream', + 'DecompressMemMapReader', 'FDCompressedSha1Writer') # ZLIB configuration @@ -18,7 +21,7 @@ Z_BEST_SPEED = 1 #{ ODB Bases -class ODB_Info(tuple): +class OInfo(tuple): """Carries information about an object in an ODB, provdiing information about the sha of the object, the type_string as well as the uncompressed size in bytes. @@ -35,8 +38,8 @@ class ODB_Info(tuple): def __new__(cls, sha, type, size): return tuple.__new__(cls, (sha, type, size)) - def __init__(self, sha, type, size): - pass + def __init__(self, *args): + tuple.__init__(self) #{ Interface @property @@ -53,38 +56,52 @@ class ODB_Info(tuple): #} END interface -class ODB_OStream(ODB_Info): +class OStream(OInfo): """Base for object streams retrieved from the database, providing additional information about the stream. Generally, ODB streams are read-only as objects are immutable""" __slots__ = tuple() - def __new__(cls, sha, type, size, *args, **kwargs): + def __new__(cls, sha, type, size, stream, *args, **kwargs): """Helps with the initialization of subclasses""" - return tuple.__new__(cls, (sha, type, size)) + return tuple.__new__(cls, (sha, type, size, stream)) + + + def __init__(self, *args, **kwargs): + tuple.__init__(self) + #{ Interface def is_compressed(self): - """:return: True if reads of this stream yield zlib compressed data. + """:return: True if reads of this stream yield zlib compressed data. Default False :note: this does not imply anything about the actual internal storage. Hence the data could be uncompressed, but read compressed, or vice versa""" - raise NotImplementedError("To be implemented by subclass") + raise False + + #} END interface + + #{ Stream Reader Interface + + def read(self, size=-1): + return self[3].read(size) + + #} END stream reader interface -class ODB_IStream(list): +class IStream(list): """Represents an input content stream to be fed into the ODB. It is mutable to allow the ODB to record information about the operations outcome right in this instance. - It provides interfaces for the ODB_OStream and a StreamReader to allow the instance + It provides interfaces for the OStream and a StreamReader to allow the instance to blend in without prior conversion. The only method your content stream must support is 'read'""" __slots__ = tuple() def __new__(cls, type, size, stream, sha=None, compressed=False): - list.__new__(cls, (sha, type, size, stream, compressed, None)) + return list.__new__(cls, (sha, type, size, stream, compressed, None)) - def __init__(cls, type, size, stream, sha=None, compressed=None): - pass + def __init__(self, type, size, stream, sha=None, compressed=None): + list.__init__(self, (sha, type, size, stream, compressed, None)) #{ Interface @@ -127,25 +144,42 @@ class ODB_IStream(list): sha = property(_sha, _set_sha) - @property - def type(self): + + def _type(self): return self[1] + + def _set_type(self, type): + self[1] = type - @property - def size(self): + type = property(_type, _set_type) + + def _size(self): return self[2] + + def _set_size(self, size): + self[2] = size + + size = property(_size, _set_size) + + def _stream(self): + return self[3] + + def _set_stream(self, stream): + self[3] = stream + + stream = property(_stream, _set_stream) #} END odb info interface - #{ ODB_OStream interface + #{ OStream interface def is_compressed(self): return self[4] - #} END ODB_OStream interface + #} END OStream interface -class InvalidODB_Info(tuple): +class InvalidOInfo(tuple): """Carries information about a sha identifying an object which is invalid in the queried database. The exception attribute provides more information about the cause of the issue""" @@ -155,7 +189,7 @@ class InvalidODB_Info(tuple): return tuple.__new__(cls, (sha, exc)) def __init__(self, sha, exc): - pass + tuple.__init__(self, (sha, exc)) @property def sha(self): @@ -166,7 +200,8 @@ class InvalidODB_Info(tuple): """:return: exception instance explaining the failure""" return self[1] -class InvalidODB_OStream(InvalidODB_Info): + +class InvalidOStream(InvalidOInfo): """Carries information about an invalid ODB stream""" __slots__ = tuple() @@ -175,7 +210,7 @@ class InvalidODB_OStream(InvalidODB_Info): #{ RO Streams -class DecompressMemMapReader(ODB_OStream): +class DecompressMemMapReader(object): """Reads data in chunks from a memory map and decompresses it. The client sees only the uncompressed data, respective file-like read calls are handling on-demand buffered decompression accordingly @@ -192,17 +227,17 @@ class DecompressMemMapReader(ODB_OStream): times we actually allocate. An own zlib implementation would be good here to better support streamed reading - it would only need to keep the mmap and decompress it into chunks, thats all ... """ - # __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close') + __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close') max_read_size = 512*1024 - def __init__(self, sha, type, size, m, close_on_deletion): + def __init__(self, m, close_on_deletion, size): """Initialize with mmap for stream reading""" self._m = m self._zip = zlib.decompressobj() self._buf = None # buffer of decompressed bytes self._buflen = 0 # length of bytes in buffer - self._s = 0 # size of uncompressed data to read in total + self._s = size # size of uncompressed data to read in total self._br = 0 # num uncompressed bytes read self._cws = 0 # start byte of compression window self._cwe = 0 # end byte of compression window @@ -213,34 +248,33 @@ class DecompressMemMapReader(ODB_OStream): self._m.close() # END handle resource freeing - def initialize(self, size=0): - """Initialize this instance for acting as a read-only stream for size bytes. - :param size: size in bytes to be decompresed before being depleted. - If 0, default object header information is parsed from the data, - returning a tuple of (type_string, uncompressed_size) - If not 0, the size will be used, and None is returned. - :note: must only be called exactly once""" - if size: - self._s = size - return - # END handle size + @classmethod + def new(self, m, close_on_deletion=False): + """Create a new DecompressMemMapReader instance for acting as a read-only stream + This method parses the object header from m and returns the parsed + type and size, as well as the created stream instance. + :param m: memory map on which to oparate + :param close_on_deletion: if True, the memory map will be closed once we are + being deleted""" + inst = DecompressMemMapReader(m, close_on_deletion, 0) # read header maxb = 512 # should really be enough, cgit uses 8192 I believe - self._s = maxb - hdr = self.read(maxb) + inst._s = maxb + hdr = inst.read(maxb) hdrend = hdr.find("\0") type, size = hdr[:hdrend].split(" ") - self._s = int(size) + size = int(size) + inst._s = size # adjust internal state to match actual header length that we ignore # The buffer will be depleted first on future reads - self._br = 0 + inst._br = 0 hdrend += 1 # count terminating \0 - self._buf = StringIO(hdr[hdrend:]) - self._buflen = len(hdr) - hdrend + inst._buf = StringIO(hdr[hdrend:]) + inst._buflen = len(hdr) - hdrend - return type, size + return type, size, inst def read(self, size=-1): if size < 1: @@ -346,7 +380,35 @@ class DecompressMemMapReader(ODB_OStream): #{ W Streams -class FDCompressedSha1Writer(object): +class Sha1Writer(object): + """Simple stream writer which produces a sha whenever you like as it degests + everything it is supposed to write""" + + def __init__(self): + self.sha1 = make_sha("") + + #{ Stream Interface + + def write(self, data): + """:raise IOError: If not all bytes could be written + :return: lenght of incoming data""" + self.sha1.update(data) + return len(data) + + # END stream interface + + #{ Interface + + def sha(self, as_hex = False): + """:return: sha so far + :param as_hex: if True, sha will be hex-encoded, binary otherwise""" + if as_hex: + return self.sha1.hexdigest() + return self.sha1.digest() + + #} END interface + +class FDCompressedSha1Writer(Sha1Writer): """Digests data written to it, making the sha available, then compress the data and write it to the file descriptor :note: operates on raw file descriptors @@ -357,10 +419,12 @@ class FDCompressedSha1Writer(object): exc = IOError("Failed to write all bytes to filedescriptor") def __init__(self, fd): + super(FDCompressedSha1Writer, self).__init__() self.fd = fd - self.sha1 = make_sha("") self.zip = zlib.compressobj(Z_BEST_SPEED) + #{ Stream Interface + def write(self, data): """:raise IOError: If not all bytes could be written :return: lenght of incoming data""" @@ -371,18 +435,12 @@ class FDCompressedSha1Writer(object): raise self.exc return len(data) - def sha(self, as_hex = False): - """:return: sha so far - :param as_hex: if True, sha will be hex-encoded, binary otherwise""" - if as_hex: - return self.sha1.hexdigest() - return self.sha1.digest() - def close(self): remainder = self.zip.flush() if write(self.fd, remainder) != len(remainder): raise self.exc return close(self.fd) + #} END stream interface #} END W streams diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py index 61565ba9..6863e97b 100644 --- a/lib/git/odb/utils.py +++ b/lib/git/odb/utils.py @@ -2,8 +2,6 @@ import binascii import os import errno -__all__ = ('FDSha1Writer', ) - #{ Routines hex_to_bin = binascii.a2b_hex diff --git a/lib/git/repo.py b/lib/git/repo.py index 1afb1eb7..78e5f526 100644 --- a/lib/git/repo.py +++ b/lib/git/repo.py @@ -13,7 +13,7 @@ from objects import * from config import GitConfigParser from remote import Remote -from odb.db import GitObjectDB +from odb import GitObjectDB import os import sys diff --git a/test/git/performance/test_commit.py b/test/git/performance/test_commit.py index bca3ad8b..0571d0d9 100644 --- a/test/git/performance/test_commit.py +++ b/test/git/performance/test_commit.py @@ -91,7 +91,7 @@ class TestPerformance(TestBigRepoRW): slen = stream.tell() stream.seek(0) - cm.sha = make_object(Commit.type, slen, stream) + cm.sha = make_object(IStream(Commit.type, slen, stream)).sha # END commit creation elapsed = time() - st diff --git a/test/git/performance/test_streams.py b/test/git/performance/test_streams.py index 30fd8048..01ec9fc4 100644 --- a/test/git/performance/test_streams.py +++ b/test/git/performance/test_streams.py @@ -1,7 +1,7 @@ """Performance data streaming performance""" from test.testlib import * -from git.odb.db import * +from git.odb import * from array import array from cStringIO import StringIO @@ -51,7 +51,7 @@ class TestObjDBPerformance(TestBigRepoR): # writing - due to the compression it will seem faster than it is st = time() - sha = ldb.store('blob', size, stream) + sha = ldb.store(IStream('blob', size, stream)).sha elapsed_add = time() - st assert ldb.has_object(sha) db_file = ldb.readable_db_object_path(sha) @@ -63,8 +63,8 @@ class TestObjDBPerformance(TestBigRepoR): # reading all at once st = time() - type, size, shastream = ldbstreamsha) - shadata = shastream.read() + ostream = ldb.stream(sha) + shadata = ostream.read() elapsed_readall = time() - st stream.seek(0) @@ -76,9 +76,9 @@ class TestObjDBPerformance(TestBigRepoR): cs = 512*1000 chunks = list() st = time() - type, size, shastream = ldbstreamsha) + ostream = ldb.stream(sha) while True: - data = shastream.read(cs) + data = ostream.read(cs) chunks.append(data) if len(data) < cs: break diff --git a/test/git/performance/test_utils.py b/test/git/performance/test_utils.py index 47366d34..76adffec 100644 --- a/test/git/performance/test_utils.py +++ b/test/git/performance/test_utils.py @@ -42,3 +42,18 @@ class TestUtilPerformance(TestBigRepoR): elapsed = time() - st print >> sys.stderr, "Accessed %s.attr %i times in %s s ( %f acc / s)" % (cls.__name__, ni, elapsed, ni / elapsed) # END for each class type + + # check num of sequence-acceses + for cls in (list, tuple): + x = 10 + st = time() + s = cls(range(x)) + for i in xrange(ni): + s[0] + s[1] + s[2] + # END for + elapsed = time() - st + na = ni * 3 + print >> sys.stderr, "Accessed %s[x] %i times in %s s ( %f acc / s)" % (cls.__name__, na, elapsed, na / elapsed) + # END for each sequence diff --git a/test/git/test_commit.py b/test/git/test_commit.py index e914b9a7..e65e2e59 100644 --- a/test/git/test_commit.py +++ b/test/git/test_commit.py @@ -6,6 +6,7 @@ from test.testlib import * from git import * +from git.odb import IStream from cStringIO import StringIO import time @@ -31,8 +32,8 @@ def assert_commit_serialization(rwrepo, commit_id, print_performance_info=False) streamlen = stream.tell() stream.seek(0) - csha = rwrepo.odb.store(Commit.type, streamlen, stream) - assert csha == cm.sha + istream = rwrepo.odb.store(IStream(Commit.type, streamlen, stream)) + assert istream.sha == cm.sha nc = Commit(rwrepo, Commit.NULL_HEX_SHA, cm.tree.sha, cm.author, cm.authored_date, cm.author_tz_offset, @@ -45,7 +46,12 @@ def assert_commit_serialization(rwrepo, commit_id, print_performance_info=False) ns += 1 streamlen = stream.tell() stream.seek(0) - nc.sha = rwrepo.odb.store(Commit.type, streamlen, stream) + + # reuse istream + istream.size = streamlen + istream.stream = stream + istream.sha = None + nc.sha = rwrepo.odb.store(istream).sha # if it worked, we have exactly the same contents ! assert nc.sha == cm.sha diff --git a/test/git/test_odb.py b/test/git/test_odb.py index 80597df6..c3a03714 100644 --- a/test/git/test_odb.py +++ b/test/git/test_odb.py @@ -1,7 +1,8 @@ """Test for object db""" from test.testlib import * -from git.odb.db import * +from git.odb import * +from git.odb.stream import Sha1Writer from git import Blob from git.errors import BadObject @@ -20,26 +21,39 @@ class TestDB(TestBase): def _assert_object_writing(self, db): """General tests to verify object writing, compatible to ObjectDBW :note: requires write access to the database""" - # start in dry-run mode - for dry_run in range(1, -1, -1): + # start in 'dry-run' mode, using a simple sha1 writer + ostreams = (Sha1Writer, None) + for ostreamcls in ostreams: for data in self.all_data: - for hex_sha in range(2): - sha = db.store(Blob.type, len(data), StringIO(data), dry_run, hex_sha) - assert db.has_object(sha) != dry_run - assert len(sha) == 20 + hex_sha * 20 + dry_run = ostreamcls is not None + ostream = None + if ostreamcls is not None: + ostream = ostreamcls() + # END create ostream + + prev_ostream = db.set_ostream(ostream) + assert type(prev_ostream) in ostreams or prev_ostream in ostreams + + istream = IStream(Blob.type, len(data), StringIO(data)) + my_istream = db.store(istream) + sha = istream.sha + assert my_istream is istream + assert db.has_object(sha) != dry_run + assert len(sha) == 40 # for now we require 40 byte shas as default + + # verify data - the slow way, we want to run code + if not dry_run: + info = db.info(sha) + assert Blob.type == info.type + assert info.size == len(data) - # verify data - the slow way, we want to run code - if not dry_run: - type, size = db.info(sha) - assert Blob.type == type - assert size == len(data) - - type, size, stream = dbstreamsha) - assert stream.read() == data - else: - self.failUnlessRaises(BadObject, db.info, sha) - self.failUnlessRaises(BadObject, db.object, sha) - # END for each sha type + ostream = db.stream(sha) + assert ostream.read() == data + assert ostream.type == Blob.type + assert ostream.size == len(data) + else: + self.failUnlessRaises(BadObject, db.info, sha) + self.failUnlessRaises(BadObject, db.stream, sha) # END for each data set # END for each dry_run mode -- cgit v1.2.3