From a5497c432fe8ab1415d633d5d4b68f00a2807c26 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 6 Jun 2011 20:29:03 +0200 Subject: Streams returned by git cmd db are now containing all the data right away. This could cause several copies to exist, and makes the cmd implementation a bad choice if big files are involved --- git/db/cmd/base.py | 13 ++++++++++--- git/db/complex.py | 5 ++++- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'git') diff --git a/git/db/cmd/base.py b/git/db/cmd/base.py index b3354b0a..735e71df 100644 --- a/git/db/cmd/base.py +++ b/git/db/cmd/base.py @@ -31,6 +31,7 @@ from git.refs import ( TagReference ) from git.objects.commit import Commit +from cStringIO import StringIO import re import os import sys @@ -305,9 +306,15 @@ class CmdObjectDBRMixin(object): return OInfo(hex_to_bin(hexsha), typename, size) def stream(self, sha): - """For now, all lookup is done by git itself""" - hexsha, typename, size, stream = self._git.stream_object_data(bin_to_hex(sha)) - return OStream(hex_to_bin(hexsha), typename, size, stream) + """For now, all lookup is done by git itself + :note: As we don't know when the stream is actually read (and if it is + stored for later use) we read the data rigth away and cache it. + This has HUGE performance implication, both for memory as for + reading/deserializing objects, but we have no other choice in order + to make the database behaviour consistent with other implementations !""" + + hexsha, typename, size, data = self._git.get_object_data(bin_to_hex(sha)) + return OStream(hex_to_bin(hexsha), typename, size, StringIO(data)) def partial_to_complete_sha_hex(self, partial_hexsha): """:return: Full binary 20 byte sha from the given partial hexsha diff --git a/git/db/complex.py b/git/db/complex.py index 71a39c45..31b047a0 100644 --- a/git/db/complex.py +++ b/git/db/complex.py @@ -8,7 +8,10 @@ __all__ = ['CmdGitDB', 'PureGitDB', 'CmdCompatibilityGitDB', 'PureCompatibilityG class CmdGitDB(CmdPartialGitDB, PurePartialGitDB): """A database which uses primarily the git command implementation, but falls back - to pure python where it is more feasible""" + to pure python where it is more feasible + :note: To assure consistent behaviour across implementations, when calling the + ``stream()`` method a cache is created. This makes this implementation a bad + choice when reading big files as these are streamed from memory in all cases.""" class CmdCompatibilityGitDB(RepoCompatibilityInterface, CmdGitDB): """A database which fills in its missing implementation using the pure python -- cgit v1.2.3