Skip to content

Commit be97c45

Browse files
committed
Initial frame for implementing read_tree using pure python. As git-read-tree can do much more than we can ( and faster assumably ), the .new method is used to create new index instances from up to 3 trees.
Implemented multi-tree traversal to facilitate building a stage list more efficiently ( although I am not sure whether it could be faster to use a dictionary together with some intensive lookup ), including test Added performance to learn how fast certain operations are, and whether one should be preferred over another
1 parent 778234d commit be97c45

File tree

8 files changed

+339
-9
lines changed

8 files changed

+339
-9
lines changed

lib/git/index/base.py

+21-3
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,6 @@ def _set_cache_(self, attr):
133133
def _index_path(self):
134134
return join_path_native(self.repo.git_dir, "index")
135135

136-
137136
@property
138137
def path(self):
139138
""" :return: Path to the index file we are representing """
@@ -240,6 +239,26 @@ def merge_tree(self, rhs, base=None):
240239
self.repo.git.read_tree(args)
241240
return self
242241

242+
@classmethod
243+
def new(cls, repo, *tree_sha):
244+
""" Merge the given treeish revisions into a new index which is returned.
245+
This method behaves like git-read-tree --aggressive when doing the merge.
246+
247+
:param repo: The repository treeish are located in.
248+
249+
:param *tree_sha:
250+
see ``from_tree``
251+
252+
:return:
253+
New IndexFile instance. Its path will be undefined.
254+
If you intend to write such a merged Index, supply an alternate file_path
255+
to its 'write' method."""
256+
base_entries = aggressive_tree_merge(repo.odb, tree_sha)
257+
258+
inst = cls(self.repo)
259+
raise NotImplementedError("convert to entries")
260+
261+
243262
@classmethod
244263
def from_tree(cls, repo, *treeish, **kwargs):
245264
"""
@@ -275,8 +294,7 @@ def from_tree(cls, repo, *treeish, **kwargs):
275294
276295
As the underlying git-read-tree command takes into account the current index,
277296
it will be temporarily moved out of the way to assure there are no unsuspected
278-
interferences.
279-
"""
297+
interferences."""
280298
if len(treeish) == 0 or len(treeish) > 3:
281299
raise ValueError("Please specify between 1 and 3 treeish, got %i" % len(treeish))
282300

lib/git/index/fun.py

+25-4
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,10 @@
1212
)
1313

1414
from typ import (
15+
BaseIndexEntry,
1516
IndexEntry,
16-
CE_NAMEMASK
17+
CE_NAMEMASK,
18+
CE_STAGESHIFT
1719
)
1820

1921
from util import (
@@ -23,7 +25,6 @@
2325

2426
from gitdb.base import IStream
2527
from gitdb.typ import str_tree_type
26-
from binascii import a2b_hex
2728

2829
__all__ = ('write_cache', 'read_cache', 'write_tree_from_cache', 'entry_key' )
2930

@@ -150,6 +151,7 @@ def write_tree_from_cache(entries, odb, sl, si=0):
150151
:return: tuple(binsha, list(tree_entry, ...)) a tuple of a sha and a list of
151152
tree entries being a tuple of hexsha, mode, name"""
152153
tree_items = list()
154+
tree_items_append = tree_items.append
153155
ci = sl.start
154156
end = sl.stop
155157
while ci < end:
@@ -161,7 +163,7 @@ def write_tree_from_cache(entries, odb, sl, si=0):
161163
rbound = entry.path.find('/', si)
162164
if rbound == -1:
163165
# its not a tree
164-
tree_items.append((entry.binsha, entry.mode, entry.path[si:]))
166+
tree_items_append((entry.binsha, entry.mode, entry.path[si:]))
165167
else:
166168
# find common base range
167169
base = entry.path[si:rbound]
@@ -178,7 +180,7 @@ def write_tree_from_cache(entries, odb, sl, si=0):
178180
# enter recursion
179181
# ci - 1 as we want to count our current item as well
180182
sha, tree_entry_list = write_tree_from_cache(entries, odb, slice(ci-1, xi), rbound+1)
181-
tree_items.append((sha, S_IFDIR, base))
183+
tree_items_append((sha, S_IFDIR, base))
182184

183185
# skip ahead
184186
ci = xi
@@ -193,5 +195,24 @@ def write_tree_from_cache(entries, odb, sl, si=0):
193195
istream = odb.store(IStream(str_tree_type, len(sio.getvalue()), sio))
194196
return (istream.binsha, tree_items)
195197

198+
def _tree_entry_to_baseindexentry(tree_entry, stage):
199+
return BaseIndexEntry(tree_entry[1], tree_entry[0], stage <<CE_STAGESHIFT, tree_entry[2])
196200

201+
def aggressive_tree_merge(odb, tree_shas):
202+
"""
203+
:return: list of BaseIndexEntries representing the aggressive merge of the given
204+
trees. All valid entries are on stage 0, whereas the conflicting ones are left
205+
on stage 1, 2 or 3, whereas stage 1 corresponds to the common ancestor tree,
206+
2 to our tree and 3 to 'their' tree.
207+
:param tree_shas: 1, 2 or 3 trees as identified by their shas"""
208+
out = list()
209+
out_append = out.append
210+
if len(tree_shas) == 1:
211+
for entry in traverse_tree_recursive(odb, tree_shas[0]):
212+
out_append(_tree_entry_to_baseindexentry(entry, 0))
213+
# END for each entry
214+
else:
215+
raise ValueError("Cannot handle %i trees at once" % len(tree_shas))
216+
# END handle tree shas
197217

218+
return out

lib/git/objects/fun.py

+118
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
__all__ = ('tree_to_stream', 'tree_entries_from_data')
44

5+
from stat import S_ISDIR
6+
7+
58
def tree_to_stream(entries, write):
69
"""Write the give list of entries into a stream using its write method
710
:param entries: **sorted** list of tuples with (binsha, mode, name)
@@ -64,3 +67,118 @@ def tree_entries_from_data(data):
6467
out.append((sha, mode, name))
6568
# END for each byte in data stream
6669
return out
70+
71+
72+
def _find_by_name(tree_data, name, is_dir, start_at):
73+
"""return data entry matching the given name and tree mode
74+
or None.
75+
Before the item is returned, the respective data item is set
76+
None in the tree_data list to mark it done"""
77+
try:
78+
item = tree_data[start_at]
79+
if item and item[2] == name and S_ISDIR(item[1]) == is_dir:
80+
tree_data[start_at] = None
81+
return item
82+
except IndexError:
83+
pass
84+
# END exception handling
85+
for index, item in enumerate(tree_data):
86+
if item and item[2] == name and S_ISDIR(item[1]) == is_dir:
87+
tree_data[index] = None
88+
return item
89+
# END if item matches
90+
# END for each item
91+
return None
92+
93+
def _to_full_path(item, path_prefix):
94+
"""Rebuild entry with given path prefix"""
95+
if not item:
96+
return item
97+
return (item[0], item[1], path_prefix+item[2])
98+
99+
def traverse_trees_recursive(odb, tree_shas, path_prefix):
100+
"""
101+
:return: list with entries according to the given tree-shas.
102+
The result is encoded in a list
103+
of n tuple|None per blob/commit, (n == len(tree_shas)), where
104+
* [0] == 20 byte sha
105+
* [1] == mode as int
106+
* [2] == path relative to working tree root
107+
The entry tuple is None if the respective blob/commit did not
108+
exist in the given tree.
109+
:param tree_shas: iterable of shas pointing to trees. All trees must
110+
be on the same level. A tree-sha may be None in which case None
111+
:param path_prefix: a prefix to be added to the returned paths on this level,
112+
set it '' for the first iteration
113+
:note: The ordering of the returned items will be partially lost"""
114+
trees_data = list()
115+
nt = len(tree_shas)
116+
for tree_sha in tree_shas:
117+
if tree_sha is None:
118+
data = list()
119+
else:
120+
data = tree_entries_from_data(odb.stream(tree_sha).read())
121+
# END handle muted trees
122+
trees_data.append(data)
123+
# END for each sha to get data for
124+
125+
out = list()
126+
out_append = out.append
127+
128+
# find all matching entries and recursively process them together if the match
129+
# is a tree. If the match is a non-tree item, put it into the result.
130+
# Processed items will be set None
131+
for ti, tree_data in enumerate(trees_data):
132+
for ii, item in enumerate(tree_data):
133+
if not item:
134+
continue
135+
# END skip already done items
136+
entries = [ None for n in range(nt) ]
137+
entries[ti] = item
138+
sha, mode, name = item # its faster to unpack
139+
is_dir = S_ISDIR(mode) # type mode bits
140+
141+
# find this item in all other tree data items
142+
# wrap around, but stop one before our current index, hence
143+
# ti+nt, not ti+1+nt
144+
for tio in range(ti+1, ti+nt):
145+
tio = tio % nt
146+
entries[tio] = _find_by_name(trees_data[tio], name, is_dir, ii)
147+
# END for each other item data
148+
149+
# if we are a directory, enter recursion
150+
if is_dir:
151+
out.extend(traverse_trees_recursive(odb, [ei[0] for ei in entries if ei], path_prefix+name+'/'))
152+
else:
153+
out_append(tuple(_to_full_path(e, path_prefix) for e in entries))
154+
# END handle recursion
155+
156+
# finally mark it done
157+
tree_data[ii] = None
158+
# END for each item
159+
160+
# we are done with one tree, set all its data empty
161+
del(tree_data[:])
162+
# END for each tree_data chunk
163+
return out
164+
165+
def traverse_tree_recursive(odb, tree_sha, path_prefix):
166+
"""
167+
:return: list of entries of the tree pointed to by tree_sha. An entry
168+
has the following format:
169+
* [0] 20 byte sha
170+
* [1] mode as int
171+
* [2] path relative to the repository
172+
:param path_prefix: prefix to prepend to the front of all returned paths"""
173+
entries = list()
174+
data = tree_entries_from_data(odb.stream(tree_sha).read())
175+
176+
# unpacking/packing is faster than accessing individual items
177+
for sha, mode, name in data:
178+
if S_ISDIR(mode):
179+
entries.extend(traverse_tree_recursive(odb, sha, path_prefix+name+'/'))
180+
else:
181+
entries.append((sha, mode, path_prefix+name))
182+
# END for each item
183+
184+
return entries

lib/git/repo.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ class Repo(object):
7171
# represents the configuration level of a configuration file
7272
config_level = ("system", "global", "repository")
7373

74-
def __init__(self, path=None, odbt = GitCmdObjectDB):
74+
def __init__(self, path=None, odbt = GitDB):
7575
""" Create a new Repo instance
7676
7777
:param path: is the path to either the root git directory or the bare git repo::

test/git/performance/test_utils.py

+94
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,97 @@ def __init__(self):
5757
na = ni * 3
5858
print >> sys.stderr, "Accessed %s[x] %i times in %s s ( %f acc / s)" % (cls.__name__, na, elapsed, na / elapsed)
5959
# END for each sequence
60+
61+
def test_instantiation(self):
62+
ni = 100000
63+
max_num_items = 4
64+
for mni in range(max_num_items+1):
65+
for cls in (tuple, list):
66+
st = time()
67+
for i in xrange(ni):
68+
if mni == 0:
69+
cls()
70+
elif mni == 1:
71+
cls((1,))
72+
elif mni == 2:
73+
cls((1,2))
74+
elif mni == 3:
75+
cls((1,2,3))
76+
elif mni == 4:
77+
cls((1,2,3,4))
78+
else:
79+
cls(x for x in xrange(mni))
80+
# END handle empty cls
81+
# END for each item
82+
elapsed = time() - st
83+
print >> sys.stderr, "Created %i %ss of size %i in %f s ( %f inst / s)" % (ni, cls.__name__, mni, elapsed, ni / elapsed)
84+
# END for each type
85+
# END for each item count
86+
87+
# tuple and tuple direct
88+
st = time()
89+
for i in xrange(ni):
90+
t = (1,2,3,4)
91+
# END for each item
92+
elapsed = time() - st
93+
print >> sys.stderr, "Created %i tuples (1,2,3,4) in %f s ( %f tuples / s)" % (ni, elapsed, ni / elapsed)
94+
95+
st = time()
96+
for i in xrange(ni):
97+
t = tuple((1,2,3,4))
98+
# END for each item
99+
elapsed = time() - st
100+
print >> sys.stderr, "Created %i tuples tuple((1,2,3,4)) in %f s ( %f tuples / s)" % (ni, elapsed, ni / elapsed)
101+
102+
def test_unpacking_vs_indexing(self):
103+
ni = 1000000
104+
list_items = [1,2,3,4]
105+
tuple_items = (1,2,3,4)
106+
107+
for sequence in (list_items, tuple_items):
108+
st = time()
109+
for i in xrange(ni):
110+
one, two, three, four = sequence
111+
# END for eac iteration
112+
elapsed = time() - st
113+
print >> sys.stderr, "Unpacked %i %ss of size %i in %f s ( %f acc / s)" % (ni, type(sequence).__name__, len(sequence), elapsed, ni / elapsed)
114+
115+
st = time()
116+
for i in xrange(ni):
117+
one, two, three, four = sequence[0], sequence[1], sequence[2], sequence[3]
118+
# END for eac iteration
119+
elapsed = time() - st
120+
print >> sys.stderr, "Unpacked %i %ss of size %i individually in %f s ( %f acc / s)" % (ni, type(sequence).__name__, len(sequence), elapsed, ni / elapsed)
121+
122+
st = time()
123+
for i in xrange(ni):
124+
one, two = sequence[0], sequence[1]
125+
# END for eac iteration
126+
elapsed = time() - st
127+
print >> sys.stderr, "Unpacked %i %ss of size %i individually (2 of 4) in %f s ( %f acc / s)" % (ni, type(sequence).__name__, len(sequence), elapsed, ni / elapsed)
128+
# END for each sequence
129+
130+
def test_large_list_vs_iteration(self):
131+
# what costs more: alloc/realloc of lists, or the cpu strain of iterators ?
132+
def slow_iter(ni):
133+
for i in xrange(ni):
134+
yield i
135+
# END slow iter - be closer to the real world
136+
137+
# alloc doesn't play a role here it seems
138+
for ni in (500, 1000, 10000, 20000, 40000):
139+
st = time()
140+
for i in list(xrange(ni)):
141+
i
142+
# END for each item
143+
elapsed = time() - st
144+
print >> sys.stderr, "Iterated %i items from list in %f s ( %f acc / s)" % (ni, elapsed, ni / elapsed)
145+
146+
st = time()
147+
for i in slow_iter(ni):
148+
i
149+
# END for each item
150+
elapsed = time() - st
151+
print >> sys.stderr, "Iterated %i items from iterator in %f s ( %f acc / s)" % (ni, elapsed, ni / elapsed)
152+
# END for each number of iterations
153+

0 commit comments

Comments
 (0)