diff --git a/.coveragerc b/.coveragerc deleted file mode 100644 index 71b6ef7..0000000 --- a/.coveragerc +++ /dev/null @@ -1,7 +0,0 @@ -[run] -source = gitdb - -; to make nosetests happy -[report] -include = */gitdb/* -omit = */gitdb/ext/* diff --git a/.github/dependabot.yml b/.github/dependabot.yml deleted file mode 100644 index 2fe73ca..0000000 --- a/.github/dependabot.yml +++ /dev/null @@ -1,11 +0,0 @@ -version: 2 -updates: -- package-ecosystem: "github-actions" - directory: "/" - schedule: - interval: "weekly" - -- package-ecosystem: "gitsubmodule" - directory: "/" - schedule: - interval: "weekly" diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml deleted file mode 100644 index 907698d..0000000 --- a/.github/workflows/pythonpackage.yml +++ /dev/null @@ -1,49 +0,0 @@ -# This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions - -name: Python package - -on: [push, pull_request, workflow_dispatch] - -jobs: - build: - - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] - os: [ubuntu-latest] - experimental: [false] - include: - - python-version: "3.7" - os: ubuntu-22.04 - experimental: false - continue-on-error: ${{ matrix.experimental }} - - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - allow-prereleases: ${{ matrix.experimental }} - - name: Install project and dependencies - run: | - python -m pip install --upgrade pip - pip install . - - name: Lint with flake8 - run: | - pip install flake8 - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test with pytest - run: | - pip install pytest - ulimit -n 48 - ulimit -n - pytest -v diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 8b7da92..0000000 --- a/.gitignore +++ /dev/null @@ -1,10 +0,0 @@ -MANIFEST -.coverage -build/ -dist/ -*.pyc -*.o -*.so -.noseids -*.sublime-workspace -*.egg-info diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index e73cded..0000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "smmap"] - path = gitdb/ext/smmap - url = https://github.com/gitpython-developers/smmap.git diff --git a/AUTHORS b/AUTHORS deleted file mode 100644 index 6c7e9b9..0000000 --- a/AUTHORS +++ /dev/null @@ -1,4 +0,0 @@ -Creator: Sebastian Thiel - -Contributors: - - Ram Rachum (@cool-RR) diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 0d6fe8b..0000000 --- a/LICENSE +++ /dev/null @@ -1,42 +0,0 @@ -Copyright (C) 2010, 2011 Sebastian Thiel and contributors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -* Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -* Neither the name of the GitDB project nor the names of -its contributors may be used to endorse or promote products derived -from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED -TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -Additional Licenses -------------------- -The files at -gitdb/test/fixtures/packs/pack-11fdfa9e156ab73caae3b6da867192221f2089c2.idx -and -gitdb/test/fixtures/packs/pack-11fdfa9e156ab73caae3b6da867192221f2089c2.pack -are licensed under GNU GPL as part of the git source repository, -see http://en.wikipedia.org/wiki/Git_%28software%29 for more information. - -They are not required for the actual operation, which is why they are not found -in the distribution package. diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index b939b5d..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,18 +0,0 @@ -include VERSION -include LICENSE -include CHANGES -include AUTHORS -include README -include MANIFEST.in - -include gitdb/_fun.c -include gitdb/_delta_apply.c -include gitdb/_delta_apply.h - -graft gitdb/test - -global-exclude .git* -global-exclude *.pyc -global-exclude *.so -global-exclude *.dll -global-exclude *.o diff --git a/Makefile b/Makefile deleted file mode 100644 index 20436bb..0000000 --- a/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -.PHONY: all clean release force_release - -all: - @grep -Ee '^[a-z].*:' Makefile | cut -d: -f1 | grep -vF all - -clean: - rm -rf build/ dist/ .eggs/ .tox/ - -force_release: clean - ./build-release.sh - twine upload dist/* - git push --tags origin master diff --git a/README.rst b/README.rst deleted file mode 100644 index 61ce28b..0000000 --- a/README.rst +++ /dev/null @@ -1,74 +0,0 @@ -GitDB -===== - -GitDB allows you to access bare git repositories for reading and writing. It aims at allowing full access to loose objects as well as packs with performance and scalability in mind. It operates exclusively on streams, allowing to handle large objects with a small memory footprint. - -Installation -============ - -.. image:: https://img.shields.io/pypi/v/gitdb.svg - :target: https://pypi.python.org/pypi/gitdb/ - :alt: Latest Version -.. image:: https://img.shields.io/pypi/pyversions/gitdb.svg - :target: https://pypi.python.org/pypi/gitdb/ - :alt: Supported Python versions -.. image:: https://readthedocs.org/projects/gitdb/badge/?version=latest - :target: https://readthedocs.org/projects/gitdb/?badge=latest - :alt: Documentation Status - -From `PyPI `_:: - - pip install gitdb - -SPEEDUPS -======== - -If you want to go up to 20% faster, you can install gitdb-speedups with:: - - pip install gitdb-speedups - -However, please note that gitdb-speedups is not currently maintained. - -REQUIREMENTS -============ - -* smmap - declared as a dependency, automatically installed -* pytest - for running the tests - -SOURCE -====== - -The source is available in a git repository on GitHub: - -https://github.com/gitpython-developers/gitdb - -Once the clone is complete, please be sure to initialize the submodule using:: - - cd gitdb - git submodule update --init - -Run the tests with:: - - pytest - -DEVELOPMENT -=========== - -.. image:: https://github.com/gitpython-developers/gitdb/workflows/Python%20package/badge.svg - :target: https://github.com/gitpython-developers/gitdb/actions - -The library is considered mature, and not under active development. Its primary (known) use is in GitPython. - -INFRASTRUCTURE -============== - -* Discussions - * https://github.com/gitpython-developers/GitPython/discussions - -* Issue Tracker - * https://github.com/gitpython-developers/gitdb/issues - -LICENSE -======= - -New BSD License diff --git a/SECURITY.md b/SECURITY.md deleted file mode 100644 index 95389ff..0000000 --- a/SECURITY.md +++ /dev/null @@ -1,3 +0,0 @@ -# Security Policy - -See [GitPython](https://github.com/gitpython-developers/GitPython/blob/main/SECURITY.md). Vulnerabilities found in `gitdb` can be reported there. diff --git a/build-release.sh b/build-release.sh deleted file mode 100755 index 5840e44..0000000 --- a/build-release.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -# -# This script builds a release. If run in a venv, it auto-installs its tools. -# You may want to run "make release" instead of running this script directly. - -set -eEu - -function release_with() { - $1 -m build --sdist --wheel -} - -if test -n "${VIRTUAL_ENV:-}"; then - deps=(build twine) # Install twine along with build, as we need it later. - echo "Virtual environment detected. Adding packages: ${deps[*]}" - pip install --quiet --upgrade "${deps[@]}" - echo 'Starting the build.' - release_with python -else - function suggest_venv() { - venv_cmd='python -m venv env && source env/bin/activate' - printf "HELP: To avoid this error, use a virtual-env with '%s' instead.\n" "$venv_cmd" - } - trap suggest_venv ERR # This keeps the original exit (error) code. - echo 'Starting the build.' - release_with python3 # Outside a venv, use python3. -fi diff --git a/doc/.gitignore b/doc/.gitignore deleted file mode 100644 index 567609b..0000000 --- a/doc/.gitignore +++ /dev/null @@ -1 +0,0 @@ -build/ diff --git a/doc/Makefile b/doc/Makefile deleted file mode 100644 index b10926a..0000000 --- a/doc/Makefile +++ /dev/null @@ -1,89 +0,0 @@ -# Makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -PAPER = -BUILDDIR = build - -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source - -.PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest - -help: - @echo "Please use \`make ' where is one of" - @echo " html to make standalone HTML files" - @echo " dirhtml to make HTML files named index.html in directories" - @echo " pickle to make pickle files" - @echo " json to make JSON files" - @echo " htmlhelp to make HTML files and a HTML help project" - @echo " qthelp to make HTML files and a qthelp project" - @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" - @echo " changes to make an overview of all changed/added/deprecated items" - @echo " linkcheck to check all external links for integrity" - @echo " doctest to run all doctests embedded in the documentation (if enabled)" - -clean: - -rm -rf $(BUILDDIR)/* - -html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." - -dirhtml: - $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." - -pickle: - $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle - @echo - @echo "Build finished; now you can process the pickle files." - -json: - $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json - @echo - @echo "Build finished; now you can process the JSON files." - -htmlhelp: - $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp - @echo - @echo "Build finished; now you can run HTML Help Workshop with the" \ - ".hhp project file in $(BUILDDIR)/htmlhelp." - -qthelp: - $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp - @echo - @echo "Build finished; now you can run "qcollectiongenerator" with the" \ - ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/GitDB.qhcp" - @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/GitDB.qhc" - -latex: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo - @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." - @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \ - "run these through (pdf)latex." - -changes: - $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes - @echo - @echo "The overview file is in $(BUILDDIR)/changes." - -linkcheck: - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck - @echo - @echo "Link check complete; look for any errors in the above output " \ - "or in $(BUILDDIR)/linkcheck/output.txt." - -doctest: - $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest - @echo "Testing of doctests in the sources finished, look at the " \ - "results in $(BUILDDIR)/doctest/output.txt." diff --git a/doc/source/algorithm.rst b/doc/source/algorithm.rst deleted file mode 100644 index 2e01b3f..0000000 --- a/doc/source/algorithm.rst +++ /dev/null @@ -1,97 +0,0 @@ -######################## -Discussion of Algorithms -######################## - -************ -Introduction -************ -As you know, the pure-python object database support for GitPython is provided by the GitDB project. It is meant to be my backup plan to ensure that the DataVault (http://www.youtube.com/user/ByronBates99?feature=mhum#p/c/2A5C6EF5BDA8DB5C ) can handle reading huge files, especially those which were consolidated into packs. A nearly fully packed state is anticipated for the data-vaults repository, and reading these packs efficiently is an essential task. - -This document informs you about my findings in the struggle to improve the way packs are read to reduce memory usage required to handle huge files. It will try to conclude where future development could go to assure big delta-packed files can be read without the need of 8GB+ RAM. - -GitDB's main feature is the use of streams, hence the amount of memory used to read a database object is minimized, at the cost of some additional processing overhead to keep track of the stream state. This works great for legacy objects, which are essentially a zip-compressed byte-stream. - -Streaming data from delta-packed objects is far more difficult though, and only technically possible within certain limits, and at relatively high processing costs. My first observation was that there doesn't appear to be 'the one and only' algorithm which is generally superior. They all have their pros and cons, but fortunately this allows the implementation to choose the one most suited based on the amount of delta streams, as well as the size of the base, which allows an early and cheap estimate of the target size of the final data. - -********************************** -Traditional Delta-Apply-Algorithms -********************************** - -The brute-force CGit delta-apply algorithm -========================================== -CGit employs a simple and relatively brute-force algorithm, which resolves all delta streams recursively. When the recursion reaches the base level of the deltas, it will be decompressed into a buffer, then the first delta gets decompressed into a second buffer. From that, the target size of the delta can be extracted, to allocated a third buffer to hold the result of the operation, which consists of reading the delta stream byte-wise, to apply the operations in order, as described by single-byte opcodes. During recursion, each target buffer of the preceding delta-apply operation is used as base buffer for the next delta-apply operation, until the last delta was applied, leaving the final target buffer as result. - -About Delta-Opcodes -------------------- -There are only two kinds of opcodes, 'add-bytes' and 'copy-from-base'. One 'add-bytes' opcode can encode up to 7 bit of additional bytes to be copied from the delta stream into the target buffer. -A 'copy-from-base' opcode encodes a 32 bit offset into the base buffer, as well as the amount of bytes to be copied, which are up to 2^24 bytes. We may conclude that delta-bases may not be larger than 2^32+2^24 bytes in the current, extensible, implementation. -When generating the delta, git prefers copy operations over add operations, as they are much more efficient. Usually, the most recent, or biggest version of a file is used as base, whereas older and smaller versions of the file are expressed by copying only portions of the newest file. As it is not efficiently possible to represent all changes that way, add-bytes operations fill the gap where needed. All this explains why git can only add 128 bytes with one opcode, as it tries to minimize their use. -This implies that recent file history can usually be extracted faster than old history, which may involve many more deltas. - -Performance considerations --------------------------- -The performance bottleneck of this algorithm appear to be the throughput of your RAM, as both opcodes will just trigger memcpy operations from one memory location to another, times the amount of deltas to apply. This in fact is very fast, even for big files above 100 MB. -Memory allocation could become an issue as you need the base buffer, the target buffer as well as the decompressed delta stream in memory at the same time. The continuous allocation and deallocation of possibly big buffers may support memory fragmentation. Whether it really kicks in, especially on 64 bit machines, is unproven though. -Nonetheless, the cgit implementation is currently the fastest one. - -The brute-force GitDB algorithm -=============================== -Despite of working essentially the same way as the CGit brute-force algorithm, GitDB minimizes the amount of allocations to 2 + num of deltas. The amount memory allocated peaks whiles the deltas are applied, as the base and target buffer, as well as the decompressed stream, are held in memory. -To achieve this, GitDB retrieves all delta-streams in advance, and peaks into their header information to determine the maximum size of the target buffer, just by reading 512 bytes of the compressed stream. If there is more than one delta to apply, the base buffer is set large enough to hold the biggest target buffer required by the delta streams. -Now it is possible to iterate all deltas, oldest first, newest last, and apply them using the buffers. At the end of each iteration, the buffers are swapped. - -Performance Results -------------------- -The performance test is performed on an aggressively packed repository with the history of cgit. 5000 sha's are extracted and read one after another. The delta-chains have a length of 0 to about 35. The pure-python implementation can stream the data of all objects (totaling 62,2 MiB) with an average rate of 8.1 MiB/s, which equals about 654 streams/s. -There are two bottlenecks: The major is the collection of the delta streams, which involves plenty of pack-lookup. This lookup is expensive in python, and is overly expensive. Its not overly critical though, as it only limits the amount of streams per second, not the actual data rate when applying the deltas. -Applying the deltas happens to be the second bottleneck, if the files to be processed get bigger. The more opcodes have to be processed, the more python slow function calls will dominate the result. As an example, it takes nearly 8 seconds to unpack a 125 MB file, where cgit only takes 2.4 s. - -To eliminate a few performance concerns, some key routines were rewritten in C. This changes the numbers of this particular test significantly, but not drastically, as the major bottleneck (delta collection) is still in place. Another performance reduction is due to the fact that plenty of other code related to the deltas is still pure-python. -Now all 5000 objects can be read at a rate of 11.1 MiB /s, or 892 streams/s. Fortunately, unpacking a big object is now done in 2.5s, which is just a tad slower than cgit, but with possibly less memory fragmentation. - - -************************************** -Paving the way towards delta streaming -************************************** - -GitDB's reverse delta aggregation algorithm -=========================================== -The idea of this algorithm is to merge all delta streams into one, which can then be applied in just one go. - -In the current implementation, delta streams are parsed into DeltaChunks (->**DC**). Each DC represents one copy-from-base operation, or one or multiple consecutive add-bytes operations. DeltaChunks know about their target offset in the target buffer, and their size. Their target offsets are consecutive, i.e. one chunk ends where the next one begins, regarding their logical extend in the target buffer. -Add-bytes DCs additional store their data to apply, copy-from-base DCs store the offset into the base buffer from which to copy bytes. - -During processing, one starts with the latest (i.e. topmost) delta stream (->**TDS**), and iterates through its ancestor delta streams (->ADS) to merge them into the growing toplevel delta stream.. - -The merging works by following a set of rules: - * Merge into the top-level delta from the youngest ancestor delta to the oldest one - * When merging one ADS, iterate from the first to the last chunk in TDS, then: - - * skip all add-bytes DCs. If bytes are added, these will always overwrite any operation coming from any ADS at the same offset. - * copy-from-base DCs will copy a slice of the respective portion of the ADS ( as defined by their base offset ) and use it to replace the original chunk. This acts as a 'virtual' copy-from-base operation. - - * Finish the merge once all ADS have been handled, or once the TDS only consists of add-byte DCs. The remaining copy-from-base DCs will copy from the original base buffer accordingly. - -Applying the TDS is as straightforward as applying any other DS. The base buffer is required to be kept in memory. In the current implementation, a full-size target buffer is allocated to hold the result of applying the chunk information. Here it is already possible to stream the result, which is feasible only if the memory of the base buffer + the memory of the TDS are smaller than a full size target buffer. Streaming will always make sense if the peak resulting from having the base, target and TDS buffers in memory together is unaffordable. - -The memory consumption during the TDS processing is only the condensed delta-bytes, for each ADS an additional index is required which costs 8 byte per DC. When applying the TDS, one requires an allocated base buffer too.The target buffer can be allocated, but may be a writer as well. - -Performance Results -------------------- -The benchmarking context was the same as for the brute-force GitDB algorithm. This implementation is far more complex than the said brute-force implementation, which clearly reflects in the numbers. It's pure-python throughput is at only 1.1 MiB/s, which equals 89 streams/s. -The biggest performance bottleneck is the slicing of the parsed delta streams, where the program spends most of its time due to hundred thousands of calls. - -To get a more usable version of the algorithm, it was implemented in C, such that python must do no more than two calls to get all the work done. The first prepares the TDS, the second applies it, writing it into a target buffer. -The throughput reaches 15.2 MiB/s, which equals 1221 streams/s, which makes it nearly 14 times faster than the pure python version, and amazingly even 1.35 times faster than the brute-force C implementation. As a comparison, cgit is able to stream about 20 MiB when controlling it through a pipe. GitDBs performance may still improve once pack access is reimplemented in C as well. - -A 125 MB file took 2.5 seconds to unpack for instance, which is only 20% slower than the c implementation of the brute-force algorithm. - - -Future work -=========== - -Another very promising option is that streaming of delta data is indeed possible. Depending on the configuration of the copy-from-base operations, different optimizations could be applied to reduce the amount of memory required for the final processed delta stream. Some configurations may even allow it to stream data from the base buffer, instead of pre-loading it for random access. - -The ability to stream files at reduced memory costs would only be feasible for big files, and would have to be paid with extra pre-processing time. - -A very first and simple implementation could avoid memory peaks by streaming the TDS in conjunction with a base buffer, instead of writing everything into a fully allocated target buffer. diff --git a/doc/source/api.rst b/doc/source/api.rst deleted file mode 100644 index 9dea957..0000000 --- a/doc/source/api.rst +++ /dev/null @@ -1,113 +0,0 @@ -.. _api-label: - -############# -API Reference -############# - -**************** -Database.Base -**************** - -.. automodule:: gitdb.db.base - :members: - :undoc-members: - -**************** -Database.Git -**************** - -.. automodule:: gitdb.db.git - :members: - :undoc-members: - -**************** -Database.Loose -**************** - -.. automodule:: gitdb.db.loose - :members: - :undoc-members: - -**************** -Database.Memory -**************** - -.. automodule:: gitdb.db.mem - :members: - :undoc-members: - -**************** -Database.Pack -**************** - -.. automodule:: gitdb.db.pack - :members: - :undoc-members: - -****************** -Database.Reference -****************** - -.. automodule:: gitdb.db.ref - :members: - :undoc-members: - -************ -Base -************ - -.. automodule:: gitdb.base - :members: - :undoc-members: - -************ -Functions -************ - -.. automodule:: gitdb.fun - :members: - :undoc-members: - -************ -Pack -************ - -.. automodule:: gitdb.pack - :members: - :undoc-members: - -************ -Streams -************ - -.. automodule:: gitdb.stream - :members: - :undoc-members: - -************ -Types -************ - -.. automodule:: gitdb.typ - :members: - :undoc-members: - - -************ -Utilities -************ - -.. automodule:: gitdb.util - :members: - :undoc-members: - - - - - - - - - - - diff --git a/doc/source/changes.rst b/doc/source/changes.rst deleted file mode 100644 index b4340e4..0000000 --- a/doc/source/changes.rst +++ /dev/null @@ -1,141 +0,0 @@ -######### -Changelog -######### - -****** -4.0.12 -****** - -- various improvements - please see the release on GitHub for details. - -****** -4.0.11 -****** - -- various improvements - please see the release on GitHub for details. - -****** -4.0.10 -****** - -- improvements to the way external packages are imported. - -***** -4.0.9 -***** - -- re-release of 4.0.8 to get a valid signature. - -***** -4.0.8 -***** - -* drop support for python 3.4 and 3.5 due to EOL -* Updated upper bound for smmap requirement in setup.py - (`#69 `_) - -***** -4.0.7 -***** - -* Updated upper bound for smmap requirement in setup.py - (`#69 `_) - -***** -4.0.6 -***** - -* Bumped upper bound for smmap requirement - (`#67 `_, - `#68 `_) - -***** -4.0.5 -***** - -* Re-release of 4.0.4, with known signature - -***** -4.0.4 -***** - -* Support for PyOxidizer - -***** -4.0.2 -***** - -* Updated to release as Pure Python Wheel rather than Universal Wheel - (`#62 `_) - -***** -4.0.1 -***** - -* Switched back to the gitdb package name on PyPI and fixed the gitdb2 mirror package - (`#59 `_) -* Switched back to require smmap package and fixed version requirement to >= 3.0.1, < 4 - (`#59 `_) -* Updated smmap submodule - -*********** -3.0.3.post1 -*********** - -* Fixed changelogs for v3.0.2 and v3.0.3 - -***** -3.0.3 -***** - -* Changed ``force_bytes`` to use UTF-8 encoding by default - (`#49 `_) -* Restricted smmap2 version requirement to < 3 -* Updated requirements.txt - -***** -3.0.2 -***** - -* Removed Python 2 compatibility shims - (`#56 `_) - -***** -0.6.1 -***** - -* Fixed possibly critical error, see https://github.com/gitpython-developers/GitPython/issues/220 - - - However, it only seems to occur on high-entropy data and didn't reoccour after the fix - -***** -0.6.0 -***** - -* Added support got python 3.X -* Removed all `async` dependencies and all `*_async` versions of methods with it. - -***** -0.5.4 -***** -* Adjusted implementation to use the SlidingMemoryManager by default in python 2.6 for efficiency reasons. In Python 2.4, the StaticMemoryManager will be used instead. - -***** -0.5.3 -***** -* Added support for smmap. SmartMMap allows resources to be managed and controlled. This brings the implementation closer to the way git handles memory maps, such that unused cached memory maps will automatically be freed once a resource limit is hit. The memory limit on 32 bit systems remains though as a sliding mmap implementation is not used for performance reasons. - -***** -0.5.2 -***** -* Improved performance of the c implementation, which now uses reverse-delta-aggregation to make a memory bound operation CPU bound. - -***** -0.5.1 -***** -* Restored most basic python 2.4 compatibility, such that gitdb can be imported within python 2.4, pack access cannot work though. This at least allows Super-Projects to provide their own workarounds, or use everything but pack support. - -***** -0.5.0 -***** -Initial Release diff --git a/doc/source/conf.py b/doc/source/conf.py deleted file mode 100644 index b387f60..0000000 --- a/doc/source/conf.py +++ /dev/null @@ -1,193 +0,0 @@ -# -# GitDB documentation build configuration file, created by -# sphinx-quickstart on Wed Jun 30 00:01:32 2010. -# -# This file is execfile()d with the current directory set to its containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -import sys -import os - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -sys.path.append(os.path.abspath('../../')) - -# -- General configuration ----------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be extensions -# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc'] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['.templates'] - -# The suffix of source filenames. -source_suffix = '.rst' - -# The encoding of source files. -#source_encoding = 'utf-8' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = 'GitDB' -copyright = '2011, Sebastian Thiel' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -version = '0.5' -# The full version, including alpha/beta/rc tags. -release = '0.5.3' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -#language = None - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -#today = '' -# Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' - -# List of documents that shouldn't be included in the build. -#unused_docs = [] - -# List of directories, relative to source directory, that shouldn't be searched -# for source files. -exclude_trees = [] - -# The reST default role (used for this markup: `text`) to use for all documents. -#default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -#add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -#show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] - - -# -- Options for HTML output --------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. Major themes that come with -# Sphinx are currently 'default' and 'sphinxdoc'. -html_theme = 'default' - -html_theme_options = { - "stickysidebar": "true" -} - -# Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] - -# The name for this set of Sphinx documents. If None, it defaults to -# " v documentation". -#html_title = None - -# A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. -#html_logo = None - -# The name of an image file (within the static path) to use as favicon of the -# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -#html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -#html_static_path = ['.static'] - -# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, -# using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -#html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -#html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -#html_additional_pages = {} - -# If false, no module index is generated. -#html_use_modindex = True - -# If false, no index is generated. -#html_use_index = True - -# If true, the index is split into individual pages for each letter. -#html_split_index = False - -# If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -#html_use_opensearch = '' - -# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = '' - -# Output file base name for HTML help builder. -htmlhelp_basename = 'GitDBdoc' - - -# -- Options for LaTeX output -------------------------------------------------- - -# The paper size ('letter' or 'a4'). -#latex_paper_size = 'letter' - -# The font size ('10pt', '11pt' or '12pt'). -#latex_font_size = '10pt' - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, author, documentclass [howto/manual]). -latex_documents = [ - ('index', 'GitDB.tex', 'GitDB Documentation', - 'Sebastian Thiel', 'manual'), -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -#latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -#latex_use_parts = False - -# Additional stuff for the LaTeX preamble. -#latex_preamble = '' - -# Documents to append as an appendix to all manuals. -#latex_appendices = [] - -# If false, no module index is generated. -#latex_use_modindex = True diff --git a/doc/source/index.rst b/doc/source/index.rst deleted file mode 100644 index 5223e6b..0000000 --- a/doc/source/index.rst +++ /dev/null @@ -1,26 +0,0 @@ -.. GitDB documentation master file, created by - sphinx-quickstart on Wed Jun 30 00:01:32 2010. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to GitDB's documentation! -================================= - -Contents: - -.. toctree:: - :maxdepth: 2 - - intro - tutorial - api - algorithm - changes - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` - diff --git a/doc/source/intro.rst b/doc/source/intro.rst deleted file mode 100644 index 4341386..0000000 --- a/doc/source/intro.rst +++ /dev/null @@ -1,38 +0,0 @@ -######## -Overview -######## - -The *GitDB* project implements interfaces to allow read and write access to git repositories. In its core lies the *db* package, which contains all database types necessary to read a complete git repository. These are the ``LooseObjectDB``, the ``PackedDB`` and the ``ReferenceDB`` which are combined into the ``GitDB`` to combine every aspect of the git database. - -For this to work, GitDB implements pack reading, as well as loose object reading and writing. Data is always encapsulated in streams, which allows huge files to be handled as well as small ones, usually only chunks of the stream are kept in memory for processing, never the whole stream at once. - -Interfaces are used to describe the API, making it easy to provide alternate implementations. - -================ -Installing GitDB -================ -Its easiest to install gitdb using the *pip* program:: - - $ pip install gitdb - -As the command will install gitdb in your respective python distribution, you will most likely need root permissions to authorize the required changes. - -If you have downloaded the source archive, the package can be installed by running the ``setup.py`` script:: - - $ python setup.py install - -=============== -Getting Started -=============== -It is advised to have a look at the :ref:`Usage Guide ` for a brief introduction on the different database implementations. - -================= -Source Repository -================= -The latest source can be cloned using git from github: - - * https://github.com/gitpython-developers/gitdb - -License Information -=================== -*GitDB* is licensed under the New BSD License. diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst deleted file mode 100644 index 55a737f..0000000 --- a/doc/source/tutorial.rst +++ /dev/null @@ -1,114 +0,0 @@ -.. _tutorial-label: - -########### -Usage Guide -########### -This text briefly introduces you to the basic design decisions and accompanying types. - -****** -Design -****** -The *GitDB* project models a standard git object database and implements it in pure python. This means that data, being classified by one of four types, can can be stored in the database and will in future be referred to by the generated SHA1 key, which is a 20 byte string within python. - -*GitDB* implements *RW* access to loose objects, as well as *RO* access to packed objects. Compound Databases allow to combine multiple object databases into one. - -All data is read and written using streams, which effectively prevents more than a chunk of the data being kept in memory at once mostly [#]_. - -******* -Streams -******* -In order to assure the object database can handle objects of any size, a stream interface is used for data retrieval as well as to fill data into the database. - -Basic Stream Types -================== -There are two fundamentally different types of streams, **IStream**\ s and **OStream**\ s. IStreams are mutable and are used to provide data streams to the database to create new objects. - -OStreams are immutable and are used to read data from the database. The base of this type, **OInfo**, contains only type and size information of the queried object, but no stream, which is slightly faster to retrieve depending on the database. - -OStreams are tuples, IStreams are lists. Both, OInfo and OStream, have the same member ordering which allows quick conversion from one type to another. - -**************************** -Data Query and Data Addition -**************************** -Databases support query and/or addition of objects using simple interfaces. They are called **ObjectDBR** for read-only access, and **ObjectDBW** for write access to create new objects. - -Both have two sets of methods, one of which allows interacting with single objects, the other one allowing to handle a stream of objects simultaneously and asynchronously. - -Acquiring information about an object from a database is easy if you have a SHA1 to refer to the object:: - - - ldb = LooseObjectDB(fixture_path("../../../.git/objects")) - - for sha1 in ldb.sha_iter(): - oinfo = ldb.info(sha1) - ostream = ldb.stream(sha1) - assert oinfo[:3] == ostream[:3] - - assert len(ostream.read()) == ostream.size - # END for each sha in database - -To store information, you prepare an *IStream* object with the required information. The provided stream will be read and converted into an object, and the respective 20 byte SHA1 identifier is stored in the IStream object:: - - data = "my data" - istream = IStream("blob", len(data), StringIO(data)) - - # the object does not yet have a sha - assert istream.binsha is None - ldb.store(istream) - # now the sha is set - assert len(istream.binsha) == 20 - assert ldb.has_object(istream.binsha) - -********************** -Asynchronous Operation -********************** -For each read or write method that allows a single-object to be handled, an *_async* version exists which reads items to be processed from a channel, and writes the operation's result into an output channel that is read by the caller or by other async methods, to support chaining. - -Using asynchronous operations is easy, but chaining multiple operations together to form a complex one would require you to read the docs of the *async* package. At the current time, due to the *GIL*, the *GitDB* can only achieve true concurrency during zlib compression and decompression if big objects, if the respective c modules where compiled in *async*. - -Asynchronous operations are scheduled by a *ThreadPool* which resides in the *gitdb.util* module:: - - from gitdb.util import pool - - # set the pool to use two threads - pool.set_size(2) - - # synchronize the mode of operation - pool.set_size(0) - - -Use async methods with readers, which supply items to be processed. The result is given through readers as well:: - - from async import IteratorReader - - # Create a reader from an iterator - reader = IteratorReader(ldb.sha_iter()) - - # get reader for object streams - info_reader = ldb.stream_async(reader) - - # read one - info = info_reader.read(1)[0] - - # read all the rest until depletion - ostreams = info_reader.read() - - - -********* -Databases -********* -A database implements different interfaces, one if which will always be the *ObjectDBR* interface to support reading of object information and streams. - -The *Loose Object Database* as well as the *Packed Object Database* are *File Databases*, hence they operate on a directory which contains files they can read. - -File databases implementing the *ObjectDBW* interface can also be forced to write their output into the specified stream, using the ``set_ostream`` method. This effectively allows you to redirect its output to anywhere you like. - -*Compound Databases* are not implementing their own access type, but instead combine multiple database implementations into one. Examples for this database type are the *Reference Database*, which reads object locations from a file, and the *GitDB* which combines loose, packed and referenced objects into one database interface. - -For more information about the individual database types, please see the :ref:`API Reference `, and the unittests for the respective types. - - ----- - -.. [#] When reading streams from packs, all deltas are currently applied and the result written into a memory map before the first byte is returned. Future versions of the delta-apply algorithm might improve on this. diff --git a/gitdb.pro b/gitdb.pro deleted file mode 100644 index a682615..0000000 --- a/gitdb.pro +++ /dev/null @@ -1,48 +0,0 @@ - -OTHER_FILES += \ - setup.py \ - README.rst \ - MANIFEST \ - Makefile \ - LICENSE \ - AUTHORS \ - gitdb/util.py \ - gitdb/typ.py \ - gitdb/stream.py \ - gitdb/pack.py \ - gitdb/__init__.py \ - gitdb/fun.py \ - gitdb/exc.py \ - gitdb/base.py \ - doc/source/tutorial.rst \ - doc/source/intro.rst \ - doc/source/index.rst \ - doc/source/conf.py \ - doc/source/changes.rst \ - doc/source/api.rst \ - doc/source/algorithm.rst \ - gitdb/db/ref.py \ - gitdb/db/pack.py \ - gitdb/db/mem.py \ - gitdb/db/loose.py \ - gitdb/db/__init__.py \ - gitdb/db/git.py \ - gitdb/db/base.py \ - gitdb/test/test_util.py \ - gitdb/test/test_stream.py \ - gitdb/test/test_pack.py \ - gitdb/test/test_example.py \ - gitdb/test/test_base.py \ - gitdb/test/lib.py \ - gitdb/test/__init__.py \ - gitdb/test/performance/test_stream.py \ - gitdb/test/performance/test_pack_streaming.py \ - gitdb/test/performance/test_pack.py \ - gitdb/test/performance/lib.py - -HEADERS += \ - gitdb/_delta_apply.h - -SOURCES += \ - gitdb/_fun.c \ - gitdb/_delta_apply.c diff --git a/gitdb.pro.user b/gitdb.pro.user deleted file mode 100644 index 3ca1e21..0000000 --- a/gitdb.pro.user +++ /dev/null @@ -1,266 +0,0 @@ - - - - ProjectExplorer.Project.ActiveTarget - 0 - - - ProjectExplorer.Project.EditorSettings - - Default - - - - ProjectExplorer.Project.Target.0 - - Desktop - - Qt4ProjectManager.Target.DesktopTarget - 0 - 0 - 1 - - - 0 - Build - - ProjectExplorer.BuildSteps.Build - - - - Make - - Qt4ProjectManager.MakeStep - true - - clean - - - - 1 - Clean - - ProjectExplorer.BuildSteps.Clean - - 2 - false - - Qt in PATH Release - - Qt4ProjectManager.Qt4BuildConfiguration - 0 - /home/byron/projects/git-python/git/ext/gitdb/gitdbpro-build-desktop - 3 - 0 - false - - - - - qmake - - QtProjectManager.QMakeBuildStep - - - - Make - - Qt4ProjectManager.MakeStep - false - - - - 2 - Build - - ProjectExplorer.BuildSteps.Build - - - - Make - - Qt4ProjectManager.MakeStep - true - - clean - - - - 1 - Clean - - ProjectExplorer.BuildSteps.Clean - - 2 - false - - Qt in PATH Debug - - Qt4ProjectManager.Qt4BuildConfiguration - 2 - /home/byron/projects/git-python/git/ext/gitdb/gitdbpro-build-desktop - 3 - 0 - true - - - - - qmake - - QtProjectManager.QMakeBuildStep - - - - Make - - Qt4ProjectManager.MakeStep - false - - - - 2 - Build - - ProjectExplorer.BuildSteps.Build - - - - Make - - Qt4ProjectManager.MakeStep - true - - clean - - - - 1 - Clean - - ProjectExplorer.BuildSteps.Clean - - 2 - false - - Qt 4.6.2 OpenSource Release - - Qt4ProjectManager.Qt4BuildConfiguration - 0 - /home/byron/projects/git-python/git/ext/gitdb/gitdbpro-build-desktop - 2 - 0 - true - - - - - qmake - - QtProjectManager.QMakeBuildStep - - - - Make - - Qt4ProjectManager.MakeStep - false - - - - 2 - Build - - ProjectExplorer.BuildSteps.Build - - - - Make - - Qt4ProjectManager.MakeStep - true - - clean - - - - 1 - Clean - - ProjectExplorer.BuildSteps.Clean - - 2 - false - - Qt 4.6.2 OpenSource Debug - - Qt4ProjectManager.Qt4BuildConfiguration - 2 - /home/byron/projects/git-python/git/ext/gitdb/gitdbpro-build-desktop - 2 - 0 - true - - 4 - - - 0 - Deploy - - ProjectExplorer.BuildSteps.Deploy - - 1 - No deployment - - ProjectExplorer.DefaultDeployConfiguration - - 1 - - gitdb - - Qt4ProjectManager.Qt4RunConfiguration - 2 - - gitdb.pro - false - false - - false - - 3768 - true - false - - - - /usr/bin/pytest - gitdb/test/test_pack.py - - 2 - python - false - - $BUILDDIR - Run python - test-pack - ProjectExplorer.CustomExecutableRunConfiguration - 3768 - true - false - - 2 - - - - ProjectExplorer.Project.TargetCount - 1 - - - ProjectExplorer.Project.Updater.EnvironmentId - {d38778e3-6b24-4419-8fc3-c8cc320f55e0} - - - ProjectExplorer.Project.Updater.FileVersion - 8 - - diff --git a/gitdb/__init__.py b/gitdb/__init__.py deleted file mode 100644 index 1fb7df8..0000000 --- a/gitdb/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -"""Initialize the object database module""" - -__author__ = "Sebastian Thiel" -__contact__ = "byronimo@gmail.com" -__homepage__ = "https://github.com/gitpython-developers/gitdb" -version_info = (4, 0, 12) -__version__ = '.'.join(str(i) for i in version_info) - -# default imports -from gitdb.base import * -from gitdb.db import * -from gitdb.stream import * diff --git a/gitdb/base.py b/gitdb/base.py deleted file mode 100644 index 9a23a4f..0000000 --- a/gitdb/base.py +++ /dev/null @@ -1,315 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -"""Module with basic data structures - they are designed to be lightweight and fast""" -from gitdb.util import bin_to_hex - -from gitdb.fun import ( - type_id_to_type_map, - type_to_type_id_map -) - -__all__ = ('OInfo', 'OPackInfo', 'ODeltaPackInfo', - 'OStream', 'OPackStream', 'ODeltaPackStream', - 'IStream', 'InvalidOInfo', 'InvalidOStream') - -#{ ODB Bases - - -class OInfo(tuple): - - """Carries information about an object in an ODB, providing information - about the binary sha of the object, the type_string as well as the uncompressed size - in bytes. - - It can be accessed using tuple notation and using attribute access notation:: - - assert dbi[0] == dbi.binsha - assert dbi[1] == dbi.type - assert dbi[2] == dbi.size - - The type is designed to be as lightweight as possible.""" - __slots__ = tuple() - - def __new__(cls, sha, type, size): - return tuple.__new__(cls, (sha, type, size)) - - def __init__(self, *args): - tuple.__init__(self) - - #{ Interface - @property - def binsha(self): - """:return: our sha as binary, 20 bytes""" - return self[0] - - @property - def hexsha(self): - """:return: our sha, hex encoded, 40 bytes""" - return bin_to_hex(self[0]) - - @property - def type(self): - return self[1] - - @property - def type_id(self): - return type_to_type_id_map[self[1]] - - @property - def size(self): - return self[2] - #} END interface - - -class OPackInfo(tuple): - - """As OInfo, but provides a type_id property to retrieve the numerical type id, and - does not include a sha. - - Additionally, the pack_offset is the absolute offset into the packfile at which - all object information is located. The data_offset property points to the absolute - location in the pack at which that actual data stream can be found.""" - __slots__ = tuple() - - def __new__(cls, packoffset, type, size): - return tuple.__new__(cls, (packoffset, type, size)) - - def __init__(self, *args): - tuple.__init__(self) - - #{ Interface - - @property - def pack_offset(self): - return self[0] - - @property - def type(self): - return type_id_to_type_map[self[1]] - - @property - def type_id(self): - return self[1] - - @property - def size(self): - return self[2] - - #} END interface - - -class ODeltaPackInfo(OPackInfo): - - """Adds delta specific information, - Either the 20 byte sha which points to some object in the database, - or the negative offset from the pack_offset, so that pack_offset - delta_info yields - the pack offset of the base object""" - __slots__ = tuple() - - def __new__(cls, packoffset, type, size, delta_info): - return tuple.__new__(cls, (packoffset, type, size, delta_info)) - - #{ Interface - @property - def delta_info(self): - return self[3] - #} END interface - - -class OStream(OInfo): - - """Base for object streams retrieved from the database, providing additional - information about the stream. - Generally, ODB streams are read-only as objects are immutable""" - __slots__ = tuple() - - def __new__(cls, sha, type, size, stream, *args, **kwargs): - """Helps with the initialization of subclasses""" - return tuple.__new__(cls, (sha, type, size, stream)) - - def __init__(self, *args, **kwargs): - tuple.__init__(self) - - #{ Stream Reader Interface - - def read(self, size=-1): - return self[3].read(size) - - @property - def stream(self): - return self[3] - - #} END stream reader interface - - -class ODeltaStream(OStream): - - """Uses size info of its stream, delaying reads""" - - def __new__(cls, sha, type, size, stream, *args, **kwargs): - """Helps with the initialization of subclasses""" - return tuple.__new__(cls, (sha, type, size, stream)) - - #{ Stream Reader Interface - - @property - def size(self): - return self[3].size - - #} END stream reader interface - - -class OPackStream(OPackInfo): - - """Next to pack object information, a stream outputting an undeltified base object - is provided""" - __slots__ = tuple() - - def __new__(cls, packoffset, type, size, stream, *args): - """Helps with the initialization of subclasses""" - return tuple.__new__(cls, (packoffset, type, size, stream)) - - #{ Stream Reader Interface - def read(self, size=-1): - return self[3].read(size) - - @property - def stream(self): - return self[3] - #} END stream reader interface - - -class ODeltaPackStream(ODeltaPackInfo): - - """Provides a stream outputting the uncompressed offset delta information""" - __slots__ = tuple() - - def __new__(cls, packoffset, type, size, delta_info, stream): - return tuple.__new__(cls, (packoffset, type, size, delta_info, stream)) - - #{ Stream Reader Interface - def read(self, size=-1): - return self[4].read(size) - - @property - def stream(self): - return self[4] - #} END stream reader interface - - -class IStream(list): - - """Represents an input content stream to be fed into the ODB. It is mutable to allow - the ODB to record information about the operations outcome right in this instance. - - It provides interfaces for the OStream and a StreamReader to allow the instance - to blend in without prior conversion. - - The only method your content stream must support is 'read'""" - __slots__ = tuple() - - def __new__(cls, type, size, stream, sha=None): - return list.__new__(cls, (sha, type, size, stream, None)) - - def __init__(self, type, size, stream, sha=None): - list.__init__(self, (sha, type, size, stream, None)) - - #{ Interface - @property - def hexsha(self): - """:return: our sha, hex encoded, 40 bytes""" - return bin_to_hex(self[0]) - - def _error(self): - """:return: the error that occurred when processing the stream, or None""" - return self[4] - - def _set_error(self, exc): - """Set this input stream to the given exc, may be None to reset the error""" - self[4] = exc - - error = property(_error, _set_error) - - #} END interface - - #{ Stream Reader Interface - - def read(self, size=-1): - """Implements a simple stream reader interface, passing the read call on - to our internal stream""" - return self[3].read(size) - - #} END stream reader interface - - #{ interface - - def _set_binsha(self, binsha): - self[0] = binsha - - def _binsha(self): - return self[0] - - binsha = property(_binsha, _set_binsha) - - def _type(self): - return self[1] - - def _set_type(self, type): - self[1] = type - - type = property(_type, _set_type) - - def _size(self): - return self[2] - - def _set_size(self, size): - self[2] = size - - size = property(_size, _set_size) - - def _stream(self): - return self[3] - - def _set_stream(self, stream): - self[3] = stream - - stream = property(_stream, _set_stream) - - #} END odb info interface - - -class InvalidOInfo(tuple): - - """Carries information about a sha identifying an object which is invalid in - the queried database. The exception attribute provides more information about - the cause of the issue""" - __slots__ = tuple() - - def __new__(cls, sha, exc): - return tuple.__new__(cls, (sha, exc)) - - def __init__(self, sha, exc): - tuple.__init__(self, (sha, exc)) - - @property - def binsha(self): - return self[0] - - @property - def hexsha(self): - return bin_to_hex(self[0]) - - @property - def error(self): - """:return: exception instance explaining the failure""" - return self[1] - - -class InvalidOStream(InvalidOInfo): - - """Carries information about an invalid ODB stream""" - __slots__ = tuple() - -#} END ODB Bases diff --git a/gitdb/const.py b/gitdb/const.py deleted file mode 100644 index 6391d79..0000000 --- a/gitdb/const.py +++ /dev/null @@ -1,4 +0,0 @@ -BYTE_SPACE = b' ' -NULL_BYTE = b'\0' -NULL_HEX_SHA = "0" * 40 -NULL_BIN_SHA = NULL_BYTE * 20 diff --git a/gitdb/db/__init__.py b/gitdb/db/__init__.py deleted file mode 100644 index 20fd228..0000000 --- a/gitdb/db/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ - -from gitdb.db.base import * -from gitdb.db.loose import * -from gitdb.db.mem import * -from gitdb.db.pack import * -from gitdb.db.git import * -from gitdb.db.ref import * diff --git a/gitdb/db/base.py b/gitdb/db/base.py deleted file mode 100644 index 7312fe0..0000000 --- a/gitdb/db/base.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -"""Contains implementations of database retrieveing objects""" -from gitdb.util import ( - join, - LazyMixin, - hex_to_bin -) - -from gitdb.utils.encoding import force_text -from gitdb.exc import ( - BadObject, - AmbiguousObjectName -) - -from itertools import chain -from functools import reduce - - -__all__ = ('ObjectDBR', 'ObjectDBW', 'FileDBBase', 'CompoundDB', 'CachingDB') - - -class ObjectDBR: - - """Defines an interface for object database lookup. - Objects are identified either by their 20 byte bin sha""" - - def __contains__(self, sha): - return self.has_obj - - #{ Query Interface - def has_object(self, sha): - """ - Whether the object identified by the given 20 bytes - binary sha is contained in the database - - :return: True if the object identified by the given 20 bytes - binary sha is contained in the database""" - raise NotImplementedError("To be implemented in subclass") - - def info(self, sha): - """ :return: OInfo instance - :param sha: bytes binary sha - :raise BadObject:""" - raise NotImplementedError("To be implemented in subclass") - - def stream(self, sha): - """:return: OStream instance - :param sha: 20 bytes binary sha - :raise BadObject:""" - raise NotImplementedError("To be implemented in subclass") - - def size(self): - """:return: amount of objects in this database""" - raise NotImplementedError() - - def sha_iter(self): - """Return iterator yielding 20 byte shas for all objects in this data base""" - raise NotImplementedError() - - #} END query interface - - -class ObjectDBW: - - """Defines an interface to create objects in the database""" - - def __init__(self, *args, **kwargs): - self._ostream = None - - #{ Edit Interface - def set_ostream(self, stream): - """ - Adjusts the stream to which all data should be sent when storing new objects - - :param stream: if not None, the stream to use, if None the default stream - will be used. - :return: previously installed stream, or None if there was no override - :raise TypeError: if the stream doesn't have the supported functionality""" - cstream = self._ostream - self._ostream = stream - return cstream - - def ostream(self): - """ - Return the output stream - - :return: overridden output stream this instance will write to, or None - if it will write to the default stream""" - return self._ostream - - def store(self, istream): - """ - Create a new object in the database - :return: the input istream object with its sha set to its corresponding value - - :param istream: IStream compatible instance. If its sha is already set - to a value, the object will just be stored in the our database format, - in which case the input stream is expected to be in object format ( header + contents ). - :raise IOError: if data could not be written""" - raise NotImplementedError("To be implemented in subclass") - - #} END edit interface - - -class FileDBBase: - - """Provides basic facilities to retrieve files of interest, including - caching facilities to help mapping hexsha's to objects""" - - def __init__(self, root_path): - """Initialize this instance to look for its files at the given root path - All subsequent operations will be relative to this path - :raise InvalidDBRoot: - **Note:** The base will not perform any accessablity checking as the base - might not yet be accessible, but become accessible before the first - access.""" - super().__init__() - self._root_path = root_path - - #{ Interface - def root_path(self): - """:return: path at which this db operates""" - return self._root_path - - def db_path(self, rela_path): - """ - :return: the given relative path relative to our database root, allowing - to pontentially access datafiles""" - return join(self._root_path, force_text(rela_path)) - #} END interface - - -class CachingDB: - - """A database which uses caches to speed-up access""" - - #{ Interface - def update_cache(self, force=False): - """ - Call this method if the underlying data changed to trigger an update - of the internal caching structures. - - :param force: if True, the update must be performed. Otherwise the implementation - may decide not to perform an update if it thinks nothing has changed. - :return: True if an update was performed as something change indeed""" - - # END interface - - -def _databases_recursive(database, output): - """Fill output list with database from db, in order. Deals with Loose, Packed - and compound databases.""" - if isinstance(database, CompoundDB): - dbs = database.databases() - output.extend(db for db in dbs if not isinstance(db, CompoundDB)) - for cdb in (db for db in dbs if isinstance(db, CompoundDB)): - _databases_recursive(cdb, output) - else: - output.append(database) - # END handle database type - - -class CompoundDB(ObjectDBR, LazyMixin, CachingDB): - - """A database which delegates calls to sub-databases. - - Databases are stored in the lazy-loaded _dbs attribute. - Define _set_cache_ to update it with your databases""" - - def _set_cache_(self, attr): - if attr == '_dbs': - self._dbs = list() - elif attr == '_db_cache': - self._db_cache = dict() - else: - super()._set_cache_(attr) - - def _db_query(self, sha): - """:return: database containing the given 20 byte sha - :raise BadObject:""" - # most databases use binary representations, prevent converting - # it every time a database is being queried - try: - return self._db_cache[sha] - except KeyError: - pass - # END first level cache - - for db in self._dbs: - if db.has_object(sha): - self._db_cache[sha] = db - return db - # END for each database - raise BadObject(sha) - - #{ ObjectDBR interface - - def has_object(self, sha): - try: - self._db_query(sha) - return True - except BadObject: - return False - # END handle exceptions - - def info(self, sha): - return self._db_query(sha).info(sha) - - def stream(self, sha): - return self._db_query(sha).stream(sha) - - def size(self): - """:return: total size of all contained databases""" - return reduce(lambda x, y: x + y, (db.size() for db in self._dbs), 0) - - def sha_iter(self): - return chain(*(db.sha_iter() for db in self._dbs)) - - #} END object DBR Interface - - #{ Interface - - def databases(self): - """:return: tuple of database instances we use for lookups""" - return tuple(self._dbs) - - def update_cache(self, force=False): - # something might have changed, clear everything - self._db_cache.clear() - stat = False - for db in self._dbs: - if isinstance(db, CachingDB): - stat |= db.update_cache(force) - # END if is caching db - # END for each database to update - return stat - - def partial_to_complete_sha_hex(self, partial_hexsha): - """ - :return: 20 byte binary sha1 from the given less-than-40 byte hexsha (bytes or str) - :param partial_hexsha: hexsha with less than 40 byte - :raise AmbiguousObjectName: """ - databases = list() - _databases_recursive(self, databases) - partial_hexsha = force_text(partial_hexsha) - len_partial_hexsha = len(partial_hexsha) - if len_partial_hexsha % 2 != 0: - partial_binsha = hex_to_bin(partial_hexsha + "0") - else: - partial_binsha = hex_to_bin(partial_hexsha) - # END assure successful binary conversion - - candidate = None - for db in databases: - full_bin_sha = None - try: - if hasattr(db, 'partial_to_complete_sha_hex'): - full_bin_sha = db.partial_to_complete_sha_hex(partial_hexsha) - else: - full_bin_sha = db.partial_to_complete_sha(partial_binsha, len_partial_hexsha) - # END handle database type - except BadObject: - continue - # END ignore bad objects - if full_bin_sha: - if candidate and candidate != full_bin_sha: - raise AmbiguousObjectName(partial_hexsha) - candidate = full_bin_sha - # END handle candidate - # END for each db - if not candidate: - raise BadObject(partial_binsha) - return candidate - - #} END interface diff --git a/gitdb/db/git.py b/gitdb/db/git.py deleted file mode 100644 index a1ed142..0000000 --- a/gitdb/db/git.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -from gitdb.db.base import ( - CompoundDB, - ObjectDBW, - FileDBBase -) - -from gitdb.db.loose import LooseObjectDB -from gitdb.db.pack import PackedDB -from gitdb.db.ref import ReferenceDB - -from gitdb.exc import InvalidDBRoot - -import os - -__all__ = ('GitDB', ) - - -class GitDB(FileDBBase, ObjectDBW, CompoundDB): - - """A git-style object database, which contains all objects in the 'objects' - subdirectory - - ``IMPORTANT``: The usage of this implementation is highly discouraged as it fails to release file-handles. - This can be a problem with long-running processes and/or big repositories. - """ - # Configuration - PackDBCls = PackedDB - LooseDBCls = LooseObjectDB - ReferenceDBCls = ReferenceDB - - # Directories - packs_dir = 'pack' - loose_dir = '' - alternates_dir = os.path.join('info', 'alternates') - - def __init__(self, root_path): - """Initialize ourselves on a git objects directory""" - super().__init__(root_path) - - def _set_cache_(self, attr): - if attr == '_dbs' or attr == '_loose_db': - self._dbs = list() - loose_db = None - for subpath, dbcls in ((self.packs_dir, self.PackDBCls), - (self.loose_dir, self.LooseDBCls), - (self.alternates_dir, self.ReferenceDBCls)): - path = self.db_path(subpath) - if os.path.exists(path): - self._dbs.append(dbcls(path)) - if dbcls is self.LooseDBCls: - loose_db = self._dbs[-1] - # END remember loose db - # END check path exists - # END for each db type - - # should have at least one subdb - if not self._dbs: - raise InvalidDBRoot(self.root_path()) - # END handle error - - # we the first one should have the store method - assert loose_db is not None and hasattr(loose_db, 'store'), "First database needs store functionality" - - # finally set the value - self._loose_db = loose_db - else: - super()._set_cache_(attr) - # END handle attrs - - #{ ObjectDBW interface - - def store(self, istream): - return self._loose_db.store(istream) - - def ostream(self): - return self._loose_db.ostream() - - def set_ostream(self, ostream): - return self._loose_db.set_ostream(ostream) - - #} END objectdbw interface diff --git a/gitdb/db/loose.py b/gitdb/db/loose.py deleted file mode 100644 index e6765cd..0000000 --- a/gitdb/db/loose.py +++ /dev/null @@ -1,268 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -from contextlib import suppress - -from gitdb.db.base import ( - FileDBBase, - ObjectDBR, - ObjectDBW -) - -from gitdb.exc import ( - BadObject, - AmbiguousObjectName -) - -from gitdb.stream import ( - DecompressMemMapReader, - FDCompressedSha1Writer, - FDStream, - Sha1Writer -) - -from gitdb.base import ( - OStream, - OInfo -) - -from gitdb.util import ( - file_contents_ro_filepath, - ENOENT, - hex_to_bin, - bin_to_hex, - exists, - chmod, - isfile, - remove, - rename, - dirname, - basename, - join -) - -from gitdb.fun import ( - chunk_size, - loose_object_header_info, - write_object, - stream_copy -) - -from gitdb.utils.encoding import force_bytes - -import tempfile -import os -import sys -import time - - -__all__ = ('LooseObjectDB', ) - - -class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): - - """A database which operates on loose object files""" - - # CONFIGURATION - # chunks in which data will be copied between streams - stream_chunk_size = chunk_size - - # On windows we need to keep it writable, otherwise it cannot be removed - # either - new_objects_mode = int("444", 8) - if os.name == 'nt': - new_objects_mode = int("644", 8) - - def __init__(self, root_path): - super().__init__(root_path) - self._hexsha_to_file = dict() - # Additional Flags - might be set to 0 after the first failure - # Depending on the root, this might work for some mounts, for others not, which - # is why it is per instance - self._fd_open_flags = getattr(os, 'O_NOATIME', 0) - - #{ Interface - def object_path(self, hexsha): - """ - :return: path at which the object with the given hexsha would be stored, - relative to the database root""" - return join(hexsha[:2], hexsha[2:]) - - def readable_db_object_path(self, hexsha): - """ - :return: readable object path to the object identified by hexsha - :raise BadObject: If the object file does not exist""" - with suppress(KeyError): - return self._hexsha_to_file[hexsha] - # END ignore cache misses - - # try filesystem - path = self.db_path(self.object_path(hexsha)) - if exists(path): - self._hexsha_to_file[hexsha] = path - return path - # END handle cache - raise BadObject(hexsha) - - def partial_to_complete_sha_hex(self, partial_hexsha): - """:return: 20 byte binary sha1 string which matches the given name uniquely - :param name: hexadecimal partial name (bytes or ascii string) - :raise AmbiguousObjectName: - :raise BadObject: """ - candidate = None - for binsha in self.sha_iter(): - if bin_to_hex(binsha).startswith(force_bytes(partial_hexsha)): - # it can't ever find the same object twice - if candidate is not None: - raise AmbiguousObjectName(partial_hexsha) - candidate = binsha - # END for each object - if candidate is None: - raise BadObject(partial_hexsha) - return candidate - - #} END interface - - def _map_loose_object(self, sha): - """ - :return: memory map of that file to allow random read access - :raise BadObject: if object could not be located""" - db_path = self.db_path(self.object_path(bin_to_hex(sha))) - try: - return file_contents_ro_filepath(db_path, flags=self._fd_open_flags) - except OSError as e: - if e.errno != ENOENT: - # try again without noatime - try: - return file_contents_ro_filepath(db_path) - except OSError as new_e: - raise BadObject(sha) from new_e - # didn't work because of our flag, don't try it again - self._fd_open_flags = 0 - else: - raise BadObject(sha) from e - # END handle error - # END exception handling - - def set_ostream(self, stream): - """:raise TypeError: if the stream does not support the Sha1Writer interface""" - if stream is not None and not isinstance(stream, Sha1Writer): - raise TypeError("Output stream musst support the %s interface" % Sha1Writer.__name__) - return super().set_ostream(stream) - - def info(self, sha): - m = self._map_loose_object(sha) - try: - typ, size = loose_object_header_info(m) - return OInfo(sha, typ, size) - finally: - if hasattr(m, 'close'): - m.close() - # END assure release of system resources - - def stream(self, sha): - m = self._map_loose_object(sha) - type, size, stream = DecompressMemMapReader.new(m, close_on_deletion=True) - return OStream(sha, type, size, stream) - - def has_object(self, sha): - try: - self.readable_db_object_path(bin_to_hex(sha)) - return True - except BadObject: - return False - # END check existence - - def store(self, istream): - """note: The sha we produce will be hex by nature""" - tmp_path = None - writer = self.ostream() - if writer is None: - # open a tmp file to write the data to - fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path) - - if istream.binsha is None: - writer = FDCompressedSha1Writer(fd) - else: - writer = FDStream(fd) - # END handle direct stream copies - # END handle custom writer - - try: - try: - if istream.binsha is not None: - # copy as much as possible, the actual uncompressed item size might - # be smaller than the compressed version - stream_copy(istream.read, writer.write, sys.maxsize, self.stream_chunk_size) - else: - # write object with header, we have to make a new one - write_object(istream.type, istream.size, istream.read, writer.write, - chunk_size=self.stream_chunk_size) - # END handle direct stream copies - finally: - if tmp_path: - writer.close() - # END assure target stream is closed - except: - if tmp_path: - remove(tmp_path) - raise - # END assure tmpfile removal on error - - hexsha = None - if istream.binsha: - hexsha = istream.hexsha - else: - hexsha = writer.sha(as_hex=True) - # END handle sha - - if tmp_path: - obj_path = self.db_path(self.object_path(hexsha)) - obj_dir = dirname(obj_path) - os.makedirs(obj_dir, exist_ok=True) - # END handle destination directory - # rename onto existing doesn't work on NTFS - if isfile(obj_path): - remove(tmp_path) - else: - rename(tmp_path, obj_path) - # end rename only if needed - - # Ensure rename is actually done and file is stable - # Retry up to 14 times - quadratic wait & retry in ms. - # The total maximum wait time is 1000ms, which should be vastly enough for the - # OS to return and commit the file to disk. - for backoff_ms in [1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 181]: - with suppress(PermissionError): - # make sure its readable for all ! It started out as rw-- tmp file - # but needs to be rwrr - chmod(obj_path, self.new_objects_mode) - break - time.sleep(backoff_ms / 1000.0) - else: - raise PermissionError( - "Impossible to apply `chmod` to file {}".format(obj_path) - ) - - # END handle dry_run - - istream.binsha = hex_to_bin(hexsha) - return istream - - def sha_iter(self): - # find all files which look like an object, extract sha from there - for root, dirs, files in os.walk(self.root_path()): - root_base = basename(root) - if len(root_base) != 2: - continue - - for f in files: - if len(f) != 38: - continue - yield hex_to_bin(root_base + f) - # END for each file - # END for each walk iteration - - def size(self): - return len(tuple(self.sha_iter())) diff --git a/gitdb/db/mem.py b/gitdb/db/mem.py deleted file mode 100644 index d4772fd..0000000 --- a/gitdb/db/mem.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -"""Contains the MemoryDatabase implementation""" -from gitdb.db.loose import LooseObjectDB -from gitdb.db.base import ( - ObjectDBR, - ObjectDBW -) - -from gitdb.base import ( - OStream, - IStream, -) - -from gitdb.exc import ( - BadObject, - UnsupportedOperation -) - -from gitdb.stream import ( - ZippedStoreShaWriter, - DecompressMemMapReader, -) - -from io import BytesIO - -__all__ = ("MemoryDB", ) - - -class MemoryDB(ObjectDBR, ObjectDBW): - - """A memory database stores everything to memory, providing fast IO and object - retrieval. It should be used to buffer results and obtain SHAs before writing - it to the actual physical storage, as it allows to query whether object already - exists in the target storage before introducing actual IO""" - - def __init__(self): - super().__init__() - self._db = LooseObjectDB("path/doesnt/matter") - - # maps 20 byte shas to their OStream objects - self._cache = dict() - - def set_ostream(self, stream): - raise UnsupportedOperation("MemoryDB's always stream into memory") - - def store(self, istream): - zstream = ZippedStoreShaWriter() - self._db.set_ostream(zstream) - - istream = self._db.store(istream) - zstream.close() # close to flush - zstream.seek(0) - - # don't provide a size, the stream is written in object format, hence the - # header needs decompression - decomp_stream = DecompressMemMapReader(zstream.getvalue(), close_on_deletion=False) - self._cache[istream.binsha] = OStream(istream.binsha, istream.type, istream.size, decomp_stream) - - return istream - - def has_object(self, sha): - return sha in self._cache - - def info(self, sha): - # we always return streams, which are infos as well - return self.stream(sha) - - def stream(self, sha): - try: - ostream = self._cache[sha] - # rewind stream for the next one to read - ostream.stream.seek(0) - return ostream - except KeyError as e: - raise BadObject(sha) from e - # END exception handling - - def size(self): - return len(self._cache) - - def sha_iter(self): - return self._cache.keys() - - #{ Interface - def stream_copy(self, sha_iter, odb): - """Copy the streams as identified by sha's yielded by sha_iter into the given odb - The streams will be copied directly - **Note:** the object will only be written if it did not exist in the target db - - :return: amount of streams actually copied into odb. If smaller than the amount - of input shas, one or more objects did already exist in odb""" - count = 0 - for sha in sha_iter: - if odb.has_object(sha): - continue - # END check object existence - - ostream = self.stream(sha) - # compressed data including header - sio = BytesIO(ostream.stream.data()) - istream = IStream(ostream.type, ostream.size, sio, sha) - - odb.store(istream) - count += 1 - # END for each sha - return count - #} END interface diff --git a/gitdb/db/pack.py b/gitdb/db/pack.py deleted file mode 100644 index 274ea59..0000000 --- a/gitdb/db/pack.py +++ /dev/null @@ -1,206 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -"""Module containing a database to deal with packs""" -from gitdb.db.base import ( - FileDBBase, - ObjectDBR, - CachingDB -) - -from gitdb.util import LazyMixin - -from gitdb.exc import ( - BadObject, - UnsupportedOperation, - AmbiguousObjectName -) - -from gitdb.pack import PackEntity - -from functools import reduce - -import os -import glob - -__all__ = ('PackedDB', ) - -#{ Utilities - - -class PackedDB(FileDBBase, ObjectDBR, CachingDB, LazyMixin): - - """A database operating on a set of object packs""" - - # sort the priority list every N queries - # Higher values are better, performance tests don't show this has - # any effect, but it should have one - _sort_interval = 500 - - def __init__(self, root_path): - super().__init__(root_path) - # list of lists with three items: - # * hits - number of times the pack was hit with a request - # * entity - Pack entity instance - # * sha_to_index - PackIndexFile.sha_to_index method for direct cache query - # self._entities = list() # lazy loaded list - self._hit_count = 0 # amount of hits - self._st_mtime = 0 # last modification data of our root path - - def _set_cache_(self, attr): - if attr == '_entities': - self._entities = list() - self.update_cache(force=True) - # END handle entities initialization - - def _sort_entities(self): - self._entities.sort(key=lambda l: l[0], reverse=True) - - def _pack_info(self, sha): - """:return: tuple(entity, index) for an item at the given sha - :param sha: 20 or 40 byte sha - :raise BadObject: - **Note:** This method is not thread-safe, but may be hit in multi-threaded - operation. The worst thing that can happen though is a counter that - was not incremented, or the list being in wrong order. So we safe - the time for locking here, lets see how that goes""" - # presort ? - if self._hit_count % self._sort_interval == 0: - self._sort_entities() - # END update sorting - - for item in self._entities: - index = item[2](sha) - if index is not None: - item[0] += 1 # one hit for you - self._hit_count += 1 # general hit count - return (item[1], index) - # END index found in pack - # END for each item - - # no hit, see whether we have to update packs - # NOTE: considering packs don't change very often, we safe this call - # and leave it to the super-caller to trigger that - raise BadObject(sha) - - #{ Object DB Read - - def has_object(self, sha): - try: - self._pack_info(sha) - return True - except BadObject: - return False - # END exception handling - - def info(self, sha): - entity, index = self._pack_info(sha) - return entity.info_at_index(index) - - def stream(self, sha): - entity, index = self._pack_info(sha) - return entity.stream_at_index(index) - - def sha_iter(self): - for entity in self.entities(): - index = entity.index() - sha_by_index = index.sha - for index in range(index.size()): - yield sha_by_index(index) - # END for each index - # END for each entity - - def size(self): - sizes = [item[1].index().size() for item in self._entities] - return reduce(lambda x, y: x + y, sizes, 0) - - #} END object db read - - #{ object db write - - def store(self, istream): - """Storing individual objects is not feasible as a pack is designed to - hold multiple objects. Writing or rewriting packs for single objects is - inefficient""" - raise UnsupportedOperation() - - #} END object db write - - #{ Interface - - def update_cache(self, force=False): - """ - Update our cache with the actually existing packs on disk. Add new ones, - and remove deleted ones. We keep the unchanged ones - - :param force: If True, the cache will be updated even though the directory - does not appear to have changed according to its modification timestamp. - :return: True if the packs have been updated so there is new information, - False if there was no change to the pack database""" - stat = os.stat(self.root_path()) - if not force and stat.st_mtime <= self._st_mtime: - return False - # END abort early on no change - self._st_mtime = stat.st_mtime - - # packs are supposed to be prefixed with pack- by git-convention - # get all pack files, figure out what changed - pack_files = set(glob.glob(os.path.join(self.root_path(), "pack-*.pack"))) - our_pack_files = {item[1].pack().path() for item in self._entities} - - # new packs - for pack_file in (pack_files - our_pack_files): - # init the hit-counter/priority with the size, a good measure for hit- - # probability. Its implemented so that only 12 bytes will be read - entity = PackEntity(pack_file) - self._entities.append([entity.pack().size(), entity, entity.index().sha_to_index]) - # END for each new packfile - - # removed packs - for pack_file in (our_pack_files - pack_files): - del_index = -1 - for i, item in enumerate(self._entities): - if item[1].pack().path() == pack_file: - del_index = i - break - # END found index - # END for each entity - assert del_index != -1 - del(self._entities[del_index]) - # END for each removed pack - - # reinitialize prioritiess - self._sort_entities() - return True - - def entities(self): - """:return: list of pack entities operated upon by this database""" - return [item[1] for item in self._entities] - - def partial_to_complete_sha(self, partial_binsha, canonical_length): - """:return: 20 byte sha as inferred by the given partial binary sha - :param partial_binsha: binary sha with less than 20 bytes - :param canonical_length: length of the corresponding canonical representation. - It is required as binary sha's cannot display whether the original hex sha - had an odd or even number of characters - :raise AmbiguousObjectName: - :raise BadObject: """ - candidate = None - for item in self._entities: - item_index = item[1].index().partial_sha_to_index(partial_binsha, canonical_length) - if item_index is not None: - sha = item[1].index().sha(item_index) - if candidate and candidate != sha: - raise AmbiguousObjectName(partial_binsha) - candidate = sha - # END handle full sha could be found - # END for each entity - - if candidate: - return candidate - - # still not found ? - raise BadObject(partial_binsha) - - #} END interface diff --git a/gitdb/db/ref.py b/gitdb/db/ref.py deleted file mode 100644 index bd30156..0000000 --- a/gitdb/db/ref.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -import codecs -from gitdb.db.base import ( - CompoundDB, -) - -__all__ = ('ReferenceDB', ) - - -class ReferenceDB(CompoundDB): - - """A database consisting of database referred to in a file""" - - # Configuration - # Specifies the object database to use for the paths found in the alternates - # file. If None, it defaults to the GitDB - ObjectDBCls = None - - def __init__(self, ref_file): - super().__init__() - self._ref_file = ref_file - - def _set_cache_(self, attr): - if attr == '_dbs': - self._dbs = list() - self._update_dbs_from_ref_file() - else: - super()._set_cache_(attr) - # END handle attrs - - def _update_dbs_from_ref_file(self): - dbcls = self.ObjectDBCls - if dbcls is None: - # late import - from gitdb.db.git import GitDB - dbcls = GitDB - # END get db type - - # try to get as many as possible, don't fail if some are unavailable - ref_paths = list() - try: - with codecs.open(self._ref_file, 'r', encoding="utf-8") as f: - ref_paths = [l.strip() for l in f] - except OSError: - pass - # END handle alternates - - ref_paths_set = set(ref_paths) - cur_ref_paths_set = {db.root_path() for db in self._dbs} - - # remove existing - for path in (cur_ref_paths_set - ref_paths_set): - for i, db in enumerate(self._dbs[:]): - if db.root_path() == path: - del(self._dbs[i]) - continue - # END del matching db - # END for each path to remove - - # add new - # sort them to maintain order - added_paths = sorted(ref_paths_set - cur_ref_paths_set, key=lambda p: ref_paths.index(p)) - for path in added_paths: - try: - db = dbcls(path) - # force an update to verify path - if isinstance(db, CompoundDB): - db.databases() - # END verification - self._dbs.append(db) - except Exception: - # ignore invalid paths or issues - pass - # END for each path to add - - def update_cache(self, force=False): - # re-read alternates and update databases - self._update_dbs_from_ref_file() - return super().update_cache(force) diff --git a/gitdb/exc.py b/gitdb/exc.py deleted file mode 100644 index 752dafd..0000000 --- a/gitdb/exc.py +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -"""Module with common exceptions""" -from gitdb.util import to_hex_sha - -__all__ = [ - 'AmbiguousObjectName', - 'BadName', - 'BadObject', - 'BadObjectType', - 'InvalidDBRoot', - 'ODBError', - 'ParseError', - 'UnsupportedOperation', - 'to_hex_sha', -] - -class ODBError(Exception): - """All errors thrown by the object database""" - - -class InvalidDBRoot(ODBError): - """Thrown if an object database cannot be initialized at the given path""" - - -class BadObject(ODBError): - """The object with the given SHA does not exist. Instantiate with the - failed sha""" - - def __str__(self): - return "BadObject: %s" % to_hex_sha(self.args[0]) - - -class BadName(ODBError): - """A name provided to rev_parse wasn't understood""" - - def __str__(self): - return "Ref '%s' did not resolve to an object" % self.args[0] - - -class ParseError(ODBError): - """Thrown if the parsing of a file failed due to an invalid format""" - - -class AmbiguousObjectName(ODBError): - """Thrown if a possibly shortened name does not uniquely represent a single object - in the database""" - - -class BadObjectType(ODBError): - """The object had an unsupported type""" - - -class UnsupportedOperation(ODBError): - """Thrown if the given operation cannot be supported by the object database""" diff --git a/gitdb/ext/smmap b/gitdb/ext/smmap deleted file mode 160000 index 8f82e6c..0000000 --- a/gitdb/ext/smmap +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 8f82e6c19661f9b735cc55cc89031a189e408894 diff --git a/gitdb/fun.py b/gitdb/fun.py deleted file mode 100644 index a272e5c..0000000 --- a/gitdb/fun.py +++ /dev/null @@ -1,704 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -"""Contains basic c-functions which usually contain performance critical code -Keeping this code separate from the beginning makes it easier to out-source -it into c later, if required""" - -import zlib -from gitdb.util import byte_ord -decompressobj = zlib.decompressobj - -import mmap -from itertools import islice -from functools import reduce - -from gitdb.const import NULL_BYTE, BYTE_SPACE -from gitdb.utils.encoding import force_text -from gitdb.typ import ( - str_blob_type, - str_commit_type, - str_tree_type, - str_tag_type, -) - -from io import StringIO - -# INVARIANTS -OFS_DELTA = 6 -REF_DELTA = 7 -delta_types = (OFS_DELTA, REF_DELTA) - -type_id_to_type_map = { - 0: b'', # EXT 1 - 1: str_commit_type, - 2: str_tree_type, - 3: str_blob_type, - 4: str_tag_type, - 5: b'', # EXT 2 - OFS_DELTA: "OFS_DELTA", # OFFSET DELTA - REF_DELTA: "REF_DELTA" # REFERENCE DELTA -} - -type_to_type_id_map = { - str_commit_type: 1, - str_tree_type: 2, - str_blob_type: 3, - str_tag_type: 4, - "OFS_DELTA": OFS_DELTA, - "REF_DELTA": REF_DELTA, -} - -# used when dealing with larger streams -chunk_size = 1000 * mmap.PAGESIZE - -__all__ = ('is_loose_object', 'loose_object_header_info', 'msb_size', 'pack_object_header_info', - 'write_object', 'loose_object_header', 'stream_copy', 'apply_delta_data', - 'is_equal_canonical_sha', 'connect_deltas', 'DeltaChunkList', 'create_pack_object_header') - - -#{ Structures - -def _set_delta_rbound(d, size): - """Truncate the given delta to the given size - :param size: size relative to our target offset, may not be 0, must be smaller or equal - to our size - :return: d""" - d.ts = size - - # NOTE: data is truncated automatically when applying the delta - # MUST NOT DO THIS HERE - return d - - -def _move_delta_lbound(d, bytes): - """Move the delta by the given amount of bytes, reducing its size so that its - right bound stays static - :param bytes: amount of bytes to move, must be smaller than delta size - :return: d""" - if bytes == 0: - return - - d.to += bytes - d.so += bytes - d.ts -= bytes - if d.data is not None: - d.data = d.data[bytes:] - # END handle data - - return d - - -def delta_duplicate(src): - return DeltaChunk(src.to, src.ts, src.so, src.data) - - -def delta_chunk_apply(dc, bbuf, write): - """Apply own data to the target buffer - :param bbuf: buffer providing source bytes for copy operations - :param write: write method to call with data to write""" - if dc.data is None: - # COPY DATA FROM SOURCE - write(bbuf[dc.so:dc.so + dc.ts]) - else: - # APPEND DATA - # what's faster: if + 4 function calls or just a write with a slice ? - # Considering data can be larger than 127 bytes now, it should be worth it - if dc.ts < len(dc.data): - write(dc.data[:dc.ts]) - else: - write(dc.data) - # END handle truncation - # END handle chunk mode - - -class DeltaChunk: - - """Represents a piece of a delta, it can either add new data, or copy existing - one from a source buffer""" - __slots__ = ( - 'to', # start offset in the target buffer in bytes - 'ts', # size of this chunk in the target buffer in bytes - 'so', # start offset in the source buffer in bytes or None - 'data', # chunk of bytes to be added to the target buffer, - # DeltaChunkList to use as base, or None - ) - - def __init__(self, to, ts, so, data): - self.to = to - self.ts = ts - self.so = so - self.data = data - - def __repr__(self): - return "DeltaChunk(%i, %i, %s, %s)" % (self.to, self.ts, self.so, self.data or "") - - #{ Interface - - def rbound(self): - return self.to + self.ts - - def has_data(self): - """:return: True if the instance has data to add to the target stream""" - return self.data is not None - - #} END interface - - -def _closest_index(dcl, absofs): - """:return: index at which the given absofs should be inserted. The index points - to the DeltaChunk with a target buffer absofs that equals or is greater than - absofs. - **Note:** global method for performance only, it belongs to DeltaChunkList""" - lo = 0 - hi = len(dcl) - while lo < hi: - mid = (lo + hi) / 2 - dc = dcl[mid] - if dc.to > absofs: - hi = mid - elif dc.rbound() > absofs or dc.to == absofs: - return mid - else: - lo = mid + 1 - # END handle bound - # END for each delta absofs - return len(dcl) - 1 - - -def delta_list_apply(dcl, bbuf, write): - """Apply the chain's changes and write the final result using the passed - write function. - :param bbuf: base buffer containing the base of all deltas contained in this - list. It will only be used if the chunk in question does not have a base - chain. - :param write: function taking a string of bytes to write to the output""" - for dc in dcl: - delta_chunk_apply(dc, bbuf, write) - # END for each dc - - -def delta_list_slice(dcl, absofs, size, ndcl): - """:return: Subsection of this list at the given absolute offset, with the given - size in bytes. - :return: None""" - cdi = _closest_index(dcl, absofs) # delta start index - cd = dcl[cdi] - slen = len(dcl) - lappend = ndcl.append - - if cd.to != absofs: - tcd = DeltaChunk(cd.to, cd.ts, cd.so, cd.data) - _move_delta_lbound(tcd, absofs - cd.to) - tcd.ts = min(tcd.ts, size) - lappend(tcd) - size -= tcd.ts - cdi += 1 - # END lbound overlap handling - - while cdi < slen and size: - # are we larger than the current block - cd = dcl[cdi] - if cd.ts <= size: - lappend(DeltaChunk(cd.to, cd.ts, cd.so, cd.data)) - size -= cd.ts - else: - tcd = DeltaChunk(cd.to, cd.ts, cd.so, cd.data) - tcd.ts = size - lappend(tcd) - size -= tcd.ts - break - # END hadle size - cdi += 1 - # END for each chunk - - -class DeltaChunkList(list): - - """List with special functionality to deal with DeltaChunks. - There are two types of lists we represent. The one was created bottom-up, working - towards the latest delta, the other kind was created top-down, working from the - latest delta down to the earliest ancestor. This attribute is queryable - after all processing with is_reversed.""" - - __slots__ = tuple() - - def rbound(self): - """:return: rightmost extend in bytes, absolute""" - if len(self) == 0: - return 0 - return self[-1].rbound() - - def lbound(self): - """:return: leftmost byte at which this chunklist starts""" - if len(self) == 0: - return 0 - return self[0].to - - def size(self): - """:return: size of bytes as measured by our delta chunks""" - return self.rbound() - self.lbound() - - def apply(self, bbuf, write): - """Only used by public clients, internally we only use the global routines - for performance""" - return delta_list_apply(self, bbuf, write) - - def compress(self): - """Alter the list to reduce the amount of nodes. Currently we concatenate - add-chunks - :return: self""" - slen = len(self) - if slen < 2: - return self - i = 0 - - first_data_index = None - while i < slen: - dc = self[i] - i += 1 - if dc.data is None: - if first_data_index is not None and i - 2 - first_data_index > 1: - # if first_data_index is not None: - nd = StringIO() # new data - so = self[first_data_index].to # start offset in target buffer - for x in range(first_data_index, i - 1): - xdc = self[x] - nd.write(xdc.data[:xdc.ts]) - # END collect data - - del(self[first_data_index:i - 1]) - buf = nd.getvalue() - self.insert(first_data_index, DeltaChunk(so, len(buf), 0, buf)) - - slen = len(self) - i = first_data_index + 1 - - # END concatenate data - first_data_index = None - continue - # END skip non-data chunks - - if first_data_index is None: - first_data_index = i - 1 - # END iterate list - - # if slen_orig != len(self): - # print "INFO: Reduced delta list len to %f %% of former size" % ((float(len(self)) / slen_orig) * 100) - return self - - def check_integrity(self, target_size=-1): - """Verify the list has non-overlapping chunks only, and the total size matches - target_size - :param target_size: if not -1, the total size of the chain must be target_size - :raise AssertionError: if the size doesn't match""" - if target_size > -1: - assert self[-1].rbound() == target_size - assert reduce(lambda x, y: x + y, (d.ts for d in self), 0) == target_size - # END target size verification - - if len(self) < 2: - return - - # check data - for dc in self: - assert dc.ts > 0 - if dc.has_data(): - assert len(dc.data) >= dc.ts - # END for each dc - - left = islice(self, 0, len(self) - 1) - right = iter(self) - right.next() - # this is very pythonic - we might have just use index based access here, - # but this could actually be faster - for lft, rgt in zip(left, right): - assert lft.rbound() == rgt.to - assert lft.to + lft.ts == rgt.to - # END for each pair - - -class TopdownDeltaChunkList(DeltaChunkList): - - """Represents a list which is generated by feeding its ancestor streams one by - one""" - __slots__ = tuple() - - def connect_with_next_base(self, bdcl): - """Connect this chain with the next level of our base delta chunklist. - The goal in this game is to mark as many of our chunks rigid, hence they - cannot be changed by any of the upcoming bases anymore. Once all our - chunks are marked like that, we can stop all processing - :param bdcl: data chunk list being one of our bases. They must be fed in - consecutively and in order, towards the earliest ancestor delta - :return: True if processing was done. Use it to abort processing of - remaining streams if False is returned""" - nfc = 0 # number of frozen chunks - dci = 0 # delta chunk index - slen = len(self) # len of self - ccl = list() # temporary list - while dci < slen: - dc = self[dci] - dci += 1 - - # all add-chunks which are already topmost don't need additional processing - if dc.data is not None: - nfc += 1 - continue - # END skip add chunks - - # copy chunks - # integrate the portion of the base list into ourselves. Lists - # dont support efficient insertion ( just one at a time ), but for now - # we live with it. Internally, its all just a 32/64bit pointer, and - # the portions of moved memory should be smallish. Maybe we just rebuild - # ourselves in order to reduce the amount of insertions ... - del(ccl[:]) - delta_list_slice(bdcl, dc.so, dc.ts, ccl) - - # move the target bounds into place to match with our chunk - ofs = dc.to - dc.so - for cdc in ccl: - cdc.to += ofs - # END update target bounds - - if len(ccl) == 1: - self[dci - 1] = ccl[0] - else: - # maybe try to compute the expenses here, and pick the right algorithm - # It would normally be faster than copying everything physically though - # TODO: Use a deque here, and decide by the index whether to extend - # or extend left ! - post_dci = self[dci:] - del(self[dci - 1:]) # include deletion of dc - self.extend(ccl) - self.extend(post_dci) - - slen = len(self) - dci += len(ccl) - 1 # deleted dc, added rest - - # END handle chunk replacement - # END for each chunk - - if nfc == slen: - return False - # END handle completeness - return True - - -#} END structures - -#{ Routines - -def is_loose_object(m): - """ - :return: True the file contained in memory map m appears to be a loose object. - Only the first two bytes are needed""" - b0, b1 = map(ord, m[:2]) - word = (b0 << 8) + b1 - return b0 == 0x78 and (word % 31) == 0 - - -def loose_object_header_info(m): - """ - :return: tuple(type_string, uncompressed_size_in_bytes) the type string of the - object as well as its uncompressed size in bytes. - :param m: memory map from which to read the compressed object data""" - decompress_size = 8192 # is used in cgit as well - hdr = decompressobj().decompress(m, decompress_size) - type_name, size = hdr[:hdr.find(NULL_BYTE)].split(BYTE_SPACE) - - return type_name, int(size) - - -def pack_object_header_info(data): - """ - :return: tuple(type_id, uncompressed_size_in_bytes, byte_offset) - The type_id should be interpreted according to the ``type_id_to_type_map`` map - The byte-offset specifies the start of the actual zlib compressed datastream - :param m: random-access memory, like a string or memory map""" - c = byte_ord(data[0]) # first byte - i = 1 # next char to read - type_id = (c >> 4) & 7 # numeric type - size = c & 15 # starting size - s = 4 # starting bit-shift size - while c & 0x80: - c = byte_ord(data[i]) - i += 1 - size += (c & 0x7f) << s - s += 7 - # END character loop - # end performance at expense of maintenance ... - return (type_id, size, i) - - -def create_pack_object_header(obj_type, obj_size): - """ - :return: string defining the pack header comprised of the object type - and its incompressed size in bytes - - :param obj_type: pack type_id of the object - :param obj_size: uncompressed size in bytes of the following object stream""" - c = 0 # 1 byte - hdr = bytearray() # output string - - c = (obj_type << 4) | (obj_size & 0xf) - obj_size >>= 4 - while obj_size: - hdr.append(c | 0x80) - c = obj_size & 0x7f - obj_size >>= 7 - # END until size is consumed - hdr.append(c) - # end handle interpreter - return hdr - - -def msb_size(data, offset=0): - """ - :return: tuple(read_bytes, size) read the msb size from the given random - access data starting at the given byte offset""" - size = 0 - i = 0 - l = len(data) - hit_msb = False - while i < l: - c = data[i + offset] - size |= (c & 0x7f) << i * 7 - i += 1 - if not c & 0x80: - hit_msb = True - break - # END check msb bit - # END while in range - # end performance ... - if not hit_msb: - raise AssertionError("Could not find terminating MSB byte in data stream") - return i + offset, size - - -def loose_object_header(type, size): - """ - :return: bytes representing the loose object header, which is immediately - followed by the content stream of size 'size'""" - return ('%s %i\0' % (force_text(type), size)).encode('ascii') - - -def write_object(type, size, read, write, chunk_size=chunk_size): - """ - Write the object as identified by type, size and source_stream into the - target_stream - - :param type: type string of the object - :param size: amount of bytes to write from source_stream - :param read: read method of a stream providing the content data - :param write: write method of the output stream - :param close_target_stream: if True, the target stream will be closed when - the routine exits, even if an error is thrown - :return: The actual amount of bytes written to stream, which includes the header and a trailing newline""" - tbw = 0 # total num bytes written - - # WRITE HEADER: type SP size NULL - tbw += write(loose_object_header(type, size)) - tbw += stream_copy(read, write, size, chunk_size) - - return tbw - - -def stream_copy(read, write, size, chunk_size): - """ - Copy a stream up to size bytes using the provided read and write methods, - in chunks of chunk_size - - **Note:** its much like stream_copy utility, but operates just using methods""" - dbw = 0 # num data bytes written - - # WRITE ALL DATA UP TO SIZE - while True: - cs = min(chunk_size, size - dbw) - # NOTE: not all write methods return the amount of written bytes, like - # mmap.write. Its bad, but we just deal with it ... perhaps its not - # even less efficient - # data_len = write(read(cs)) - # dbw += data_len - data = read(cs) - data_len = len(data) - dbw += data_len - write(data) - if data_len < cs or dbw == size: - break - # END check for stream end - # END duplicate data - return dbw - - -def connect_deltas(dstreams): - """ - Read the condensed delta chunk information from dstream and merge its information - into a list of existing delta chunks - - :param dstreams: iterable of delta stream objects, the delta to be applied last - comes first, then all its ancestors in order - :return: DeltaChunkList, containing all operations to apply""" - tdcl = None # topmost dcl - - dcl = tdcl = TopdownDeltaChunkList() - for dsi, ds in enumerate(dstreams): - # print "Stream", dsi - db = ds.read() - delta_buf_size = ds.size - - # read header - i, base_size = msb_size(db) - i, target_size = msb_size(db, i) - - # interpret opcodes - tbw = 0 # amount of target bytes written - while i < delta_buf_size: - c = ord(db[i]) - i += 1 - if c & 0x80: - cp_off, cp_size = 0, 0 - if (c & 0x01): - cp_off = ord(db[i]) - i += 1 - if (c & 0x02): - cp_off |= (ord(db[i]) << 8) - i += 1 - if (c & 0x04): - cp_off |= (ord(db[i]) << 16) - i += 1 - if (c & 0x08): - cp_off |= (ord(db[i]) << 24) - i += 1 - if (c & 0x10): - cp_size = ord(db[i]) - i += 1 - if (c & 0x20): - cp_size |= (ord(db[i]) << 8) - i += 1 - if (c & 0x40): - cp_size |= (ord(db[i]) << 16) - i += 1 - - if not cp_size: - cp_size = 0x10000 - - rbound = cp_off + cp_size - if (rbound < cp_size or - rbound > base_size): - break - - dcl.append(DeltaChunk(tbw, cp_size, cp_off, None)) - tbw += cp_size - elif c: - # NOTE: in C, the data chunks should probably be concatenated here. - # In python, we do it as a post-process - dcl.append(DeltaChunk(tbw, c, 0, db[i:i + c])) - i += c - tbw += c - else: - raise ValueError("unexpected delta opcode 0") - # END handle command byte - # END while processing delta data - - dcl.compress() - - # merge the lists ! - if dsi > 0: - if not tdcl.connect_with_next_base(dcl): - break - # END handle merge - - # prepare next base - dcl = DeltaChunkList() - # END for each delta stream - - return tdcl - - -def apply_delta_data(src_buf, src_buf_size, delta_buf, delta_buf_size, write): - """ - Apply data from a delta buffer using a source buffer to the target file - - :param src_buf: random access data from which the delta was created - :param src_buf_size: size of the source buffer in bytes - :param delta_buf_size: size for the delta buffer in bytes - :param delta_buf: random access delta data - :param write: write method taking a chunk of bytes - - **Note:** transcribed to python from the similar routine in patch-delta.c""" - i = 0 - db = delta_buf - while i < delta_buf_size: - c = db[i] - i += 1 - if c & 0x80: - cp_off, cp_size = 0, 0 - if (c & 0x01): - cp_off = db[i] - i += 1 - if (c & 0x02): - cp_off |= (db[i] << 8) - i += 1 - if (c & 0x04): - cp_off |= (db[i] << 16) - i += 1 - if (c & 0x08): - cp_off |= (db[i] << 24) - i += 1 - if (c & 0x10): - cp_size = db[i] - i += 1 - if (c & 0x20): - cp_size |= (db[i] << 8) - i += 1 - if (c & 0x40): - cp_size |= (db[i] << 16) - i += 1 - - if not cp_size: - cp_size = 0x10000 - - rbound = cp_off + cp_size - if (rbound < cp_size or - rbound > src_buf_size): - break - write(src_buf[cp_off:cp_off + cp_size]) - elif c: - write(db[i:i + c]) - i += c - else: - raise ValueError("unexpected delta opcode 0") - # END handle command byte - # END while processing delta data - - # yes, lets use the exact same error message that git uses :) - assert i == delta_buf_size, "delta replay has gone wild" - - -def is_equal_canonical_sha(canonical_length, match, sha1): - """ - :return: True if the given lhs and rhs 20 byte binary shas - The comparison will take the canonical_length of the match sha into account, - hence the comparison will only use the last 4 bytes for uneven canonical representations - :param match: less than 20 byte sha - :param sha1: 20 byte sha""" - binary_length = canonical_length // 2 - if match[:binary_length] != sha1[:binary_length]: - return False - - if canonical_length - binary_length and \ - (byte_ord(match[-1]) ^ byte_ord(sha1[len(match) - 1])) & 0xf0: - return False - # END handle uneven canonnical length - return True - -#} END routines - - -try: - from gitdb_speedups._perf import connect_deltas -except ImportError: - pass diff --git a/gitdb/pack.py b/gitdb/pack.py deleted file mode 100644 index e559e11..0000000 --- a/gitdb/pack.py +++ /dev/null @@ -1,1031 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -"""Contains PackIndexFile and PackFile implementations""" -import zlib - -from gitdb.exc import ( - BadObject, - AmbiguousObjectName, - UnsupportedOperation, - ParseError -) - -from gitdb.util import ( - mman, - LazyMixin, - unpack_from, - bin_to_hex, - byte_ord, -) - -from gitdb.fun import ( - create_pack_object_header, - pack_object_header_info, - is_equal_canonical_sha, - type_id_to_type_map, - write_object, - stream_copy, - chunk_size, - delta_types, - OFS_DELTA, - REF_DELTA, - msb_size -) - -try: - from gitdb_speedups._perf import PackIndexFile_sha_to_index -except ImportError: - pass -# END try c module - -from gitdb.base import ( # Amazing ! - OInfo, - OStream, - OPackInfo, - OPackStream, - ODeltaStream, - ODeltaPackInfo, - ODeltaPackStream, -) - -from gitdb.stream import ( - DecompressMemMapReader, - DeltaApplyReader, - Sha1Writer, - NullStream, - FlexibleSha1Writer -) - -from struct import pack -from binascii import crc32 - -from gitdb.const import NULL_BYTE - -import tempfile -import array -import os -import sys - -__all__ = ('PackIndexFile', 'PackFile', 'PackEntity') - - -#{ Utilities - -def pack_object_at(cursor, offset, as_stream): - """ - :return: Tuple(abs_data_offset, PackInfo|PackStream) - an object of the correct type according to the type_id of the object. - If as_stream is True, the object will contain a stream, allowing the - data to be read decompressed. - :param data: random accessible data containing all required information - :parma offset: offset in to the data at which the object information is located - :param as_stream: if True, a stream object will be returned that can read - the data, otherwise you receive an info object only""" - data = cursor.use_region(offset).buffer() - type_id, uncomp_size, data_rela_offset = pack_object_header_info(data) - total_rela_offset = None # set later, actual offset until data stream begins - delta_info = None - - # OFFSET DELTA - if type_id == OFS_DELTA: - i = data_rela_offset - c = byte_ord(data[i]) - i += 1 - delta_offset = c & 0x7f - while c & 0x80: - c = byte_ord(data[i]) - i += 1 - delta_offset += 1 - delta_offset = (delta_offset << 7) + (c & 0x7f) - # END character loop - delta_info = delta_offset - total_rela_offset = i - # REF DELTA - elif type_id == REF_DELTA: - total_rela_offset = data_rela_offset + 20 - delta_info = data[data_rela_offset:total_rela_offset] - # BASE OBJECT - else: - # assume its a base object - total_rela_offset = data_rela_offset - # END handle type id - abs_data_offset = offset + total_rela_offset - if as_stream: - stream = DecompressMemMapReader(data[total_rela_offset:], False, uncomp_size) - if delta_info is None: - return abs_data_offset, OPackStream(offset, type_id, uncomp_size, stream) - else: - return abs_data_offset, ODeltaPackStream(offset, type_id, uncomp_size, delta_info, stream) - else: - if delta_info is None: - return abs_data_offset, OPackInfo(offset, type_id, uncomp_size) - else: - return abs_data_offset, ODeltaPackInfo(offset, type_id, uncomp_size, delta_info) - # END handle info - # END handle stream - - -def write_stream_to_pack(read, write, zstream, base_crc=None): - """Copy a stream as read from read function, zip it, and write the result. - Count the number of written bytes and return it - :param base_crc: if not None, the crc will be the base for all compressed data - we consecutively write and generate a crc32 from. If None, no crc will be generated - :return: tuple(no bytes read, no bytes written, crc32) crc might be 0 if base_crc - was false""" - br = 0 # bytes read - bw = 0 # bytes written - want_crc = base_crc is not None - crc = 0 - if want_crc: - crc = base_crc - # END initialize crc - - while True: - chunk = read(chunk_size) - br += len(chunk) - compressed = zstream.compress(chunk) - bw += len(compressed) - write(compressed) # cannot assume return value - - if want_crc: - crc = crc32(compressed, crc) - # END handle crc - - if len(chunk) != chunk_size: - break - # END copy loop - - compressed = zstream.flush() - bw += len(compressed) - write(compressed) - if want_crc: - crc = crc32(compressed, crc) - # END handle crc - - return (br, bw, crc) - - -#} END utilities - - -class IndexWriter: - - """Utility to cache index information, allowing to write all information later - in one go to the given stream - **Note:** currently only writes v2 indices""" - __slots__ = '_objs' - - def __init__(self): - self._objs = list() - - def append(self, binsha, crc, offset): - """Append one piece of object information""" - self._objs.append((binsha, crc, offset)) - - def write(self, pack_sha, write): - """Write the index file using the given write method - :param pack_sha: binary sha over the whole pack that we index - :return: sha1 binary sha over all index file contents""" - # sort for sha1 hash - self._objs.sort(key=lambda o: o[0]) - - sha_writer = FlexibleSha1Writer(write) - sha_write = sha_writer.write - sha_write(PackIndexFile.index_v2_signature) - sha_write(pack(">L", PackIndexFile.index_version_default)) - - # fanout - tmplist = list((0,) * 256) # fanout or list with 64 bit offsets - for t in self._objs: - tmplist[byte_ord(t[0][0])] += 1 - # END prepare fanout - for i in range(255): - v = tmplist[i] - sha_write(pack('>L', v)) - tmplist[i + 1] += v - # END write each fanout entry - sha_write(pack('>L', tmplist[255])) - - # sha1 ordered - # save calls, that is push them into c - sha_write(b''.join(t[0] for t in self._objs)) - - # crc32 - for t in self._objs: - sha_write(pack('>L', t[1] & 0xffffffff)) - # END for each crc - - tmplist = list() - # offset 32 - for t in self._objs: - ofs = t[2] - if ofs > 0x7fffffff: - tmplist.append(ofs) - ofs = 0x80000000 + len(tmplist) - 1 - # END handle 64 bit offsets - sha_write(pack('>L', ofs & 0xffffffff)) - # END for each offset - - # offset 64 - for ofs in tmplist: - sha_write(pack(">Q", ofs)) - # END for each offset - - # trailer - assert(len(pack_sha) == 20) - sha_write(pack_sha) - sha = sha_writer.sha(as_hex=False) - write(sha) - return sha - - -class PackIndexFile(LazyMixin): - - """A pack index provides offsets into the corresponding pack, allowing to find - locations for offsets faster.""" - - # Dont use slots as we dynamically bind functions for each version, need a dict for this - # The slots you see here are just to keep track of our instance variables - # __slots__ = ('_indexpath', '_fanout_table', '_cursor', '_version', - # '_sha_list_offset', '_crc_list_offset', '_pack_offset', '_pack_64_offset') - - # used in v2 indices - _sha_list_offset = 8 + 1024 - index_v2_signature = b'\xfftOc' - index_version_default = 2 - - def __init__(self, indexpath): - super().__init__() - self._indexpath = indexpath - - def close(self): - mman.force_map_handle_removal_win(self._indexpath) - self._cursor = None - - def _set_cache_(self, attr): - if attr == "_packfile_checksum": - self._packfile_checksum = self._cursor.map()[-40:-20] - elif attr == "_packfile_checksum": - self._packfile_checksum = self._cursor.map()[-20:] - elif attr == "_cursor": - # Note: We don't lock the file when reading as we cannot be sure - # that we can actually write to the location - it could be a read-only - # alternate for instance - self._cursor = mman.make_cursor(self._indexpath).use_region() - # We will assume that the index will always fully fit into memory ! - if mman.window_size() > 0 and self._cursor.file_size() > mman.window_size(): - raise AssertionError("The index file at %s is too large to fit into a mapped window (%i > %i). This is a limitation of the implementation" % ( - self._indexpath, self._cursor.file_size(), mman.window_size())) - # END assert window size - else: - # now its time to initialize everything - if we are here, someone wants - # to access the fanout table or related properties - - # CHECK VERSION - mmap = self._cursor.map() - self._version = (mmap[:4] == self.index_v2_signature and 2) or 1 - if self._version == 2: - version_id = unpack_from(">L", mmap, 4)[0] - assert version_id == self._version, "Unsupported index version: %i" % version_id - # END assert version - - # SETUP FUNCTIONS - # setup our functions according to the actual version - for fname in ('entry', 'offset', 'sha', 'crc'): - setattr(self, fname, getattr(self, "_%s_v%i" % (fname, self._version))) - # END for each function to initialize - - # INITIALIZE DATA - # byte offset is 8 if version is 2, 0 otherwise - self._initialize() - # END handle attributes - - #{ Access V1 - - def _entry_v1(self, i): - """:return: tuple(offset, binsha, 0)""" - return unpack_from(">L20s", self._cursor.map(), 1024 + i * 24) + (0, ) - - def _offset_v1(self, i): - """see ``_offset_v2``""" - return unpack_from(">L", self._cursor.map(), 1024 + i * 24)[0] - - def _sha_v1(self, i): - """see ``_sha_v2``""" - base = 1024 + (i * 24) + 4 - return self._cursor.map()[base:base + 20] - - def _crc_v1(self, i): - """unsupported""" - return 0 - - #} END access V1 - - #{ Access V2 - def _entry_v2(self, i): - """:return: tuple(offset, binsha, crc)""" - return (self._offset_v2(i), self._sha_v2(i), self._crc_v2(i)) - - def _offset_v2(self, i): - """:return: 32 or 64 byte offset into pack files. 64 byte offsets will only - be returned if the pack is larger than 4 GiB, or 2^32""" - offset = unpack_from(">L", self._cursor.map(), self._pack_offset + i * 4)[0] - - # if the high-bit is set, this indicates that we have to lookup the offset - # in the 64 bit region of the file. The current offset ( lower 31 bits ) - # are the index into it - if offset & 0x80000000: - offset = unpack_from(">Q", self._cursor.map(), self._pack_64_offset + (offset & ~0x80000000) * 8)[0] - # END handle 64 bit offset - - return offset - - def _sha_v2(self, i): - """:return: sha at the given index of this file index instance""" - base = self._sha_list_offset + i * 20 - return self._cursor.map()[base:base + 20] - - def _crc_v2(self, i): - """:return: 4 bytes crc for the object at index i""" - return unpack_from(">L", self._cursor.map(), self._crc_list_offset + i * 4)[0] - - #} END access V2 - - #{ Initialization - - def _initialize(self): - """initialize base data""" - self._fanout_table = self._read_fanout((self._version == 2) * 8) - - if self._version == 2: - self._crc_list_offset = self._sha_list_offset + self.size() * 20 - self._pack_offset = self._crc_list_offset + self.size() * 4 - self._pack_64_offset = self._pack_offset + self.size() * 4 - # END setup base - - def _read_fanout(self, byte_offset): - """Generate a fanout table from our data""" - d = self._cursor.map() - out = list() - append = out.append - for i in range(256): - append(unpack_from('>L', d, byte_offset + i * 4)[0]) - # END for each entry - return out - - #} END initialization - - #{ Properties - def version(self): - return self._version - - def size(self): - """:return: amount of objects referred to by this index""" - return self._fanout_table[255] - - def path(self): - """:return: path to the packindexfile""" - return self._indexpath - - def packfile_checksum(self): - """:return: 20 byte sha representing the sha1 hash of the pack file""" - return self._cursor.map()[-40:-20] - - def indexfile_checksum(self): - """:return: 20 byte sha representing the sha1 hash of this index file""" - return self._cursor.map()[-20:] - - def offsets(self): - """:return: sequence of all offsets in the order in which they were written - - **Note:** return value can be random accessed, but may be immmutable""" - if self._version == 2: - # read stream to array, convert to tuple - a = array.array('I') # 4 byte unsigned int, long are 8 byte on 64 bit it appears - a.frombytes(self._cursor.map()[self._pack_offset:self._pack_64_offset]) - - # networkbyteorder to something array likes more - if sys.byteorder == 'little': - a.byteswap() - return a - else: - return tuple(self.offset(index) for index in range(self.size())) - # END handle version - - def sha_to_index(self, sha): - """ - :return: index usable with the ``offset`` or ``entry`` method, or None - if the sha was not found in this pack index - :param sha: 20 byte sha to lookup""" - first_byte = byte_ord(sha[0]) - get_sha = self.sha - lo = 0 # lower index, the left bound of the bisection - if first_byte != 0: - lo = self._fanout_table[first_byte - 1] - hi = self._fanout_table[first_byte] # the upper, right bound of the bisection - - # bisect until we have the sha - while lo < hi: - mid = (lo + hi) // 2 - mid_sha = get_sha(mid) - if sha < mid_sha: - hi = mid - elif sha == mid_sha: - return mid - else: - lo = mid + 1 - # END handle midpoint - # END bisect - return None - - def partial_sha_to_index(self, partial_bin_sha, canonical_length): - """ - :return: index as in `sha_to_index` or None if the sha was not found in this - index file - :param partial_bin_sha: an at least two bytes of a partial binary sha as bytes - :param canonical_length: length of the original hexadecimal representation of the - given partial binary sha - :raise AmbiguousObjectName:""" - if len(partial_bin_sha) < 2: - raise ValueError("Require at least 2 bytes of partial sha") - - assert isinstance(partial_bin_sha, bytes), "partial_bin_sha must be bytes" - first_byte = byte_ord(partial_bin_sha[0]) - - get_sha = self.sha - lo = 0 # lower index, the left bound of the bisection - if first_byte != 0: - lo = self._fanout_table[first_byte - 1] - hi = self._fanout_table[first_byte] # the upper, right bound of the bisection - - # fill the partial to full 20 bytes - filled_sha = partial_bin_sha + NULL_BYTE * (20 - len(partial_bin_sha)) - - # find lowest - while lo < hi: - mid = (lo + hi) // 2 - mid_sha = get_sha(mid) - if filled_sha < mid_sha: - hi = mid - elif filled_sha == mid_sha: - # perfect match - lo = mid - break - else: - lo = mid + 1 - # END handle midpoint - # END bisect - - if lo < self.size(): - cur_sha = get_sha(lo) - if is_equal_canonical_sha(canonical_length, partial_bin_sha, cur_sha): - next_sha = None - if lo + 1 < self.size(): - next_sha = get_sha(lo + 1) - if next_sha and next_sha == cur_sha: - raise AmbiguousObjectName(partial_bin_sha) - return lo - # END if we have a match - # END if we found something - return None - - if 'PackIndexFile_sha_to_index' in globals(): - # NOTE: Its just about 25% faster, the major bottleneck might be the attr - # accesses - def sha_to_index(self, sha): - return PackIndexFile_sha_to_index(self, sha) - # END redefine heavy-hitter with c version - - #} END properties - - -class PackFile(LazyMixin): - - """A pack is a file written according to the Version 2 for git packs - - As we currently use memory maps, it could be assumed that the maximum size of - packs therefore is 32 bit on 32 bit systems. On 64 bit systems, this should be - fine though. - - **Note:** at some point, this might be implemented using streams as well, or - streams are an alternate path in the case memory maps cannot be created - for some reason - one clearly doesn't want to read 10GB at once in that - case""" - - __slots__ = ('_packpath', '_cursor', '_size', '_version') - pack_signature = 0x5041434b # 'PACK' - pack_version_default = 2 - - # offset into our data at which the first object starts - first_object_offset = 3 * 4 # header bytes - footer_size = 20 # final sha - - def __init__(self, packpath): - self._packpath = packpath - - def close(self): - mman.force_map_handle_removal_win(self._packpath) - self._cursor = None - - def _set_cache_(self, attr): - # we fill the whole cache, whichever attribute gets queried first - self._cursor = mman.make_cursor(self._packpath).use_region() - - # read the header information - type_id, self._version, self._size = unpack_from(">LLL", self._cursor.map(), 0) - - # TODO: figure out whether we should better keep the lock, or maybe - # add a .keep file instead ? - if type_id != self.pack_signature: - raise ParseError("Invalid pack signature: %i" % type_id) - - def _iter_objects(self, start_offset, as_stream=True): - """Handle the actual iteration of objects within this pack""" - c = self._cursor - content_size = c.file_size() - self.footer_size - cur_offset = start_offset or self.first_object_offset - - null = NullStream() - while cur_offset < content_size: - data_offset, ostream = pack_object_at(c, cur_offset, True) - # scrub the stream to the end - this decompresses the object, but yields - # the amount of compressed bytes we need to get to the next offset - - stream_copy(ostream.read, null.write, ostream.size, chunk_size) - assert ostream.stream._br == ostream.size - cur_offset += (data_offset - ostream.pack_offset) + ostream.stream.compressed_bytes_read() - - # if a stream is requested, reset it beforehand - # Otherwise return the Stream object directly, its derived from the - # info object - if as_stream: - ostream.stream.seek(0) - yield ostream - # END until we have read everything - - #{ Pack Information - - def size(self): - """:return: The amount of objects stored in this pack""" - return self._size - - def version(self): - """:return: the version of this pack""" - return self._version - - def data(self): - """ - :return: read-only data of this pack. It provides random access and usually - is a memory map. - :note: This method is unsafe as it returns a window into a file which might be larger than than the actual window size""" - # can use map as we are starting at offset 0. Otherwise we would have to use buffer() - return self._cursor.use_region().map() - - def checksum(self): - """:return: 20 byte sha1 hash on all object sha's contained in this file""" - return self._cursor.use_region(self._cursor.file_size() - 20).buffer()[:] - - def path(self): - """:return: path to the packfile""" - return self._packpath - #} END pack information - - #{ Pack Specific - - def collect_streams(self, offset): - """ - :return: list of pack streams which are required to build the object - at the given offset. The first entry of the list is the object at offset, - the last one is either a full object, or a REF_Delta stream. The latter - type needs its reference object to be locked up in an ODB to form a valid - delta chain. - If the object at offset is no delta, the size of the list is 1. - :param offset: specifies the first byte of the object within this pack""" - out = list() - c = self._cursor - while True: - ostream = pack_object_at(c, offset, True)[1] - out.append(ostream) - if ostream.type_id == OFS_DELTA: - offset = ostream.pack_offset - ostream.delta_info - else: - # the only thing we can lookup are OFFSET deltas. Everything - # else is either an object, or a ref delta, in the latter - # case someone else has to find it - break - # END handle type - # END while chaining streams - return out - - #} END pack specific - - #{ Read-Database like Interface - - def info(self, offset): - """Retrieve information about the object at the given file-absolute offset - - :param offset: byte offset - :return: OPackInfo instance, the actual type differs depending on the type_id attribute""" - return pack_object_at(self._cursor, offset or self.first_object_offset, False)[1] - - def stream(self, offset): - """Retrieve an object at the given file-relative offset as stream along with its information - - :param offset: byte offset - :return: OPackStream instance, the actual type differs depending on the type_id attribute""" - return pack_object_at(self._cursor, offset or self.first_object_offset, True)[1] - - def stream_iter(self, start_offset=0): - """ - :return: iterator yielding OPackStream compatible instances, allowing - to access the data in the pack directly. - :param start_offset: offset to the first object to iterate. If 0, iteration - starts at the very first object in the pack. - - **Note:** Iterating a pack directly is costly as the datastream has to be decompressed - to determine the bounds between the objects""" - return self._iter_objects(start_offset, as_stream=True) - - #} END Read-Database like Interface - - -class PackEntity(LazyMixin): - - """Combines the PackIndexFile and the PackFile into one, allowing the - actual objects to be resolved and iterated""" - - __slots__ = ('_index', # our index file - '_pack', # our pack file - '_offset_map' # on demand dict mapping one offset to the next consecutive one - ) - - IndexFileCls = PackIndexFile - PackFileCls = PackFile - - def __init__(self, pack_or_index_path): - """Initialize ourselves with the path to the respective pack or index file""" - basename, ext = os.path.splitext(pack_or_index_path) - self._index = self.IndexFileCls("%s.idx" % basename) # PackIndexFile instance - self._pack = self.PackFileCls("%s.pack" % basename) # corresponding PackFile instance - - def close(self): - self._index.close() - self._pack.close() - - def _set_cache_(self, attr): - # currently this can only be _offset_map - # TODO: make this a simple sorted offset array which can be bisected - # to find the respective entry, from which we can take a +1 easily - # This might be slower, but should also be much lighter in memory ! - offsets_sorted = sorted(self._index.offsets()) - last_offset = len(self._pack.data()) - self._pack.footer_size - assert offsets_sorted, "Cannot handle empty indices" - - offset_map = None - if len(offsets_sorted) == 1: - offset_map = {offsets_sorted[0]: last_offset} - else: - iter_offsets = iter(offsets_sorted) - iter_offsets_plus_one = iter(offsets_sorted) - next(iter_offsets_plus_one) - consecutive = zip(iter_offsets, iter_offsets_plus_one) - - offset_map = dict(consecutive) - - # the last offset is not yet set - offset_map[offsets_sorted[-1]] = last_offset - # END handle offset amount - self._offset_map = offset_map - - def _sha_to_index(self, sha): - """:return: index for the given sha, or raise""" - index = self._index.sha_to_index(sha) - if index is None: - raise BadObject(sha) - return index - - def _iter_objects(self, as_stream): - """Iterate over all objects in our index and yield their OInfo or OStream instences""" - _sha = self._index.sha - _object = self._object - for index in range(self._index.size()): - yield _object(_sha(index), as_stream, index) - # END for each index - - def _object(self, sha, as_stream, index=-1): - """:return: OInfo or OStream object providing information about the given sha - :param index: if not -1, its assumed to be the sha's index in the IndexFile""" - # its a little bit redundant here, but it needs to be efficient - if index < 0: - index = self._sha_to_index(sha) - if sha is None: - sha = self._index.sha(index) - # END assure sha is present ( in output ) - offset = self._index.offset(index) - type_id, uncomp_size, data_rela_offset = pack_object_header_info(self._pack._cursor.use_region(offset).buffer()) - if as_stream: - if type_id not in delta_types: - packstream = self._pack.stream(offset) - return OStream(sha, packstream.type, packstream.size, packstream.stream) - # END handle non-deltas - - # produce a delta stream containing all info - # To prevent it from applying the deltas when querying the size, - # we extract it from the delta stream ourselves - streams = self.collect_streams_at_offset(offset) - dstream = DeltaApplyReader.new(streams) - - return ODeltaStream(sha, dstream.type, None, dstream) - else: - if type_id not in delta_types: - return OInfo(sha, type_id_to_type_map[type_id], uncomp_size) - # END handle non-deltas - - # deltas are a little tougher - unpack the first bytes to obtain - # the actual target size, as opposed to the size of the delta data - streams = self.collect_streams_at_offset(offset) - buf = streams[0].read(512) - offset, src_size = msb_size(buf) - offset, target_size = msb_size(buf, offset) - - # collect the streams to obtain the actual object type - if streams[-1].type_id in delta_types: - raise BadObject(sha, "Could not resolve delta object") - return OInfo(sha, streams[-1].type, target_size) - # END handle stream - - #{ Read-Database like Interface - - def info(self, sha): - """Retrieve information about the object identified by the given sha - - :param sha: 20 byte sha1 - :raise BadObject: - :return: OInfo instance, with 20 byte sha""" - return self._object(sha, False) - - def stream(self, sha): - """Retrieve an object stream along with its information as identified by the given sha - - :param sha: 20 byte sha1 - :raise BadObject: - :return: OStream instance, with 20 byte sha""" - return self._object(sha, True) - - def info_at_index(self, index): - """As ``info``, but uses a PackIndexFile compatible index to refer to the object""" - return self._object(None, False, index) - - def stream_at_index(self, index): - """As ``stream``, but uses a PackIndexFile compatible index to refer to the - object""" - return self._object(None, True, index) - - #} END Read-Database like Interface - - #{ Interface - - def pack(self): - """:return: the underlying pack file instance""" - return self._pack - - def index(self): - """:return: the underlying pack index file instance""" - return self._index - - def is_valid_stream(self, sha, use_crc=False): - """ - Verify that the stream at the given sha is valid. - - :param use_crc: if True, the index' crc is run over the compressed stream of - the object, which is much faster than checking the sha1. It is also - more prone to unnoticed corruption or manipulation. - :param sha: 20 byte sha1 of the object whose stream to verify - whether the compressed stream of the object is valid. If it is - a delta, this only verifies that the delta's data is valid, not the - data of the actual undeltified object, as it depends on more than - just this stream. - If False, the object will be decompressed and the sha generated. It must - match the given sha - - :return: True if the stream is valid - :raise UnsupportedOperation: If the index is version 1 only - :raise BadObject: sha was not found""" - if use_crc: - if self._index.version() < 2: - raise UnsupportedOperation("Version 1 indices do not contain crc's, verify by sha instead") - # END handle index version - - index = self._sha_to_index(sha) - offset = self._index.offset(index) - next_offset = self._offset_map[offset] - crc_value = self._index.crc(index) - - # create the current crc value, on the compressed object data - # Read it in chunks, without copying the data - crc_update = zlib.crc32 - pack_data = self._pack.data() - cur_pos = offset - this_crc_value = 0 - while cur_pos < next_offset: - rbound = min(cur_pos + chunk_size, next_offset) - size = rbound - cur_pos - this_crc_value = crc_update(pack_data[cur_pos:cur_pos + size], this_crc_value) - cur_pos += size - # END window size loop - - # crc returns signed 32 bit numbers, the AND op forces it into unsigned - # mode ... wow, sneaky, from dulwich. - return (this_crc_value & 0xffffffff) == crc_value - else: - shawriter = Sha1Writer() - stream = self._object(sha, as_stream=True) - # write a loose object, which is the basis for the sha - write_object(stream.type, stream.size, stream.read, shawriter.write) - - assert shawriter.sha(as_hex=False) == sha - return shawriter.sha(as_hex=False) == sha - # END handle crc/sha verification - return True - - def info_iter(self): - """ - :return: Iterator over all objects in this pack. The iterator yields - OInfo instances""" - return self._iter_objects(as_stream=False) - - def stream_iter(self): - """ - :return: iterator over all objects in this pack. The iterator yields - OStream instances""" - return self._iter_objects(as_stream=True) - - def collect_streams_at_offset(self, offset): - """ - As the version in the PackFile, but can resolve REF deltas within this pack - For more info, see ``collect_streams`` - - :param offset: offset into the pack file at which the object can be found""" - streams = self._pack.collect_streams(offset) - - # try to resolve the last one if needed. It is assumed to be either - # a REF delta, or a base object, as OFFSET deltas are resolved by the pack - if streams[-1].type_id == REF_DELTA: - stream = streams[-1] - while stream.type_id in delta_types: - if stream.type_id == REF_DELTA: - # smmap can return memory view objects, which can't be compared as buffers/bytes can ... - if isinstance(stream.delta_info, memoryview): - sindex = self._index.sha_to_index(stream.delta_info.tobytes()) - else: - sindex = self._index.sha_to_index(stream.delta_info) - if sindex is None: - break - stream = self._pack.stream(self._index.offset(sindex)) - streams.append(stream) - else: - # must be another OFS DELTA - this could happen if a REF - # delta we resolve previously points to an OFS delta. Who - # would do that ;) ? We can handle it though - stream = self._pack.stream(stream.delta_info) - streams.append(stream) - # END handle ref delta - # END resolve ref streams - # END resolve streams - - return streams - - def collect_streams(self, sha): - """ - As ``PackFile.collect_streams``, but takes a sha instead of an offset. - Additionally, ref_delta streams will be resolved within this pack. - If this is not possible, the stream will be left alone, hence it is adivsed - to check for unresolved ref-deltas and resolve them before attempting to - construct a delta stream. - - :param sha: 20 byte sha1 specifying the object whose related streams you want to collect - :return: list of streams, first being the actual object delta, the last being - a possibly unresolved base object. - :raise BadObject:""" - return self.collect_streams_at_offset(self._index.offset(self._sha_to_index(sha))) - - @classmethod - def write_pack(cls, object_iter, pack_write, index_write=None, - object_count=None, zlib_compression=zlib.Z_BEST_SPEED): - """ - Create a new pack by putting all objects obtained by the object_iterator - into a pack which is written using the pack_write method. - The respective index is produced as well if index_write is not Non. - - :param object_iter: iterator yielding odb output objects - :param pack_write: function to receive strings to write into the pack stream - :param indx_write: if not None, the function writes the index file corresponding - to the pack. - :param object_count: if you can provide the amount of objects in your iteration, - this would be the place to put it. Otherwise we have to pre-iterate and store - all items into a list to get the number, which uses more memory than necessary. - :param zlib_compression: the zlib compression level to use - :return: tuple(pack_sha, index_binsha) binary sha over all the contents of the pack - and over all contents of the index. If index_write was None, index_binsha will be None - - **Note:** The destination of the write functions is up to the user. It could - be a socket, or a file for instance - - **Note:** writes only undeltified objects""" - objs = object_iter - if not object_count: - if not isinstance(object_iter, (tuple, list)): - objs = list(object_iter) - # END handle list type - object_count = len(objs) - # END handle object - - pack_writer = FlexibleSha1Writer(pack_write) - pwrite = pack_writer.write - ofs = 0 # current offset into the pack file - index = None - wants_index = index_write is not None - - # write header - pwrite(pack('>LLL', PackFile.pack_signature, PackFile.pack_version_default, object_count)) - ofs += 12 - - if wants_index: - index = IndexWriter() - # END handle index header - - actual_count = 0 - for obj in objs: - actual_count += 1 - crc = 0 - - # object header - hdr = create_pack_object_header(obj.type_id, obj.size) - if index_write: - crc = crc32(hdr) - else: - crc = None - # END handle crc - pwrite(hdr) - - # data stream - zstream = zlib.compressobj(zlib_compression) - ostream = obj.stream - br, bw, crc = write_stream_to_pack(ostream.read, pwrite, zstream, base_crc=crc) - assert(br == obj.size) - if wants_index: - index.append(obj.binsha, crc, ofs) - # END handle index - - ofs += len(hdr) + bw - if actual_count == object_count: - break - # END abort once we are done - # END for each object - - if actual_count != object_count: - raise ValueError( - "Expected to write %i objects into pack, but received only %i from iterators" % (object_count, actual_count)) - # END count assertion - - # write footer - pack_sha = pack_writer.sha(as_hex=False) - assert len(pack_sha) == 20 - pack_write(pack_sha) - ofs += len(pack_sha) # just for completeness ;) - - index_sha = None - if wants_index: - index_sha = index.write(pack_sha, index_write) - # END handle index - - return pack_sha, index_sha - - @classmethod - def create(cls, object_iter, base_dir, object_count=None, zlib_compression=zlib.Z_BEST_SPEED): - """Create a new on-disk entity comprised of a properly named pack file and a properly named - and corresponding index file. The pack contains all OStream objects contained in object iter. - :param base_dir: directory which is to contain the files - :return: PackEntity instance initialized with the new pack - - **Note:** for more information on the other parameters see the write_pack method""" - pack_fd, pack_path = tempfile.mkstemp('', 'pack', base_dir) - index_fd, index_path = tempfile.mkstemp('', 'index', base_dir) - pack_write = lambda d: os.write(pack_fd, d) - index_write = lambda d: os.write(index_fd, d) - - pack_binsha, index_binsha = cls.write_pack(object_iter, pack_write, index_write, object_count, zlib_compression) - os.close(pack_fd) - os.close(index_fd) - - fmt = "pack-%s.%s" - new_pack_path = os.path.join(base_dir, fmt % (bin_to_hex(pack_binsha), 'pack')) - new_index_path = os.path.join(base_dir, fmt % (bin_to_hex(pack_binsha), 'idx')) - os.rename(pack_path, new_pack_path) - os.rename(index_path, new_index_path) - - return cls(new_pack_path) - - #} END interface diff --git a/gitdb/stream.py b/gitdb/stream.py deleted file mode 100644 index 1e0be84..0000000 --- a/gitdb/stream.py +++ /dev/null @@ -1,730 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ - -from io import BytesIO - -import mmap -import os -import sys -import zlib - -from gitdb.fun import ( - msb_size, - stream_copy, - apply_delta_data, - connect_deltas, - delta_types -) - -from gitdb.util import ( - allocate_memory, - LazyMixin, - make_sha, - write, - close, -) - -from gitdb.const import NULL_BYTE, BYTE_SPACE -from gitdb.utils.encoding import force_bytes - -has_perf_mod = False -try: - from gitdb_speedups._perf import apply_delta as c_apply_delta - has_perf_mod = True -except ImportError: - pass - -__all__ = ('DecompressMemMapReader', 'FDCompressedSha1Writer', 'DeltaApplyReader', - 'Sha1Writer', 'FlexibleSha1Writer', 'ZippedStoreShaWriter', 'FDCompressedSha1Writer', - 'FDStream', 'NullStream') - - -#{ RO Streams - -class DecompressMemMapReader(LazyMixin): - - """Reads data in chunks from a memory map and decompresses it. The client sees - only the uncompressed data, respective file-like read calls are handling on-demand - buffered decompression accordingly - - A constraint on the total size of bytes is activated, simulating - a logical file within a possibly larger physical memory area - - To read efficiently, you clearly don't want to read individual bytes, instead, - read a few kilobytes at least. - - **Note:** The chunk-size should be carefully selected as it will involve quite a bit - of string copying due to the way the zlib is implemented. Its very wasteful, - hence we try to find a good tradeoff between allocation time and number of - times we actually allocate. An own zlib implementation would be good here - to better support streamed reading - it would only need to keep the mmap - and decompress it into chunks, that's all ... """ - __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close', - '_cbr', '_phi') - - max_read_size = 512 * 1024 # currently unused - - def __init__(self, m, close_on_deletion, size=None): - """Initialize with mmap for stream reading - :param m: must be content data - use new if you have object data and no size""" - self._m = m - self._zip = zlib.decompressobj() - self._buf = None # buffer of decompressed bytes - self._buflen = 0 # length of bytes in buffer - if size is not None: - self._s = size # size of uncompressed data to read in total - self._br = 0 # num uncompressed bytes read - self._cws = 0 # start byte of compression window - self._cwe = 0 # end byte of compression window - self._cbr = 0 # number of compressed bytes read - self._phi = False # is True if we parsed the header info - self._close = close_on_deletion # close the memmap on deletion ? - - def _set_cache_(self, attr): - assert attr == '_s' - # only happens for size, which is a marker to indicate we still - # have to parse the header from the stream - self._parse_header_info() - - def __del__(self): - self.close() - - def _parse_header_info(self): - """If this stream contains object data, parse the header info and skip the - stream to a point where each read will yield object content - - :return: parsed type_string, size""" - # read header - # should really be enough, cgit uses 8192 I believe - # And for good reason !! This needs to be that high for the header to be read correctly in all cases - maxb = 8192 - self._s = maxb - hdr = self.read(maxb) - hdrend = hdr.find(NULL_BYTE) - typ, size = hdr[:hdrend].split(BYTE_SPACE) - size = int(size) - self._s = size - - # adjust internal state to match actual header length that we ignore - # The buffer will be depleted first on future reads - self._br = 0 - hdrend += 1 - self._buf = BytesIO(hdr[hdrend:]) - self._buflen = len(hdr) - hdrend - - self._phi = True - - return typ, size - - #{ Interface - - @classmethod - def new(self, m, close_on_deletion=False): - """Create a new DecompressMemMapReader instance for acting as a read-only stream - This method parses the object header from m and returns the parsed - type and size, as well as the created stream instance. - - :param m: memory map on which to operate. It must be object data ( header + contents ) - :param close_on_deletion: if True, the memory map will be closed once we are - being deleted""" - inst = DecompressMemMapReader(m, close_on_deletion, 0) - typ, size = inst._parse_header_info() - return typ, size, inst - - def data(self): - """:return: random access compatible data we are working on""" - return self._m - - def close(self): - """Close our underlying stream of compressed bytes if this was allowed during initialization - :return: True if we closed the underlying stream - :note: can be called safely - """ - if self._close: - if hasattr(self._m, 'close'): - self._m.close() - self._close = False - # END handle resource freeing - - def compressed_bytes_read(self): - """ - :return: number of compressed bytes read. This includes the bytes it - took to decompress the header ( if there was one )""" - # ABSTRACT: When decompressing a byte stream, it can be that the first - # x bytes which were requested match the first x bytes in the loosely - # compressed datastream. This is the worst-case assumption that the reader - # does, it assumes that it will get at least X bytes from X compressed bytes - # in call cases. - # The caveat is that the object, according to our known uncompressed size, - # is already complete, but there are still some bytes left in the compressed - # stream that contribute to the amount of compressed bytes. - # How can we know that we are truly done, and have read all bytes we need - # to read ? - # Without help, we cannot know, as we need to obtain the status of the - # decompression. If it is not finished, we need to decompress more data - # until it is finished, to yield the actual number of compressed bytes - # belonging to the decompressed object - # We are using a custom zlib module for this, if its not present, - # we try to put in additional bytes up for decompression if feasible - # and check for the unused_data. - - # Only scrub the stream forward if we are officially done with the - # bytes we were to have. - if self._br == self._s and not self._zip.unused_data: - # manipulate the bytes-read to allow our own read method to continue - # but keep the window at its current position - self._br = 0 - if hasattr(self._zip, 'status'): - while self._zip.status == zlib.Z_OK: - self.read(mmap.PAGESIZE) - # END scrub-loop custom zlib - else: - # pass in additional pages, until we have unused data - while not self._zip.unused_data and self._cbr != len(self._m): - self.read(mmap.PAGESIZE) - # END scrub-loop default zlib - # END handle stream scrubbing - - # reset bytes read, just to be sure - self._br = self._s - # END handle stream scrubbing - - # unused data ends up in the unconsumed tail, which was removed - # from the count already - return self._cbr - - #} END interface - - def seek(self, offset, whence=getattr(os, 'SEEK_SET', 0)): - """Allows to reset the stream to restart reading - :raise ValueError: If offset and whence are not 0""" - if offset != 0 or whence != getattr(os, 'SEEK_SET', 0): - raise ValueError("Can only seek to position 0") - # END handle offset - - self._zip = zlib.decompressobj() - self._br = self._cws = self._cwe = self._cbr = 0 - if self._phi: - self._phi = False - del(self._s) # trigger header parsing on first access - # END skip header - - def read(self, size=-1): - if size < 1: - size = self._s - self._br - else: - size = min(size, self._s - self._br) - # END clamp size - - if size == 0: - return b'' - # END handle depletion - - # deplete the buffer, then just continue using the decompress object - # which has an own buffer. We just need this to transparently parse the - # header from the zlib stream - dat = b'' - if self._buf: - if self._buflen >= size: - # have enough data - dat = self._buf.read(size) - self._buflen -= size - self._br += size - return dat - else: - dat = self._buf.read() # ouch, duplicates data - size -= self._buflen - self._br += self._buflen - - self._buflen = 0 - self._buf = None - # END handle buffer len - # END handle buffer - - # decompress some data - # Abstract: zlib needs to operate on chunks of our memory map ( which may - # be large ), as it will otherwise and always fill in the 'unconsumed_tail' - # attribute which possible reads our whole map to the end, forcing - # everything to be read from disk even though just a portion was requested. - # As this would be a nogo, we workaround it by passing only chunks of data, - # moving the window into the memory map along as we decompress, which keeps - # the tail smaller than our chunk-size. This causes 'only' the chunk to be - # copied once, and another copy of a part of it when it creates the unconsumed - # tail. We have to use it to hand in the appropriate amount of bytes during - # the next read. - tail = self._zip.unconsumed_tail - if tail: - # move the window, make it as large as size demands. For code-clarity, - # we just take the chunk from our map again instead of reusing the unconsumed - # tail. The latter one would safe some memory copying, but we could end up - # with not getting enough data uncompressed, so we had to sort that out as well. - # Now we just assume the worst case, hence the data is uncompressed and the window - # needs to be as large as the uncompressed bytes we want to read. - self._cws = self._cwe - len(tail) - self._cwe = self._cws + size - else: - cws = self._cws - self._cws = self._cwe - self._cwe = cws + size - # END handle tail - - # if window is too small, make it larger so zip can decompress something - if self._cwe - self._cws < 8: - self._cwe = self._cws + 8 - # END adjust winsize - - # takes a slice, but doesn't copy the data, it says ... - indata = self._m[self._cws:self._cwe] - - # get the actual window end to be sure we don't use it for computations - self._cwe = self._cws + len(indata) - dcompdat = self._zip.decompress(indata, size) - # update the amount of compressed bytes read - # We feed possibly overlapping chunks, which is why the unconsumed tail - # has to be taken into consideration, as well as the unused data - # if we hit the end of the stream - # NOTE: Behavior changed in PY2.7 onward, which requires special handling to make the tests work properly. - # They are thorough, and I assume it is truly working. - # Why is this logic as convoluted as it is ? Please look at the table in - # https://github.com/gitpython-developers/gitdb/issues/19 to learn about the test-results. - # Basically, on py2.6, you want to use branch 1, whereas on all other python version, the second branch - # will be the one that works. - # However, the zlib VERSIONs as well as the platform check is used to further match the entries in the - # table in the github issue. This is it ... it was the only way I could make this work everywhere. - # IT's CERTAINLY GOING TO BITE US IN THE FUTURE ... . - if getattr(zlib, 'ZLIB_RUNTIME_VERSION', zlib.ZLIB_VERSION) in ('1.2.7', '1.2.5') and not sys.platform == 'darwin': - unused_datalen = len(self._zip.unconsumed_tail) - else: - unused_datalen = len(self._zip.unconsumed_tail) + len(self._zip.unused_data) - # # end handle very special case ... - - self._cbr += len(indata) - unused_datalen - self._br += len(dcompdat) - - if dat: - dcompdat = dat + dcompdat - # END prepend our cached data - - # it can happen, depending on the compression, that we get less bytes - # than ordered as it needs the final portion of the data as well. - # Recursively resolve that. - # Note: dcompdat can be empty even though we still appear to have bytes - # to read, if we are called by compressed_bytes_read - it manipulates - # us to empty the stream - if dcompdat and (len(dcompdat) - len(dat)) < size and self._br < self._s: - dcompdat += self.read(size - len(dcompdat)) - # END handle special case - return dcompdat - - -class DeltaApplyReader(LazyMixin): - - """A reader which dynamically applies pack deltas to a base object, keeping the - memory demands to a minimum. - - The size of the final object is only obtainable once all deltas have been - applied, unless it is retrieved from a pack index. - - The uncompressed Delta has the following layout (MSB being a most significant - bit encoded dynamic size): - - * MSB Source Size - the size of the base against which the delta was created - * MSB Target Size - the size of the resulting data after the delta was applied - * A list of one byte commands (cmd) which are followed by a specific protocol: - - * cmd & 0x80 - copy delta_data[offset:offset+size] - - * Followed by an encoded offset into the delta data - * Followed by an encoded size of the chunk to copy - - * cmd & 0x7f - insert - - * insert cmd bytes from the delta buffer into the output stream - - * cmd == 0 - invalid operation ( or error in delta stream ) - """ - __slots__ = ( - "_bstream", # base stream to which to apply the deltas - "_dstreams", # tuple of delta stream readers - "_mm_target", # memory map of the delta-applied data - "_size", # actual number of bytes in _mm_target - "_br" # number of bytes read - ) - - #{ Configuration - k_max_memory_move = 250 * 1000 * 1000 - #} END configuration - - def __init__(self, stream_list): - """Initialize this instance with a list of streams, the first stream being - the delta to apply on top of all following deltas, the last stream being the - base object onto which to apply the deltas""" - assert len(stream_list) > 1, "Need at least one delta and one base stream" - - self._bstream = stream_list[-1] - self._dstreams = tuple(stream_list[:-1]) - self._br = 0 - - def _set_cache_too_slow_without_c(self, attr): - # the direct algorithm is fastest and most direct if there is only one - # delta. Also, the extra overhead might not be worth it for items smaller - # than X - definitely the case in python, every function call costs - # huge amounts of time - # if len(self._dstreams) * self._bstream.size < self.k_max_memory_move: - if len(self._dstreams) == 1: - return self._set_cache_brute_(attr) - - # Aggregate all deltas into one delta in reverse order. Hence we take - # the last delta, and reverse-merge its ancestor delta, until we receive - # the final delta data stream. - dcl = connect_deltas(self._dstreams) - - # call len directly, as the (optional) c version doesn't implement the sequence - # protocol - if dcl.rbound() == 0: - self._size = 0 - self._mm_target = allocate_memory(0) - return - # END handle empty list - - self._size = dcl.rbound() - self._mm_target = allocate_memory(self._size) - - bbuf = allocate_memory(self._bstream.size) - stream_copy(self._bstream.read, bbuf.write, self._bstream.size, 256 * mmap.PAGESIZE) - - # APPLY CHUNKS - write = self._mm_target.write - dcl.apply(bbuf, write) - - self._mm_target.seek(0) - - def _set_cache_brute_(self, attr): - """If we are here, we apply the actual deltas""" - # TODO: There should be a special case if there is only one stream - # Then the default-git algorithm should perform a tad faster, as the - # delta is not peaked into, causing less overhead. - buffer_info_list = list() - max_target_size = 0 - for dstream in self._dstreams: - buf = dstream.read(512) # read the header information + X - offset, src_size = msb_size(buf) - offset, target_size = msb_size(buf, offset) - buffer_info_list.append((buf[offset:], offset, src_size, target_size)) - max_target_size = max(max_target_size, target_size) - # END for each delta stream - - # sanity check - the first delta to apply should have the same source - # size as our actual base stream - base_size = self._bstream.size - target_size = max_target_size - - # if we have more than 1 delta to apply, we will swap buffers, hence we must - # assure that all buffers we use are large enough to hold all the results - if len(self._dstreams) > 1: - base_size = target_size = max(base_size, max_target_size) - # END adjust buffer sizes - - # Allocate private memory map big enough to hold the first base buffer - # We need random access to it - bbuf = allocate_memory(base_size) - stream_copy(self._bstream.read, bbuf.write, base_size, 256 * mmap.PAGESIZE) - - # allocate memory map large enough for the largest (intermediate) target - # We will use it as scratch space for all delta ops. If the final - # target buffer is smaller than our allocated space, we just use parts - # of it upon return. - tbuf = allocate_memory(target_size) - - # for each delta to apply, memory map the decompressed delta and - # work on the op-codes to reconstruct everything. - # For the actual copying, we use a seek and write pattern of buffer - # slices. - final_target_size = None - for (dbuf, offset, src_size, target_size), dstream in zip(reversed(buffer_info_list), reversed(self._dstreams)): - # allocate a buffer to hold all delta data - fill in the data for - # fast access. We do this as we know that reading individual bytes - # from our stream would be slower than necessary ( although possible ) - # The dbuf buffer contains commands after the first two MSB sizes, the - # offset specifies the amount of bytes read to get the sizes. - ddata = allocate_memory(dstream.size - offset) - ddata.write(dbuf) - # read the rest from the stream. The size we give is larger than necessary - stream_copy(dstream.read, ddata.write, dstream.size, 256 * mmap.PAGESIZE) - - ####################################################################### - if 'c_apply_delta' in globals(): - c_apply_delta(bbuf, ddata, tbuf) - else: - apply_delta_data(bbuf, src_size, ddata, len(ddata), tbuf.write) - ####################################################################### - - # finally, swap out source and target buffers. The target is now the - # base for the next delta to apply - bbuf, tbuf = tbuf, bbuf - bbuf.seek(0) - tbuf.seek(0) - final_target_size = target_size - # END for each delta to apply - - # its already seeked to 0, constrain it to the actual size - # NOTE: in the end of the loop, it swaps buffers, hence our target buffer - # is not tbuf, but bbuf ! - self._mm_target = bbuf - self._size = final_target_size - - #{ Configuration - if not has_perf_mod: - _set_cache_ = _set_cache_brute_ - else: - _set_cache_ = _set_cache_too_slow_without_c - - #} END configuration - - def read(self, count=0): - bl = self._size - self._br # bytes left - if count < 1 or count > bl: - count = bl - # NOTE: we could check for certain size limits, and possibly - # return buffers instead of strings to prevent byte copying - data = self._mm_target.read(count) - self._br += len(data) - return data - - def seek(self, offset, whence=getattr(os, 'SEEK_SET', 0)): - """Allows to reset the stream to restart reading - - :raise ValueError: If offset and whence are not 0""" - if offset != 0 or whence != getattr(os, 'SEEK_SET', 0): - raise ValueError("Can only seek to position 0") - # END handle offset - self._br = 0 - self._mm_target.seek(0) - - #{ Interface - - @classmethod - def new(cls, stream_list): - """ - Convert the given list of streams into a stream which resolves deltas - when reading from it. - - :param stream_list: two or more stream objects, first stream is a Delta - to the object that you want to resolve, followed by N additional delta - streams. The list's last stream must be a non-delta stream. - - :return: Non-Delta OPackStream object whose stream can be used to obtain - the decompressed resolved data - :raise ValueError: if the stream list cannot be handled""" - if len(stream_list) < 2: - raise ValueError("Need at least two streams") - # END single object special handling - - if stream_list[-1].type_id in delta_types: - raise ValueError( - "Cannot resolve deltas if there is no base object stream, last one was type: %s" % stream_list[-1].type) - # END check stream - return cls(stream_list) - - #} END interface - - #{ OInfo like Interface - - @property - def type(self): - return self._bstream.type - - @property - def type_id(self): - return self._bstream.type_id - - @property - def size(self): - """:return: number of uncompressed bytes in the stream""" - return self._size - - #} END oinfo like interface - - -#} END RO streams - - -#{ W Streams - -class Sha1Writer: - - """Simple stream writer which produces a sha whenever you like as it degests - everything it is supposed to write""" - __slots__ = "sha1" - - def __init__(self): - self.sha1 = make_sha() - - #{ Stream Interface - - def write(self, data): - """:raise IOError: If not all bytes could be written - :param data: byte object - :return: length of incoming data""" - - self.sha1.update(data) - - return len(data) - - # END stream interface - - #{ Interface - - def sha(self, as_hex=False): - """:return: sha so far - :param as_hex: if True, sha will be hex-encoded, binary otherwise""" - if as_hex: - return self.sha1.hexdigest() - return self.sha1.digest() - - #} END interface - - -class FlexibleSha1Writer(Sha1Writer): - - """Writer producing a sha1 while passing on the written bytes to the given - write function""" - __slots__ = 'writer' - - def __init__(self, writer): - Sha1Writer.__init__(self) - self.writer = writer - - def write(self, data): - Sha1Writer.write(self, data) - self.writer(data) - - -class ZippedStoreShaWriter(Sha1Writer): - - """Remembers everything someone writes to it and generates a sha""" - __slots__ = ('buf', 'zip') - - def __init__(self): - Sha1Writer.__init__(self) - self.buf = BytesIO() - self.zip = zlib.compressobj(zlib.Z_BEST_SPEED) - - def __getattr__(self, attr): - return getattr(self.buf, attr) - - def write(self, data): - alen = Sha1Writer.write(self, data) - self.buf.write(self.zip.compress(data)) - - return alen - - def close(self): - self.buf.write(self.zip.flush()) - - def seek(self, offset, whence=getattr(os, 'SEEK_SET', 0)): - """Seeking currently only supports to rewind written data - Multiple writes are not supported""" - if offset != 0 or whence != getattr(os, 'SEEK_SET', 0): - raise ValueError("Can only seek to position 0") - # END handle offset - self.buf.seek(0) - - def getvalue(self): - """:return: string value from the current stream position to the end""" - return self.buf.getvalue() - - -class FDCompressedSha1Writer(Sha1Writer): - - """Digests data written to it, making the sha available, then compress the - data and write it to the file descriptor - - **Note:** operates on raw file descriptors - **Note:** for this to work, you have to use the close-method of this instance""" - __slots__ = ("fd", "sha1", "zip") - - # default exception - exc = IOError("Failed to write all bytes to filedescriptor") - - def __init__(self, fd): - super().__init__() - self.fd = fd - self.zip = zlib.compressobj(zlib.Z_BEST_SPEED) - - #{ Stream Interface - - def write(self, data): - """:raise IOError: If not all bytes could be written - :return: length of incoming data""" - self.sha1.update(data) - cdata = self.zip.compress(data) - bytes_written = write(self.fd, cdata) - - if bytes_written != len(cdata): - raise self.exc - - return len(data) - - def close(self): - remainder = self.zip.flush() - if write(self.fd, remainder) != len(remainder): - raise self.exc - return close(self.fd) - - #} END stream interface - - -class FDStream: - - """A simple wrapper providing the most basic functions on a file descriptor - with the fileobject interface. Cannot use os.fdopen as the resulting stream - takes ownership""" - __slots__ = ("_fd", '_pos') - - def __init__(self, fd): - self._fd = fd - self._pos = 0 - - def write(self, data): - self._pos += len(data) - os.write(self._fd, data) - - def read(self, count=0): - if count == 0: - count = os.path.getsize(self._filepath) - # END handle read everything - - bytes = os.read(self._fd, count) - self._pos += len(bytes) - return bytes - - def fileno(self): - return self._fd - - def tell(self): - return self._pos - - def close(self): - close(self._fd) - - -class NullStream: - - """A stream that does nothing but providing a stream interface. - Use it like /dev/null""" - __slots__ = tuple() - - def read(self, size=0): - return '' - - def close(self): - pass - - def write(self, data): - return len(data) - - -#} END W streams diff --git a/gitdb/test/__init__.py b/gitdb/test/__init__.py deleted file mode 100644 index 03bd406..0000000 --- a/gitdb/test/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ diff --git a/gitdb/test/db/__init__.py b/gitdb/test/db/__init__.py deleted file mode 100644 index 03bd406..0000000 --- a/gitdb/test/db/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ diff --git a/gitdb/test/db/lib.py b/gitdb/test/db/lib.py deleted file mode 100644 index 408dd8c..0000000 --- a/gitdb/test/db/lib.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -"""Base classes for object db testing""" -from gitdb.test.lib import ( - with_rw_directory, - with_packs_rw, - fixture_path, - TestBase -) - -from gitdb.stream import ( - Sha1Writer, - ZippedStoreShaWriter -) - -from gitdb.base import ( - IStream, - OStream, - OInfo -) - -from gitdb.exc import BadObject -from gitdb.typ import str_blob_type - -from io import BytesIO - -from struct import pack - - -__all__ = ('TestDBBase', 'with_rw_directory', 'with_packs_rw', 'fixture_path') - - -class TestDBBase(TestBase): - - """Base class providing testing routines on databases""" - - # data - two_lines = b'1234\nhello world' - all_data = (two_lines, ) - - def _assert_object_writing_simple(self, db): - # write a bunch of objects and query their streams and info - null_objs = db.size() - ni = 250 - for i in range(ni): - data = pack(">L", i) - istream = IStream(str_blob_type, len(data), BytesIO(data)) - new_istream = db.store(istream) - assert new_istream is istream - assert db.has_object(istream.binsha) - - info = db.info(istream.binsha) - assert isinstance(info, OInfo) - assert info.type == istream.type and info.size == istream.size - - stream = db.stream(istream.binsha) - assert isinstance(stream, OStream) - assert stream.binsha == info.binsha and stream.type == info.type - assert stream.read() == data - # END for each item - - assert db.size() == null_objs + ni - shas = list(db.sha_iter()) - assert len(shas) == db.size() - assert len(shas[0]) == 20 - - def _assert_object_writing(self, db): - """General tests to verify object writing, compatible to ObjectDBW - **Note:** requires write access to the database""" - # start in 'dry-run' mode, using a simple sha1 writer - ostreams = (ZippedStoreShaWriter, None) - for ostreamcls in ostreams: - for data in self.all_data: - dry_run = ostreamcls is not None - ostream = None - if ostreamcls is not None: - ostream = ostreamcls() - assert isinstance(ostream, Sha1Writer) - # END create ostream - - prev_ostream = db.set_ostream(ostream) - assert type(prev_ostream) in ostreams or prev_ostream in ostreams - istream = IStream(str_blob_type, len(data), BytesIO(data)) - - # store returns same istream instance, with new sha set - my_istream = db.store(istream) - sha = istream.binsha - assert my_istream is istream - assert db.has_object(sha) != dry_run - assert len(sha) == 20 - - # verify data - the slow way, we want to run code - if not dry_run: - info = db.info(sha) - assert str_blob_type == info.type - assert info.size == len(data) - - ostream = db.stream(sha) - assert ostream.read() == data - assert ostream.type == str_blob_type - assert ostream.size == len(data) - else: - self.assertRaises(BadObject, db.info, sha) - self.assertRaises(BadObject, db.stream, sha) - - # DIRECT STREAM COPY - # our data hase been written in object format to the StringIO - # we passed as output stream. No physical database representation - # was created. - # Test direct stream copy of object streams, the result must be - # identical to what we fed in - ostream.seek(0) - istream.stream = ostream - assert istream.binsha is not None - prev_sha = istream.binsha - - db.set_ostream(ZippedStoreShaWriter()) - db.store(istream) - assert istream.binsha == prev_sha - new_ostream = db.ostream() - - # note: only works as long our store write uses the same compression - # level, which is zip_best - assert ostream.getvalue() == new_ostream.getvalue() - # END for each data set - # END for each dry_run mode diff --git a/gitdb/test/db/test_git.py b/gitdb/test/db/test_git.py deleted file mode 100644 index 73ac1a0..0000000 --- a/gitdb/test/db/test_git.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -import os -from gitdb.test.db.lib import ( - TestDBBase, - with_rw_directory -) -from gitdb.exc import BadObject -from gitdb.db import GitDB -from gitdb.base import OStream, OInfo -from gitdb.util import bin_to_hex - - -class TestGitDB(TestDBBase): - - def test_reading(self): - gdb = GitDB(os.path.join(self.gitrepopath, 'objects')) - - # we have packs and loose objects, alternates doesn't necessarily exist - assert 1 < len(gdb.databases()) < 4 - - # access should be possible - gitdb_sha = next(gdb.sha_iter()) - assert isinstance(gdb.info(gitdb_sha), OInfo) - assert isinstance(gdb.stream(gitdb_sha), OStream) - ni = 50 - assert gdb.size() >= ni - sha_list = list(gdb.sha_iter()) - assert len(sha_list) == gdb.size() - sha_list = sha_list[:ni] # speed up tests ... - - # This is actually a test for compound functionality, but it doesn't - # have a separate test module - # test partial shas - # this one as uneven and quite short - gitdb_sha_hex = bin_to_hex(gitdb_sha) - assert gdb.partial_to_complete_sha_hex(gitdb_sha_hex[:5]) == gitdb_sha - - # mix even/uneven hexshas - for i, binsha in enumerate(sha_list): - assert gdb.partial_to_complete_sha_hex(bin_to_hex(binsha)[:8 - (i % 2)]) == binsha - # END for each sha - - self.assertRaises(BadObject, gdb.partial_to_complete_sha_hex, "0000") - - @with_rw_directory - def test_writing(self, path): - gdb = GitDB(path) - - # its possible to write objects - self._assert_object_writing(gdb) diff --git a/gitdb/test/db/test_loose.py b/gitdb/test/db/test_loose.py deleted file mode 100644 index 295e2ee..0000000 --- a/gitdb/test/db/test_loose.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -from gitdb.test.db.lib import ( - TestDBBase, - with_rw_directory -) -from gitdb.db import LooseObjectDB -from gitdb.exc import BadObject -from gitdb.util import bin_to_hex - - -class TestLooseDB(TestDBBase): - - @with_rw_directory - def test_basics(self, path): - ldb = LooseObjectDB(path) - - # write data - self._assert_object_writing(ldb) - - # verify sha iteration and size - shas = list(ldb.sha_iter()) - assert shas and len(shas[0]) == 20 - - assert len(shas) == ldb.size() - - # verify find short object - long_sha = bin_to_hex(shas[-1]) - for short_sha in (long_sha[:20], long_sha[:5]): - assert bin_to_hex(ldb.partial_to_complete_sha_hex(short_sha)) == long_sha - # END for each sha - - self.assertRaises(BadObject, ldb.partial_to_complete_sha_hex, '0000') - # raises if no object could be found diff --git a/gitdb/test/db/test_mem.py b/gitdb/test/db/test_mem.py deleted file mode 100644 index 882e54f..0000000 --- a/gitdb/test/db/test_mem.py +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -from gitdb.test.db.lib import ( - TestDBBase, - with_rw_directory -) -from gitdb.db import ( - MemoryDB, - LooseObjectDB -) - - -class TestMemoryDB(TestDBBase): - - @with_rw_directory - def test_writing(self, path): - mdb = MemoryDB() - - # write data - self._assert_object_writing_simple(mdb) - - # test stream copy - ldb = LooseObjectDB(path) - assert ldb.size() == 0 - num_streams_copied = mdb.stream_copy(mdb.sha_iter(), ldb) - assert num_streams_copied == mdb.size() - - assert ldb.size() == mdb.size() - for sha in mdb.sha_iter(): - assert ldb.has_object(sha) - assert ldb.stream(sha).read() == mdb.stream(sha).read() - # END verify objects where copied and are equal diff --git a/gitdb/test/db/test_pack.py b/gitdb/test/db/test_pack.py deleted file mode 100644 index bd07906..0000000 --- a/gitdb/test/db/test_pack.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -from gitdb.test.db.lib import ( - TestDBBase, - with_rw_directory, - with_packs_rw -) -from gitdb.db import PackedDB - -from gitdb.exc import BadObject, AmbiguousObjectName -from gitdb.util import mman - -import os -import random -import sys - -import pytest - -class TestPackDB(TestDBBase): - - @with_rw_directory - @with_packs_rw - def test_writing(self, path): - if sys.platform == "win32": - pytest.skip("FIXME: Currently fail on windows") - - pdb = PackedDB(path) - - # on demand, we init our pack cache - num_packs = len(pdb.entities()) - assert pdb._st_mtime != 0 - - # test pack directory changed: - # packs removed - rename a file, should affect the glob - pack_path = pdb.entities()[0].pack().path() - new_pack_path = pack_path + "renamed" - if sys.platform == "win32": - # While using this function, we are not allowed to have any handle - # to this path, which is currently not the case. The pack caching - # does still have a handle :-( - mman.force_map_handle_removal_win(pack_path) - os.rename(pack_path, new_pack_path) - - pdb.update_cache(force=True) - assert len(pdb.entities()) == num_packs - 1 - - # packs added - os.rename(new_pack_path, pack_path) - pdb.update_cache(force=True) - assert len(pdb.entities()) == num_packs - - # bang on the cache - # access the Entities directly, as there is no iteration interface - # yet ( or required for now ) - sha_list = list(pdb.sha_iter()) - assert len(sha_list) == pdb.size() - - # hit all packs in random order - random.shuffle(sha_list) - - for sha in sha_list: - pdb.info(sha) - pdb.stream(sha) - # END for each sha to query - - # test short finding - be a bit more brutal here - max_bytes = 19 - min_bytes = 2 - num_ambiguous = 0 - for i, sha in enumerate(sha_list): - short_sha = sha[:max((i % max_bytes), min_bytes)] - try: - assert pdb.partial_to_complete_sha(short_sha, len(short_sha) * 2) == sha - except AmbiguousObjectName: - num_ambiguous += 1 - pass # valid, we can have short objects - # END exception handling - # END for each sha to find - - # we should have at least one ambiguous, considering the small sizes - # but in our pack, there is no ambiguous ... - # assert num_ambiguous - - # non-existing - self.assertRaises(BadObject, pdb.partial_to_complete_sha, b'\0\0', 4) diff --git a/gitdb/test/db/test_ref.py b/gitdb/test/db/test_ref.py deleted file mode 100644 index 0816e64..0000000 --- a/gitdb/test/db/test_ref.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -from gitdb.test.db.lib import ( - TestDBBase, - with_rw_directory, -) -from gitdb.db import ReferenceDB - -from gitdb.util import ( - NULL_BIN_SHA, - hex_to_bin -) - -import os - - -class TestReferenceDB(TestDBBase): - - def make_alt_file(self, alt_path, alt_list): - """Create an alternates file which contains the given alternates. - The list can be empty""" - with open(alt_path, "wb") as alt_file: - for alt in alt_list: - alt_file.write(alt.encode("utf-8") + b"\n") - - @with_rw_directory - def test_writing(self, path): - alt_path = os.path.join(path, 'alternates') - rdb = ReferenceDB(alt_path) - assert len(rdb.databases()) == 0 - assert rdb.size() == 0 - assert len(list(rdb.sha_iter())) == 0 - - # try empty, non-existing - assert not rdb.has_object(NULL_BIN_SHA) - - # setup alternate file - # add two, one is invalid - own_repo_path = os.path.join(self.gitrepopath, 'objects') # use own repo - self.make_alt_file(alt_path, [own_repo_path, "invalid/path"]) - rdb.update_cache() - assert len(rdb.databases()) == 1 - - # we should now find a default revision of ours - gitdb_sha = next(rdb.sha_iter()) - assert rdb.has_object(gitdb_sha) - - # remove valid - self.make_alt_file(alt_path, ["just/one/invalid/path"]) - rdb.update_cache() - assert len(rdb.databases()) == 0 - - # add valid - self.make_alt_file(alt_path, [own_repo_path]) - rdb.update_cache() - assert len(rdb.databases()) == 1 diff --git a/gitdb/test/fixtures/objects/7b/b839852ed5e3a069966281bb08d50012fb309b b/gitdb/test/fixtures/objects/7b/b839852ed5e3a069966281bb08d50012fb309b deleted file mode 100644 index 021c2db..0000000 Binary files a/gitdb/test/fixtures/objects/7b/b839852ed5e3a069966281bb08d50012fb309b and /dev/null differ diff --git a/gitdb/test/fixtures/objects/88/8401851f15db0eed60eb1bc29dec5ddcace911 b/gitdb/test/fixtures/objects/88/8401851f15db0eed60eb1bc29dec5ddcace911 deleted file mode 100644 index d60aeef..0000000 Binary files a/gitdb/test/fixtures/objects/88/8401851f15db0eed60eb1bc29dec5ddcace911 and /dev/null differ diff --git a/gitdb/test/fixtures/packs/pack-11fdfa9e156ab73caae3b6da867192221f2089c2.idx b/gitdb/test/fixtures/packs/pack-11fdfa9e156ab73caae3b6da867192221f2089c2.idx deleted file mode 100644 index fda5969..0000000 Binary files a/gitdb/test/fixtures/packs/pack-11fdfa9e156ab73caae3b6da867192221f2089c2.idx and /dev/null differ diff --git a/gitdb/test/fixtures/packs/pack-11fdfa9e156ab73caae3b6da867192221f2089c2.pack b/gitdb/test/fixtures/packs/pack-11fdfa9e156ab73caae3b6da867192221f2089c2.pack deleted file mode 100644 index a3209d2..0000000 Binary files a/gitdb/test/fixtures/packs/pack-11fdfa9e156ab73caae3b6da867192221f2089c2.pack and /dev/null differ diff --git a/gitdb/test/fixtures/packs/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.idx b/gitdb/test/fixtures/packs/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.idx deleted file mode 100644 index a7d6c71..0000000 Binary files a/gitdb/test/fixtures/packs/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.idx and /dev/null differ diff --git a/gitdb/test/fixtures/packs/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.pack b/gitdb/test/fixtures/packs/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.pack deleted file mode 100644 index 955c424..0000000 Binary files a/gitdb/test/fixtures/packs/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.pack and /dev/null differ diff --git a/gitdb/test/fixtures/packs/pack-c0438c19fb16422b6bbcce24387b3264416d485b.idx b/gitdb/test/fixtures/packs/pack-c0438c19fb16422b6bbcce24387b3264416d485b.idx deleted file mode 100644 index 87c635f..0000000 Binary files a/gitdb/test/fixtures/packs/pack-c0438c19fb16422b6bbcce24387b3264416d485b.idx and /dev/null differ diff --git a/gitdb/test/fixtures/packs/pack-c0438c19fb16422b6bbcce24387b3264416d485b.pack b/gitdb/test/fixtures/packs/pack-c0438c19fb16422b6bbcce24387b3264416d485b.pack deleted file mode 100644 index a69b28a..0000000 Binary files a/gitdb/test/fixtures/packs/pack-c0438c19fb16422b6bbcce24387b3264416d485b.pack and /dev/null differ diff --git a/gitdb/test/lib.py b/gitdb/test/lib.py deleted file mode 100644 index 8e60234..0000000 --- a/gitdb/test/lib.py +++ /dev/null @@ -1,192 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -"""Utilities used in ODB testing""" -from gitdb import OStream - -import sys -import random -from array import array - -from io import BytesIO - -import glob -import unittest -import tempfile -import shutil -import os -import gc -import logging -from functools import wraps - - -#{ Bases - -class TestBase(unittest.TestCase): - """Base class for all tests - - TestCase providing access to readonly repositories using the following member variables. - - * gitrepopath - - * read-only base path of the git source repository, i.e. .../git/.git - """ - - #{ Invvariants - k_env_git_repo = "GITDB_TEST_GIT_REPO_BASE" - #} END invariants - - @classmethod - def setUpClass(cls): - try: - super().setUpClass() - except AttributeError: - pass - - cls.gitrepopath = os.environ.get(cls.k_env_git_repo) - if not cls.gitrepopath: - logging.info( - "You can set the %s environment variable to a .git repository of your choice - defaulting to the gitdb repository", cls.k_env_git_repo) - ospd = os.path.dirname - cls.gitrepopath = os.path.join(ospd(ospd(ospd(__file__))), '.git') - # end assure gitrepo is set - assert cls.gitrepopath.endswith('.git') - - -#} END bases - -#{ Decorators - -def with_rw_directory(func): - """Create a temporary directory which can be written to, remove it if the - test succeeds, but leave it otherwise to aid additional debugging""" - - def wrapper(self): - path = tempfile.mktemp(prefix=func.__name__) - os.mkdir(path) - keep = False - try: - try: - return func(self, path) - except Exception: - sys.stderr.write(f"Test {type(self).__name__}.{func.__name__} failed, output is at {path!r}\n") - keep = True - raise - finally: - # Need to collect here to be sure all handles have been closed. It appears - # a windows-only issue. In fact things should be deleted, as well as - # memory maps closed, once objects go out of scope. For some reason - # though this is not the case here unless we collect explicitly. - if not keep: - gc.collect() - shutil.rmtree(path) - # END handle exception - # END wrapper - - wrapper.__name__ = func.__name__ - return wrapper - - -def with_packs_rw(func): - """Function that provides a path into which the packs for testing should be - copied. Will pass on the path to the actual function afterwards""" - - def wrapper(self, path): - src_pack_glob = fixture_path('packs/*') - copy_files_globbed(src_pack_glob, path, hard_link_ok=True) - return func(self, path) - # END wrapper - - wrapper.__name__ = func.__name__ - return wrapper - -#} END decorators - -#{ Routines - - -def fixture_path(relapath=''): - """:return: absolute path into the fixture directory - :param relapath: relative path into the fixtures directory, or '' - to obtain the fixture directory itself""" - return os.path.join(os.path.dirname(__file__), 'fixtures', relapath) - - -def copy_files_globbed(source_glob, target_dir, hard_link_ok=False): - """Copy all files found according to the given source glob into the target directory - :param hard_link_ok: if True, hard links will be created if possible. Otherwise - the files will be copied""" - for src_file in glob.glob(source_glob): - if hard_link_ok and hasattr(os, 'link'): - target = os.path.join(target_dir, os.path.basename(src_file)) - try: - os.link(src_file, target) - except OSError: - shutil.copy(src_file, target_dir) - # END handle cross device links ( and resulting failure ) - else: - shutil.copy(src_file, target_dir) - # END try hard link - # END for each file to copy - - -def make_bytes(size_in_bytes, randomize=False): - """:return: string with given size in bytes - :param randomize: try to produce a very random stream""" - actual_size = size_in_bytes // 4 - producer = range(actual_size) - if randomize: - producer = list(producer) - random.shuffle(producer) - # END randomize - a = array('i', producer) - return a.tobytes() - - -def make_object(type, data): - """:return: bytes resembling an uncompressed object""" - odata = "blob %i\0" % len(data) - return odata.encode("ascii") + data - - -def make_memory_file(size_in_bytes, randomize=False): - """:return: tuple(size_of_stream, stream) - :param randomize: try to produce a very random stream""" - d = make_bytes(size_in_bytes, randomize) - return len(d), BytesIO(d) - -#} END routines - -#{ Stream Utilities - - -class DummyStream: - - def __init__(self): - self.was_read = False - self.bytes = 0 - self.closed = False - - def read(self, size): - self.was_read = True - self.bytes = size - - def close(self): - self.closed = True - - def _assert(self): - assert self.was_read - - -class DeriveTest(OStream): - - def __init__(self, sha, type, size, stream, *args, **kwargs): - self.myarg = kwargs.pop('myarg') - self.args = args - - def _assert(self): - assert self.args - assert self.myarg - -#} END stream utilitiess diff --git a/gitdb/test/performance/__init__.py b/gitdb/test/performance/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/gitdb/test/performance/lib.py b/gitdb/test/performance/lib.py deleted file mode 100644 index 36916ed..0000000 --- a/gitdb/test/performance/lib.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -"""Contains library functions""" -from gitdb.test.lib import TestBase - - - -#{ Base Classes - -class TestBigRepoR(TestBase): - """A placeholder in case we want to add additional functionality to all performance test-cases - """ - - -#} END base classes diff --git a/gitdb/test/performance/test_pack.py b/gitdb/test/performance/test_pack.py deleted file mode 100644 index fc3d334..0000000 --- a/gitdb/test/performance/test_pack.py +++ /dev/null @@ -1,127 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -"""Performance tests for object store""" - -from gitdb.test.performance.lib import ( - TestBigRepoR -) - -from gitdb import ( - MemoryDB, - GitDB, - IStream, -) -from gitdb.typ import str_blob_type -from gitdb.exc import UnsupportedOperation -from gitdb.db.pack import PackedDB - -import sys -import os -from time import time - - -class TestPackedDBPerformance(TestBigRepoR): - - def test_pack_random_access(self): - pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) - - # sha lookup - st = time() - sha_list = list(pdb.sha_iter()) - elapsed = time() - st - ns = len(sha_list) - print("PDB: looked up %i shas by index in %f s ( %f shas/s )" % (ns, elapsed, ns / (elapsed or 1)), file=sys.stderr) - - # sha lookup: best-case and worst case access - pdb_pack_info = pdb._pack_info - # END shuffle shas - st = time() - for sha in sha_list: - pdb_pack_info(sha) - # END for each sha to look up - elapsed = time() - st - - # discard cache - del(pdb._entities) - pdb.entities() - print("PDB: looked up %i sha in %i packs in %f s ( %f shas/s )" % - (ns, len(pdb.entities()), elapsed, ns / (elapsed or 1)), file=sys.stderr) - # END for each random mode - - # query info and streams only - max_items = 10000 # can wait longer when testing memory - for pdb_fun in (pdb.info, pdb.stream): - st = time() - for sha in sha_list[:max_items]: - pdb_fun(sha) - elapsed = time() - st - print("PDB: Obtained %i object %s by sha in %f s ( %f items/s )" % - (max_items, pdb_fun.__name__.upper(), elapsed, max_items / (elapsed or 1)), file=sys.stderr) - # END for each function - - # retrieve stream and read all - max_items = 5000 - pdb_stream = pdb.stream - total_size = 0 - st = time() - for sha in sha_list[:max_items]: - stream = pdb_stream(sha) - read_len = len(stream.read()) - assert read_len == stream.size - total_size += stream.size - elapsed = time() - st - total_kib = total_size / 1000 - print("PDB: Obtained %i streams by sha and read all bytes totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % - (max_items, total_kib, total_kib / (elapsed or 1), elapsed, max_items / (elapsed or 1)), file=sys.stderr) - - def test_loose_correctness(self): - """based on the pack(s) of our packed object DB, we will just copy and verify all objects in the back - into the loose object db (memory). - This should help finding dormant issues like this one https://github.com/gitpython-developers/GitPython/issues/220 - faster - :note: It doesn't seem this test can find the issue unless the given pack contains highly compressed - data files, like archives.""" - from gitdb.util import bin_to_hex - pdb = GitDB(os.path.join(self.gitrepopath, 'objects')) - mdb = MemoryDB() - for c, sha in enumerate(pdb.sha_iter()): - ostream = pdb.stream(sha) - # the issue only showed on larger files which are hardly compressible ... - if ostream.type != str_blob_type: - continue - istream = IStream(ostream.type, ostream.size, ostream.stream) - mdb.store(istream) - assert istream.binsha == sha, "Failed on object %s" % bin_to_hex(sha).decode('ascii') - # this can fail ... sometimes, so the packs dataset should be huge - assert len(mdb.stream(sha).read()) == ostream.size - - if c and c % 1000 == 0: - print("Verified %i loose object compression/decompression cycles" % c, file=sys.stderr) - mdb._cache.clear() - # end for each sha to copy - - def test_correctness(self): - pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) - # disabled for now as it used to work perfectly, checking big repositories takes a long time - print("Endurance run: verify streaming of objects (crc and sha)", file=sys.stderr) - for crc in range(2): - count = 0 - st = time() - for entity in pdb.entities(): - pack_verify = entity.is_valid_stream - sha_by_index = entity.index().sha - for index in range(entity.index().size()): - try: - assert pack_verify(sha_by_index(index), use_crc=crc) - count += 1 - except UnsupportedOperation: - pass - # END ignore old indices - # END for each index - # END for each entity - elapsed = time() - st - print("PDB: verified %i objects (crc=%i) in %f s ( %f objects/s )" % - (count, crc, elapsed, count / (elapsed or 1)), file=sys.stderr) - # END for each verify mode diff --git a/gitdb/test/performance/test_pack_streaming.py b/gitdb/test/performance/test_pack_streaming.py deleted file mode 100644 index 80c798b..0000000 --- a/gitdb/test/performance/test_pack_streaming.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -"""Specific test for pack streams only""" - -from gitdb.test.performance.lib import ( - TestBigRepoR -) - -from gitdb.db.pack import PackedDB -from gitdb.stream import NullStream -from gitdb.pack import PackEntity - -import os -import sys -from time import time - - -class CountedNullStream(NullStream): - __slots__ = '_bw' - - def __init__(self): - self._bw = 0 - - def bytes_written(self): - return self._bw - - def write(self, d): - self._bw += NullStream.write(self, d) - - -class TestPackStreamingPerformance(TestBigRepoR): - - def test_pack_writing(self): - # see how fast we can write a pack from object streams. - # This will not be fast, as we take time for decompressing the streams as well - ostream = CountedNullStream() - pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) - - ni = 1000 - count = 0 - st = time() - for sha in pdb.sha_iter(): - count += 1 - pdb.stream(sha) - if count == ni: - break - # END gather objects for pack-writing - elapsed = time() - st - print("PDB Streaming: Got %i streams by sha in in %f s ( %f streams/s )" % - (ni, elapsed, ni / (elapsed or 1)), file=sys.stderr) - - st = time() - PackEntity.write_pack((pdb.stream(sha) for sha in pdb.sha_iter()), ostream.write, object_count=ni) - elapsed = time() - st - total_kb = ostream.bytes_written() / 1000 - print(sys.stderr, "PDB Streaming: Wrote pack of size %i kb in %f s (%f kb/s)" % - (total_kb, elapsed, total_kb / (elapsed or 1)), sys.stderr) - - def test_stream_reading(self): - # raise SkipTest() - pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) - - # streaming only, meant for --with-profile runs - ni = 5000 - count = 0 - pdb_stream = pdb.stream - total_size = 0 - st = time() - for sha in pdb.sha_iter(): - if count == ni: - break - stream = pdb_stream(sha) - stream.read() - total_size += stream.size - count += 1 - elapsed = time() - st - total_kib = total_size / 1000 - print(sys.stderr, "PDB Streaming: Got %i streams by sha and read all bytes totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % - (ni, total_kib, total_kib / (elapsed or 1), elapsed, ni / (elapsed or 1)), sys.stderr) diff --git a/gitdb/test/performance/test_stream.py b/gitdb/test/performance/test_stream.py deleted file mode 100644 index fb10871..0000000 --- a/gitdb/test/performance/test_stream.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -"""Performance data streaming performance""" - -from gitdb.test.performance.lib import TestBigRepoR -from gitdb.db import LooseObjectDB -from gitdb import IStream - -from gitdb.util import bin_to_hex, remove -from gitdb.fun import chunk_size - -from time import time -import os -import sys - - -from gitdb.test.lib import ( - make_memory_file, - with_rw_directory, -) - - -#{ Utilities -def read_chunked_stream(stream): - total = 0 - while True: - chunk = stream.read(chunk_size) - total += len(chunk) - if len(chunk) < chunk_size: - break - # END read stream loop - assert total == stream.size - return stream - - -#} END utilities - -class TestObjDBPerformance(TestBigRepoR): - - large_data_size_bytes = 1000 * 1000 * 50 # some MiB should do it - moderate_data_size_bytes = 1000 * 1000 * 1 # just 1 MiB - - @with_rw_directory - def test_large_data_streaming(self, path): - ldb = LooseObjectDB(path) - string_ios = list() # list of streams we previously created - - # serial mode - for randomize in range(2): - desc = (randomize and 'random ') or '' - print("Creating %s data ..." % desc, file=sys.stderr) - st = time() - size, stream = make_memory_file(self.large_data_size_bytes, randomize) - elapsed = time() - st - print("Done (in %f s)" % elapsed, file=sys.stderr) - string_ios.append(stream) - - # writing - due to the compression it will seem faster than it is - st = time() - sha = ldb.store(IStream('blob', size, stream)).binsha - elapsed_add = time() - st - assert ldb.has_object(sha) - db_file = ldb.readable_db_object_path(bin_to_hex(sha)) - fsize_kib = os.path.getsize(db_file) / 1000 - - size_kib = size / 1000 - print("Added %i KiB (filesize = %i KiB) of %s data to loose odb in %f s ( %f Write KiB / s)" % - (size_kib, fsize_kib, desc, elapsed_add, size_kib / (elapsed_add or 1)), file=sys.stderr) - - # reading all at once - st = time() - ostream = ldb.stream(sha) - shadata = ostream.read() - elapsed_readall = time() - st - - stream.seek(0) - assert shadata == stream.getvalue() - print("Read %i KiB of %s data at once from loose odb in %f s ( %f Read KiB / s)" % - (size_kib, desc, elapsed_readall, size_kib / (elapsed_readall or 1)), file=sys.stderr) - - # reading in chunks of 1 MiB - cs = 512 * 1000 - chunks = list() - st = time() - ostream = ldb.stream(sha) - while True: - data = ostream.read(cs) - chunks.append(data) - if len(data) < cs: - break - # END read in chunks - elapsed_readchunks = time() - st - - stream.seek(0) - assert b''.join(chunks) == stream.getvalue() - - cs_kib = cs / 1000 - print("Read %i KiB of %s data in %i KiB chunks from loose odb in %f s ( %f Read KiB / s)" % - (size_kib, desc, cs_kib, elapsed_readchunks, size_kib / (elapsed_readchunks or 1)), file=sys.stderr) - - # del db file so we keep something to do - ostream = None # To release the file handle (win) - remove(db_file) - # END for each randomization factor diff --git a/gitdb/test/test_base.py b/gitdb/test/test_base.py deleted file mode 100644 index 17906c9..0000000 --- a/gitdb/test/test_base.py +++ /dev/null @@ -1,105 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -"""Test for object db""" -from gitdb.test.lib import ( - TestBase, - DummyStream, - DeriveTest, -) - -from gitdb import ( - OInfo, - OPackInfo, - ODeltaPackInfo, - OStream, - OPackStream, - ODeltaPackStream, - IStream -) -from gitdb.util import ( - NULL_BIN_SHA -) - -from gitdb.typ import ( - str_blob_type -) - - -class TestBaseTypes(TestBase): - - def test_streams(self): - # test info - sha = NULL_BIN_SHA - s = 20 - blob_id = 3 - - info = OInfo(sha, str_blob_type, s) - assert info.binsha == sha - assert info.type == str_blob_type - assert info.type_id == blob_id - assert info.size == s - - # test pack info - # provides type_id - pinfo = OPackInfo(0, blob_id, s) - assert pinfo.type == str_blob_type - assert pinfo.type_id == blob_id - assert pinfo.pack_offset == 0 - - dpinfo = ODeltaPackInfo(0, blob_id, s, sha) - assert dpinfo.type == str_blob_type - assert dpinfo.type_id == blob_id - assert dpinfo.delta_info == sha - assert dpinfo.pack_offset == 0 - - # test ostream - stream = DummyStream() - ostream = OStream(*(info + (stream, ))) - assert ostream.stream is stream - ostream.read(15) - stream._assert() - assert stream.bytes == 15 - ostream.read(20) - assert stream.bytes == 20 - - # test packstream - postream = OPackStream(*(pinfo + (stream, ))) - assert postream.stream is stream - postream.read(10) - stream._assert() - assert stream.bytes == 10 - - # test deltapackstream - dpostream = ODeltaPackStream(*(dpinfo + (stream, ))) - assert dpostream.stream is stream - dpostream.read(5) - stream._assert() - assert stream.bytes == 5 - - # derive with own args - DeriveTest(sha, str_blob_type, s, stream, 'mine', myarg=3)._assert() - - # test istream - istream = IStream(str_blob_type, s, stream) - assert istream.binsha == None - istream.binsha = sha - assert istream.binsha == sha - - assert len(istream.binsha) == 20 - assert len(istream.hexsha) == 40 - - assert istream.size == s - istream.size = s * 2 - assert istream.size == s * 2 - assert istream.type == str_blob_type - istream.type = "something" - assert istream.type == "something" - assert istream.stream is stream - istream.stream = None - assert istream.stream is None - - assert istream.error is None - istream.error = Exception() - assert isinstance(istream.error, Exception) diff --git a/gitdb/test/test_example.py b/gitdb/test/test_example.py deleted file mode 100644 index 3b4c908..0000000 --- a/gitdb/test/test_example.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -"""Module with examples from the tutorial section of the docs""" -import os -from gitdb.test.lib import TestBase -from gitdb import IStream -from gitdb.db import LooseObjectDB - -from io import BytesIO - - -class TestExamples(TestBase): - - def test_base(self): - ldb = LooseObjectDB(os.path.join(self.gitrepopath, 'objects')) - - for sha1 in ldb.sha_iter(): - oinfo = ldb.info(sha1) - ostream = ldb.stream(sha1) - assert oinfo[:3] == ostream[:3] - - assert len(ostream.read()) == ostream.size - assert ldb.has_object(oinfo.binsha) - # END for each sha in database - # assure we close all files - try: - del(ostream) - del(oinfo) - except UnboundLocalError: - pass - # END ignore exception if there are no loose objects - - data = b"my data" - istream = IStream("blob", len(data), BytesIO(data)) - - # the object does not yet have a sha - assert istream.binsha is None - ldb.store(istream) - # now the sha is set - assert len(istream.binsha) == 20 - assert ldb.has_object(istream.binsha) diff --git a/gitdb/test/test_pack.py b/gitdb/test/test_pack.py deleted file mode 100644 index e723482..0000000 --- a/gitdb/test/test_pack.py +++ /dev/null @@ -1,249 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -"""Test everything about packs reading and writing""" -from gitdb.test.lib import ( - TestBase, - with_rw_directory, - fixture_path -) - -from gitdb.stream import DeltaApplyReader - -from gitdb.pack import ( - PackEntity, - PackIndexFile, - PackFile -) - -from gitdb.base import ( - OInfo, - OStream, -) - -from gitdb.fun import delta_types -from gitdb.exc import UnsupportedOperation -from gitdb.util import to_bin_sha - -import pytest - -import os -import tempfile - - -#{ Utilities -def bin_sha_from_filename(filename): - return to_bin_sha(os.path.splitext(os.path.basename(filename))[0][5:]) -#} END utilities - - -class TestPack(TestBase): - - packindexfile_v1 = (fixture_path('packs/pack-c0438c19fb16422b6bbcce24387b3264416d485b.idx'), 1, 67) - packindexfile_v2 = (fixture_path('packs/pack-11fdfa9e156ab73caae3b6da867192221f2089c2.idx'), 2, 30) - packindexfile_v2_3_ascii = (fixture_path('packs/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.idx'), 2, 42) - packfile_v2_1 = (fixture_path('packs/pack-c0438c19fb16422b6bbcce24387b3264416d485b.pack'), 2, packindexfile_v1[2]) - packfile_v2_2 = (fixture_path('packs/pack-11fdfa9e156ab73caae3b6da867192221f2089c2.pack'), 2, packindexfile_v2[2]) - packfile_v2_3_ascii = ( - fixture_path('packs/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.pack'), 2, packindexfile_v2_3_ascii[2]) - - def _assert_index_file(self, index, version, size): - assert index.packfile_checksum() != index.indexfile_checksum() - assert len(index.packfile_checksum()) == 20 - assert len(index.indexfile_checksum()) == 20 - assert index.version() == version - assert index.size() == size - assert len(index.offsets()) == size - - # get all data of all objects - for oidx in range(index.size()): - sha = index.sha(oidx) - assert oidx == index.sha_to_index(sha) - - entry = index.entry(oidx) - assert len(entry) == 3 - - assert entry[0] == index.offset(oidx) - assert entry[1] == sha - assert entry[2] == index.crc(oidx) - - # verify partial sha - for l in (4, 8, 11, 17, 20): - assert index.partial_sha_to_index(sha[:l], l * 2) == oidx - - # END for each object index in indexfile - self.assertRaises(ValueError, index.partial_sha_to_index, "\0", 2) - - def _assert_pack_file(self, pack, version, size): - assert pack.version() == 2 - assert pack.size() == size - assert len(pack.checksum()) == 20 - - num_obj = 0 - for obj in pack.stream_iter(): - num_obj += 1 - info = pack.info(obj.pack_offset) - stream = pack.stream(obj.pack_offset) - - assert info.pack_offset == stream.pack_offset - assert info.type_id == stream.type_id - assert hasattr(stream, 'read') - - # it should be possible to read from both streams - assert obj.read() == stream.read() - - streams = pack.collect_streams(obj.pack_offset) - assert streams - - # read the stream - try: - dstream = DeltaApplyReader.new(streams) - except ValueError: - # ignore these, old git versions use only ref deltas, - # which we haven't resolved ( as we are without an index ) - # Also ignore non-delta streams - continue - # END get deltastream - - # read all - data = dstream.read() - assert len(data) == dstream.size - - # test seek - dstream.seek(0) - assert dstream.read() == data - - # read chunks - # NOTE: the current implementation is safe, it basically transfers - # all calls to the underlying memory map - - # END for each object - assert num_obj == size - - def test_pack_index(self): - # check version 1 and 2 - for indexfile, version, size in (self.packindexfile_v1, self.packindexfile_v2): - index = PackIndexFile(indexfile) - self._assert_index_file(index, version, size) - # END run tests - - def test_pack(self): - # there is this special version 3, but apparently its like 2 ... - for packfile, version, size in (self.packfile_v2_3_ascii, self.packfile_v2_1, self.packfile_v2_2): - pack = PackFile(packfile) - self._assert_pack_file(pack, version, size) - # END for each pack to test - - @with_rw_directory - def test_pack_entity(self, rw_dir): - pack_objs = list() - for packinfo, indexinfo in ((self.packfile_v2_1, self.packindexfile_v1), - (self.packfile_v2_2, self.packindexfile_v2), - (self.packfile_v2_3_ascii, self.packindexfile_v2_3_ascii)): - packfile, version, size = packinfo - indexfile, version, size = indexinfo - entity = PackEntity(packfile) - assert entity.pack().path() == packfile - assert entity.index().path() == indexfile - pack_objs.extend(entity.stream_iter()) - - count = 0 - for info, stream in zip(entity.info_iter(), entity.stream_iter()): - count += 1 - assert info.binsha == stream.binsha - assert len(info.binsha) == 20 - assert info.type_id == stream.type_id - assert info.size == stream.size - - # we return fully resolved items, which is implied by the sha centric access - assert not info.type_id in delta_types - - # try all calls - assert len(entity.collect_streams(info.binsha)) - oinfo = entity.info(info.binsha) - assert isinstance(oinfo, OInfo) - assert oinfo.binsha is not None - ostream = entity.stream(info.binsha) - assert isinstance(ostream, OStream) - assert ostream.binsha is not None - - # verify the stream - try: - assert entity.is_valid_stream(info.binsha, use_crc=True) - except UnsupportedOperation: - pass - # END ignore version issues - assert entity.is_valid_stream(info.binsha, use_crc=False) - # END for each info, stream tuple - assert count == size - - # END for each entity - - # pack writing - write all packs into one - # index path can be None - pack_path1 = tempfile.mktemp('', "pack1", rw_dir) - pack_path2 = tempfile.mktemp('', "pack2", rw_dir) - index_path = tempfile.mktemp('', 'index', rw_dir) - iteration = 0 - - def rewind_streams(): - for obj in pack_objs: - obj.stream.seek(0) - # END utility - for ppath, ipath, num_obj in zip((pack_path1, pack_path2), - (index_path, None), - (len(pack_objs), None)): - iwrite = None - if ipath: - ifile = open(ipath, 'wb') - iwrite = ifile.write - # END handle ip - - # make sure we rewind the streams ... we work on the same objects over and over again - if iteration > 0: - rewind_streams() - # END rewind streams - iteration += 1 - - with open(ppath, 'wb') as pfile: - pack_sha, index_sha = PackEntity.write_pack(pack_objs, pfile.write, iwrite, object_count=num_obj) - assert os.path.getsize(ppath) > 100 - - # verify pack - pf = PackFile(ppath) - assert pf.size() == len(pack_objs) - assert pf.version() == PackFile.pack_version_default - assert pf.checksum() == pack_sha - pf.close() - - # verify index - if ipath is not None: - ifile.close() - assert os.path.getsize(ipath) > 100 - idx = PackIndexFile(ipath) - assert idx.version() == PackIndexFile.index_version_default - assert idx.packfile_checksum() == pack_sha - assert idx.indexfile_checksum() == index_sha - assert idx.size() == len(pack_objs) - idx.close() - # END verify files exist - # END for each packpath, indexpath pair - - # verify the packs thoroughly - rewind_streams() - entity = PackEntity.create(pack_objs, rw_dir) - count = 0 - for info in entity.info_iter(): - count += 1 - for use_crc in range(2): - assert entity.is_valid_stream(info.binsha, use_crc) - # END for each crc mode - # END for each info - assert count == len(pack_objs) - entity.close() - - def test_pack_64(self): - # TODO: hex-edit a pack helping us to verify that we can handle 64 byte offsets - # of course without really needing such a huge pack - pytest.skip('not implemented') diff --git a/gitdb/test/test_stream.py b/gitdb/test/test_stream.py deleted file mode 100644 index 1e7e941..0000000 --- a/gitdb/test/test_stream.py +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -"""Test for object db""" - -from gitdb.test.lib import ( - TestBase, - DummyStream, - make_bytes, - make_object, - fixture_path -) - -from gitdb import ( - DecompressMemMapReader, - FDCompressedSha1Writer, - LooseObjectDB, - Sha1Writer, - MemoryDB, - IStream, -) -from gitdb.util import hex_to_bin - -import zlib -from gitdb.typ import ( - str_blob_type -) - -import tempfile -import os -from io import BytesIO - - -class TestStream(TestBase): - - """Test stream classes""" - - data_sizes = (15, 10000, 1000 * 1024 + 512) - - def _assert_stream_reader(self, stream, cdata, rewind_stream=lambda s: None): - """Make stream tests - the orig_stream is seekable, allowing it to be - rewound and reused - :param cdata: the data we expect to read from stream, the contents - :param rewind_stream: function called to rewind the stream to make it ready - for reuse""" - ns = 10 - assert len(cdata) > ns - 1, "Data must be larger than %i, was %i" % (ns, len(cdata)) - - # read in small steps - ss = len(cdata) // ns - for i in range(ns): - data = stream.read(ss) - chunk = cdata[i * ss:(i + 1) * ss] - assert data == chunk - # END for each step - rest = stream.read() - if rest: - assert rest == cdata[-len(rest):] - # END handle rest - - if isinstance(stream, DecompressMemMapReader): - assert len(stream.data()) == stream.compressed_bytes_read() - # END handle special type - - rewind_stream(stream) - - # read everything - rdata = stream.read() - assert rdata == cdata - - if isinstance(stream, DecompressMemMapReader): - assert len(stream.data()) == stream.compressed_bytes_read() - # END handle special type - - def test_decompress_reader(self): - for close_on_deletion in range(2): - for with_size in range(2): - for ds in self.data_sizes: - cdata = make_bytes(ds, randomize=False) - - # zdata = zipped actual data - # cdata = original content data - - # create reader - if with_size: - # need object data - zdata = zlib.compress(make_object(str_blob_type, cdata)) - typ, size, reader = DecompressMemMapReader.new(zdata, close_on_deletion) - assert size == len(cdata) - assert typ == str_blob_type - - # even if we don't set the size, it will be set automatically on first read - test_reader = DecompressMemMapReader(zdata, close_on_deletion=False) - assert test_reader._s == len(cdata) - else: - # here we need content data - zdata = zlib.compress(cdata) - reader = DecompressMemMapReader(zdata, close_on_deletion, len(cdata)) - assert reader._s == len(cdata) - # END get reader - - self._assert_stream_reader(reader, cdata, lambda r: r.seek(0)) - - # put in a dummy stream for closing - dummy = DummyStream() - reader._m = dummy - - assert not dummy.closed - del(reader) - assert dummy.closed == close_on_deletion - # END for each datasize - # END whether size should be used - # END whether stream should be closed when deleted - - def test_sha_writer(self): - writer = Sha1Writer() - assert 2 == writer.write(b"hi") - assert len(writer.sha(as_hex=1)) == 40 - assert len(writer.sha(as_hex=0)) == 20 - - # make sure it does something ;) - prev_sha = writer.sha() - writer.write(b"hi again") - assert writer.sha() != prev_sha - - def test_compressed_writer(self): - for ds in self.data_sizes: - fd, path = tempfile.mkstemp() - ostream = FDCompressedSha1Writer(fd) - data = make_bytes(ds, randomize=False) - - # for now, just a single write, code doesn't care about chunking - assert len(data) == ostream.write(data) - ostream.close() - - # its closed already - self.assertRaises(OSError, os.close, fd) - - # read everything back, compare to data we zip - fd = os.open(path, os.O_RDONLY | getattr(os, 'O_BINARY', 0)) - written_data = os.read(fd, os.path.getsize(path)) - assert len(written_data) == os.path.getsize(path) - os.close(fd) - assert written_data == zlib.compress(data, 1) # best speed - - os.remove(path) - # END for each os - - def test_decompress_reader_special_case(self): - odb = LooseObjectDB(fixture_path('objects')) - mdb = MemoryDB() - for sha in (b'888401851f15db0eed60eb1bc29dec5ddcace911', - b'7bb839852ed5e3a069966281bb08d50012fb309b',): - ostream = odb.stream(hex_to_bin(sha)) - - # if there is a bug, we will be missing one byte exactly ! - data = ostream.read() - assert len(data) == ostream.size - - # Putting it back in should yield nothing new - after all, we have - dump = mdb.store(IStream(ostream.type, ostream.size, BytesIO(data))) - assert dump.hexsha == sha - # end for each loose object sha to test diff --git a/gitdb/test/test_util.py b/gitdb/test/test_util.py deleted file mode 100644 index 166b33c..0000000 --- a/gitdb/test/test_util.py +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -"""Test for object db""" -import tempfile -import os - -from gitdb.test.lib import TestBase -from gitdb.util import ( - to_hex_sha, - to_bin_sha, - NULL_HEX_SHA, - LockedFD -) - - -class TestUtils(TestBase): - - def test_basics(self): - assert to_hex_sha(NULL_HEX_SHA) == NULL_HEX_SHA - assert len(to_bin_sha(NULL_HEX_SHA)) == 20 - assert to_hex_sha(to_bin_sha(NULL_HEX_SHA)) == NULL_HEX_SHA.encode("ascii") - - def _cmp_contents(self, file_path, data): - # raise if data from file at file_path - # does not match data string - with open(file_path, "rb") as fp: - assert fp.read() == data.encode("ascii") - - def test_lockedfd(self): - my_file = tempfile.mktemp() - orig_data = "hello" - new_data = "world" - with open(my_file, "wb") as my_file_fp: - my_file_fp.write(orig_data.encode("ascii")) - - try: - lfd = LockedFD(my_file) - lockfilepath = lfd._lockfilepath() - - # cannot end before it was started - self.assertRaises(AssertionError, lfd.rollback) - self.assertRaises(AssertionError, lfd.commit) - - # open for writing - assert not os.path.isfile(lockfilepath) - wfd = lfd.open(write=True) - assert lfd._fd is wfd - assert os.path.isfile(lockfilepath) - - # write data and fail - os.write(wfd, new_data.encode("ascii")) - lfd.rollback() - assert lfd._fd is None - self._cmp_contents(my_file, orig_data) - assert not os.path.isfile(lockfilepath) - - # additional call doesn't fail - lfd.commit() - lfd.rollback() - - # test reading - lfd = LockedFD(my_file) - rfd = lfd.open(write=False) - assert os.read(rfd, len(orig_data)) == orig_data.encode("ascii") - - assert os.path.isfile(lockfilepath) - # deletion rolls back - del(lfd) - assert not os.path.isfile(lockfilepath) - - # write data - concurrently - lfd = LockedFD(my_file) - olfd = LockedFD(my_file) - assert not os.path.isfile(lockfilepath) - wfdstream = lfd.open(write=True, stream=True) # this time as stream - assert os.path.isfile(lockfilepath) - # another one fails - self.assertRaises(IOError, olfd.open) - - wfdstream.write(new_data.encode("ascii")) - lfd.commit() - assert not os.path.isfile(lockfilepath) - self._cmp_contents(my_file, new_data) - - # could test automatic _end_writing on destruction - finally: - os.remove(my_file) - # END final cleanup - - # try non-existing file for reading - lfd = LockedFD(tempfile.mktemp()) - try: - lfd.open(write=False) - except OSError: - assert not os.path.exists(lfd._lockfilepath()) - else: - self.fail("expected OSError") - # END handle exceptions diff --git a/gitdb/typ.py b/gitdb/typ.py deleted file mode 100644 index 314db50..0000000 --- a/gitdb/typ.py +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -"""Module containing information about types known to the database""" - -str_blob_type = b'blob' -str_commit_type = b'commit' -str_tree_type = b'tree' -str_tag_type = b'tag' diff --git a/gitdb/util.py b/gitdb/util.py deleted file mode 100644 index bb6d879..0000000 --- a/gitdb/util.py +++ /dev/null @@ -1,398 +0,0 @@ -# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors -# -# This module is part of GitDB and is released under -# the New BSD License: https://opensource.org/license/bsd-3-clause/ -import binascii -import os -import mmap -import sys -import time -import errno - -from io import BytesIO - -from smmap import ( - StaticWindowMapManager, - SlidingWindowMapManager, - SlidingWindowMapBuffer -) - -# initialize our global memory manager instance -# Use it to free cached (and unused) resources. -mman = SlidingWindowMapManager() -# END handle mman - -import hashlib - -try: - from struct import unpack_from -except ImportError: - from struct import unpack, calcsize - __calcsize_cache = dict() - - def unpack_from(fmt, data, offset=0): - try: - size = __calcsize_cache[fmt] - except KeyError: - size = calcsize(fmt) - __calcsize_cache[fmt] = size - # END exception handling - return unpack(fmt, data[offset: offset + size]) - # END own unpack_from implementation - - -#{ Aliases - -hex_to_bin = binascii.a2b_hex -bin_to_hex = binascii.b2a_hex - -# errors -ENOENT = errno.ENOENT - -# os shortcuts -exists = os.path.exists -mkdir = os.mkdir -chmod = os.chmod -isdir = os.path.isdir -isfile = os.path.isfile -rename = os.rename -dirname = os.path.dirname -basename = os.path.basename -join = os.path.join -read = os.read -write = os.write -close = os.close -fsync = os.fsync - - -def _retry(func, *args, **kwargs): - # Wrapper around functions, that are problematic on "Windows". Sometimes - # the OS or someone else has still a handle to the file - if sys.platform == "win32": - for _ in range(10): - try: - return func(*args, **kwargs) - except Exception: - time.sleep(0.1) - return func(*args, **kwargs) - else: - return func(*args, **kwargs) - - -def remove(*args, **kwargs): - return _retry(os.remove, *args, **kwargs) - - -# Backwards compatibility imports -from gitdb.const import ( - NULL_BIN_SHA, - NULL_HEX_SHA -) - -#} END Aliases - -#{ compatibility stuff ... - - -class _RandomAccessBytesIO: - - """Wrapper to provide required functionality in case memory maps cannot or may - not be used. This is only really required in python 2.4""" - __slots__ = '_sio' - - def __init__(self, buf=''): - self._sio = BytesIO(buf) - - def __getattr__(self, attr): - return getattr(self._sio, attr) - - def __len__(self): - return len(self.getvalue()) - - def __getitem__(self, i): - return self.getvalue()[i] - - def __getslice__(self, start, end): - return self.getvalue()[start:end] - - -def byte_ord(b): - """ - Return the integer representation of the byte string. This supports Python - 3 byte arrays as well as standard strings. - """ - try: - return ord(b) - except TypeError: - return b - -#} END compatibility stuff ... - -#{ Routines - - -def make_sha(source=b''): - """A python2.4 workaround for the sha/hashlib module fiasco - - **Note** From the dulwich project """ - try: - return hashlib.sha1(source) - except NameError: - import sha - sha1 = sha.sha(source) - return sha1 - - -def allocate_memory(size): - """:return: a file-protocol accessible memory block of the given size""" - if size == 0: - return _RandomAccessBytesIO(b'') - # END handle empty chunks gracefully - - try: - return mmap.mmap(-1, size) # read-write by default - except OSError: - # setup real memory instead - # this of course may fail if the amount of memory is not available in - # one chunk - would only be the case in python 2.4, being more likely on - # 32 bit systems. - return _RandomAccessBytesIO(b"\0" * size) - # END handle memory allocation - - -def file_contents_ro(fd, stream=False, allow_mmap=True): - """:return: read-only contents of the file represented by the file descriptor fd - - :param fd: file descriptor opened for reading - :param stream: if False, random access is provided, otherwise the stream interface - is provided. - :param allow_mmap: if True, its allowed to map the contents into memory, which - allows large files to be handled and accessed efficiently. The file-descriptor - will change its position if this is False""" - try: - if allow_mmap: - # supports stream and random access - try: - return mmap.mmap(fd, 0, access=mmap.ACCESS_READ) - except OSError: - # python 2.4 issue, 0 wants to be the actual size - return mmap.mmap(fd, os.fstat(fd).st_size, access=mmap.ACCESS_READ) - # END handle python 2.4 - except OSError: - pass - # END exception handling - - # read manually - contents = os.read(fd, os.fstat(fd).st_size) - if stream: - return _RandomAccessBytesIO(contents) - return contents - - -def file_contents_ro_filepath(filepath, stream=False, allow_mmap=True, flags=0): - """Get the file contents at filepath as fast as possible - - :return: random access compatible memory of the given filepath - :param stream: see ``file_contents_ro`` - :param allow_mmap: see ``file_contents_ro`` - :param flags: additional flags to pass to os.open - :raise OSError: If the file could not be opened - - **Note** for now we don't try to use O_NOATIME directly as the right value needs to be - shared per database in fact. It only makes a real difference for loose object - databases anyway, and they use it with the help of the ``flags`` parameter""" - fd = os.open(filepath, os.O_RDONLY | getattr(os, 'O_BINARY', 0) | flags) - try: - return file_contents_ro(fd, stream, allow_mmap) - finally: - close(fd) - # END assure file is closed - - -def sliding_ro_buffer(filepath, flags=0): - """ - :return: a buffer compatible object which uses our mapped memory manager internally - ready to read the whole given filepath""" - return SlidingWindowMapBuffer(mman.make_cursor(filepath), flags=flags) - - -def to_hex_sha(sha): - """:return: hexified version of sha""" - if len(sha) == 40: - return sha - return bin_to_hex(sha) - - -def to_bin_sha(sha): - if len(sha) == 20: - return sha - return hex_to_bin(sha) - - -#} END routines - - -#{ Utilities - -class LazyMixin: - - """ - Base class providing an interface to lazily retrieve attribute values upon - first access. If slots are used, memory will only be reserved once the attribute - is actually accessed and retrieved the first time. All future accesses will - return the cached value as stored in the Instance's dict or slot. - """ - - __slots__ = tuple() - - def __getattr__(self, attr): - """ - Whenever an attribute is requested that we do not know, we allow it - to be created and set. Next time the same attribute is requested, it is simply - returned from our dict/slots. """ - self._set_cache_(attr) - # will raise in case the cache was not created - return object.__getattribute__(self, attr) - - def _set_cache_(self, attr): - """ - This method should be overridden in the derived class. - It should check whether the attribute named by attr can be created - and cached. Do nothing if you do not know the attribute or call your subclass - - The derived class may create as many additional attributes as it deems - necessary in case a git command returns more information than represented - in the single attribute.""" - pass - - -class LockedFD: - - """ - This class facilitates a safe read and write operation to a file on disk. - If we write to 'file', we obtain a lock file at 'file.lock' and write to - that instead. If we succeed, the lock file will be renamed to overwrite - the original file. - - When reading, we obtain a lock file, but to prevent other writers from - succeeding while we are reading the file. - - This type handles error correctly in that it will assure a consistent state - on destruction. - - **note** with this setup, parallel reading is not possible""" - __slots__ = ("_filepath", '_fd', '_write') - - def __init__(self, filepath): - """Initialize an instance with the givne filepath""" - self._filepath = filepath - self._fd = None - self._write = None # if True, we write a file - - def __del__(self): - # will do nothing if the file descriptor is already closed - if self._fd is not None: - self.rollback() - - def _lockfilepath(self): - return "%s.lock" % self._filepath - - def open(self, write=False, stream=False): - """ - Open the file descriptor for reading or writing, both in binary mode. - - :param write: if True, the file descriptor will be opened for writing. Other - wise it will be opened read-only. - :param stream: if True, the file descriptor will be wrapped into a simple stream - object which supports only reading or writing - :return: fd to read from or write to. It is still maintained by this instance - and must not be closed directly - :raise IOError: if the lock could not be retrieved - :raise OSError: If the actual file could not be opened for reading - - **note** must only be called once""" - if self._write is not None: - raise AssertionError("Called %s multiple times" % self.open) - - self._write = write - - # try to open the lock file - binary = getattr(os, 'O_BINARY', 0) - lockmode = os.O_WRONLY | os.O_CREAT | os.O_EXCL | binary - try: - fd = os.open(self._lockfilepath(), lockmode, int("600", 8)) - if not write: - os.close(fd) - else: - self._fd = fd - # END handle file descriptor - except OSError as e: - raise OSError("Lock at %r could not be obtained" % self._lockfilepath()) from e - # END handle lock retrieval - - # open actual file if required - if self._fd is None: - # we could specify exclusive here, as we obtained the lock anyway - try: - self._fd = os.open(self._filepath, os.O_RDONLY | binary) - except: - # assure we release our lockfile - remove(self._lockfilepath()) - raise - # END handle lockfile - # END open descriptor for reading - - if stream: - # need delayed import - from gitdb.stream import FDStream - return FDStream(self._fd) - else: - return self._fd - # END handle stream - - def commit(self): - """When done writing, call this function to commit your changes into the - actual file. - The file descriptor will be closed, and the lockfile handled. - - **Note** can be called multiple times""" - self._end_writing(successful=True) - - def rollback(self): - """Abort your operation without any changes. The file descriptor will be - closed, and the lock released. - - **Note** can be called multiple times""" - self._end_writing(successful=False) - - def _end_writing(self, successful=True): - """Handle the lock according to the write mode """ - if self._write is None: - raise AssertionError("Cannot end operation if it wasn't started yet") - - if self._fd is None: - return - - os.close(self._fd) - self._fd = None - - lockfile = self._lockfilepath() - if self._write and successful: - # on windows, rename does not silently overwrite the existing one - if sys.platform == "win32": - if isfile(self._filepath): - remove(self._filepath) - # END remove if exists - # END win32 special handling - os.rename(lockfile, self._filepath) - - # assure others can at least read the file - the tmpfile left it at rw-- - # We may also write that file, on windows that boils down to a remove- - # protection as well - chmod(self._filepath, int("644", 8)) - else: - # just delete the file so far, we failed - remove(lockfile) - # END successful handling - -#} END utilities diff --git a/gitdb/utils/__init__.py b/gitdb/utils/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/gitdb/utils/encoding.py b/gitdb/utils/encoding.py deleted file mode 100644 index b534ef7..0000000 --- a/gitdb/utils/encoding.py +++ /dev/null @@ -1,18 +0,0 @@ -def force_bytes(data, encoding="utf-8"): - if isinstance(data, bytes): - return data - - if isinstance(data, str): - return data.encode(encoding) - - return data - - -def force_text(data, encoding="utf-8"): - if isinstance(data, str): - return data - - if isinstance(data, bytes): - return data.decode(encoding) - - return str(data, encoding) diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 1b2e11d..0000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -smmap>=3.0.1,<6 diff --git a/setup.py b/setup.py old mode 100755 new mode 100644 index 3a91543..bced0ea --- a/setup.py +++ b/setup.py @@ -1,47 +1,12 @@ from setuptools import setup -# NOTE: This is currently duplicated from the gitdb.__init__ module, because -# that's just how you write a setup.py (nobody reads this stuff out of the -# module) - -__author__ = "Sebastian Thiel" -__contact__ = "byronimo@gmail.com" -__homepage__ = "https://github.com/gitpython-developers/gitdb" -version_info = (4, 0, 12) -__version__ = '.'.join(str(i) for i in version_info) - setup( - name="gitdb", - version=__version__, - description="Git Object Database", - author=__author__, - author_email=__contact__, - url=__homepage__, - packages=('gitdb', 'gitdb.db', 'gitdb.utils', 'gitdb.test'), - license="BSD License", - zip_safe=False, - install_requires=['smmap>=3.0.1,<6'], - long_description="""GitDB is a pure-Python git object database""", - python_requires='>=3.7', - # See https://pypi.python.org/pypi?%3Aaction=list_classifiers - classifiers=[ - "Development Status :: 5 - Production/Stable", - "Environment :: Console", - "Intended Audience :: Developers", - "License :: OSI Approved :: BSD License", - "Operating System :: OS Independent", - "Operating System :: POSIX", - "Operating System :: Microsoft :: Windows", - "Operating System :: MacOS :: MacOS X", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: 3.13", - "Programming Language :: Python :: 3 :: Only", - ] + name="gitdb2", + version="4.0.2", + author="Sebastian Thiel", + author_email="byronimo@gmail.com", + description="A mirror package for gitdb", + long_description="This is a mirror package for `gitdb `_. Consider installing it directly instead.", + url="https://github.com/gitpython-developers/gitdb", + install_requires=["gitdb>=4.0.1"], )