diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..71b6ef7 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,7 @@ +[run] +source = gitdb + +; to make nosetests happy +[report] +include = */gitdb/* +omit = */gitdb/ext/* diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..2fe73ca --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,11 @@ +version: 2 +updates: +- package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" + +- package-ecosystem: "gitsubmodule" + directory: "/" + schedule: + interval: "weekly" diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml new file mode 100644 index 0000000..907698d --- /dev/null +++ b/.github/workflows/pythonpackage.yml @@ -0,0 +1,49 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Python package + +on: [push, pull_request, workflow_dispatch] + +jobs: + build: + + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] + os: [ubuntu-latest] + experimental: [false] + include: + - python-version: "3.7" + os: ubuntu-22.04 + experimental: false + continue-on-error: ${{ matrix.experimental }} + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + allow-prereleases: ${{ matrix.experimental }} + - name: Install project and dependencies + run: | + python -m pip install --upgrade pip + pip install . + - name: Lint with flake8 + run: | + pip install flake8 + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pip install pytest + ulimit -n 48 + ulimit -n + pytest -v diff --git a/.gitignore b/.gitignore index c6247db..8b7da92 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,10 @@ MANIFEST +.coverage build/ dist/ *.pyc *.o *.so .noseids -*.sublime-workspace \ No newline at end of file +*.sublime-workspace +*.egg-info diff --git a/.gitmodules b/.gitmodules index d85b15c..e73cded 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "smmap"] path = gitdb/ext/smmap - url = https://github.com/Byron/smmap.git + url = https://github.com/gitpython-developers/smmap.git diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 761edc1..0000000 --- a/.travis.yml +++ /dev/null @@ -1,18 +0,0 @@ -language: python -python: - - "2.6" - - "2.7" - - "3.3" - - "3.4" - # - "pypy" - won't work as smmap doesn't work (see smmap/.travis.yml for details) - -git: - # a higher depth is needed for one of the tests - lets fet - depth: 1000 -install: - - pip install coveralls -script: - - nosetests -v -after_success: - - coveralls - diff --git a/AUTHORS b/AUTHORS index 490baad..6c7e9b9 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1 +1,4 @@ Creator: Sebastian Thiel + +Contributors: + - Ram Rachum (@cool-RR) diff --git a/MANIFEST.in b/MANIFEST.in index b14aed9..b939b5d 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,12 +3,13 @@ include LICENSE include CHANGES include AUTHORS include README +include MANIFEST.in include gitdb/_fun.c include gitdb/_delta_apply.c include gitdb/_delta_apply.h -prune gitdb/test +graft gitdb/test global-exclude .git* global-exclude *.pyc diff --git a/Makefile b/Makefile index c6c159b..20436bb 100644 --- a/Makefile +++ b/Makefile @@ -1,27 +1,12 @@ -PYTHON = python -SETUP = $(PYTHON) setup.py -TESTRUNNER = $(shell which nosetests) -TESTFLAGS = +.PHONY: all clean release force_release -all: build +all: + @grep -Ee '^[a-z].*:' Makefile | cut -d: -f1 | grep -vF all -doc:: - make -C doc/ html - -build:: - $(SETUP) build - $(SETUP) build_ext -i - -build_ext:: - $(SETUP) build_ext -i - -install:: - $(SETUP) install - -clean:: - $(SETUP) clean --all - rm -f *.so - -coverage:: build - PYTHONPATH=. $(PYTHON) $(TESTRUNNER) --cover-package=gitdb --with-coverage --cover-erase --cover-inclusive gitdb +clean: + rm -rf build/ dist/ .eggs/ .tox/ +force_release: clean + ./build-release.sh + twine upload dist/* + git push --tags origin master diff --git a/README.rst b/README.rst index 194e246..61ce28b 100644 --- a/README.rst +++ b/README.rst @@ -6,56 +6,67 @@ GitDB allows you to access bare git repositories for reading and writing. It aim Installation ============ -.. image:: https://pypip.in/version/gitdb/badge.svg +.. image:: https://img.shields.io/pypi/v/gitdb.svg :target: https://pypi.python.org/pypi/gitdb/ :alt: Latest Version -.. image:: https://pypip.in/py_versions/gitdb/badge.svg +.. image:: https://img.shields.io/pypi/pyversions/gitdb.svg :target: https://pypi.python.org/pypi/gitdb/ :alt: Supported Python versions +.. image:: https://readthedocs.org/projects/gitdb/badge/?version=latest + :target: https://readthedocs.org/projects/gitdb/?badge=latest + :alt: Documentation Status -From `PyPI `_ +From `PyPI `_:: pip install gitdb +SPEEDUPS +======== + +If you want to go up to 20% faster, you can install gitdb-speedups with:: + + pip install gitdb-speedups + +However, please note that gitdb-speedups is not currently maintained. + REQUIREMENTS ============ -* Python Nose - for running the tests +* smmap - declared as a dependency, automatically installed +* pytest - for running the tests SOURCE ====== -The source is available in a git repository at gitorious and github: + +The source is available in a git repository on GitHub: https://github.com/gitpython-developers/gitdb -Once the clone is complete, please be sure to initialize the submodules using +Once the clone is complete, please be sure to initialize the submodule using:: cd gitdb git submodule update --init -Run the tests with - - nosetests +Run the tests with:: + + pytest DEVELOPMENT =========== -.. image:: https://travis-ci.org/gitpython-developers/gitdb.svg?branch=master - :target: https://travis-ci.org/gitpython-developers/gitdb - -.. image:: https://coveralls.io/repos/gitpython-developers/gitdb/badge.png - :target: https://coveralls.io/r/gitpython-developers/gitdb +.. image:: https://github.com/gitpython-developers/gitdb/workflows/Python%20package/badge.svg + :target: https://github.com/gitpython-developers/gitdb/actions -The library is considered mature, and not under active development. It's primary (known) use is in git-python. +The library is considered mature, and not under active development. Its primary (known) use is in GitPython. INFRASTRUCTURE ============== -* Mailing List - * http://groups.google.com/group/git-python +* Discussions + * https://github.com/gitpython-developers/GitPython/discussions * Issue Tracker - * https://github.com/gitpython-developers/gitdb/issues + * https://github.com/gitpython-developers/gitdb/issues LICENSE ======= diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..95389ff --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,3 @@ +# Security Policy + +See [GitPython](https://github.com/gitpython-developers/GitPython/blob/main/SECURITY.md). Vulnerabilities found in `gitdb` can be reported there. diff --git a/build-release.sh b/build-release.sh new file mode 100755 index 0000000..5840e44 --- /dev/null +++ b/build-release.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# +# This script builds a release. If run in a venv, it auto-installs its tools. +# You may want to run "make release" instead of running this script directly. + +set -eEu + +function release_with() { + $1 -m build --sdist --wheel +} + +if test -n "${VIRTUAL_ENV:-}"; then + deps=(build twine) # Install twine along with build, as we need it later. + echo "Virtual environment detected. Adding packages: ${deps[*]}" + pip install --quiet --upgrade "${deps[@]}" + echo 'Starting the build.' + release_with python +else + function suggest_venv() { + venv_cmd='python -m venv env && source env/bin/activate' + printf "HELP: To avoid this error, use a virtual-env with '%s' instead.\n" "$venv_cmd" + } + trap suggest_venv ERR # This keeps the original exit (error) code. + echo 'Starting the build.' + release_with python3 # Outside a venv, use python3. +fi diff --git a/doc/source/algorithm.rst b/doc/source/algorithm.rst index 4374cb8..2e01b3f 100644 --- a/doc/source/algorithm.rst +++ b/doc/source/algorithm.rst @@ -92,6 +92,6 @@ Future work Another very promising option is that streaming of delta data is indeed possible. Depending on the configuration of the copy-from-base operations, different optimizations could be applied to reduce the amount of memory required for the final processed delta stream. Some configurations may even allow it to stream data from the base buffer, instead of pre-loading it for random access. -The ability to stream files at reduced memory costs would only be feasible for big files, and would have to be payed with extra pre-processing time. +The ability to stream files at reduced memory costs would only be feasible for big files, and would have to be paid with extra pre-processing time. A very first and simple implementation could avoid memory peaks by streaming the TDS in conjunction with a base buffer, instead of writing everything into a fully allocated target buffer. diff --git a/doc/source/changes.rst b/doc/source/changes.rst index f544f76..b4340e4 100644 --- a/doc/source/changes.rst +++ b/doc/source/changes.rst @@ -2,6 +2,112 @@ Changelog ######### +****** +4.0.12 +****** + +- various improvements - please see the release on GitHub for details. + +****** +4.0.11 +****** + +- various improvements - please see the release on GitHub for details. + +****** +4.0.10 +****** + +- improvements to the way external packages are imported. + +***** +4.0.9 +***** + +- re-release of 4.0.8 to get a valid signature. + +***** +4.0.8 +***** + +* drop support for python 3.4 and 3.5 due to EOL +* Updated upper bound for smmap requirement in setup.py + (`#69 `_) + +***** +4.0.7 +***** + +* Updated upper bound for smmap requirement in setup.py + (`#69 `_) + +***** +4.0.6 +***** + +* Bumped upper bound for smmap requirement + (`#67 `_, + `#68 `_) + +***** +4.0.5 +***** + +* Re-release of 4.0.4, with known signature + +***** +4.0.4 +***** + +* Support for PyOxidizer + +***** +4.0.2 +***** + +* Updated to release as Pure Python Wheel rather than Universal Wheel + (`#62 `_) + +***** +4.0.1 +***** + +* Switched back to the gitdb package name on PyPI and fixed the gitdb2 mirror package + (`#59 `_) +* Switched back to require smmap package and fixed version requirement to >= 3.0.1, < 4 + (`#59 `_) +* Updated smmap submodule + +*********** +3.0.3.post1 +*********** + +* Fixed changelogs for v3.0.2 and v3.0.3 + +***** +3.0.3 +***** + +* Changed ``force_bytes`` to use UTF-8 encoding by default + (`#49 `_) +* Restricted smmap2 version requirement to < 3 +* Updated requirements.txt + +***** +3.0.2 +***** + +* Removed Python 2 compatibility shims + (`#56 `_) + +***** +0.6.1 +***** + +* Fixed possibly critical error, see https://github.com/gitpython-developers/GitPython/issues/220 + + - However, it only seems to occur on high-entropy data and didn't reoccour after the fix + ***** 0.6.0 ***** diff --git a/doc/source/conf.py b/doc/source/conf.py index 723a345..b387f60 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # GitDB documentation build configuration file, created by # sphinx-quickstart on Wed Jun 30 00:01:32 2010. @@ -11,7 +10,8 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys, os +import sys +import os # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the @@ -37,8 +37,8 @@ master_doc = 'index' # General information about the project. -project = u'GitDB' -copyright = u'2011, Sebastian Thiel' +project = 'GitDB' +copyright = '2011, Sebastian Thiel' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -119,7 +119,7 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['.static'] +#html_static_path = ['.static'] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. @@ -171,8 +171,8 @@ # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'GitDB.tex', u'GitDB Documentation', - u'Sebastian Thiel', 'manual'), + ('index', 'GitDB.tex', 'GitDB Documentation', + 'Sebastian Thiel', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of diff --git a/etc/sublime-text/gitdb.sublime-project b/etc/sublime-text/gitdb.sublime-project deleted file mode 100644 index bc0e37f..0000000 --- a/etc/sublime-text/gitdb.sublime-project +++ /dev/null @@ -1,54 +0,0 @@ -{ - "folders": - [ - // GITDB - //////// - { - "follow_symlinks": true, - "path": "../..", - "file_exclude_patterns" : [ - "*.sublime-workspace", - ".git", - ".noseids", - ".coverage" - ], - "folder_exclude_patterns" : [ - ".git", - "cover", - "gitdb/ext" - ] - }, - // SMMAP - //////// - { - "follow_symlinks": true, - "path": "../../gitdb/ext/smmap", - "file_exclude_patterns" : [ - "*.sublime-workspace", - ".git", - ".noseids", - ".coverage" - ], - "folder_exclude_patterns" : [ - ".git", - "cover", - ] - }, - // ASYNC - //////// - { - "follow_symlinks": true, - "path": "../../gitdb/ext/async", - "file_exclude_patterns" : [ - "*.sublime-workspace", - ".git", - ".noseids", - ".coverage" - ], - "folder_exclude_patterns" : [ - ".git", - "cover", - ] - }, - ] -} diff --git a/gitdb.pro.user b/gitdb.pro.user index 398cb70..3ca1e21 100644 --- a/gitdb.pro.user +++ b/gitdb.pro.user @@ -233,8 +233,7 @@ - /usr/bin/nosetests - -s + /usr/bin/pytest gitdb/test/test_pack.py 2 diff --git a/gitdb/__init__.py b/gitdb/__init__.py index 165993f..1fb7df8 100644 --- a/gitdb/__init__.py +++ b/gitdb/__init__.py @@ -1,36 +1,15 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ """Initialize the object database module""" -import sys -import os - -#{ Initialization -def _init_externals(): - """Initialize external projects by putting them into the path""" - for module in ('smmap',): - sys.path.append(os.path.join(os.path.dirname(__file__), 'ext', module)) - - try: - __import__(module) - except ImportError: - raise ImportError("'%s' could not be imported, assure it is located in your PYTHONPATH" % module) - #END verify import - #END handel imports - -#} END initialization - -_init_externals() - __author__ = "Sebastian Thiel" __contact__ = "byronimo@gmail.com" __homepage__ = "https://github.com/gitpython-developers/gitdb" -version_info = (0, 6, 0) +version_info = (4, 0, 12) __version__ = '.'.join(str(i) for i in version_info) - # default imports from gitdb.base import * from gitdb.db import * diff --git a/gitdb/_delta_apply.c b/gitdb/_delta_apply.c deleted file mode 100644 index 8b0f8e0..0000000 --- a/gitdb/_delta_apply.c +++ /dev/null @@ -1,1154 +0,0 @@ -#include <_delta_apply.h> -#include -#include -#include -#include -#include - - - -typedef unsigned long long ull; -typedef unsigned int uint; -typedef unsigned char uchar; -typedef unsigned short ushort; -typedef uchar bool; - -// Constants -const ull gDIV_grow_by = 100; - - - -// DELTA STREAM ACCESS -/////////////////////// -inline -ull msb_size(const uchar** datap, const uchar* top) -{ - const uchar *data = *datap; - ull cmd, size = 0; - uint i = 0; - do { - cmd = *data++; - size |= (cmd & 0x7f) << i; - i += 7; - } while (cmd & 0x80 && data < top); - *datap = data; - return size; -} - - -// TOP LEVEL STREAM INFO -///////////////////////////// -typedef struct { - const uchar *tds; // Toplevel delta stream - const uchar *cstart; // start of the chunks - Py_ssize_t tdslen; // size of tds in bytes - Py_ssize_t target_size; // size of the target buffer which can hold all data - uint num_chunks; // amount of chunks in the delta stream - PyObject *parent_object; -} ToplevelStreamInfo; - - -void TSI_init(ToplevelStreamInfo* info) -{ - info->tds = NULL; - info->cstart = NULL; - info->tdslen = 0; - info->num_chunks = 0; - info->target_size = 0; - info->parent_object = 0; -} - -void TSI_destroy(ToplevelStreamInfo* info) -{ -#ifdef DEBUG - fprintf(stderr, "TSI_destroy: %p\n", info); -#endif - - if (info->parent_object){ - Py_DECREF(info->parent_object); - info->parent_object = NULL; - } else if (info->tds){ - PyMem_Free((void*)info->tds); - } - info->tds = NULL; - info->cstart = NULL; - info->tdslen = 0; - info->num_chunks = 0; -} - -inline -const uchar* TSI_end(ToplevelStreamInfo* info) -{ - return info->tds + info->tdslen; -} - -inline -const uchar* TSI_first(ToplevelStreamInfo* info) -{ - return info->cstart; -} - -// set the stream, and initialize it -// initialize our set stream to point to the first chunk -// Fill in the header information, which is the base and target size -inline -void TSI_set_stream(ToplevelStreamInfo* info, const uchar* stream) -{ - info->tds = stream; - info->cstart = stream; - - assert(info->tds && info->tdslen); - - // init stream - const uchar* tdsend = TSI_end(info); - msb_size(&info->cstart, tdsend); // base size - info->target_size = msb_size(&info->cstart, tdsend); -} - - - -// duplicate the data currently owned by the parent object drop its refcount -// return 1 on success -bool TSI_copy_stream_from_object(ToplevelStreamInfo* info) -{ - assert(info->parent_object); - - uchar* ptmp = PyMem_Malloc(info->tdslen); - if (!ptmp){ - return 0; - } - uint ofs = (uint)(info->cstart - info->tds); - memcpy((void*)ptmp, info->tds, info->tdslen); - - info->tds = ptmp; - info->cstart = ptmp + ofs; - - Py_DECREF(info->parent_object); - info->parent_object = 0; - - return 1; -} - -// Transfer ownership of the given stream into our instance. The amount of chunks -// remains the same, and needs to be set by the caller -void TSI_replace_stream(ToplevelStreamInfo* info, const uchar* stream, uint streamlen) -{ - assert(info->parent_object == 0); - - uint ofs = (uint)(info->cstart - info->tds); - if (info->tds){ - PyMem_Free((void*)info->tds); - } - info->tds = stream; - info->cstart = info->tds + ofs; - info->tdslen = streamlen; - -} - -// DELTA CHUNK -//////////////// -// Internal Delta Chunk Objects -// They are just used to keep information parsed from a stream -// The data pointer is always shared -typedef struct { - ull to; - uint ts; - uint so; - const uchar* data; -} DeltaChunk; - -// forward declarations -const uchar* next_delta_info(const uchar*, DeltaChunk*); - -inline -void DC_init(DeltaChunk* dc, ull to, ull ts, ull so, const uchar* data) -{ - dc->to = to; - dc->ts = ts; - dc->so = so; - dc->data = NULL; -} - - -inline -ull DC_rbound(const DeltaChunk* dc) -{ - return dc->to + dc->ts; -} - -inline -void DC_print(const DeltaChunk* dc, const char* prefix) -{ - fprintf(stderr, "%s-dc: to = %i, ts = %i, so = %i, data = %p\n", prefix, (int)dc->to, dc->ts, dc->so, dc->data); -} - -// Apply -inline -void DC_apply(const DeltaChunk* dc, const uchar* base, PyObject* writer, PyObject* tmpargs) -{ - PyObject* buffer = 0; - if (dc->data){ - buffer = PyBuffer_FromMemory((void*)dc->data, dc->ts); - } else { - buffer = PyBuffer_FromMemory((void*)(base + dc->so), dc->ts); - } - - if (PyTuple_SetItem(tmpargs, 0, buffer)){ - assert(0); - } - - - // tuple steals reference, and will take care about the deallocation - PyObject_Call(writer, tmpargs, NULL); - -} - -// Encode the information in the given delta chunk and write the byte-stream -// into the given output stream -// It will be copied into the given bounds, the given size must be the final size -// and work with the given relative offset - hence the bounds are assumed to be -// correct and to fit within the unaltered dc -inline -void DC_encode_to(const DeltaChunk* dc, uchar** pout, uint ofs, uint size) -{ - uchar* out = *pout; - if (dc->data){ - *out++ = (uchar)size; - memcpy(out, dc->data+ofs, size); - out += size; - } else { - uchar i = 0x80; - uchar* op = out++; - uint moff = dc->so+ofs; - - if (moff & 0x000000ff) - *out++ = moff >> 0, i |= 0x01; - if (moff & 0x0000ff00) - *out++ = moff >> 8, i |= 0x02; - if (moff & 0x00ff0000) - *out++ = moff >> 16, i |= 0x04; - if (moff & 0xff000000) - *out++ = moff >> 24, i |= 0x08; - - if (size & 0x00ff) - *out++ = size >> 0, i |= 0x10; - if (size & 0xff00) - *out++ = size >> 8, i |= 0x20; - - *op = i; - } - - *pout = out; -} - -// Return: amount of bytes one would need to encode dc -inline -ushort DC_count_encode_bytes(const DeltaChunk* dc) -{ - if (dc->data){ - return 1 + dc->ts; // cmd byte + actual data bytes - } else { - ushort c = 1; // cmd byte - uint ts = dc->ts; - ull so = dc->so; - - // offset - c += (so & 0x000000FF) > 0; - c += (so & 0x0000FF00) > 0; - c += (so & 0x00FF0000) > 0; - c += (so & 0xFF000000) > 0; - - // size - max size is 0x10000, its encoded with 0 size bits - c += (ts & 0x000000FF) > 0; - c += (ts & 0x0000FF00) > 0; - - return c; - } -} - - - -// DELTA INFO -///////////// -typedef struct { - uint dso; // delta stream offset, relative to the very start of the stream - uint to; // target offset (cache) -} DeltaInfo; - - -// DELTA INFO VECTOR -////////////////////// - -typedef struct { - DeltaInfo *mem; // Memory for delta infos - uint di_last_size; // size of the last element - we can't compute it using the next bound - const uchar *dstream; // borrowed ointer to delta stream we index - Py_ssize_t size; // Amount of DeltaInfos - Py_ssize_t reserved_size; // Reserved amount of DeltaInfos -} DeltaInfoVector; - - - -// Reserve enough memory to hold the given amount of delta chunks -// Return 1 on success -// NOTE: added a minimum allocation to assure reallocation is not done -// just for a single additional entry. DIVs change often, and reallocs are expensive -inline -int DIV_reserve_memory(DeltaInfoVector* vec, uint num_dc) -{ - if (num_dc <= vec->reserved_size){ - return 1; - } - -#ifdef DEBUG - bool was_null = vec->mem == NULL; -#endif - - if (vec->mem == NULL){ - vec->mem = PyMem_Malloc(num_dc * sizeof(DeltaInfo)); - } else { - vec->mem = PyMem_Realloc(vec->mem, num_dc * sizeof(DeltaInfo)); - } - - if (vec->mem == NULL){ - Py_FatalError("Could not allocate memory for append operation"); - } - - vec->reserved_size = num_dc; - -#ifdef DEBUG - const char* format = "Allocated %i bytes at %p, to hold up to %i chunks\n"; - if (!was_null) - format = "Re-allocated %i bytes at %p, to hold up to %i chunks\n"; - fprintf(stderr, format, (int)(vec->reserved_size * sizeof(DeltaInfo)), vec->mem, (int)vec->reserved_size); -#endif - - return vec->mem != NULL; -} - -/* -Grow the delta chunk list by the given amount of bytes. -This may trigger a realloc, but will do nothing if the reserved size is already -large enough. -Return 1 on success, 0 on failure -*/ -inline -int DIV_grow_by(DeltaInfoVector* vec, uint num_dc) -{ - return DIV_reserve_memory(vec, vec->reserved_size + num_dc); -} - -int DIV_init(DeltaInfoVector* vec, ull initial_size) -{ - vec->mem = NULL; - vec->dstream = NULL; - vec->size = 0; - vec->reserved_size = 0; - vec->di_last_size = 0; - - return DIV_grow_by(vec, initial_size); -} - -inline -Py_ssize_t DIV_len(const DeltaInfoVector* vec) -{ - return vec->size; -} - -inline -uint DIV_lbound(const DeltaInfoVector* vec) -{ - assert(vec->size && vec->mem); - return vec->mem->to; -} - -// Return item at index -inline -DeltaInfo* DIV_get(const DeltaInfoVector* vec, Py_ssize_t i) -{ - assert(i < vec->size && vec->mem); - return &vec->mem[i]; -} - -// Return last item -inline -DeltaInfo* DIV_last(const DeltaInfoVector* vec) -{ - return DIV_get(vec, vec->size-1); -} - -inline -int DIV_empty(const DeltaInfoVector* vec) -{ - return vec->size == 0; -} - -// Return end pointer of the vector -inline -const DeltaInfo* DIV_end(const DeltaInfoVector* vec) -{ - assert(!DIV_empty(vec)); - return vec->mem + vec->size; -} - -// return first item in vector -inline -DeltaInfo* DIV_first(const DeltaInfoVector* vec) -{ - assert(!DIV_empty(vec)); - return vec->mem; -} - -// return rbound offset in bytes. We use information contained in the -// vec to do that -inline -uint DIV_info_rbound(const DeltaInfoVector* vec, const DeltaInfo* di) -{ - if (DIV_last(vec) == di){ - return di->to + vec->di_last_size; - } else { - return (di+1)->to; - } -} - -// return size of the given delta info item -inline -uint DIV_info_size2(const DeltaInfoVector* vec, const DeltaInfo* di, const DeltaInfo const* veclast) -{ - if (veclast == di){ - return vec->di_last_size; - } else { - return (di+1)->to - di->to; - } -} - -// return size of the given delta info item -inline -uint DIV_info_size(const DeltaInfoVector* vec, const DeltaInfo* di) -{ - return DIV_info_size2(vec, di, DIV_last(vec)); -} - -void DIV_destroy(DeltaInfoVector* vec) -{ - if (vec->mem){ -#ifdef DEBUG - fprintf(stderr, "DIV_destroy: %p\n", (void*)vec->mem); -#endif - PyMem_Free(vec->mem); - vec->size = 0; - vec->reserved_size = 0; - vec->mem = 0; - } -} - -// Reset this vector so that its existing memory can be filled again. -// Memory will be kept, but not cleaned up -inline -void DIV_forget_members(DeltaInfoVector* vec) -{ - vec->size = 0; -} - -// Reset the vector so that its size will be zero -// It will keep its memory though, and hence can be filled again -inline -void DIV_reset(DeltaInfoVector* vec) -{ - if (vec->size == 0) - return; - vec->size = 0; -} - - -// Append one chunk to the end of the list, and return a pointer to it -// It will not have been initialized ! -inline -DeltaInfo* DIV_append(DeltaInfoVector* vec) -{ - if (vec->size + 1 > vec->reserved_size){ - DIV_grow_by(vec, gDIV_grow_by); - } - - DeltaInfo* next = vec->mem + vec->size; - vec->size += 1; - return next; -} - -// Return delta chunk being closest to the given absolute offset -inline -DeltaInfo* DIV_closest_chunk(const DeltaInfoVector* vec, ull ofs) -{ - assert(vec->mem); - - ull lo = 0; - ull hi = vec->size; - ull mid; - DeltaInfo* di; - - while (lo < hi) - { - mid = (lo + hi) / 2; - di = vec->mem + mid; - if (di->to > ofs){ - hi = mid; - } else if ((DIV_info_rbound(vec, di) > ofs) | (di->to == ofs)) { - return di; - } else { - lo = mid + 1; - } - } - - return DIV_last(vec); -} - - -// Return the amount of chunks a slice at the given spot would have, as well as -// its size in bytes it would have if the possibly partial chunks would be encoded -// and added to the spot marked by sdc -uint DIV_count_slice_bytes(const DeltaInfoVector* src, uint ofs, uint size) -{ - uint num_bytes = 0; - DeltaInfo* cdi = DIV_closest_chunk(src, ofs); - - DeltaChunk dc; - DC_init(&dc, 0, 0, 0, NULL); - - // partial overlap - if (cdi->to != ofs) { - const ull relofs = ofs - cdi->to; - const uint cdisize = DIV_info_size(src, cdi); - const uint max_size = cdisize - relofs < size ? cdisize - relofs : size; - size -= max_size; - - // get the size in bytes the info would have - next_delta_info(src->dstream + cdi->dso, &dc); - dc.so += relofs; - dc.ts = max_size; - num_bytes += DC_count_encode_bytes(&dc); - - cdi += 1; - - if (size == 0){ - return num_bytes; - } - } - - const DeltaInfo const* vecend = DIV_end(src); - const uchar* nstream; - for( ;cdi < vecend; ++cdi){ - nstream = next_delta_info(src->dstream + cdi->dso, &dc); - - if (dc.ts < size) { - num_bytes += nstream - (src->dstream + cdi->dso); - size -= dc.ts; - } else { - dc.ts = size; - num_bytes += DC_count_encode_bytes(&dc); - size = 0; - break; - } - } - - assert(size == 0); - return num_bytes; -} - -// Write a slice as defined by its absolute offset in bytes and its size into the given -// destination memory. The individual chunks written will be a byte copy of the source -// data chunk stream -// Return: number of chunks in the slice -uint DIV_copy_slice_to(const DeltaInfoVector* src, uchar** dest, ull tofs, uint size) -{ - assert(DIV_lbound(src) <= tofs); - assert((tofs + size) <= DIV_info_rbound(src, DIV_last(src))); - - DeltaChunk dc; - DC_init(&dc, 0, 0, 0, NULL); - - DeltaInfo* cdi = DIV_closest_chunk(src, tofs); - uint num_chunks = 0; - - // partial overlap - if (cdi->to != tofs) { - const uint relofs = tofs - cdi->to; - next_delta_info(src->dstream + cdi->dso, &dc); - const uint max_size = dc.ts - relofs < size ? dc.ts - relofs : size; - - size -= max_size; - - // adjust dc proportions - DC_encode_to(&dc, dest, relofs, max_size); - - num_chunks += 1; - cdi += 1; - - if (size == 0){ - return num_chunks; - } - } - - const uchar* dstream = src->dstream + cdi->dso; - const uchar* nstream = dstream; - for( ; nstream; dstream = nstream) - { - num_chunks += 1; - nstream = next_delta_info(dstream, &dc); - if (dc.ts < size) { - memcpy(*dest, dstream, nstream - dstream); - *dest += nstream - dstream; - size -= dc.ts; - } else { - DC_encode_to(&dc, dest, 0, size); - size = 0; - break; - } - } - - assert(size == 0); - return num_chunks; -} - - -// Take slices of div into the corresponding area of the tsi, which is the topmost -// delta to apply. -bool DIV_connect_with_base(ToplevelStreamInfo* tsi, DeltaInfoVector* div) -{ - assert(tsi->num_chunks); - - - uint num_bytes = 0; - const uchar* data = TSI_first(tsi); - const uchar* dend = TSI_end(tsi); - - DeltaChunk dc; - DC_init(&dc, 0, 0, 0, NULL); - - - // COMPUTE SIZE OF TARGET STREAM - ///////////////////////////////// - for (;data < dend;) - { - data = next_delta_info(data, &dc); - - // Data chunks don't need processing - if (dc.data){ - num_bytes += 1 + dc.ts; - continue; - } - - num_bytes += DIV_count_slice_bytes(div, dc.so, dc.ts); - } - assert(DC_rbound(&dc) == tsi->target_size); - - - // GET NEW DELTA BUFFER - //////////////////////// - uchar *const dstream = PyMem_Malloc(num_bytes); - if (!dstream){ - return 0; - } - - - data = TSI_first(tsi); - const uchar *ndata = data; - dend = TSI_end(tsi); - - uint num_chunks = 0; - uchar* ds = dstream; - DC_init(&dc, 0, 0, 0, NULL); - - // pick slices from the delta and put them into the new stream - for (; data < dend; data = ndata) - { - ndata = next_delta_info(data, &dc); - - // Data chunks don't need processing - if (dc.data){ - // just copy it over - memcpy((void*)ds, (void*)data, ndata - data); - ds += ndata - data; - num_chunks += 1; - continue; - } - - // Copy Chunks - num_chunks += DIV_copy_slice_to(div, &ds, dc.so, dc.ts); - } - assert(ds - dstream == num_bytes); - assert(num_chunks >= tsi->num_chunks); - assert(DC_rbound(&dc) == tsi->target_size); - - // finally, replace the streams - TSI_replace_stream(tsi, dstream, num_bytes); - tsi->cstart = dstream; // we have NO header ! - assert(tsi->tds == dstream); - tsi->num_chunks = num_chunks; - - - return 1; - -} - -// DELTA CHUNK LIST (PYTHON) -///////////////////////////// -// Internally, it has nothing to do with a ChunkList anymore though -typedef struct { - PyObject_HEAD - // ----------- - ToplevelStreamInfo istream; - -} DeltaChunkList; - - - -int DCL_init(DeltaChunkList*self, PyObject *args, PyObject *kwds) -{ - if(args && PySequence_Size(args) > 0){ - PyErr_SetString(PyExc_ValueError, "Too many arguments"); - return -1; - } - - TSI_init(&self->istream); - return 0; -} - - -void DCL_dealloc(DeltaChunkList* self) -{ - TSI_destroy(&(self->istream)); -} - - -PyObject* DCL_py_rbound(DeltaChunkList* self) -{ - return PyLong_FromUnsignedLongLong(self->istream.target_size); -} - -// Write using a write function, taking remaining bytes from a base buffer - -PyObject* DCL_apply(DeltaChunkList* self, PyObject* args) -{ - PyObject* pybuf = 0; - PyObject* writeproc = 0; - if (!PyArg_ParseTuple(args, "OO", &pybuf, &writeproc)){ - PyErr_BadArgument(); - return NULL; - } - - if (!PyObject_CheckReadBuffer(pybuf)){ - PyErr_SetString(PyExc_ValueError, "First argument must be a buffer-compatible object, like a string, or a memory map"); - return NULL; - } - - if (!PyCallable_Check(writeproc)){ - PyErr_SetString(PyExc_ValueError, "Second argument must be a writer method with signature write(buf)"); - return NULL; - } - - const uchar* base; - Py_ssize_t baselen; - PyObject_AsReadBuffer(pybuf, (const void**)&base, &baselen); - - PyObject* tmpargs = PyTuple_New(1); - - const uchar* data = TSI_first(&self->istream); - const uchar const* dend = TSI_end(&self->istream); - - DeltaChunk dc; - DC_init(&dc, 0, 0, 0, NULL); - - while (data < dend){ - data = next_delta_info(data, &dc); - DC_apply(&dc, base, writeproc, tmpargs); - } - - Py_DECREF(tmpargs); - Py_RETURN_NONE; -} - -PyMethodDef DCL_methods[] = { - {"apply", (PyCFunction)DCL_apply, METH_VARARGS, "Apply the given iterable of delta streams" }, - {"rbound", (PyCFunction)DCL_py_rbound, METH_NOARGS, NULL}, - {NULL} /* Sentinel */ -}; - -PyTypeObject DeltaChunkListType = { - PyObject_HEAD_INIT(NULL) - 0, /*ob_size*/ - "DeltaChunkList", /*tp_name*/ - sizeof(DeltaChunkList), /*tp_basicsize*/ - 0, /*tp_itemsize*/ - (destructor)DCL_dealloc, /*tp_dealloc*/ - 0, /*tp_print*/ - 0, /*tp_getattr*/ - 0, /*tp_setattr*/ - 0, /*tp_compare*/ - 0, /*tp_repr*/ - 0, /*tp_as_number*/ - 0, /*tp_as_sequence*/ - 0, /*tp_as_mapping*/ - 0, /*tp_hash */ - 0, /*tp_call*/ - 0, /*tp_str*/ - 0, /*tp_getattro*/ - 0, /*tp_setattro*/ - 0, /*tp_as_buffer*/ - Py_TPFLAGS_DEFAULT, /*tp_flags*/ - "Minimal Delta Chunk List",/* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - 0, /* tp_iter */ - 0, /* tp_iternext */ - DCL_methods, /* tp_methods */ - 0, /* tp_members */ - 0, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - (initproc)DCL_init, /* tp_init */ - 0, /* tp_alloc */ - 0, /* tp_new */ -}; - - -// Makes a new copy of the DeltaChunkList - you have to do everything yourselve -// in C ... want C++ !! -DeltaChunkList* DCL_new_instance(void) -{ - DeltaChunkList* dcl = (DeltaChunkList*) PyType_GenericNew(&DeltaChunkListType, 0, 0); - assert(dcl); - - DCL_init(dcl, 0, 0); - return dcl; -} - -// Read the next delta chunk from the given stream and advance it -// dc will contain the parsed information, its offset must be set by -// the previous call of next_delta_info, which implies it should remain the -// same instance between the calls. -// Return the altered uchar pointer, reassign it to the input data -inline -const uchar* next_delta_info(const uchar* data, DeltaChunk* dc) -{ - const char cmd = *data++; - - if (cmd & 0x80) - { - uint cp_off = 0, cp_size = 0; - if (cmd & 0x01) cp_off = *data++; - if (cmd & 0x02) cp_off |= (*data++ << 8); - if (cmd & 0x04) cp_off |= (*data++ << 16); - if (cmd & 0x08) cp_off |= ((unsigned) *data++ << 24); - if (cmd & 0x10) cp_size = *data++; - if (cmd & 0x20) cp_size |= (*data++ << 8); - if (cmd & 0x40) cp_size |= (*data++ << 16); // this should never get hit with current deltas ... - if (cp_size == 0) cp_size = 0x10000; - - dc->to += dc->ts; - dc->data = NULL; - dc->so = cp_off; - dc->ts = cp_size; - - } else if (cmd) { - // Just share the data - dc->to += dc->ts; - dc->data = data; - dc->ts = cmd; - dc->so = 0; - - data += cmd; - } else { - PyErr_SetString(PyExc_RuntimeError, "Encountered an unsupported delta cmd: 0"); - assert(0); - return NULL; - } - - return data; -} - -// Return amount of chunks encoded in the given delta stream -// If read_header is True, then the header msb chunks will be read first. -// Otherwise, the stream is assumed to be scrubbed one past the header -uint compute_chunk_count(const uchar* data, const uchar* dend, bool read_header) -{ - // read header - if (read_header){ - msb_size(&data, dend); - msb_size(&data, dend); - } - - DeltaChunk dc; - DC_init(&dc, 0, 0, 0, NULL); - uint num_chunks = 0; - - while (data < dend) - { - data = next_delta_info(data, &dc); - num_chunks += 1; - }// END handle command opcodes - - return num_chunks; -} - -PyObject* connect_deltas(PyObject *self, PyObject *dstreams) -{ - // obtain iterator - PyObject* stream_iter = 0; - if (!PyIter_Check(dstreams)){ - stream_iter = PyObject_GetIter(dstreams); - if (!stream_iter){ - PyErr_SetString(PyExc_RuntimeError, "Couldn't obtain iterator for streams"); - return NULL; - } - } else { - stream_iter = dstreams; - } - - DeltaInfoVector div; - ToplevelStreamInfo tdsinfo; - TSI_init(&tdsinfo); - DIV_init(&div, 0); - - - // GET TOPLEVEL DELTA STREAM - int error = 0; - PyObject* ds = 0; - unsigned int dsi = 0; // delta stream index we process - ds = PyIter_Next(stream_iter); - if (!ds){ - error = 1; - goto _error; - } - - dsi += 1; - tdsinfo.parent_object = PyObject_CallMethod(ds, "read", 0); - if (!PyObject_CheckReadBuffer(tdsinfo.parent_object)){ - Py_DECREF(ds); - error = 1; - goto _error; - } - - PyObject_AsReadBuffer(tdsinfo.parent_object, (const void**)&tdsinfo.tds, &tdsinfo.tdslen); - if (tdsinfo.tdslen > pow(2, 32)){ - // parent object is deallocated by info structure - Py_DECREF(ds); - PyErr_SetString(PyExc_RuntimeError, "Cannot handle deltas larger than 4GB"); - tdsinfo.parent_object = 0; - - error = 1; - goto _error; - } - Py_DECREF(ds); - - // let it officially know, and initialize its internal state - TSI_set_stream(&tdsinfo, tdsinfo.tds); - - // INTEGRATE ANCESTOR DELTA STREAMS - for (ds = PyIter_Next(stream_iter); ds != NULL; ds = PyIter_Next(stream_iter), ++dsi) - { - // Its important to initialize this before the next block which can jump - // to code who needs this to exist ! - PyObject* db = 0; - - // When processing the first delta, we know we will have to alter the tds - // Hence we copy it and deallocate the parent object - if (dsi == 1) { - if (!TSI_copy_stream_from_object(&tdsinfo)){ - PyErr_SetString(PyExc_RuntimeError, "Could not allocate memory to copy toplevel buffer"); - // info structure takes care of the parent_object - error = 1; - goto loop_end; - } - - tdsinfo.num_chunks = compute_chunk_count(tdsinfo.cstart, TSI_end(&tdsinfo), 0); - } - - db = PyObject_CallMethod(ds, "read", 0); - if (!PyObject_CheckReadBuffer(db)){ - error = 1; - PyErr_SetString(PyExc_RuntimeError, "Returned buffer didn't support the buffer protocol"); - goto loop_end; - } - - // Fill the stream info structure - const uchar* data; - Py_ssize_t dlen; - PyObject_AsReadBuffer(db, (const void**)&data, &dlen); - const uchar const* dstart = data; - const uchar const* dend = data + dlen; - div.dstream = dstart; - - if (dlen > pow(2, 32)){ - error = 1; - PyErr_SetString(PyExc_RuntimeError, "Cannot currently handle deltas larger than 4GB"); - goto loop_end; - } - - // READ HEADER - msb_size(&data, dend); - const ull target_size = msb_size(&data, dend); - - DIV_reserve_memory(&div, compute_chunk_count(data, dend, 0)); - - // parse command stream - DeltaInfo* di = 0; // temporary pointer - DeltaChunk dc; - DC_init(&dc, 0, 0, 0, NULL); - - assert(data < dend); - while (data < dend) - { - di = DIV_append(&div); - di->dso = data - dstart; - if ((data = next_delta_info(data, &dc))){ - di->to = dc.to; - } else { - error = 1; - goto loop_end; - } - }// END handle command opcodes - - // finalize information - div.di_last_size = dc.ts; - - if (DC_rbound(&dc) != target_size){ - PyErr_SetString(PyExc_RuntimeError, "Failed to parse delta stream"); - error = 1; - } - - #ifdef DEBUG - fprintf(stderr, "------------ Stream %i --------\n ", (int)dsi); - fprintf(stderr, "Before Connect: tdsinfo: num_chunks = %i, bytelen = %i KiB, target_size = %i KiB\n", (int)tdsinfo.num_chunks, (int)tdsinfo.tdslen/1000, (int)tdsinfo.target_size/1000); - fprintf(stderr, "div->num_chunks = %i, div->reserved_size = %i, div->bytelen=%i KiB\n", (int)div.size, (int)div.reserved_size, (int)dlen/1000); - #endif - - if (!DIV_connect_with_base(&tdsinfo, &div)){ - error = 1; - } - - #ifdef DEBUG - fprintf(stderr, "after connect: tdsinfo->num_chunks = %i, tdsinfo->bytelen = %i KiB\n", (int)tdsinfo.num_chunks, (int)tdsinfo.tdslen/1000); - #endif - - // destroy members, but keep memory - DIV_reset(&div); - -loop_end: - // perform cleanup - Py_DECREF(ds); - Py_DECREF(db); - - if (error){ - break; - } - }// END for each stream object - - if (dsi == 0){ - PyErr_SetString(PyExc_ValueError, "No streams provided"); - } - - -_error: - - if (stream_iter != dstreams){ - Py_DECREF(stream_iter); - } - - - DIV_destroy(&div); - - // Return the actual python object - its just a container - DeltaChunkList* dcl = DCL_new_instance(); - if (!dcl){ - PyErr_SetString(PyExc_RuntimeError, "Couldn't allocate list"); - // Otherwise tdsinfo would be deallocated by the chunk list - TSI_destroy(&tdsinfo); - error = 1; - } else { - // Plain copy, transfer ownership to dcl - dcl->istream = tdsinfo; - } - - if (error){ - // Will dealloc tdcv - Py_XDECREF(dcl); - return NULL; - } - - return (PyObject*)dcl; -} - - -// Write using a write function, taking remaining bytes from a base buffer -// replaces the corresponding method in python -PyObject* apply_delta(PyObject* self, PyObject* args) -{ - PyObject* pybbuf = 0; - PyObject* pydbuf = 0; - PyObject* pytbuf = 0; - if (!PyArg_ParseTuple(args, "OOO", &pybbuf, &pydbuf, &pytbuf)){ - PyErr_BadArgument(); - return NULL; - } - - PyObject* objects[] = { pybbuf, pydbuf, pytbuf }; - assert(sizeof(objects) / sizeof(PyObject*) == 3); - - uint i; - for(i = 0; i < 3; i++){ - if (!PyObject_CheckReadBuffer(objects[i])){ - PyErr_SetString(PyExc_ValueError, "Argument must be a buffer-compatible object, like a string, or a memory map"); - return NULL; - } - } - - Py_ssize_t lbbuf; Py_ssize_t ldbuf; Py_ssize_t ltbuf; - const uchar* bbuf; const uchar* dbuf; - uchar* tbuf; - PyObject_AsReadBuffer(pybbuf, (const void**)(&bbuf), &lbbuf); - PyObject_AsReadBuffer(pydbuf, (const void**)(&dbuf), &ldbuf); - - if (PyObject_AsWriteBuffer(pytbuf, (void**)(&tbuf), <buf)){ - PyErr_SetString(PyExc_ValueError, "Argument 3 must be a writable buffer"); - return NULL; - } - - const uchar* data = dbuf; - const uchar* dend = dbuf + ldbuf; - - while (data < dend) - { - const char cmd = *data++; - - if (cmd & 0x80) - { - unsigned long cp_off = 0, cp_size = 0; - if (cmd & 0x01) cp_off = *data++; - if (cmd & 0x02) cp_off |= (*data++ << 8); - if (cmd & 0x04) cp_off |= (*data++ << 16); - if (cmd & 0x08) cp_off |= ((unsigned) *data++ << 24); - if (cmd & 0x10) cp_size = *data++; - if (cmd & 0x20) cp_size |= (*data++ << 8); - if (cmd & 0x40) cp_size |= (*data++ << 16); - if (cp_size == 0) cp_size = 0x10000; - - memcpy(tbuf, bbuf + cp_off, cp_size); - tbuf += cp_size; - - } else if (cmd) { - memcpy(tbuf, data, cmd); - tbuf += cmd; - data += cmd; - } else { - PyErr_SetString(PyExc_RuntimeError, "Encountered an unsupported delta cmd: 0"); - return NULL; - } - }// END handle command opcodes - - Py_RETURN_NONE; -} diff --git a/gitdb/_delta_apply.h b/gitdb/_delta_apply.h deleted file mode 100644 index 1fcd538..0000000 --- a/gitdb/_delta_apply.h +++ /dev/null @@ -1,6 +0,0 @@ -#include - -extern PyObject* connect_deltas(PyObject *self, PyObject *dstreams); -extern PyObject* apply_delta(PyObject* self, PyObject* args); - -extern PyTypeObject DeltaChunkListType; diff --git a/gitdb/_fun.c b/gitdb/_fun.c deleted file mode 100644 index 4997038..0000000 --- a/gitdb/_fun.c +++ /dev/null @@ -1,107 +0,0 @@ -#include -#include "_delta_apply.h" - -static PyObject *PackIndexFile_sha_to_index(PyObject *self, PyObject *args) -{ - const unsigned char *sha; - const unsigned int sha_len; - - // Note: self is only set if we are a c type. We emulate an instance method, - // hence we have to get the instance as 'first' argument - - // get instance and sha - PyObject* inst = 0; - if (!PyArg_ParseTuple(args, "Os#", &inst, &sha, &sha_len)) - return NULL; - - if (sha_len != 20) { - PyErr_SetString(PyExc_ValueError, "Sha is not 20 bytes long"); - return NULL; - } - - if( !inst){ - PyErr_SetString(PyExc_ValueError, "Cannot be called without self"); - return NULL; - } - - // read lo and hi bounds - PyObject* fanout_table = PyObject_GetAttrString(inst, "_fanout_table"); - if (!fanout_table){ - PyErr_SetString(PyExc_ValueError, "Couldn't obtain fanout table"); - return NULL; - } - - unsigned int lo = 0, hi = 0; - if (sha[0]){ - PyObject* item = PySequence_GetItem(fanout_table, (const Py_ssize_t)(sha[0]-1)); - lo = PyInt_AS_LONG(item); - Py_DECREF(item); - } - PyObject* item = PySequence_GetItem(fanout_table, (const Py_ssize_t)sha[0]); - hi = PyInt_AS_LONG(item); - Py_DECREF(item); - item = 0; - - Py_DECREF(fanout_table); - - // get sha query function - PyObject* get_sha = PyObject_GetAttrString(inst, "sha"); - if (!get_sha){ - PyErr_SetString(PyExc_ValueError, "Couldn't obtain sha method"); - return NULL; - } - - PyObject *sha_str = 0; - while (lo < hi) { - const int mid = (lo + hi)/2; - sha_str = PyObject_CallFunction(get_sha, "i", mid); - if (!sha_str) { - return NULL; - } - - // we really trust that string ... for speed - const int cmp = memcmp(PyString_AS_STRING(sha_str), sha, 20); - Py_DECREF(sha_str); - sha_str = 0; - - if (cmp < 0){ - lo = mid + 1; - } - else if (cmp > 0) { - hi = mid; - } - else { - Py_DECREF(get_sha); - return PyInt_FromLong(mid); - }// END handle comparison - }// END while lo < hi - - // nothing found, cleanup - Py_DECREF(get_sha); - Py_RETURN_NONE; -} - -static PyMethodDef py_fun[] = { - { "PackIndexFile_sha_to_index", (PyCFunction)PackIndexFile_sha_to_index, METH_VARARGS, "TODO" }, - { "connect_deltas", (PyCFunction)connect_deltas, METH_O, "See python implementation" }, - { "apply_delta", (PyCFunction)apply_delta, METH_VARARGS, "See python implementation" }, - { NULL, NULL, 0, NULL } -}; - -#ifndef PyMODINIT_FUNC /* declarations for DLL import/export */ -#define PyMODINIT_FUNC void -#endif -PyMODINIT_FUNC init_perf(void) -{ - PyObject *m; - - if (PyType_Ready(&DeltaChunkListType) < 0) - return; - - m = Py_InitModule3("_perf", py_fun, NULL); - if (m == NULL) - return; - - Py_INCREF(&DeltaChunkListType); - PyModule_AddObject(m, "DeltaChunkList", (PyObject *)&DeltaChunkListType); -} diff --git a/gitdb/base.py b/gitdb/base.py index a33fb67..9a23a4f 100644 --- a/gitdb/base.py +++ b/gitdb/base.py @@ -1,7 +1,7 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ """Module with basic data structures - they are designed to be lightweight and fast""" from gitdb.util import bin_to_hex @@ -11,13 +11,15 @@ ) __all__ = ('OInfo', 'OPackInfo', 'ODeltaPackInfo', - 'OStream', 'OPackStream', 'ODeltaPackStream', - 'IStream', 'InvalidOInfo', 'InvalidOStream' ) + 'OStream', 'OPackStream', 'ODeltaPackStream', + 'IStream', 'InvalidOInfo', 'InvalidOStream') #{ ODB Bases + class OInfo(tuple): - """Carries information about an object in an ODB, provding information + + """Carries information about an object in an ODB, providing information about the binary sha of the object, the type_string as well as the uncompressed size in bytes. @@ -27,7 +29,7 @@ class OInfo(tuple): assert dbi[1] == dbi.type assert dbi[2] == dbi.size - The type is designed to be as lighteight as possible.""" + The type is designed to be as lightweight as possible.""" __slots__ = tuple() def __new__(cls, sha, type, size): @@ -62,16 +64,17 @@ def size(self): class OPackInfo(tuple): + """As OInfo, but provides a type_id property to retrieve the numerical type id, and does not include a sha. Additionally, the pack_offset is the absolute offset into the packfile at which - all object information is located. The data_offset property points to the abosolute + all object information is located. The data_offset property points to the absolute location in the pack at which that actual data stream can be found.""" __slots__ = tuple() def __new__(cls, packoffset, type, size): - return tuple.__new__(cls, (packoffset,type, size)) + return tuple.__new__(cls, (packoffset, type, size)) def __init__(self, *args): tuple.__init__(self) @@ -98,6 +101,7 @@ def size(self): class ODeltaPackInfo(OPackInfo): + """Adds delta specific information, Either the 20 byte sha which points to some object in the database, or the negative offset from the pack_offset, so that pack_offset - delta_info yields @@ -115,6 +119,7 @@ def delta_info(self): class OStream(OInfo): + """Base for object streams retrieved from the database, providing additional information about the stream. Generally, ODB streams are read-only as objects are immutable""" @@ -124,7 +129,6 @@ def __new__(cls, sha, type, size, stream, *args, **kwargs): """Helps with the initialization of subclasses""" return tuple.__new__(cls, (sha, type, size, stream)) - def __init__(self, *args, **kwargs): tuple.__init__(self) @@ -141,6 +145,7 @@ def stream(self): class ODeltaStream(OStream): + """Uses size info of its stream, delaying reads""" def __new__(cls, sha, type, size, stream, *args, **kwargs): @@ -157,6 +162,7 @@ def size(self): class OPackStream(OPackInfo): + """Next to pack object information, a stream outputting an undeltified base object is provided""" __slots__ = tuple() @@ -176,13 +182,13 @@ def stream(self): class ODeltaPackStream(ODeltaPackInfo): + """Provides a stream outputting the uncompressed offset delta information""" __slots__ = tuple() def __new__(cls, packoffset, type, size, delta_info, stream): return tuple.__new__(cls, (packoffset, type, size, delta_info, stream)) - #{ Stream Reader Interface def read(self, size=-1): return self[4].read(size) @@ -194,6 +200,7 @@ def stream(self): class IStream(list): + """Represents an input content stream to be fed into the ODB. It is mutable to allow the ODB to record information about the operations outcome right in this instance. @@ -246,7 +253,6 @@ def _binsha(self): binsha = property(_binsha, _set_binsha) - def _type(self): return self[1] @@ -275,6 +281,7 @@ def _set_stream(self, stream): class InvalidOInfo(tuple): + """Carries information about a sha identifying an object which is invalid in the queried database. The exception attribute provides more information about the cause of the issue""" @@ -301,6 +308,7 @@ def error(self): class InvalidOStream(InvalidOInfo): + """Carries information about an invalid ODB stream""" __slots__ = tuple() diff --git a/gitdb/db/__init__.py b/gitdb/db/__init__.py index 0a2a46a..20fd228 100644 --- a/gitdb/db/__init__.py +++ b/gitdb/db/__init__.py @@ -1,7 +1,7 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ from gitdb.db.base import * from gitdb.db.loose import * diff --git a/gitdb/db/base.py b/gitdb/db/base.py index a670eea..7312fe0 100644 --- a/gitdb/db/base.py +++ b/gitdb/db/base.py @@ -1,7 +1,7 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ """Contains implementations of database retrieveing objects""" from gitdb.util import ( join, @@ -19,11 +19,11 @@ from functools import reduce - __all__ = ('ObjectDBR', 'ObjectDBW', 'FileDBBase', 'CompoundDB', 'CachingDB') -class ObjectDBR(object): +class ObjectDBR: + """Defines an interface for object database lookup. Objects are identified either by their 20 byte bin sha""" @@ -33,6 +33,9 @@ def __contains__(self, sha): #{ Query Interface def has_object(self, sha): """ + Whether the object identified by the given 20 bytes + binary sha is contained in the database + :return: True if the object identified by the given 20 bytes binary sha is contained in the database""" raise NotImplementedError("To be implemented in subclass") @@ -60,7 +63,8 @@ def sha_iter(self): #} END query interface -class ObjectDBW(object): +class ObjectDBW: + """Defines an interface to create objects in the database""" def __init__(self, *args, **kwargs): @@ -81,6 +85,8 @@ def set_ostream(self, stream): def ostream(self): """ + Return the output stream + :return: overridden output stream this instance will write to, or None if it will write to the default stream""" return self._ostream @@ -99,7 +105,8 @@ def store(self, istream): #} END edit interface -class FileDBBase(object): +class FileDBBase: + """Provides basic facilities to retrieve files of interest, including caching facilities to help mapping hexsha's to objects""" @@ -110,10 +117,9 @@ def __init__(self, root_path): **Note:** The base will not perform any accessablity checking as the base might not yet be accessible, but become accessible before the first access.""" - super(FileDBBase, self).__init__() + super().__init__() self._root_path = root_path - #{ Interface def root_path(self): """:return: path at which this db operates""" @@ -127,7 +133,8 @@ def db_path(self, rela_path): #} END interface -class CachingDB(object): +class CachingDB: + """A database which uses caches to speed-up access""" #{ Interface @@ -143,8 +150,6 @@ def update_cache(self, force=False): # END interface - - def _databases_recursive(database, output): """Fill output list with database from db, in order. Deals with Loose, Packed and compound databases.""" @@ -159,23 +164,25 @@ def _databases_recursive(database, output): class CompoundDB(ObjectDBR, LazyMixin, CachingDB): + """A database which delegates calls to sub-databases. Databases are stored in the lazy-loaded _dbs attribute. Define _set_cache_ to update it with your databases""" + def _set_cache_(self, attr): if attr == '_dbs': self._dbs = list() elif attr == '_db_cache': self._db_cache = dict() else: - super(CompoundDB, self)._set_cache_(attr) + super()._set_cache_(attr) def _db_query(self, sha): """:return: database containing the given 20 byte sha :raise BadObject:""" # most databases use binary representations, prevent converting - # it everytime a database is being queried + # it every time a database is being queried try: return self._db_cache[sha] except KeyError: @@ -207,7 +214,7 @@ def stream(self, sha): def size(self): """:return: total size of all contained databases""" - return reduce(lambda x,y: x+y, (db.size() for db in self._dbs), 0) + return reduce(lambda x, y: x + y, (db.size() for db in self._dbs), 0) def sha_iter(self): return chain(*(db.sha_iter() for db in self._dbs)) diff --git a/gitdb/db/git.py b/gitdb/db/git.py index d22e3f1..a1ed142 100644 --- a/gitdb/db/git.py +++ b/gitdb/db/git.py @@ -1,7 +1,7 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ from gitdb.db.base import ( CompoundDB, ObjectDBW, @@ -20,8 +20,13 @@ class GitDB(FileDBBase, ObjectDBW, CompoundDB): + """A git-style object database, which contains all objects in the 'objects' - subdirectory""" + subdirectory + + ``IMPORTANT``: The usage of this implementation is highly discouraged as it fails to release file-handles. + This can be a problem with long-running processes and/or big repositories. + """ # Configuration PackDBCls = PackedDB LooseDBCls = LooseObjectDB @@ -34,15 +39,15 @@ class GitDB(FileDBBase, ObjectDBW, CompoundDB): def __init__(self, root_path): """Initialize ourselves on a git objects directory""" - super(GitDB, self).__init__(root_path) + super().__init__(root_path) def _set_cache_(self, attr): if attr == '_dbs' or attr == '_loose_db': self._dbs = list() loose_db = None for subpath, dbcls in ((self.packs_dir, self.PackDBCls), - (self.loose_dir, self.LooseDBCls), - (self.alternates_dir, self.ReferenceDBCls)): + (self.loose_dir, self.LooseDBCls), + (self.alternates_dir, self.ReferenceDBCls)): path = self.db_path(subpath) if os.path.exists(path): self._dbs.append(dbcls(path)) @@ -63,7 +68,7 @@ def _set_cache_(self, attr): # finally set the value self._loose_db = loose_db else: - super(GitDB, self)._set_cache_(attr) + super()._set_cache_(attr) # END handle attrs #{ ObjectDBW interface diff --git a/gitdb/db/loose.py b/gitdb/db/loose.py index 3743026..e6765cd 100644 --- a/gitdb/db/loose.py +++ b/gitdb/db/loose.py @@ -1,14 +1,15 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ +from contextlib import suppress + from gitdb.db.base import ( FileDBBase, ObjectDBR, ObjectDBW ) - from gitdb.exc import ( BadObject, AmbiguousObjectName @@ -33,10 +34,8 @@ bin_to_hex, exists, chmod, - isdir, isfile, remove, - mkdir, rename, dirname, basename, @@ -50,17 +49,19 @@ stream_copy ) -from gitdb.utils.compat import MAXSIZE from gitdb.utils.encoding import force_bytes import tempfile import os +import sys +import time -__all__ = ( 'LooseObjectDB', ) +__all__ = ('LooseObjectDB', ) class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): + """A database which operates on loose object files""" # CONFIGURATION @@ -73,9 +74,8 @@ class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): if os.name == 'nt': new_objects_mode = int("644", 8) - def __init__(self, root_path): - super(LooseObjectDB, self).__init__(root_path) + super().__init__(root_path) self._hexsha_to_file = dict() # Additional Flags - might be set to 0 after the first failure # Depending on the root, this might work for some mounts, for others not, which @@ -93,10 +93,8 @@ def readable_db_object_path(self, hexsha): """ :return: readable object path to the object identified by hexsha :raise BadObject: If the object file does not exist""" - try: + with suppress(KeyError): return self._hexsha_to_file[hexsha] - except KeyError: - pass # END ignore cache misses # try filesystem @@ -138,12 +136,12 @@ def _map_loose_object(self, sha): # try again without noatime try: return file_contents_ro_filepath(db_path) - except OSError: - raise BadObject(sha) + except OSError as new_e: + raise BadObject(sha) from new_e # didn't work because of our flag, don't try it again self._fd_open_flags = 0 else: - raise BadObject(sha) + raise BadObject(sha) from e # END handle error # END exception handling @@ -151,7 +149,7 @@ def set_ostream(self, stream): """:raise TypeError: if the stream does not support the Sha1Writer interface""" if stream is not None and not isinstance(stream, Sha1Writer): raise TypeError("Output stream musst support the %s interface" % Sha1Writer.__name__) - return super(LooseObjectDB, self).set_ostream(stream) + return super().set_ostream(stream) def info(self, sha): m = self._map_loose_object(sha) @@ -159,12 +157,13 @@ def info(self, sha): typ, size = loose_object_header_info(m) return OInfo(sha, typ, size) finally: - m.close() + if hasattr(m, 'close'): + m.close() # END assure release of system resources def stream(self, sha): m = self._map_loose_object(sha) - type, size, stream = DecompressMemMapReader.new(m, close_on_deletion = True) + type, size, stream = DecompressMemMapReader.new(m, close_on_deletion=True) return OStream(sha, type, size, stream) def has_object(self, sha): @@ -173,7 +172,7 @@ def has_object(self, sha): return True except BadObject: return False - # END check existance + # END check existence def store(self, istream): """note: The sha we produce will be hex by nature""" @@ -195,11 +194,11 @@ def store(self, istream): if istream.binsha is not None: # copy as much as possible, the actual uncompressed item size might # be smaller than the compressed version - stream_copy(istream.read, writer.write, MAXSIZE, self.stream_chunk_size) + stream_copy(istream.read, writer.write, sys.maxsize, self.stream_chunk_size) else: # write object with header, we have to make a new one write_object(istream.type, istream.size, istream.read, writer.write, - chunk_size=self.stream_chunk_size) + chunk_size=self.stream_chunk_size) # END handle direct stream copies finally: if tmp_path: @@ -207,7 +206,7 @@ def store(self, istream): # END assure target stream is closed except: if tmp_path: - os.remove(tmp_path) + remove(tmp_path) raise # END assure tmpfile removal on error @@ -221,18 +220,31 @@ def store(self, istream): if tmp_path: obj_path = self.db_path(self.object_path(hexsha)) obj_dir = dirname(obj_path) - if not isdir(obj_dir): - mkdir(obj_dir) + os.makedirs(obj_dir, exist_ok=True) # END handle destination directory - # rename onto existing doesn't work on windows - if os.name == 'nt' and isfile(obj_path): - remove(obj_path) - # END handle win322 - rename(tmp_path, obj_path) - - # make sure its readable for all ! It started out as rw-- tmp file - # but needs to be rwrr - chmod(obj_path, self.new_objects_mode) + # rename onto existing doesn't work on NTFS + if isfile(obj_path): + remove(tmp_path) + else: + rename(tmp_path, obj_path) + # end rename only if needed + + # Ensure rename is actually done and file is stable + # Retry up to 14 times - quadratic wait & retry in ms. + # The total maximum wait time is 1000ms, which should be vastly enough for the + # OS to return and commit the file to disk. + for backoff_ms in [1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 181]: + with suppress(PermissionError): + # make sure its readable for all ! It started out as rw-- tmp file + # but needs to be rwrr + chmod(obj_path, self.new_objects_mode) + break + time.sleep(backoff_ms / 1000.0) + else: + raise PermissionError( + "Impossible to apply `chmod` to file {}".format(obj_path) + ) + # END handle dry_run istream.binsha = hex_to_bin(hexsha) diff --git a/gitdb/db/mem.py b/gitdb/db/mem.py index 1aa0d51..d4772fd 100644 --- a/gitdb/db/mem.py +++ b/gitdb/db/mem.py @@ -1,7 +1,7 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ """Contains the MemoryDatabase implementation""" from gitdb.db.loose import LooseObjectDB from gitdb.db.base import ( @@ -28,14 +28,16 @@ __all__ = ("MemoryDB", ) + class MemoryDB(ObjectDBR, ObjectDBW): + """A memory database stores everything to memory, providing fast IO and object retrieval. It should be used to buffer results and obtain SHAs before writing it to the actual physical storage, as it allows to query whether object already exists in the target storage before introducing actual IO""" def __init__(self): - super(MemoryDB, self).__init__() + super().__init__() self._db = LooseObjectDB("path/doesnt/matter") # maps 20 byte shas to their OStream objects @@ -72,32 +74,29 @@ def stream(self, sha): # rewind stream for the next one to read ostream.stream.seek(0) return ostream - except KeyError: - raise BadObject(sha) + except KeyError as e: + raise BadObject(sha) from e # END exception handling def size(self): return len(self._cache) def sha_iter(self): - try: - return self._cache.iterkeys() - except AttributeError: - return self._cache.keys() - + return self._cache.keys() #{ Interface def stream_copy(self, sha_iter, odb): """Copy the streams as identified by sha's yielded by sha_iter into the given odb The streams will be copied directly **Note:** the object will only be written if it did not exist in the target db + :return: amount of streams actually copied into odb. If smaller than the amount of input shas, one or more objects did already exist in odb""" count = 0 for sha in sha_iter: if odb.has_object(sha): continue - # END check object existance + # END check object existence ostream = self.stream(sha) # compressed data including header diff --git a/gitdb/db/pack.py b/gitdb/db/pack.py index eaf431a..274ea59 100644 --- a/gitdb/db/pack.py +++ b/gitdb/db/pack.py @@ -1,7 +1,7 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ """Module containing a database to deal with packs""" from gitdb.db.base import ( FileDBBase, @@ -18,7 +18,6 @@ ) from gitdb.pack import PackEntity -from gitdb.utils.compat import xrange from functools import reduce @@ -31,6 +30,7 @@ class PackedDB(FileDBBase, ObjectDBR, CachingDB, LazyMixin): + """A database operating on a set of object packs""" # sort the priority list every N queries @@ -39,7 +39,7 @@ class PackedDB(FileDBBase, ObjectDBR, CachingDB, LazyMixin): _sort_interval = 500 def __init__(self, root_path): - super(PackedDB, self).__init__(root_path) + super().__init__(root_path) # list of lists with three items: # * hits - number of times the pack was hit with a request # * entity - Pack entity instance @@ -106,14 +106,14 @@ def sha_iter(self): for entity in self.entities(): index = entity.index() sha_by_index = index.sha - for index in xrange(index.size()): + for index in range(index.size()): yield sha_by_index(index) # END for each index # END for each entity def size(self): sizes = [item[1].index().size() for item in self._entities] - return reduce(lambda x,y: x+y, sizes, 0) + return reduce(lambda x, y: x + y, sizes, 0) #} END object db read @@ -127,12 +127,11 @@ def store(self, istream): #} END object db write - #{ Interface def update_cache(self, force=False): """ - Update our cache with the acutally existing packs on disk. Add new ones, + Update our cache with the actually existing packs on disk. Add new ones, and remove deleted ones. We keep the unchanged ones :param force: If True, the cache will be updated even though the directory @@ -148,7 +147,7 @@ def update_cache(self, force=False): # packs are supposed to be prefixed with pack- by git-convention # get all pack files, figure out what changed pack_files = set(glob.glob(os.path.join(self.root_path(), "pack-*.pack"))) - our_pack_files = set(item[1].pack().path() for item in self._entities) + our_pack_files = {item[1].pack().path() for item in self._entities} # new packs for pack_file in (pack_files - our_pack_files): @@ -177,7 +176,7 @@ def update_cache(self, force=False): def entities(self): """:return: list of pack entities operated upon by this database""" - return [ item[1] for item in self._entities ] + return [item[1] for item in self._entities] def partial_to_complete_sha(self, partial_binsha, canonical_length): """:return: 20 byte sha as inferred by the given partial binary sha diff --git a/gitdb/db/ref.py b/gitdb/db/ref.py index d989126..bd30156 100644 --- a/gitdb/db/ref.py +++ b/gitdb/db/ref.py @@ -1,14 +1,17 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ +import codecs from gitdb.db.base import ( CompoundDB, ) __all__ = ('ReferenceDB', ) + class ReferenceDB(CompoundDB): + """A database consisting of database referred to in a file""" # Configuration @@ -17,7 +20,7 @@ class ReferenceDB(CompoundDB): ObjectDBCls = None def __init__(self, ref_file): - super(ReferenceDB, self).__init__() + super().__init__() self._ref_file = ref_file def _set_cache_(self, attr): @@ -25,7 +28,7 @@ def _set_cache_(self, attr): self._dbs = list() self._update_dbs_from_ref_file() else: - super(ReferenceDB, self)._set_cache_(attr) + super()._set_cache_(attr) # END handle attrs def _update_dbs_from_ref_file(self): @@ -39,13 +42,14 @@ def _update_dbs_from_ref_file(self): # try to get as many as possible, don't fail if some are unavailable ref_paths = list() try: - ref_paths = [l.strip() for l in open(self._ref_file, 'r').readlines()] - except (OSError, IOError): + with codecs.open(self._ref_file, 'r', encoding="utf-8") as f: + ref_paths = [l.strip() for l in f] + except OSError: pass # END handle alternates ref_paths_set = set(ref_paths) - cur_ref_paths_set = set(db.root_path() for db in self._dbs) + cur_ref_paths_set = {db.root_path() for db in self._dbs} # remove existing for path in (cur_ref_paths_set - ref_paths_set): @@ -75,4 +79,4 @@ def _update_dbs_from_ref_file(self): def update_cache(self, force=False): # re-read alternates and update databases self._update_dbs_from_ref_file() - return super(ReferenceDB, self).update_cache(force) + return super().update_cache(force) diff --git a/gitdb/exc.py b/gitdb/exc.py index 73f84d2..752dafd 100644 --- a/gitdb/exc.py +++ b/gitdb/exc.py @@ -1,16 +1,30 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ """Module with common exceptions""" from gitdb.util import to_hex_sha +__all__ = [ + 'AmbiguousObjectName', + 'BadName', + 'BadObject', + 'BadObjectType', + 'InvalidDBRoot', + 'ODBError', + 'ParseError', + 'UnsupportedOperation', + 'to_hex_sha', +] + class ODBError(Exception): """All errors thrown by the object database""" + class InvalidDBRoot(ODBError): """Thrown if an object database cannot be initialized at the given path""" + class BadObject(ODBError): """The object with the given SHA does not exist. Instantiate with the failed sha""" @@ -18,15 +32,26 @@ class BadObject(ODBError): def __str__(self): return "BadObject: %s" % to_hex_sha(self.args[0]) + +class BadName(ODBError): + """A name provided to rev_parse wasn't understood""" + + def __str__(self): + return "Ref '%s' did not resolve to an object" % self.args[0] + + class ParseError(ODBError): """Thrown if the parsing of a file failed due to an invalid format""" + class AmbiguousObjectName(ODBError): """Thrown if a possibly shortened name does not uniquely represent a single object in the database""" + class BadObjectType(ODBError): """The object had an unsupported type""" + class UnsupportedOperation(ODBError): """Thrown if the given operation cannot be supported by the object database""" diff --git a/gitdb/ext/smmap b/gitdb/ext/smmap index 28fd45e..8f82e6c 160000 --- a/gitdb/ext/smmap +++ b/gitdb/ext/smmap @@ -1 +1 @@ -Subproject commit 28fd45e0a7018f166820a5e00fce2ccb05ebdb61 +Subproject commit 8f82e6c19661f9b735cc55cc89031a189e408894 diff --git a/gitdb/fun.py b/gitdb/fun.py index b7662b4..a272e5c 100644 --- a/gitdb/fun.py +++ b/gitdb/fun.py @@ -1,7 +1,7 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ """Contains basic c-functions which usually contain performance critical code Keeping this code separate from the beginning makes it easier to out-source it into c later, if required""" @@ -16,7 +16,6 @@ from gitdb.const import NULL_BYTE, BYTE_SPACE from gitdb.utils.encoding import force_text -from gitdb.utils.compat import izip, buffer, xrange, PY3 from gitdb.typ import ( str_blob_type, str_commit_type, @@ -31,15 +30,15 @@ REF_DELTA = 7 delta_types = (OFS_DELTA, REF_DELTA) -type_id_to_type_map = { - 0 : b'', # EXT 1 - 1 : str_commit_type, - 2 : str_tree_type, - 3 : str_blob_type, - 4 : str_tag_type, - 5 : b'', # EXT 2 - OFS_DELTA : "OFS_DELTA", # OFFSET DELTA - REF_DELTA : "REF_DELTA" # REFERENCE DELTA +type_id_to_type_map = { + 0: b'', # EXT 1 + 1: str_commit_type, + 2: str_tree_type, + 3: str_blob_type, + 4: str_tag_type, + 5: b'', # EXT 2 + OFS_DELTA: "OFS_DELTA", # OFFSET DELTA + REF_DELTA: "REF_DELTA" # REFERENCE DELTA } type_to_type_id_map = { @@ -55,8 +54,8 @@ chunk_size = 1000 * mmap.PAGESIZE __all__ = ('is_loose_object', 'loose_object_header_info', 'msb_size', 'pack_object_header_info', - 'write_object', 'loose_object_header', 'stream_copy', 'apply_delta_data', - 'is_equal_canonical_sha', 'connect_deltas', 'DeltaChunkList', 'create_pack_object_header') + 'write_object', 'loose_object_header', 'stream_copy', 'apply_delta_data', + 'is_equal_canonical_sha', 'connect_deltas', 'DeltaChunkList', 'create_pack_object_header') #{ Structures @@ -72,6 +71,7 @@ def _set_delta_rbound(d, size): # MUST NOT DO THIS HERE return d + def _move_delta_lbound(d, bytes): """Move the delta by the given amount of bytes, reducing its size so that its right bound stays static @@ -89,19 +89,21 @@ def _move_delta_lbound(d, bytes): return d + def delta_duplicate(src): return DeltaChunk(src.to, src.ts, src.so, src.data) + def delta_chunk_apply(dc, bbuf, write): """Apply own data to the target buffer :param bbuf: buffer providing source bytes for copy operations :param write: write method to call with data to write""" if dc.data is None: # COPY DATA FROM SOURCE - write(buffer(bbuf, dc.so, dc.ts)) + write(bbuf[dc.so:dc.so + dc.ts]) else: # APPEND DATA - # whats faster: if + 4 function calls or just a write with a slice ? + # what's faster: if + 4 function calls or just a write with a slice ? # Considering data can be larger than 127 bytes now, it should be worth it if dc.ts < len(dc.data): write(dc.data[:dc.ts]) @@ -111,16 +113,17 @@ def delta_chunk_apply(dc, bbuf, write): # END handle chunk mode -class DeltaChunk(object): +class DeltaChunk: + """Represents a piece of a delta, it can either add new data, or copy existing one from a source buffer""" __slots__ = ( - 'to', # start offset in the target buffer in bytes + 'to', # start offset in the target buffer in bytes 'ts', # size of this chunk in the target buffer in bytes 'so', # start offset in the source buffer in bytes or None 'data', # chunk of bytes to be added to the target buffer, # DeltaChunkList to use as base, or None - ) + ) def __init__(self, to, ts, so, data): self.to = to @@ -142,6 +145,7 @@ def has_data(self): #} END interface + def _closest_index(dcl, absofs): """:return: index at which the given absofs should be inserted. The index points to the DeltaChunk with a target buffer absofs that equals or is greater than @@ -160,7 +164,8 @@ def _closest_index(dcl, absofs): lo = mid + 1 # END handle bound # END for each delta absofs - return len(dcl)-1 + return len(dcl) - 1 + def delta_list_apply(dcl, bbuf, write): """Apply the chain's changes and write the final result using the passed @@ -173,6 +178,7 @@ def delta_list_apply(dcl, bbuf, write): delta_chunk_apply(dc, bbuf, write) # END for each dc + def delta_list_slice(dcl, absofs, size, ndcl): """:return: Subsection of this list at the given absolute offset, with the given size in bytes. @@ -209,6 +215,7 @@ def delta_list_slice(dcl, absofs, size, ndcl): class DeltaChunkList(list): + """List with special functionality to deal with DeltaChunks. There are two types of lists we represent. The one was created bottom-up, working towards the latest delta, the other kind was created top-down, working from the @@ -252,16 +259,16 @@ def compress(self): dc = self[i] i += 1 if dc.data is None: - if first_data_index is not None and i-2-first_data_index > 1: - #if first_data_index is not None: + if first_data_index is not None and i - 2 - first_data_index > 1: + # if first_data_index is not None: nd = StringIO() # new data so = self[first_data_index].to # start offset in target buffer - for x in xrange(first_data_index, i-1): + for x in range(first_data_index, i - 1): xdc = self[x] nd.write(xdc.data[:xdc.ts]) # END collect data - del(self[first_data_index:i-1]) + del(self[first_data_index:i - 1]) buf = nd.getvalue() self.insert(first_data_index, DeltaChunk(so, len(buf), 0, buf)) @@ -274,10 +281,10 @@ def compress(self): # END skip non-data chunks if first_data_index is None: - first_data_index = i-1 + first_data_index = i - 1 # END iterate list - #if slen_orig != len(self): + # if slen_orig != len(self): # print "INFO: Reduced delta list len to %f %% of former size" % ((float(len(self)) / slen_orig) * 100) return self @@ -285,10 +292,10 @@ def check_integrity(self, target_size=-1): """Verify the list has non-overlapping chunks only, and the total size matches target_size :param target_size: if not -1, the total size of the chain must be target_size - :raise AssertionError: if the size doen't match""" + :raise AssertionError: if the size doesn't match""" if target_size > -1: assert self[-1].rbound() == target_size - assert reduce(lambda x,y: x+y, (d.ts for d in self), 0) == target_size + assert reduce(lambda x, y: x + y, (d.ts for d in self), 0) == target_size # END target size verification if len(self) < 2: @@ -301,18 +308,19 @@ def check_integrity(self, target_size=-1): assert len(dc.data) >= dc.ts # END for each dc - left = islice(self, 0, len(self)-1) + left = islice(self, 0, len(self) - 1) right = iter(self) right.next() # this is very pythonic - we might have just use index based access here, # but this could actually be faster - for lft,rgt in izip(left, right): + for lft, rgt in zip(left, right): assert lft.rbound() == rgt.to assert lft.to + lft.ts == rgt.to # END for each pair class TopdownDeltaChunkList(DeltaChunkList): + """Represents a list which is generated by feeding its ancestor streams one by one""" __slots__ = tuple() @@ -323,7 +331,7 @@ def connect_with_next_base(self, bdcl): cannot be changed by any of the upcoming bases anymore. Once all our chunks are marked like that, we can stop all processing :param bdcl: data chunk list being one of our bases. They must be fed in - consequtively and in order, towards the earliest ancestor delta + consecutively and in order, towards the earliest ancestor delta :return: True if processing was done. Use it to abort processing of remaining streams if False is returned""" nfc = 0 # number of frozen chunks @@ -356,19 +364,19 @@ def connect_with_next_base(self, bdcl): # END update target bounds if len(ccl) == 1: - self[dci-1] = ccl[0] + self[dci - 1] = ccl[0] else: # maybe try to compute the expenses here, and pick the right algorithm # It would normally be faster than copying everything physically though # TODO: Use a deque here, and decide by the index whether to extend # or extend left ! post_dci = self[dci:] - del(self[dci-1:]) # include deletion of dc + del(self[dci - 1:]) # include deletion of dc self.extend(ccl) self.extend(post_dci) slen = len(self) - dci += len(ccl)-1 # deleted dc, added rest + dci += len(ccl) - 1 # deleted dc, added rest # END handle chunk replacement # END for each chunk @@ -391,6 +399,7 @@ def is_loose_object(m): word = (b0 << 8) + b1 return b0 == 0x78 and (word % 31) == 0 + def loose_object_header_info(m): """ :return: tuple(type_string, uncompressed_size_in_bytes) the type string of the @@ -402,6 +411,7 @@ def loose_object_header_info(m): return type_name, int(size) + def pack_object_header_info(data): """ :return: tuple(type_id, uncompressed_size_in_bytes, byte_offset) @@ -413,23 +423,16 @@ def pack_object_header_info(data): type_id = (c >> 4) & 7 # numeric type size = c & 15 # starting size s = 4 # starting bit-shift size - if PY3: - while c & 0x80: - c = data[i] - i += 1 - size += (c & 0x7f) << s - s += 7 - # END character loop - else: - while c & 0x80: - c = ord(data[i]) - i += 1 - size += (c & 0x7f) << s - s += 7 - # END character loop + while c & 0x80: + c = byte_ord(data[i]) + i += 1 + size += (c & 0x7f) << s + s += 7 + # END character loop # end performance at expense of maintenance ... return (type_id, size, i) + def create_pack_object_header(obj_type, obj_size): """ :return: string defining the pack header comprised of the object type @@ -438,31 +441,20 @@ def create_pack_object_header(obj_type, obj_size): :param obj_type: pack type_id of the object :param obj_size: uncompressed size in bytes of the following object stream""" c = 0 # 1 byte - if PY3: - hdr = bytearray() # output string - - c = (obj_type << 4) | (obj_size & 0xf) - obj_size >>= 4 - while obj_size: - hdr.append(c | 0x80) - c = obj_size & 0x7f - obj_size >>= 7 - #END until size is consumed - hdr.append(c) - else: - hdr = bytes() # output string - - c = (obj_type << 4) | (obj_size & 0xf) - obj_size >>= 4 - while obj_size: - hdr += chr(c | 0x80) - c = obj_size & 0x7f - obj_size >>= 7 - #END until size is consumed - hdr += chr(c) + hdr = bytearray() # output string + + c = (obj_type << 4) | (obj_size & 0xf) + obj_size >>= 4 + while obj_size: + hdr.append(c | 0x80) + c = obj_size & 0x7f + obj_size >>= 7 + # END until size is consumed + hdr.append(c) # end handle interpreter return hdr + def msb_size(data, offset=0): """ :return: tuple(read_bytes, size) read the msb size from the given random @@ -471,30 +463,20 @@ def msb_size(data, offset=0): i = 0 l = len(data) hit_msb = False - if PY3: - while i < l: - c = data[i+offset] - size |= (c & 0x7f) << i*7 - i += 1 - if not c & 0x80: - hit_msb = True - break - # END check msb bit - # END while in range - else: - while i < l: - c = ord(data[i+offset]) - size |= (c & 0x7f) << i*7 - i += 1 - if not c & 0x80: - hit_msb = True - break - # END check msb bit - # END while in range + while i < l: + c = data[i + offset] + size |= (c & 0x7f) << i * 7 + i += 1 + if not c & 0x80: + hit_msb = True + break + # END check msb bit + # END while in range # end performance ... if not hit_msb: raise AssertionError("Could not find terminating MSB byte in data stream") - return i+offset, size + return i + offset, size + def loose_object_header(type, size): """ @@ -502,6 +484,7 @@ def loose_object_header(type, size): followed by the content stream of size 'size'""" return ('%s %i\0' % (force_text(type), size)).encode('ascii') + def write_object(type, size, read, write, chunk_size=chunk_size): """ Write the object as identified by type, size and source_stream into the @@ -522,6 +505,7 @@ def write_object(type, size, read, write, chunk_size=chunk_size): return tbw + def stream_copy(read, write, size, chunk_size): """ Copy a stream up to size bytes using the provided read and write methods, @@ -532,7 +516,7 @@ def stream_copy(read, write, size, chunk_size): # WRITE ALL DATA UP TO SIZE while True: - cs = min(chunk_size, size-dbw) + cs = min(chunk_size, size - dbw) # NOTE: not all write methods return the amount of written bytes, like # mmap.write. Its bad, but we just deal with it ... perhaps its not # even less efficient @@ -548,6 +532,7 @@ def stream_copy(read, write, size, chunk_size): # END duplicate data return dbw + def connect_deltas(dstreams): """ Read the condensed delta chunk information from dstream and merge its information @@ -602,7 +587,7 @@ def connect_deltas(dstreams): rbound = cp_off + cp_size if (rbound < cp_size or - rbound > base_size): + rbound > base_size): break dcl.append(DeltaChunk(tbw, cp_size, cp_off, None)) @@ -610,7 +595,7 @@ def connect_deltas(dstreams): elif c: # NOTE: in C, the data chunks should probably be concatenated here. # In python, we do it as a post-process - dcl.append(DeltaChunk(tbw, c, 0, db[i:i+c])) + dcl.append(DeltaChunk(tbw, c, 0, db[i:i + c])) i += c tbw += c else: @@ -632,106 +617,62 @@ def connect_deltas(dstreams): return tdcl + def apply_delta_data(src_buf, src_buf_size, delta_buf, delta_buf_size, write): """ Apply data from a delta buffer using a source buffer to the target file :param src_buf: random access data from which the delta was created :param src_buf_size: size of the source buffer in bytes - :param delta_buf_size: size fo the delta buffer in bytes + :param delta_buf_size: size for the delta buffer in bytes :param delta_buf: random access delta data :param write: write method taking a chunk of bytes **Note:** transcribed to python from the similar routine in patch-delta.c""" i = 0 db = delta_buf - if PY3: - while i < delta_buf_size: - c = db[i] - i += 1 - if c & 0x80: - cp_off, cp_size = 0, 0 - if (c & 0x01): - cp_off = db[i] - i += 1 - if (c & 0x02): - cp_off |= (db[i] << 8) - i += 1 - if (c & 0x04): - cp_off |= (db[i] << 16) - i += 1 - if (c & 0x08): - cp_off |= (db[i] << 24) - i += 1 - if (c & 0x10): - cp_size = db[i] - i += 1 - if (c & 0x20): - cp_size |= (db[i] << 8) - i += 1 - if (c & 0x40): - cp_size |= (db[i] << 16) - i += 1 - - if not cp_size: - cp_size = 0x10000 - - rbound = cp_off + cp_size - if (rbound < cp_size or + while i < delta_buf_size: + c = db[i] + i += 1 + if c & 0x80: + cp_off, cp_size = 0, 0 + if (c & 0x01): + cp_off = db[i] + i += 1 + if (c & 0x02): + cp_off |= (db[i] << 8) + i += 1 + if (c & 0x04): + cp_off |= (db[i] << 16) + i += 1 + if (c & 0x08): + cp_off |= (db[i] << 24) + i += 1 + if (c & 0x10): + cp_size = db[i] + i += 1 + if (c & 0x20): + cp_size |= (db[i] << 8) + i += 1 + if (c & 0x40): + cp_size |= (db[i] << 16) + i += 1 + + if not cp_size: + cp_size = 0x10000 + + rbound = cp_off + cp_size + if (rbound < cp_size or rbound > src_buf_size): - break - write(buffer(src_buf, cp_off, cp_size)) - elif c: - write(db[i:i+c]) - i += c - else: - raise ValueError("unexpected delta opcode 0") - # END handle command byte - # END while processing delta data - else: - while i < delta_buf_size: - c = ord(db[i]) - i += 1 - if c & 0x80: - cp_off, cp_size = 0, 0 - if (c & 0x01): - cp_off = ord(db[i]) - i += 1 - if (c & 0x02): - cp_off |= (ord(db[i]) << 8) - i += 1 - if (c & 0x04): - cp_off |= (ord(db[i]) << 16) - i += 1 - if (c & 0x08): - cp_off |= (ord(db[i]) << 24) - i += 1 - if (c & 0x10): - cp_size = ord(db[i]) - i += 1 - if (c & 0x20): - cp_size |= (ord(db[i]) << 8) - i += 1 - if (c & 0x40): - cp_size |= (ord(db[i]) << 16) - i += 1 - - if not cp_size: - cp_size = 0x10000 - - rbound = cp_off + cp_size - if (rbound < cp_size or - rbound > src_buf_size): - break - write(buffer(src_buf, cp_off, cp_size)) - elif c: - write(db[i:i+c]) - i += c - else: - raise ValueError("unexpected delta opcode 0") - # END handle command byte - # END while processing delta data - # end save byte_ord call and prevent performance regression in py2 + break + write(src_buf[cp_off:cp_off + cp_size]) + elif c: + write(db[i:i + c]) + i += c + else: + raise ValueError("unexpected delta opcode 0") + # END handle command byte + # END while processing delta data # yes, lets use the exact same error message that git uses :) assert i == delta_buf_size, "delta replay has gone wild" @@ -749,7 +690,7 @@ def is_equal_canonical_sha(canonical_length, match, sha1): return False if canonical_length - binary_length and \ - (byte_ord(match[-1]) ^ byte_ord(sha1[len(match)-1])) & 0xf0: + (byte_ord(match[-1]) ^ byte_ord(sha1[len(match) - 1])) & 0xf0: return False # END handle uneven canonnical length return True @@ -758,6 +699,6 @@ def is_equal_canonical_sha(canonical_length, match, sha1): try: - from _perf import connect_deltas + from gitdb_speedups._perf import connect_deltas except ImportError: pass diff --git a/gitdb/pack.py b/gitdb/pack.py index 375cc59..e559e11 100644 --- a/gitdb/pack.py +++ b/gitdb/pack.py @@ -1,7 +1,7 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ """Contains PackIndexFile and PackFile implementations""" import zlib @@ -35,7 +35,7 @@ ) try: - from _perf import PackIndexFile_sha_to_index + from gitdb_speedups._perf import PackIndexFile_sha_to_index except ImportError: pass # END try c module @@ -62,7 +62,6 @@ from binascii import crc32 from gitdb.const import NULL_BYTE -from gitdb.utils.compat import izip, buffer, xrange import tempfile import array @@ -72,8 +71,6 @@ __all__ = ('PackIndexFile', 'PackFile', 'PackEntity') - - #{ Utilities def pack_object_at(cursor, offset, as_stream): @@ -82,7 +79,7 @@ def pack_object_at(cursor, offset, as_stream): an object of the correct type according to the type_id of the object. If as_stream is True, the object will contain a stream, allowing the data to be read decompressed. - :param data: random accessable data containing all required information + :param data: random accessible data containing all required information :parma offset: offset in to the data at which the object information is located :param as_stream: if True, a stream object will be returned that can read the data, otherwise you receive an info object only""" @@ -107,7 +104,7 @@ def pack_object_at(cursor, offset, as_stream): total_rela_offset = i # REF DELTA elif type_id == REF_DELTA: - total_rela_offset = data_rela_offset+20 + total_rela_offset = data_rela_offset + 20 delta_info = data[data_rela_offset:total_rela_offset] # BASE OBJECT else: @@ -116,7 +113,7 @@ def pack_object_at(cursor, offset, as_stream): # END handle type id abs_data_offset = offset + total_rela_offset if as_stream: - stream = DecompressMemMapReader(buffer(data, total_rela_offset), False, uncomp_size) + stream = DecompressMemMapReader(data[total_rela_offset:], False, uncomp_size) if delta_info is None: return abs_data_offset, OPackStream(offset, type_id, uncomp_size, stream) else: @@ -129,6 +126,7 @@ def pack_object_at(cursor, offset, as_stream): # END handle info # END handle stream + def write_stream_to_pack(read, write, zstream, base_crc=None): """Copy a stream as read from read function, zip it, and write the result. Count the number of written bytes and return it @@ -142,7 +140,7 @@ def write_stream_to_pack(read, write, zstream, base_crc=None): crc = 0 if want_crc: crc = base_crc - #END initialize crc + # END initialize crc while True: chunk = read(chunk_size) @@ -153,18 +151,18 @@ def write_stream_to_pack(read, write, zstream, base_crc=None): if want_crc: crc = crc32(compressed, crc) - #END handle crc + # END handle crc if len(chunk) != chunk_size: break - #END copy loop + # END copy loop compressed = zstream.flush() bw += len(compressed) write(compressed) if want_crc: crc = crc32(compressed, crc) - #END handle crc + # END handle crc return (br, bw, crc) @@ -172,7 +170,8 @@ def write_stream_to_pack(read, write, zstream, base_crc=None): #} END utilities -class IndexWriter(object): +class IndexWriter: + """Utility to cache index information, allowing to write all information later in one go to the given stream **Note:** currently only writes v2 indices""" @@ -198,15 +197,15 @@ def write(self, pack_sha, write): sha_write(pack(">L", PackIndexFile.index_version_default)) # fanout - tmplist = list((0,)*256) # fanout or list with 64 bit offsets + tmplist = list((0,) * 256) # fanout or list with 64 bit offsets for t in self._objs: tmplist[byte_ord(t[0][0])] += 1 - #END prepare fanout - for i in xrange(255): + # END prepare fanout + for i in range(255): v = tmplist[i] sha_write(pack('>L', v)) - tmplist[i+1] += v - #END write each fanout entry + tmplist[i + 1] += v + # END write each fanout entry sha_write(pack('>L', tmplist[255])) # sha1 ordered @@ -215,8 +214,8 @@ def write(self, pack_sha, write): # crc32 for t in self._objs: - sha_write(pack('>L', t[1]&0xffffffff)) - #END for each crc + sha_write(pack('>L', t[1] & 0xffffffff)) + # END for each crc tmplist = list() # offset 32 @@ -224,15 +223,15 @@ def write(self, pack_sha, write): ofs = t[2] if ofs > 0x7fffffff: tmplist.append(ofs) - ofs = 0x80000000 + len(tmplist)-1 - #END hande 64 bit offsets - sha_write(pack('>L', ofs&0xffffffff)) - #END for each offset + ofs = 0x80000000 + len(tmplist) - 1 + # END handle 64 bit offsets + sha_write(pack('>L', ofs & 0xffffffff)) + # END for each offset # offset 64 for ofs in tmplist: sha_write(pack(">Q", ofs)) - #END for each offset + # END for each offset # trailer assert(len(pack_sha) == 20) @@ -242,8 +241,8 @@ def write(self, pack_sha, write): return sha - class PackIndexFile(LazyMixin): + """A pack index provides offsets into the corresponding pack, allowing to find locations for offsets faster.""" @@ -258,9 +257,13 @@ class PackIndexFile(LazyMixin): index_version_default = 2 def __init__(self, indexpath): - super(PackIndexFile, self).__init__() + super().__init__() self._indexpath = indexpath + def close(self): + mman.force_map_handle_removal_win(self._indexpath) + self._cursor = None + def _set_cache_(self, attr): if attr == "_packfile_checksum": self._packfile_checksum = self._cursor.map()[-40:-20] @@ -273,8 +276,9 @@ def _set_cache_(self, attr): self._cursor = mman.make_cursor(self._indexpath).use_region() # We will assume that the index will always fully fit into memory ! if mman.window_size() > 0 and self._cursor.file_size() > mman.window_size(): - raise AssertionError("The index file at %s is too large to fit into a mapped window (%i > %i). This is a limitation of the implementation" % (self._indexpath, self._cursor.file_size(), mman.window_size())) - #END assert window size + raise AssertionError("The index file at %s is too large to fit into a mapped window (%i > %i). This is a limitation of the implementation" % ( + self._indexpath, self._cursor.file_size(), mman.window_size())) + # END assert window size else: # now its time to initialize everything - if we are here, someone wants # to access the fanout table or related properties @@ -293,27 +297,25 @@ def _set_cache_(self, attr): setattr(self, fname, getattr(self, "_%s_v%i" % (fname, self._version))) # END for each function to initialize - # INITIALIZE DATA # byte offset is 8 if version is 2, 0 otherwise self._initialize() # END handle attributes - #{ Access V1 def _entry_v1(self, i): """:return: tuple(offset, binsha, 0)""" - return unpack_from(">L20s", self._cursor.map(), 1024 + i*24) + (0, ) + return unpack_from(">L20s", self._cursor.map(), 1024 + i * 24) + (0, ) def _offset_v1(self, i): """see ``_offset_v2``""" - return unpack_from(">L", self._cursor.map(), 1024 + i*24)[0] + return unpack_from(">L", self._cursor.map(), 1024 + i * 24)[0] def _sha_v1(self, i): """see ``_sha_v2``""" - base = 1024 + (i*24)+4 - return self._cursor.map()[base:base+20] + base = 1024 + (i * 24) + 4 + return self._cursor.map()[base:base + 20] def _crc_v1(self, i): """unsupported""" @@ -343,7 +345,7 @@ def _offset_v2(self, i): def _sha_v2(self, i): """:return: sha at the given index of this file index instance""" base = self._sha_list_offset + i * 20 - return self._cursor.map()[base:base+20] + return self._cursor.map()[base:base + 20] def _crc_v2(self, i): """:return: 4 bytes crc for the object at index i""" @@ -368,8 +370,8 @@ def _read_fanout(self, byte_offset): d = self._cursor.map() out = list() append = out.append - for i in xrange(256): - append(unpack_from('>L', d, byte_offset + i*4)[0]) + for i in range(256): + append(unpack_from('>L', d, byte_offset + i * 4)[0]) # END for each entry return out @@ -402,14 +404,14 @@ def offsets(self): if self._version == 2: # read stream to array, convert to tuple a = array.array('I') # 4 byte unsigned int, long are 8 byte on 64 bit it appears - a.fromstring(buffer(self._cursor.map(), self._pack_offset, self._pack_64_offset - self._pack_offset)) + a.frombytes(self._cursor.map()[self._pack_offset:self._pack_64_offset]) # networkbyteorder to something array likes more if sys.byteorder == 'little': a.byteswap() return a else: - return tuple(self.offset(index) for index in xrange(self.size())) + return tuple(self.offset(index) for index in range(self.size())) # END handle version def sha_to_index(self, sha): @@ -421,7 +423,7 @@ def sha_to_index(self, sha): get_sha = self.sha lo = 0 # lower index, the left bound of the bisection if first_byte != 0: - lo = self._fanout_table[first_byte-1] + lo = self._fanout_table[first_byte - 1] hi = self._fanout_table[first_byte] # the upper, right bound of the bisection # bisect until we have the sha @@ -443,7 +445,7 @@ def partial_sha_to_index(self, partial_bin_sha, canonical_length): :return: index as in `sha_to_index` or None if the sha was not found in this index file :param partial_bin_sha: an at least two bytes of a partial binary sha as bytes - :param canonical_length: lenght of the original hexadecimal representation of the + :param canonical_length: length of the original hexadecimal representation of the given partial binary sha :raise AmbiguousObjectName:""" if len(partial_bin_sha) < 2: @@ -455,7 +457,7 @@ def partial_sha_to_index(self, partial_bin_sha, canonical_length): get_sha = self.sha lo = 0 # lower index, the left bound of the bisection if first_byte != 0: - lo = self._fanout_table[first_byte-1] + lo = self._fanout_table[first_byte - 1] hi = self._fanout_table[first_byte] # the upper, right bound of the bisection # fill the partial to full 20 bytes @@ -481,7 +483,7 @@ def partial_sha_to_index(self, partial_bin_sha, canonical_length): if is_equal_canonical_sha(canonical_length, partial_bin_sha, cur_sha): next_sha = None if lo + 1 < self.size(): - next_sha = get_sha(lo+1) + next_sha = get_sha(lo + 1) if next_sha and next_sha == cur_sha: raise AmbiguousObjectName(partial_bin_sha) return lo @@ -500,10 +502,11 @@ def sha_to_index(self, sha): class PackFile(LazyMixin): + """A pack is a file written according to the Version 2 for git packs As we currently use memory maps, it could be assumed that the maximum size of - packs therefor is 32 bit on 32 bit systems. On 64 bit systems, this should be + packs therefore is 32 bit on 32 bit systems. On 64 bit systems, this should be fine though. **Note:** at some point, this might be implemented using streams as well, or @@ -516,12 +519,16 @@ class PackFile(LazyMixin): pack_version_default = 2 # offset into our data at which the first object starts - first_object_offset = 3*4 # header bytes + first_object_offset = 3 * 4 # header bytes footer_size = 20 # final sha def __init__(self, packpath): self._packpath = packpath + def close(self): + mman.force_map_handle_removal_win(self._packpath) + self._cursor = None + def _set_cache_(self, attr): # we fill the whole cache, whichever attribute gets queried first self._cursor = mman.make_cursor(self._packpath).use_region() @@ -547,9 +554,9 @@ def _iter_objects(self, start_offset, as_stream=True): # the amount of compressed bytes we need to get to the next offset stream_copy(ostream.read, null.write, ostream.size, chunk_size) + assert ostream.stream._br == ostream.size cur_offset += (data_offset - ostream.pack_offset) + ostream.stream.compressed_bytes_read() - # if a stream is requested, reset it beforehand # Otherwise return the Stream object directly, its derived from the # info object @@ -578,7 +585,7 @@ def data(self): def checksum(self): """:return: 20 byte sha1 hash on all object sha's contained in this file""" - return self._cursor.use_region(self._cursor.file_size()-20).buffer()[:] + return self._cursor.use_region(self._cursor.file_size() - 20).buffer()[:] def path(self): """:return: path to the packfile""" @@ -645,13 +652,14 @@ def stream_iter(self, start_offset=0): class PackEntity(LazyMixin): + """Combines the PackIndexFile and the PackFile into one, allowing the actual objects to be resolved and iterated""" - __slots__ = ( '_index', # our index file - '_pack', # our pack file - '_offset_map' # on demand dict mapping one offset to the next consecutive one - ) + __slots__ = ('_index', # our index file + '_pack', # our pack file + '_offset_map' # on demand dict mapping one offset to the next consecutive one + ) IndexFileCls = PackIndexFile PackFileCls = PackFile @@ -662,6 +670,10 @@ def __init__(self, pack_or_index_path): self._index = self.IndexFileCls("%s.idx" % basename) # PackIndexFile instance self._pack = self.PackFileCls("%s.pack" % basename) # corresponding PackFile instance + def close(self): + self._index.close() + self._pack.close() + def _set_cache_(self, attr): # currently this can only be _offset_map # TODO: make this a simple sorted offset array which can be bisected @@ -673,12 +685,12 @@ def _set_cache_(self, attr): offset_map = None if len(offsets_sorted) == 1: - offset_map = { offsets_sorted[0] : last_offset } + offset_map = {offsets_sorted[0]: last_offset} else: iter_offsets = iter(offsets_sorted) iter_offsets_plus_one = iter(offsets_sorted) next(iter_offsets_plus_one) - consecutive = izip(iter_offsets, iter_offsets_plus_one) + consecutive = zip(iter_offsets, iter_offsets_plus_one) offset_map = dict(consecutive) @@ -698,7 +710,7 @@ def _iter_objects(self, as_stream): """Iterate over all objects in our index and yield their OInfo or OStream instences""" _sha = self._index.sha _object = self._object - for index in xrange(self._index.size()): + for index in range(self._index.size()): yield _object(_sha(index), as_stream, index) # END for each index @@ -820,7 +832,7 @@ def is_valid_stream(self, sha, use_crc=False): while cur_pos < next_offset: rbound = min(cur_pos + chunk_size, next_offset) size = rbound - cur_pos - this_crc_value = crc_update(buffer(pack_data, cur_pos, size), this_crc_value) + this_crc_value = crc_update(pack_data[cur_pos:cur_pos + size], this_crc_value) cur_pos += size # END window size loop @@ -864,7 +876,11 @@ def collect_streams_at_offset(self, offset): stream = streams[-1] while stream.type_id in delta_types: if stream.type_id == REF_DELTA: - sindex = self._index.sha_to_index(stream.delta_info) + # smmap can return memory view objects, which can't be compared as buffers/bytes can ... + if isinstance(stream.delta_info, memoryview): + sindex = self._index.sha_to_index(stream.delta_info.tobytes()) + else: + sindex = self._index.sha_to_index(stream.delta_info) if sindex is None: break stream = self._pack.stream(self._index.offset(sindex)) @@ -895,10 +911,9 @@ def collect_streams(self, sha): :raise BadObject:""" return self.collect_streams_at_offset(self._index.offset(self._sha_to_index(sha))) - @classmethod def write_pack(cls, object_iter, pack_write, index_write=None, - object_count = None, zlib_compression = zlib.Z_BEST_SPEED): + object_count=None, zlib_compression=zlib.Z_BEST_SPEED): """ Create a new pack by putting all objects obtained by the object_iterator into a pack which is written using the pack_write method. @@ -923,9 +938,9 @@ def write_pack(cls, object_iter, pack_write, index_write=None, if not object_count: if not isinstance(object_iter, (tuple, list)): objs = list(object_iter) - #END handle list type + # END handle list type object_count = len(objs) - #END handle object + # END handle object pack_writer = FlexibleSha1Writer(pack_write) pwrite = pack_writer.write @@ -939,7 +954,7 @@ def write_pack(cls, object_iter, pack_write, index_write=None, if wants_index: index = IndexWriter() - #END handle index header + # END handle index header actual_count = 0 for obj in objs: @@ -952,30 +967,31 @@ def write_pack(cls, object_iter, pack_write, index_write=None, crc = crc32(hdr) else: crc = None - #END handle crc + # END handle crc pwrite(hdr) # data stream zstream = zlib.compressobj(zlib_compression) ostream = obj.stream - br, bw, crc = write_stream_to_pack(ostream.read, pwrite, zstream, base_crc = crc) + br, bw, crc = write_stream_to_pack(ostream.read, pwrite, zstream, base_crc=crc) assert(br == obj.size) if wants_index: index.append(obj.binsha, crc, ofs) - #END handle index + # END handle index ofs += len(hdr) + bw if actual_count == object_count: break - #END abort once we are done - #END for each object + # END abort once we are done + # END for each object if actual_count != object_count: - raise ValueError("Expected to write %i objects into pack, but received only %i from iterators" % (object_count, actual_count)) - #END count assertion + raise ValueError( + "Expected to write %i objects into pack, but received only %i from iterators" % (object_count, actual_count)) + # END count assertion # write footer - pack_sha = pack_writer.sha(as_hex = False) + pack_sha = pack_writer.sha(as_hex=False) assert len(pack_sha) == 20 pack_write(pack_sha) ofs += len(pack_sha) # just for completeness ;) @@ -983,12 +999,12 @@ def write_pack(cls, object_iter, pack_write, index_write=None, index_sha = None if wants_index: index_sha = index.write(pack_sha, index_write) - #END handle index + # END handle index return pack_sha, index_sha @classmethod - def create(cls, object_iter, base_dir, object_count = None, zlib_compression = zlib.Z_BEST_SPEED): + def create(cls, object_iter, base_dir, object_count=None, zlib_compression=zlib.Z_BEST_SPEED): """Create a new on-disk entity comprised of a properly named pack file and a properly named and corresponding index file. The pack contains all OStream objects contained in object iter. :param base_dir: directory which is to contain the files @@ -1012,5 +1028,4 @@ def create(cls, object_iter, base_dir, object_count = None, zlib_compression = z return cls(new_pack_path) - #} END interface diff --git a/gitdb/stream.py b/gitdb/stream.py index edd6dd2..1e0be84 100644 --- a/gitdb/stream.py +++ b/gitdb/stream.py @@ -1,7 +1,7 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ from io import BytesIO @@ -27,25 +27,24 @@ ) from gitdb.const import NULL_BYTE, BYTE_SPACE -from gitdb.utils.compat import buffer from gitdb.utils.encoding import force_bytes has_perf_mod = False -PY26 = sys.version_info[:2] < (2, 7) try: - from _perf import apply_delta as c_apply_delta + from gitdb_speedups._perf import apply_delta as c_apply_delta has_perf_mod = True except ImportError: pass -__all__ = ( 'DecompressMemMapReader', 'FDCompressedSha1Writer', 'DeltaApplyReader', - 'Sha1Writer', 'FlexibleSha1Writer', 'ZippedStoreShaWriter', 'FDCompressedSha1Writer', - 'FDStream', 'NullStream') +__all__ = ('DecompressMemMapReader', 'FDCompressedSha1Writer', 'DeltaApplyReader', + 'Sha1Writer', 'FlexibleSha1Writer', 'ZippedStoreShaWriter', 'FDCompressedSha1Writer', + 'FDStream', 'NullStream') #{ RO Streams class DecompressMemMapReader(LazyMixin): + """Reads data in chunks from a memory map and decompresses it. The client sees only the uncompressed data, respective file-like read calls are handling on-demand buffered decompression accordingly @@ -61,11 +60,11 @@ class DecompressMemMapReader(LazyMixin): hence we try to find a good tradeoff between allocation time and number of times we actually allocate. An own zlib implementation would be good here to better support streamed reading - it would only need to keep the mmap - and decompress it into chunks, thats all ... """ + and decompress it into chunks, that's all ... """ __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close', - '_cbr', '_phi') + '_cbr', '_phi') - max_read_size = 512*1024 # currently unused + max_read_size = 512 * 1024 # currently unused def __init__(self, m, close_on_deletion, size=None): """Initialize with mmap for stream reading @@ -90,9 +89,7 @@ def _set_cache_(self, attr): self._parse_header_info() def __del__(self): - if self._close: - self._m.close() - # END handle resource freeing + self.close() def _parse_header_info(self): """If this stream contains object data, parse the header info and skip the @@ -100,7 +97,9 @@ def _parse_header_info(self): :return: parsed type_string, size""" # read header - maxb = 512 # should really be enough, cgit uses 8192 I believe + # should really be enough, cgit uses 8192 I believe + # And for good reason !! This needs to be that high for the header to be read correctly in all cases + maxb = 8192 self._s = maxb hdr = self.read(maxb) hdrend = hdr.find(NULL_BYTE) @@ -127,7 +126,7 @@ def new(self, m, close_on_deletion=False): This method parses the object header from m and returns the parsed type and size, as well as the created stream instance. - :param m: memory map on which to oparate. It must be object data ( header + contents ) + :param m: memory map on which to operate. It must be object data ( header + contents ) :param close_on_deletion: if True, the memory map will be closed once we are being deleted""" inst = DecompressMemMapReader(m, close_on_deletion, 0) @@ -138,6 +137,17 @@ def data(self): """:return: random access compatible data we are working on""" return self._m + def close(self): + """Close our underlying stream of compressed bytes if this was allowed during initialization + :return: True if we closed the underlying stream + :note: can be called safely + """ + if self._close: + if hasattr(self._m, 'close'): + self._m.close() + self._close = False + # END handle resource freeing + def compressed_bytes_read(self): """ :return: number of compressed bytes read. This includes the bytes it @@ -163,7 +173,7 @@ def compressed_bytes_read(self): # Only scrub the stream forward if we are officially done with the # bytes we were to have. if self._br == self._s and not self._zip.unused_data: - # manipulate the bytes-read to allow our own read method to coninute + # manipulate the bytes-read to allow our own read method to continue # but keep the window at its current position self._br = 0 if hasattr(self._zip, 'status'): @@ -209,14 +219,13 @@ def read(self, size=-1): # END clamp size if size == 0: - return bytes() + return b'' # END handle depletion - # deplete the buffer, then just continue using the decompress object # which has an own buffer. We just need this to transparently parse the # header from the zlib stream - dat = bytes() + dat = b'' if self._buf: if self._buflen >= size: # have enough data @@ -243,7 +252,7 @@ def read(self, size=-1): # moving the window into the memory map along as we decompress, which keeps # the tail smaller than our chunk-size. This causes 'only' the chunk to be # copied once, and another copy of a part of it when it creates the unconsumed - # tail. We have to use it to hand in the appropriate amount of bytes durin g + # tail. We have to use it to hand in the appropriate amount of bytes during # the next read. tail = self._zip.unconsumed_tail if tail: @@ -261,14 +270,13 @@ def read(self, size=-1): self._cwe = cws + size # END handle tail - # if window is too small, make it larger so zip can decompress something if self._cwe - self._cws < 8: self._cwe = self._cws + 8 # END adjust winsize # takes a slice, but doesn't copy the data, it says ... - indata = buffer(self._m, self._cws, self._cwe - self._cws) + indata = self._m[self._cws:self._cwe] # get the actual window end to be sure we don't use it for computations self._cwe = self._cws + len(indata) @@ -279,11 +287,19 @@ def read(self, size=-1): # if we hit the end of the stream # NOTE: Behavior changed in PY2.7 onward, which requires special handling to make the tests work properly. # They are thorough, and I assume it is truly working. - if PY26: + # Why is this logic as convoluted as it is ? Please look at the table in + # https://github.com/gitpython-developers/gitdb/issues/19 to learn about the test-results. + # Basically, on py2.6, you want to use branch 1, whereas on all other python version, the second branch + # will be the one that works. + # However, the zlib VERSIONs as well as the platform check is used to further match the entries in the + # table in the github issue. This is it ... it was the only way I could make this work everywhere. + # IT's CERTAINLY GOING TO BITE US IN THE FUTURE ... . + if getattr(zlib, 'ZLIB_RUNTIME_VERSION', zlib.ZLIB_VERSION) in ('1.2.7', '1.2.5') and not sys.platform == 'darwin': unused_datalen = len(self._zip.unconsumed_tail) else: unused_datalen = len(self._zip.unconsumed_tail) + len(self._zip.unused_data) - # end handle very special case ... + # # end handle very special case ... + self._cbr += len(indata) - unused_datalen self._br += len(dcompdat) @@ -298,12 +314,13 @@ def read(self, size=-1): # to read, if we are called by compressed_bytes_read - it manipulates # us to empty the stream if dcompdat and (len(dcompdat) - len(dat)) < size and self._br < self._s: - dcompdat += self.read(size-len(dcompdat)) + dcompdat += self.read(size - len(dcompdat)) # END handle special case return dcompdat class DeltaApplyReader(LazyMixin): + """A reader which dynamically applies pack deltas to a base object, keeping the memory demands to a minimum. @@ -329,15 +346,15 @@ class DeltaApplyReader(LazyMixin): * cmd == 0 - invalid operation ( or error in delta stream ) """ __slots__ = ( - "_bstream", # base stream to which to apply the deltas - "_dstreams", # tuple of delta stream readers - "_mm_target", # memory map of the delta-applied data - "_size", # actual number of bytes in _mm_target - "_br" # number of bytes read - ) + "_bstream", # base stream to which to apply the deltas + "_dstreams", # tuple of delta stream readers + "_mm_target", # memory map of the delta-applied data + "_size", # actual number of bytes in _mm_target + "_br" # number of bytes read + ) #{ Configuration - k_max_memory_move = 250*1000*1000 + k_max_memory_move = 250 * 1000 * 1000 #} END configuration def __init__(self, stream_list): @@ -362,7 +379,6 @@ def _set_cache_too_slow_without_c(self, attr): # Aggregate all deltas into one delta in reverse order. Hence we take # the last delta, and reverse-merge its ancestor delta, until we receive # the final delta data stream. - # print "Handling %i delta streams, sizes: %s" % (len(self._dstreams), [ds.size for ds in self._dstreams]) dcl = connect_deltas(self._dstreams) # call len directly, as the (optional) c version doesn't implement the sequence @@ -396,7 +412,7 @@ def _set_cache_brute_(self, attr): buf = dstream.read(512) # read the header information + X offset, src_size = msb_size(buf) offset, target_size = msb_size(buf, offset) - buffer_info_list.append((buffer(buf, offset), offset, src_size, target_size)) + buffer_info_list.append((buf[offset:], offset, src_size, target_size)) max_target_size = max(max_target_size, target_size) # END for each delta stream @@ -411,7 +427,6 @@ def _set_cache_brute_(self, attr): base_size = target_size = max(base_size, max_target_size) # END adjust buffer sizes - # Allocate private memory map big enough to hold the first base buffer # We need random access to it bbuf = allocate_memory(base_size) @@ -437,11 +452,11 @@ def _set_cache_brute_(self, attr): ddata = allocate_memory(dstream.size - offset) ddata.write(dbuf) # read the rest from the stream. The size we give is larger than necessary - stream_copy(dstream.read, ddata.write, dstream.size, 256*mmap.PAGESIZE) + stream_copy(dstream.read, ddata.write, dstream.size, 256 * mmap.PAGESIZE) ####################################################################### if 'c_apply_delta' in globals(): - c_apply_delta(bbuf, ddata, tbuf); + c_apply_delta(bbuf, ddata, tbuf) else: apply_delta_data(bbuf, src_size, ddata, len(ddata), tbuf.write) ####################################################################### @@ -460,7 +475,6 @@ def _set_cache_brute_(self, attr): self._mm_target = bbuf self._size = final_target_size - #{ Configuration if not has_perf_mod: _set_cache_ = _set_cache_brute_ @@ -509,13 +523,13 @@ def new(cls, stream_list): # END single object special handling if stream_list[-1].type_id in delta_types: - raise ValueError("Cannot resolve deltas if there is no base object stream, last one was type: %s" % stream_list[-1].type) + raise ValueError( + "Cannot resolve deltas if there is no base object stream, last one was type: %s" % stream_list[-1].type) # END check stream return cls(stream_list) #} END interface - #{ OInfo like Interface @property @@ -539,7 +553,8 @@ def size(self): #{ W Streams -class Sha1Writer(object): +class Sha1Writer: + """Simple stream writer which produces a sha whenever you like as it degests everything it is supposed to write""" __slots__ = "sha1" @@ -562,7 +577,7 @@ def write(self, data): #{ Interface - def sha(self, as_hex = False): + def sha(self, as_hex=False): """:return: sha so far :param as_hex: if True, sha will be hex-encoded, binary otherwise""" if as_hex: @@ -573,6 +588,7 @@ def sha(self, as_hex = False): class FlexibleSha1Writer(Sha1Writer): + """Writer producing a sha1 while passing on the written bytes to the given write function""" __slots__ = 'writer' @@ -587,8 +603,10 @@ def write(self, data): class ZippedStoreShaWriter(Sha1Writer): + """Remembers everything someone writes to it and generates a sha""" __slots__ = ('buf', 'zip') + def __init__(self): Sha1Writer.__init__(self) self.buf = BytesIO() @@ -620,6 +638,7 @@ def getvalue(self): class FDCompressedSha1Writer(Sha1Writer): + """Digests data written to it, making the sha available, then compress the data and write it to the file descriptor @@ -631,7 +650,7 @@ class FDCompressedSha1Writer(Sha1Writer): exc = IOError("Failed to write all bytes to filedescriptor") def __init__(self, fd): - super(FDCompressedSha1Writer, self).__init__() + super().__init__() self.fd = fd self.zip = zlib.compressobj(zlib.Z_BEST_SPEED) @@ -639,7 +658,7 @@ def __init__(self, fd): def write(self, data): """:raise IOError: If not all bytes could be written - :return: lenght of incoming data""" + :return: length of incoming data""" self.sha1.update(data) cdata = self.zip.compress(data) bytes_written = write(self.fd, cdata) @@ -658,11 +677,13 @@ def close(self): #} END stream interface -class FDStream(object): +class FDStream: + """A simple wrapper providing the most basic functions on a file descriptor with the fileobject interface. Cannot use os.fdopen as the resulting stream takes ownership""" __slots__ = ("_fd", '_pos') + def __init__(self, fd): self._fd = fd self._pos = 0 @@ -690,7 +711,8 @@ def close(self): close(self._fd) -class NullStream(object): +class NullStream: + """A stream that does nothing but providing a stream interface. Use it like /dev/null""" __slots__ = tuple() diff --git a/gitdb/test/__init__.py b/gitdb/test/__init__.py index 8a681e4..03bd406 100644 --- a/gitdb/test/__init__.py +++ b/gitdb/test/__init__.py @@ -1,4 +1,4 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ diff --git a/gitdb/test/db/__init__.py b/gitdb/test/db/__init__.py index 8a681e4..03bd406 100644 --- a/gitdb/test/db/__init__.py +++ b/gitdb/test/db/__init__.py @@ -1,4 +1,4 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ diff --git a/gitdb/test/db/lib.py b/gitdb/test/db/lib.py index af6d9e0..408dd8c 100644 --- a/gitdb/test/db/lib.py +++ b/gitdb/test/db/lib.py @@ -1,7 +1,7 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ """Base classes for object db testing""" from gitdb.test.lib import ( with_rw_directory, @@ -23,7 +23,6 @@ from gitdb.exc import BadObject from gitdb.typ import str_blob_type -from gitdb.utils.compat import xrange from io import BytesIO @@ -32,7 +31,9 @@ __all__ = ('TestDBBase', 'with_rw_directory', 'with_packs_rw', 'fixture_path') + class TestDBBase(TestBase): + """Base class providing testing routines on databases""" # data @@ -43,7 +44,7 @@ def _assert_object_writing_simple(self, db): # write a bunch of objects and query their streams and info null_objs = db.size() ni = 250 - for i in xrange(ni): + for i in range(ni): data = pack(">L", i) istream = IStream(str_blob_type, len(data), BytesIO(data)) new_istream = db.store(istream) @@ -65,7 +66,6 @@ def _assert_object_writing_simple(self, db): assert len(shas) == db.size() assert len(shas[0]) == 20 - def _assert_object_writing(self, db): """General tests to verify object writing, compatible to ObjectDBW **Note:** requires write access to the database""" @@ -102,12 +102,12 @@ def _assert_object_writing(self, db): assert ostream.type == str_blob_type assert ostream.size == len(data) else: - self.failUnlessRaises(BadObject, db.info, sha) - self.failUnlessRaises(BadObject, db.stream, sha) + self.assertRaises(BadObject, db.info, sha) + self.assertRaises(BadObject, db.stream, sha) # DIRECT STREAM COPY # our data hase been written in object format to the StringIO - # we pasesd as output stream. No physical database representation + # we passed as output stream. No physical database representation # was created. # Test direct stream copy of object streams, the result must be # identical to what we fed in @@ -126,4 +126,3 @@ def _assert_object_writing(self, db): assert ostream.getvalue() == new_ostream.getvalue() # END for each data set # END for each dry_run mode - diff --git a/gitdb/test/db/test_git.py b/gitdb/test/db/test_git.py index e141c2b..73ac1a0 100644 --- a/gitdb/test/db/test_git.py +++ b/gitdb/test/db/test_git.py @@ -1,48 +1,49 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ +import os from gitdb.test.db.lib import ( - TestDBBase, - fixture_path, + TestDBBase, with_rw_directory ) from gitdb.exc import BadObject from gitdb.db import GitDB from gitdb.base import OStream, OInfo -from gitdb.util import hex_to_bin, bin_to_hex +from gitdb.util import bin_to_hex + class TestGitDB(TestDBBase): def test_reading(self): - gdb = GitDB(fixture_path('../../../.git/objects')) + gdb = GitDB(os.path.join(self.gitrepopath, 'objects')) # we have packs and loose objects, alternates doesn't necessarily exist assert 1 < len(gdb.databases()) < 4 # access should be possible - gitdb_sha = hex_to_bin("5690fd0d3304f378754b23b098bd7cb5f4aa1976") + gitdb_sha = next(gdb.sha_iter()) assert isinstance(gdb.info(gitdb_sha), OInfo) assert isinstance(gdb.stream(gitdb_sha), OStream) ni = 50 assert gdb.size() >= ni sha_list = list(gdb.sha_iter()) assert len(sha_list) == gdb.size() - sha_list = sha_list[:ni] # speed up tests ... - + sha_list = sha_list[:ni] # speed up tests ... # This is actually a test for compound functionality, but it doesn't # have a separate test module # test partial shas # this one as uneven and quite short - assert gdb.partial_to_complete_sha_hex('155b6') == hex_to_bin("155b62a9af0aa7677078331e111d0f7aa6eb4afc") + gitdb_sha_hex = bin_to_hex(gitdb_sha) + assert gdb.partial_to_complete_sha_hex(gitdb_sha_hex[:5]) == gitdb_sha # mix even/uneven hexshas for i, binsha in enumerate(sha_list): - assert gdb.partial_to_complete_sha_hex(bin_to_hex(binsha)[:8-(i%2)]) == binsha + assert gdb.partial_to_complete_sha_hex(bin_to_hex(binsha)[:8 - (i % 2)]) == binsha # END for each sha - self.failUnlessRaises(BadObject, gdb.partial_to_complete_sha_hex, "0000") + self.assertRaises(BadObject, gdb.partial_to_complete_sha_hex, "0000") @with_rw_directory def test_writing(self, path): diff --git a/gitdb/test/db/test_loose.py b/gitdb/test/db/test_loose.py index 1d6af9c..295e2ee 100644 --- a/gitdb/test/db/test_loose.py +++ b/gitdb/test/db/test_loose.py @@ -1,15 +1,16 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ from gitdb.test.db.lib import ( - TestDBBase, + TestDBBase, with_rw_directory ) from gitdb.db import LooseObjectDB from gitdb.exc import BadObject from gitdb.util import bin_to_hex + class TestLooseDB(TestDBBase): @with_rw_directory @@ -31,5 +32,5 @@ def test_basics(self, path): assert bin_to_hex(ldb.partial_to_complete_sha_hex(short_sha)) == long_sha # END for each sha - self.failUnlessRaises(BadObject, ldb.partial_to_complete_sha_hex, '0000') - # raises if no object could be foudn + self.assertRaises(BadObject, ldb.partial_to_complete_sha_hex, '0000') + # raises if no object could be found diff --git a/gitdb/test/db/test_mem.py b/gitdb/test/db/test_mem.py index 97f7217..882e54f 100644 --- a/gitdb/test/db/test_mem.py +++ b/gitdb/test/db/test_mem.py @@ -1,7 +1,7 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ from gitdb.test.db.lib import ( TestDBBase, with_rw_directory @@ -11,6 +11,7 @@ LooseObjectDB ) + class TestMemoryDB(TestDBBase): @with_rw_directory diff --git a/gitdb/test/db/test_pack.py b/gitdb/test/db/test_pack.py index 963a71a..bd07906 100644 --- a/gitdb/test/db/test_pack.py +++ b/gitdb/test/db/test_pack.py @@ -1,7 +1,7 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ from gitdb.test.db.lib import ( TestDBBase, with_rw_directory, @@ -10,15 +10,22 @@ from gitdb.db import PackedDB from gitdb.exc import BadObject, AmbiguousObjectName +from gitdb.util import mman import os import random +import sys + +import pytest class TestPackDB(TestDBBase): @with_rw_directory @with_packs_rw def test_writing(self, path): + if sys.platform == "win32": + pytest.skip("FIXME: Currently fail on windows") + pdb = PackedDB(path) # on demand, we init our pack cache @@ -29,6 +36,11 @@ def test_writing(self, path): # packs removed - rename a file, should affect the glob pack_path = pdb.entities()[0].pack().path() new_pack_path = pack_path + "renamed" + if sys.platform == "win32": + # While using this function, we are not allowed to have any handle + # to this path, which is currently not the case. The pack caching + # does still have a handle :-( + mman.force_map_handle_removal_win(pack_path) os.rename(pack_path, new_pack_path) pdb.update_cache(force=True) @@ -53,7 +65,6 @@ def test_writing(self, path): pdb.stream(sha) # END for each sha to query - # test short finding - be a bit more brutal here max_bytes = 19 min_bytes = 2 @@ -61,16 +72,16 @@ def test_writing(self, path): for i, sha in enumerate(sha_list): short_sha = sha[:max((i % max_bytes), min_bytes)] try: - assert pdb.partial_to_complete_sha(short_sha, len(short_sha)*2) == sha + assert pdb.partial_to_complete_sha(short_sha, len(short_sha) * 2) == sha except AmbiguousObjectName: num_ambiguous += 1 - pass # valid, we can have short objects + pass # valid, we can have short objects # END exception handling # END for each sha to find # we should have at least one ambiguous, considering the small sizes - # but in our pack, there is no ambigious ... + # but in our pack, there is no ambiguous ... # assert num_ambiguous # non-existing - self.failUnlessRaises(BadObject, pdb.partial_to_complete_sha, b'\0\0', 4) + self.assertRaises(BadObject, pdb.partial_to_complete_sha, b'\0\0', 4) diff --git a/gitdb/test/db/test_ref.py b/gitdb/test/db/test_ref.py index db93082..0816e64 100644 --- a/gitdb/test/db/test_ref.py +++ b/gitdb/test/db/test_ref.py @@ -1,11 +1,10 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ from gitdb.test.db.lib import ( - TestDBBase, - with_rw_directory, - fixture_path + TestDBBase, + with_rw_directory, ) from gitdb.db import ReferenceDB @@ -16,15 +15,15 @@ import os + class TestReferenceDB(TestDBBase): def make_alt_file(self, alt_path, alt_list): """Create an alternates file which contains the given alternates. The list can be empty""" - alt_file = open(alt_path, "wb") - for alt in alt_list: - alt_file.write(alt.encode("utf-8") + "\n".encode("ascii")) - alt_file.close() + with open(alt_path, "wb") as alt_file: + for alt in alt_list: + alt_file.write(alt.encode("utf-8") + b"\n") @with_rw_directory def test_writing(self, path): @@ -39,13 +38,13 @@ def test_writing(self, path): # setup alternate file # add two, one is invalid - own_repo_path = fixture_path('../../../.git/objects') # use own repo + own_repo_path = os.path.join(self.gitrepopath, 'objects') # use own repo self.make_alt_file(alt_path, [own_repo_path, "invalid/path"]) rdb.update_cache() assert len(rdb.databases()) == 1 # we should now find a default revision of ours - gitdb_sha = hex_to_bin("5690fd0d3304f378754b23b098bd7cb5f4aa1976") + gitdb_sha = next(rdb.sha_iter()) assert rdb.has_object(gitdb_sha) # remove valid diff --git a/gitdb/test/fixtures/objects/88/8401851f15db0eed60eb1bc29dec5ddcace911 b/gitdb/test/fixtures/objects/88/8401851f15db0eed60eb1bc29dec5ddcace911 new file mode 100644 index 0000000..d60aeef Binary files /dev/null and b/gitdb/test/fixtures/objects/88/8401851f15db0eed60eb1bc29dec5ddcace911 differ diff --git a/gitdb/test/lib.py b/gitdb/test/lib.py index d09b1cb..8e60234 100644 --- a/gitdb/test/lib.py +++ b/gitdb/test/lib.py @@ -1,10 +1,9 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ """Utilities used in ODB testing""" from gitdb import OStream -from gitdb.utils.compat import xrange import sys import random @@ -18,37 +17,51 @@ import shutil import os import gc +import logging from functools import wraps #{ Bases class TestBase(unittest.TestCase): - """Base class for all tests""" + """Base class for all tests + TestCase providing access to readonly repositories using the following member variables. -#} END bases + * gitrepopath -#{ Decorators + * read-only base path of the git source repository, i.e. .../git/.git + """ -def skip_on_travis_ci(func): - """All tests decorated with this one will raise SkipTest when run on travis ci. - Use it to workaround difficult to solve issues - NOTE: copied from bcore (https://github.com/Byron/bcore)""" - @wraps(func) - def wrapper(self, *args, **kwargs): - if 'TRAVIS' in os.environ: - import nose - raise nose.SkipTest("Cannot run on travis-ci") - # end check for travis ci - return func(self, *args, **kwargs) - # end wrapper - return wrapper + #{ Invvariants + k_env_git_repo = "GITDB_TEST_GIT_REPO_BASE" + #} END invariants + + @classmethod + def setUpClass(cls): + try: + super().setUpClass() + except AttributeError: + pass + cls.gitrepopath = os.environ.get(cls.k_env_git_repo) + if not cls.gitrepopath: + logging.info( + "You can set the %s environment variable to a .git repository of your choice - defaulting to the gitdb repository", cls.k_env_git_repo) + ospd = os.path.dirname + cls.gitrepopath = os.path.join(ospd(ospd(ospd(__file__))), '.git') + # end assure gitrepo is set + assert cls.gitrepopath.endswith('.git') + + +#} END bases + +#{ Decorators def with_rw_directory(func): """Create a temporary directory which can be written to, remove it if the - test suceeds, but leave it otherwise to aid additional debugging""" + test succeeds, but leave it otherwise to aid additional debugging""" + def wrapper(self): path = tempfile.mktemp(prefix=func.__name__) os.mkdir(path) @@ -57,7 +70,7 @@ def wrapper(self): try: return func(self, path) except Exception: - sys.stderr.write("Test %s.%s failed, output is at %r\n" % (type(self).__name__, func.__name__, path)) + sys.stderr.write(f"Test {type(self).__name__}.{func.__name__} failed, output is at {path!r}\n") keep = True raise finally: @@ -78,6 +91,7 @@ def wrapper(self): def with_packs_rw(func): """Function that provides a path into which the packs for testing should be copied. Will pass on the path to the actual function afterwards""" + def wrapper(self, path): src_pack_glob = fixture_path('packs/*') copy_files_globbed(src_pack_glob, path, hard_link_ok=True) @@ -91,12 +105,14 @@ def wrapper(self, path): #{ Routines + def fixture_path(relapath=''): """:return: absolute path into the fixture directory :param relapath: relative path into the fixtures directory, or '' to obtain the fixture directory itself""" return os.path.join(os.path.dirname(__file__), 'fixtures', relapath) + def copy_files_globbed(source_glob, target_dir, hard_link_ok=False): """Copy all files found according to the given source glob into the target directory :param hard_link_ok: if True, hard links will be created if possible. Otherwise @@ -119,19 +135,21 @@ def make_bytes(size_in_bytes, randomize=False): """:return: string with given size in bytes :param randomize: try to produce a very random stream""" actual_size = size_in_bytes // 4 - producer = xrange(actual_size) + producer = range(actual_size) if randomize: producer = list(producer) random.shuffle(producer) # END randomize a = array('i', producer) - return a.tostring() + return a.tobytes() + def make_object(type, data): """:return: bytes resembling an uncompressed object""" odata = "blob %i\0" % len(data) return odata.encode("ascii") + data + def make_memory_file(size_in_bytes, randomize=False): """:return: tuple(size_of_stream, stream) :param randomize: try to produce a very random stream""" @@ -142,24 +160,27 @@ def make_memory_file(size_in_bytes, randomize=False): #{ Stream Utilities -class DummyStream(object): - def __init__(self): - self.was_read = False - self.bytes = 0 - self.closed = False - def read(self, size): - self.was_read = True - self.bytes = size +class DummyStream: + + def __init__(self): + self.was_read = False + self.bytes = 0 + self.closed = False - def close(self): - self.closed = True + def read(self, size): + self.was_read = True + self.bytes = size - def _assert(self): - assert self.was_read + def close(self): + self.closed = True + + def _assert(self): + assert self.was_read class DeriveTest(OStream): + def __init__(self, sha, type, size, stream, *args, **kwargs): self.myarg = kwargs.pop('myarg') self.args = args diff --git a/gitdb/test/performance/__init__.py b/gitdb/test/performance/__init__.py index 8b13789..e69de29 100644 --- a/gitdb/test/performance/__init__.py +++ b/gitdb/test/performance/__init__.py @@ -1 +0,0 @@ - diff --git a/gitdb/test/performance/lib.py b/gitdb/test/performance/lib.py index ec45cf3..36916ed 100644 --- a/gitdb/test/performance/lib.py +++ b/gitdb/test/performance/lib.py @@ -1,47 +1,17 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ """Contains library functions""" -import os -import logging from gitdb.test.lib import TestBase -#{ Invvariants -k_env_git_repo = "GITDB_TEST_GIT_REPO_BASE" -#} END invariants - - -#{ Base Classes +#{ Base Classes class TestBigRepoR(TestBase): - """TestCase providing access to readonly 'big' repositories using the following - member variables: - - * gitrepopath - - * read-only base path of the git source repository, i.e. .../git/.git""" - - #{ Invariants - head_sha_2k = '235d521da60e4699e5bd59ac658b5b48bd76ddca' - head_sha_50 = '32347c375250fd470973a5d76185cac718955fd5' - #} END invariants - - def setUp(self): - try: - super(TestBigRepoR, self).setUp() - except AttributeError: - pass + """A placeholder in case we want to add additional functionality to all performance test-cases + """ - self.gitrepopath = os.environ.get(k_env_git_repo) - if not self.gitrepopath: - logging.info("You can set the %s environment variable to a .git repository of your choice - defaulting to the gitdb repository") - ospd = os.path.dirname - self.gitrepopath = os.path.join(ospd(ospd(ospd(ospd(__file__)))), '.git') - # end assure gitrepo is set - assert self.gitrepopath.endswith('.git') - #} END base classes diff --git a/gitdb/test/performance/test_pack.py b/gitdb/test/performance/test_pack.py index db3b48d..fc3d334 100644 --- a/gitdb/test/performance/test_pack.py +++ b/gitdb/test/performance/test_pack.py @@ -1,36 +1,39 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ """Performance tests for object store""" -from __future__ import print_function from gitdb.test.performance.lib import ( - TestBigRepoR + TestBigRepoR ) +from gitdb import ( + MemoryDB, + GitDB, + IStream, +) +from gitdb.typ import str_blob_type from gitdb.exc import UnsupportedOperation from gitdb.db.pack import PackedDB -from gitdb.utils.compat import xrange -from gitdb.test.lib import skip_on_travis_ci import sys import os from time import time + class TestPackedDBPerformance(TestBigRepoR): - @skip_on_travis_ci def test_pack_random_access(self): pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) - + # sha lookup st = time() sha_list = list(pdb.sha_iter()) elapsed = time() - st ns = len(sha_list) - print("PDB: looked up %i shas by index in %f s ( %f shas/s )" % (ns, elapsed, ns / elapsed), file=sys.stderr) - + print("PDB: looked up %i shas by index in %f s ( %f shas/s )" % (ns, elapsed, ns / (elapsed or 1)), file=sys.stderr) + # sha lookup: best-case and worst case access pdb_pack_info = pdb._pack_info # END shuffle shas @@ -39,13 +42,14 @@ def test_pack_random_access(self): pdb_pack_info(sha) # END for each sha to look up elapsed = time() - st - + # discard cache del(pdb._entities) pdb.entities() - print("PDB: looked up %i sha in %i packs in %f s ( %f shas/s )" % (ns, len(pdb.entities()), elapsed, ns / elapsed), file=sys.stderr) + print("PDB: looked up %i sha in %i packs in %f s ( %f shas/s )" % + (ns, len(pdb.entities()), elapsed, ns / (elapsed or 1)), file=sys.stderr) # END for each random mode - + # query info and streams only max_items = 10000 # can wait longer when testing memory for pdb_fun in (pdb.info, pdb.stream): @@ -53,9 +57,10 @@ def test_pack_random_access(self): for sha in sha_list[:max_items]: pdb_fun(sha) elapsed = time() - st - print("PDB: Obtained %i object %s by sha in %f s ( %f items/s )" % (max_items, pdb_fun.__name__.upper(), elapsed, max_items / elapsed), file=sys.stderr) + print("PDB: Obtained %i object %s by sha in %f s ( %f items/s )" % + (max_items, pdb_fun.__name__.upper(), elapsed, max_items / (elapsed or 1)), file=sys.stderr) # END for each function - + # retrieve stream and read all max_items = 5000 pdb_stream = pdb.stream @@ -63,13 +68,40 @@ def test_pack_random_access(self): st = time() for sha in sha_list[:max_items]: stream = pdb_stream(sha) - stream.read() + read_len = len(stream.read()) + assert read_len == stream.size total_size += stream.size elapsed = time() - st total_kib = total_size / 1000 - print("PDB: Obtained %i streams by sha and read all bytes totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % (max_items, total_kib, total_kib/elapsed , elapsed, max_items / elapsed), file=sys.stderr) - - @skip_on_travis_ci + print("PDB: Obtained %i streams by sha and read all bytes totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % + (max_items, total_kib, total_kib / (elapsed or 1), elapsed, max_items / (elapsed or 1)), file=sys.stderr) + + def test_loose_correctness(self): + """based on the pack(s) of our packed object DB, we will just copy and verify all objects in the back + into the loose object db (memory). + This should help finding dormant issues like this one https://github.com/gitpython-developers/GitPython/issues/220 + faster + :note: It doesn't seem this test can find the issue unless the given pack contains highly compressed + data files, like archives.""" + from gitdb.util import bin_to_hex + pdb = GitDB(os.path.join(self.gitrepopath, 'objects')) + mdb = MemoryDB() + for c, sha in enumerate(pdb.sha_iter()): + ostream = pdb.stream(sha) + # the issue only showed on larger files which are hardly compressible ... + if ostream.type != str_blob_type: + continue + istream = IStream(ostream.type, ostream.size, ostream.stream) + mdb.store(istream) + assert istream.binsha == sha, "Failed on object %s" % bin_to_hex(sha).decode('ascii') + # this can fail ... sometimes, so the packs dataset should be huge + assert len(mdb.stream(sha).read()) == ostream.size + + if c and c % 1000 == 0: + print("Verified %i loose object compression/decompression cycles" % c, file=sys.stderr) + mdb._cache.clear() + # end for each sha to copy + def test_correctness(self): pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) # disabled for now as it used to work perfectly, checking big repositories takes a long time @@ -80,7 +112,7 @@ def test_correctness(self): for entity in pdb.entities(): pack_verify = entity.is_valid_stream sha_by_index = entity.index().sha - for index in xrange(entity.index().size()): + for index in range(entity.index().size()): try: assert pack_verify(sha_by_index(index), use_crc=crc) count += 1 @@ -90,6 +122,6 @@ def test_correctness(self): # END for each index # END for each entity elapsed = time() - st - print("PDB: verified %i objects (crc=%i) in %f s ( %f objects/s )" % (count, crc, elapsed, count / elapsed), file=sys.stderr) + print("PDB: verified %i objects (crc=%i) in %f s ( %f objects/s )" % + (count, crc, elapsed, count / (elapsed or 1)), file=sys.stderr) # END for each verify mode - diff --git a/gitdb/test/performance/test_pack_streaming.py b/gitdb/test/performance/test_pack_streaming.py index fe160ea..80c798b 100644 --- a/gitdb/test/performance/test_pack_streaming.py +++ b/gitdb/test/performance/test_pack_streaming.py @@ -1,44 +1,43 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ """Specific test for pack streams only""" -from __future__ import print_function from gitdb.test.performance.lib import ( - TestBigRepoR + TestBigRepoR ) from gitdb.db.pack import PackedDB from gitdb.stream import NullStream from gitdb.pack import PackEntity -from gitdb.test.lib import skip_on_travis_ci import os import sys from time import time + class CountedNullStream(NullStream): __slots__ = '_bw' + def __init__(self): self._bw = 0 - + def bytes_written(self): return self._bw - + def write(self, d): self._bw += NullStream.write(self, d) - + class TestPackStreamingPerformance(TestBigRepoR): - - @skip_on_travis_ci + def test_pack_writing(self): # see how fast we can write a pack from object streams. # This will not be fast, as we take time for decompressing the streams as well ostream = CountedNullStream() pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) - + ni = 1000 count = 0 st = time() @@ -47,22 +46,22 @@ def test_pack_writing(self): pdb.stream(sha) if count == ni: break - #END gather objects for pack-writing + # END gather objects for pack-writing elapsed = time() - st - print("PDB Streaming: Got %i streams by sha in in %f s ( %f streams/s )" % (ni, elapsed, ni / elapsed), file=sys.stderr) - + print("PDB Streaming: Got %i streams by sha in in %f s ( %f streams/s )" % + (ni, elapsed, ni / (elapsed or 1)), file=sys.stderr) + st = time() PackEntity.write_pack((pdb.stream(sha) for sha in pdb.sha_iter()), ostream.write, object_count=ni) elapsed = time() - st total_kb = ostream.bytes_written() / 1000 - print(sys.stderr, "PDB Streaming: Wrote pack of size %i kb in %f s (%f kb/s)" % (total_kb, elapsed, total_kb/elapsed), sys.stderr) - - - @skip_on_travis_ci + print(sys.stderr, "PDB Streaming: Wrote pack of size %i kb in %f s (%f kb/s)" % + (total_kb, elapsed, total_kb / (elapsed or 1)), sys.stderr) + def test_stream_reading(self): # raise SkipTest() pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) - + # streaming only, meant for --with-profile runs ni = 5000 count = 0 @@ -78,5 +77,5 @@ def test_stream_reading(self): count += 1 elapsed = time() - st total_kib = total_size / 1000 - print(sys.stderr, "PDB Streaming: Got %i streams by sha and read all bytes totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % (ni, total_kib, total_kib/elapsed , elapsed, ni / elapsed), sys.stderr) - + print(sys.stderr, "PDB Streaming: Got %i streams by sha and read all bytes totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % + (ni, total_kib, total_kib / (elapsed or 1), elapsed, ni / (elapsed or 1)), sys.stderr) diff --git a/gitdb/test/performance/test_stream.py b/gitdb/test/performance/test_stream.py index 84c9dea..fb10871 100644 --- a/gitdb/test/performance/test_stream.py +++ b/gitdb/test/performance/test_stream.py @@ -1,15 +1,14 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ """Performance data streaming performance""" -from __future__ import print_function from gitdb.test.performance.lib import TestBigRepoR from gitdb.db import LooseObjectDB from gitdb import IStream -from gitdb.util import bin_to_hex +from gitdb.util import bin_to_hex, remove from gitdb.fun import chunk_size from time import time @@ -20,7 +19,6 @@ from gitdb.test.lib import ( make_memory_file, with_rw_directory, - skip_on_travis_ci ) @@ -35,22 +33,21 @@ def read_chunked_stream(stream): # END read stream loop assert total == stream.size return stream - - + + #} END utilities class TestObjDBPerformance(TestBigRepoR): - - large_data_size_bytes = 1000*1000*50 # some MiB should do it - moderate_data_size_bytes = 1000*1000*1 # just 1 MiB - - @skip_on_travis_ci + + large_data_size_bytes = 1000 * 1000 * 50 # some MiB should do it + moderate_data_size_bytes = 1000 * 1000 * 1 # just 1 MiB + @with_rw_directory def test_large_data_streaming(self, path): ldb = LooseObjectDB(path) string_ios = list() # list of streams we previously created - - # serial mode + + # serial mode for randomize in range(2): desc = (randomize and 'random ') or '' print("Creating %s data ..." % desc, file=sys.stderr) @@ -59,32 +56,32 @@ def test_large_data_streaming(self, path): elapsed = time() - st print("Done (in %f s)" % elapsed, file=sys.stderr) string_ios.append(stream) - - # writing - due to the compression it will seem faster than it is + + # writing - due to the compression it will seem faster than it is st = time() sha = ldb.store(IStream('blob', size, stream)).binsha elapsed_add = time() - st assert ldb.has_object(sha) db_file = ldb.readable_db_object_path(bin_to_hex(sha)) fsize_kib = os.path.getsize(db_file) / 1000 - - + size_kib = size / 1000 - print("Added %i KiB (filesize = %i KiB) of %s data to loose odb in %f s ( %f Write KiB / s)" % (size_kib, fsize_kib, desc, elapsed_add, size_kib / elapsed_add), file=sys.stderr) - + print("Added %i KiB (filesize = %i KiB) of %s data to loose odb in %f s ( %f Write KiB / s)" % + (size_kib, fsize_kib, desc, elapsed_add, size_kib / (elapsed_add or 1)), file=sys.stderr) + # reading all at once st = time() ostream = ldb.stream(sha) shadata = ostream.read() elapsed_readall = time() - st - + stream.seek(0) assert shadata == stream.getvalue() - print("Read %i KiB of %s data at once from loose odb in %f s ( %f Read KiB / s)" % (size_kib, desc, elapsed_readall, size_kib / elapsed_readall), file=sys.stderr) - - + print("Read %i KiB of %s data at once from loose odb in %f s ( %f Read KiB / s)" % + (size_kib, desc, elapsed_readall, size_kib / (elapsed_readall or 1)), file=sys.stderr) + # reading in chunks of 1 MiB - cs = 512*1000 + cs = 512 * 1000 chunks = list() st = time() ostream = ldb.stream(sha) @@ -95,13 +92,15 @@ def test_large_data_streaming(self, path): break # END read in chunks elapsed_readchunks = time() - st - + stream.seek(0) assert b''.join(chunks) == stream.getvalue() - + cs_kib = cs / 1000 - print("Read %i KiB of %s data in %i KiB chunks from loose odb in %f s ( %f Read KiB / s)" % (size_kib, desc, cs_kib, elapsed_readchunks, size_kib / elapsed_readchunks), file=sys.stderr) - + print("Read %i KiB of %s data in %i KiB chunks from loose odb in %f s ( %f Read KiB / s)" % + (size_kib, desc, cs_kib, elapsed_readchunks, size_kib / (elapsed_readchunks or 1)), file=sys.stderr) + # del db file so we keep something to do - os.remove(db_file) + ostream = None # To release the file handle (win) + remove(db_file) # END for each randomization factor diff --git a/gitdb/test/test_base.py b/gitdb/test/test_base.py index 578c29f..17906c9 100644 --- a/gitdb/test/test_base.py +++ b/gitdb/test/test_base.py @@ -1,13 +1,13 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ """Test for object db""" from gitdb.test.lib import ( - TestBase, - DummyStream, - DeriveTest, - ) + TestBase, + DummyStream, + DeriveTest, +) from gitdb import ( OInfo, @@ -20,11 +20,11 @@ ) from gitdb.util import ( NULL_BIN_SHA - ) +) from gitdb.typ import ( str_blob_type - ) +) class TestBaseTypes(TestBase): @@ -54,7 +54,6 @@ def test_streams(self): assert dpinfo.delta_info == sha assert dpinfo.pack_offset == 0 - # test ostream stream = DummyStream() ostream = OStream(*(info + (stream, ))) @@ -74,13 +73,13 @@ def test_streams(self): # test deltapackstream dpostream = ODeltaPackStream(*(dpinfo + (stream, ))) - dpostream.stream is stream + assert dpostream.stream is stream dpostream.read(5) stream._assert() assert stream.bytes == 5 # derive with own args - DeriveTest(sha, str_blob_type, s, stream, 'mine',myarg = 3)._assert() + DeriveTest(sha, str_blob_type, s, stream, 'mine', myarg=3)._assert() # test istream istream = IStream(str_blob_type, s, stream) @@ -93,7 +92,7 @@ def test_streams(self): assert istream.size == s istream.size = s * 2 - istream.size == s * 2 + assert istream.size == s * 2 assert istream.type == str_blob_type istream.type = "something" assert istream.type == "something" diff --git a/gitdb/test/test_example.py b/gitdb/test/test_example.py index aa43a09..3b4c908 100644 --- a/gitdb/test/test_example.py +++ b/gitdb/test/test_example.py @@ -1,21 +1,20 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ """Module with examples from the tutorial section of the docs""" -from gitdb.test.lib import ( - TestBase, - fixture_path -) +import os +from gitdb.test.lib import TestBase from gitdb import IStream from gitdb.db import LooseObjectDB from io import BytesIO + class TestExamples(TestBase): def test_base(self): - ldb = LooseObjectDB(fixture_path("../../../.git/objects")) + ldb = LooseObjectDB(os.path.join(self.gitrepopath, 'objects')) for sha1 in ldb.sha_iter(): oinfo = ldb.info(sha1) @@ -33,7 +32,7 @@ def test_base(self): pass # END ignore exception if there are no loose objects - data = "my data".encode("ascii") + data = b"my data" istream = IStream("blob", len(data), BytesIO(data)) # the object does not yet have a sha diff --git a/gitdb/test/test_pack.py b/gitdb/test/test_pack.py index 3ab2fec..e723482 100644 --- a/gitdb/test/test_pack.py +++ b/gitdb/test/test_pack.py @@ -1,7 +1,7 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ """Test everything about packs reading and writing""" from gitdb.test.lib import ( TestBase, @@ -25,14 +25,8 @@ from gitdb.fun import delta_types from gitdb.exc import UnsupportedOperation from gitdb.util import to_bin_sha -from gitdb.utils.compat import xrange -try: - from itertools import izip -except ImportError: - izip = zip - -from nose import SkipTest +import pytest import os import tempfile @@ -43,6 +37,7 @@ def bin_sha_from_filename(filename): return to_bin_sha(os.path.splitext(os.path.basename(filename))[0][5:]) #} END utilities + class TestPack(TestBase): packindexfile_v1 = (fixture_path('packs/pack-c0438c19fb16422b6bbcce24387b3264416d485b.idx'), 1, 67) @@ -50,8 +45,8 @@ class TestPack(TestBase): packindexfile_v2_3_ascii = (fixture_path('packs/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.idx'), 2, 42) packfile_v2_1 = (fixture_path('packs/pack-c0438c19fb16422b6bbcce24387b3264416d485b.pack'), 2, packindexfile_v1[2]) packfile_v2_2 = (fixture_path('packs/pack-11fdfa9e156ab73caae3b6da867192221f2089c2.pack'), 2, packindexfile_v2[2]) - packfile_v2_3_ascii = (fixture_path('packs/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.pack'), 2, packindexfile_v2_3_ascii[2]) - + packfile_v2_3_ascii = ( + fixture_path('packs/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.pack'), 2, packindexfile_v2_3_ascii[2]) def _assert_index_file(self, index, version, size): assert index.packfile_checksum() != index.indexfile_checksum() @@ -62,7 +57,7 @@ def _assert_index_file(self, index, version, size): assert len(index.offsets()) == size # get all data of all objects - for oidx in xrange(index.size()): + for oidx in range(index.size()): sha = index.sha(oidx) assert oidx == index.sha_to_index(sha) @@ -74,12 +69,11 @@ def _assert_index_file(self, index, version, size): assert entry[2] == index.crc(oidx) # verify partial sha - for l in (4,8,11,17,20): - assert index.partial_sha_to_index(sha[:l], l*2) == oidx + for l in (4, 8, 11, 17, 20): + assert index.partial_sha_to_index(sha[:l], l * 2) == oidx # END for each object index in indexfile - self.failUnlessRaises(ValueError, index.partial_sha_to_index, "\0", 2) - + self.assertRaises(ValueError, index.partial_sha_to_index, "\0", 2) def _assert_pack_file(self, pack, version, size): assert pack.version() == 2 @@ -107,7 +101,7 @@ def _assert_pack_file(self, pack, version, size): dstream = DeltaApplyReader.new(streams) except ValueError: # ignore these, old git versions use only ref deltas, - # which we havent resolved ( as we are without an index ) + # which we haven't resolved ( as we are without an index ) # Also ignore non-delta streams continue # END get deltastream @@ -120,7 +114,6 @@ def _assert_pack_file(self, pack, version, size): dstream.seek(0) assert dstream.read() == data - # read chunks # NOTE: the current implementation is safe, it basically transfers # all calls to the underlying memory map @@ -128,7 +121,6 @@ def _assert_pack_file(self, pack, version, size): # END for each object assert num_obj == size - def test_pack_index(self): # check version 1 and 2 for indexfile, version, size in (self.packindexfile_v1, self.packindexfile_v2): @@ -146,9 +138,9 @@ def test_pack(self): @with_rw_directory def test_pack_entity(self, rw_dir): pack_objs = list() - for packinfo, indexinfo in ( (self.packfile_v2_1, self.packindexfile_v1), - (self.packfile_v2_2, self.packindexfile_v2), - (self.packfile_v2_3_ascii, self.packindexfile_v2_3_ascii)): + for packinfo, indexinfo in ((self.packfile_v2_1, self.packindexfile_v1), + (self.packfile_v2_2, self.packindexfile_v2), + (self.packfile_v2_3_ascii, self.packindexfile_v2_3_ascii)): packfile, version, size = packinfo indexfile, version, size = indexinfo entity = PackEntity(packfile) @@ -157,7 +149,7 @@ def test_pack_entity(self, rw_dir): pack_objs.extend(entity.stream_iter()) count = 0 - for info, stream in izip(entity.info_iter(), entity.stream_iter()): + for info, stream in zip(entity.info_iter(), entity.stream_iter()): count += 1 assert info.binsha == stream.binsha assert len(info.binsha) == 20 @@ -190,29 +182,32 @@ def test_pack_entity(self, rw_dir): # pack writing - write all packs into one # index path can be None - pack_path = tempfile.mktemp('', "pack", rw_dir) + pack_path1 = tempfile.mktemp('', "pack1", rw_dir) + pack_path2 = tempfile.mktemp('', "pack2", rw_dir) index_path = tempfile.mktemp('', 'index', rw_dir) iteration = 0 + def rewind_streams(): for obj in pack_objs: obj.stream.seek(0) - #END utility - for ppath, ipath, num_obj in zip((pack_path, )*2, (index_path, None), (len(pack_objs), None)): - pfile = open(ppath, 'wb') + # END utility + for ppath, ipath, num_obj in zip((pack_path1, pack_path2), + (index_path, None), + (len(pack_objs), None)): iwrite = None if ipath: ifile = open(ipath, 'wb') iwrite = ifile.write - #END handle ip + # END handle ip # make sure we rewind the streams ... we work on the same objects over and over again if iteration > 0: rewind_streams() - #END rewind streams + # END rewind streams iteration += 1 - pack_sha, index_sha = PackEntity.write_pack(pack_objs, pfile.write, iwrite, object_count=num_obj) - pfile.close() + with open(ppath, 'wb') as pfile: + pack_sha, index_sha = PackEntity.write_pack(pack_objs, pfile.write, iwrite, object_count=num_obj) assert os.path.getsize(ppath) > 100 # verify pack @@ -220,6 +215,7 @@ def rewind_streams(): assert pf.size() == len(pack_objs) assert pf.version() == PackFile.pack_version_default assert pf.checksum() == pack_sha + pf.close() # verify index if ipath is not None: @@ -230,10 +226,11 @@ def rewind_streams(): assert idx.packfile_checksum() == pack_sha assert idx.indexfile_checksum() == index_sha assert idx.size() == len(pack_objs) - #END verify files exist - #END for each packpath, indexpath pair + idx.close() + # END verify files exist + # END for each packpath, indexpath pair - # verify the packs throughly + # verify the packs thoroughly rewind_streams() entity = PackEntity.create(pack_objs, rw_dir) count = 0 @@ -242,11 +239,11 @@ def rewind_streams(): for use_crc in range(2): assert entity.is_valid_stream(info.binsha, use_crc) # END for each crc mode - #END for each info + # END for each info assert count == len(pack_objs) - + entity.close() def test_pack_64(self): # TODO: hex-edit a pack helping us to verify that we can handle 64 byte offsets # of course without really needing such a huge pack - raise SkipTest() + pytest.skip('not implemented') diff --git a/gitdb/test/test_stream.py b/gitdb/test/test_stream.py index 50db44b..1e7e941 100644 --- a/gitdb/test/test_stream.py +++ b/gitdb/test/test_stream.py @@ -1,7 +1,7 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ """Test for object db""" from gitdb.test.lib import ( @@ -16,7 +16,9 @@ DecompressMemMapReader, FDCompressedSha1Writer, LooseObjectDB, - Sha1Writer + Sha1Writer, + MemoryDB, + IStream, ) from gitdb.util import hex_to_bin @@ -27,11 +29,14 @@ import tempfile import os +from io import BytesIO + class TestStream(TestBase): + """Test stream classes""" - data_sizes = (15, 10000, 1000*1024+512) + data_sizes = (15, 10000, 1000 * 1024 + 512) def _assert_stream_reader(self, stream, cdata, rewind_stream=lambda s: None): """Make stream tests - the orig_stream is seekable, allowing it to be @@ -40,13 +45,13 @@ def _assert_stream_reader(self, stream, cdata, rewind_stream=lambda s: None): :param rewind_stream: function called to rewind the stream to make it ready for reuse""" ns = 10 - assert len(cdata) > ns-1, "Data must be larger than %i, was %i" % (ns, len(cdata)) + assert len(cdata) > ns - 1, "Data must be larger than %i, was %i" % (ns, len(cdata)) # read in small steps ss = len(cdata) // ns for i in range(ns): data = stream.read(ss) - chunk = cdata[i*ss:(i+1)*ss] + chunk = cdata[i * ss:(i + 1) * ss] assert data == chunk # END for each step rest = stream.read() @@ -110,13 +115,13 @@ def test_decompress_reader(self): def test_sha_writer(self): writer = Sha1Writer() - assert 2 == writer.write("hi".encode("ascii")) + assert 2 == writer.write(b"hi") assert len(writer.sha(as_hex=1)) == 40 assert len(writer.sha(as_hex=0)) == 20 # make sure it does something ;) prev_sha = writer.sha() - writer.write("hi again".encode("ascii")) + writer.write(b"hi again") assert writer.sha() != prev_sha def test_compressed_writer(self): @@ -130,10 +135,10 @@ def test_compressed_writer(self): ostream.close() # its closed already - self.failUnlessRaises(OSError, os.close, fd) + self.assertRaises(OSError, os.close, fd) # read everything back, compare to data we zip - fd = os.open(path, os.O_RDONLY|getattr(os, 'O_BINARY', 0)) + fd = os.open(path, os.O_RDONLY | getattr(os, 'O_BINARY', 0)) written_data = os.read(fd, os.path.getsize(path)) assert len(written_data) == os.path.getsize(path) os.close(fd) @@ -144,8 +149,16 @@ def test_compressed_writer(self): def test_decompress_reader_special_case(self): odb = LooseObjectDB(fixture_path('objects')) - ostream = odb.stream(hex_to_bin('7bb839852ed5e3a069966281bb08d50012fb309b')) - - # if there is a bug, we will be missing one byte exactly ! - data = ostream.read() - assert len(data) == ostream.size + mdb = MemoryDB() + for sha in (b'888401851f15db0eed60eb1bc29dec5ddcace911', + b'7bb839852ed5e3a069966281bb08d50012fb309b',): + ostream = odb.stream(hex_to_bin(sha)) + + # if there is a bug, we will be missing one byte exactly ! + data = ostream.read() + assert len(data) == ostream.size + + # Putting it back in should yield nothing new - after all, we have + dump = mdb.store(IStream(ostream.type, ostream.size, BytesIO(data))) + assert dump.hexsha == sha + # end for each loose object sha to test diff --git a/gitdb/test/test_util.py b/gitdb/test/test_util.py index e79355a..166b33c 100644 --- a/gitdb/test/test_util.py +++ b/gitdb/test/test_util.py @@ -1,7 +1,7 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ """Test for object db""" import tempfile import os @@ -16,6 +16,7 @@ class TestUtils(TestBase): + def test_basics(self): assert to_hex_sha(NULL_HEX_SHA) == NULL_HEX_SHA assert len(to_bin_sha(NULL_HEX_SHA)) == 20 @@ -24,27 +25,23 @@ def test_basics(self): def _cmp_contents(self, file_path, data): # raise if data from file at file_path # does not match data string - fp = open(file_path, "rb") - try: + with open(file_path, "rb") as fp: assert fp.read() == data.encode("ascii") - finally: - fp.close() def test_lockedfd(self): my_file = tempfile.mktemp() orig_data = "hello" new_data = "world" - my_file_fp = open(my_file, "wb") - my_file_fp.write(orig_data.encode("ascii")) - my_file_fp.close() + with open(my_file, "wb") as my_file_fp: + my_file_fp.write(orig_data.encode("ascii")) try: lfd = LockedFD(my_file) lockfilepath = lfd._lockfilepath() # cannot end before it was started - self.failUnlessRaises(AssertionError, lfd.rollback) - self.failUnlessRaises(AssertionError, lfd.commit) + self.assertRaises(AssertionError, lfd.rollback) + self.assertRaises(AssertionError, lfd.commit) # open for writing assert not os.path.isfile(lockfilepath) @@ -59,7 +56,7 @@ def test_lockedfd(self): self._cmp_contents(my_file, orig_data) assert not os.path.isfile(lockfilepath) - # additional call doesnt fail + # additional call doesn't fail lfd.commit() lfd.rollback() @@ -73,7 +70,6 @@ def test_lockedfd(self): del(lfd) assert not os.path.isfile(lockfilepath) - # write data - concurrently lfd = LockedFD(my_file) olfd = LockedFD(my_file) @@ -81,7 +77,7 @@ def test_lockedfd(self): wfdstream = lfd.open(write=True, stream=True) # this time as stream assert os.path.isfile(lockfilepath) # another one fails - self.failUnlessRaises(IOError, olfd.open) + self.assertRaises(IOError, olfd.open) wfdstream.write(new_data.encode("ascii")) lfd.commit() diff --git a/gitdb/typ.py b/gitdb/typ.py index bc7ba58..314db50 100644 --- a/gitdb/typ.py +++ b/gitdb/typ.py @@ -1,10 +1,10 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ """Module containing information about types known to the database""" -str_blob_type = b'blob' +str_blob_type = b'blob' str_commit_type = b'commit' -str_tree_type = b'tree' -str_tag_type = b'tag' +str_tree_type = b'tree' +str_tag_type = b'tag' diff --git a/gitdb/util.py b/gitdb/util.py index 93ba7f0..bb6d879 100644 --- a/gitdb/util.py +++ b/gitdb/util.py @@ -1,28 +1,26 @@ # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under -# the New BSD License: http://www.opensource.org/licenses/bsd-license.php +# the New BSD License: https://opensource.org/license/bsd-3-clause/ import binascii import os import mmap import sys +import time import errno -from io import StringIO +from io import BytesIO from smmap import ( - StaticWindowMapManager, - SlidingWindowMapManager, - SlidingWindowMapBuffer - ) + StaticWindowMapManager, + SlidingWindowMapManager, + SlidingWindowMapBuffer +) # initialize our global memory manager instance # Use it to free cached (and unused) resources. -if sys.version_info[1] < 6: - mman = StaticWindowMapManager() -else: - mman = SlidingWindowMapManager() -#END handle mman +mman = SlidingWindowMapManager() +# END handle mman import hashlib @@ -31,6 +29,7 @@ except ImportError: from struct import unpack, calcsize __calcsize_cache = dict() + def unpack_from(fmt, data, offset=0): try: size = __calcsize_cache[fmt] @@ -38,7 +37,7 @@ def unpack_from(fmt, data, offset=0): size = calcsize(fmt) __calcsize_cache[fmt] = size # END exception handling - return unpack(fmt, data[offset : offset + size]) + return unpack(fmt, data[offset: offset + size]) # END own unpack_from implementation @@ -57,7 +56,6 @@ def unpack_from(fmt, data, offset=0): isdir = os.path.isdir isfile = os.path.isfile rename = os.rename -remove = os.remove dirname = os.path.dirname basename = os.path.basename join = os.path.join @@ -66,9 +64,28 @@ def unpack_from(fmt, data, offset=0): close = os.close fsync = os.fsync + +def _retry(func, *args, **kwargs): + # Wrapper around functions, that are problematic on "Windows". Sometimes + # the OS or someone else has still a handle to the file + if sys.platform == "win32": + for _ in range(10): + try: + return func(*args, **kwargs) + except Exception: + time.sleep(0.1) + return func(*args, **kwargs) + else: + return func(*args, **kwargs) + + +def remove(*args, **kwargs): + return _retry(os.remove, *args, **kwargs) + + # Backwards compatibility imports -from gitdb.const import ( - NULL_BIN_SHA, +from gitdb.const import ( + NULL_BIN_SHA, NULL_HEX_SHA ) @@ -76,13 +93,15 @@ def unpack_from(fmt, data, offset=0): #{ compatibility stuff ... -class _RandomAccessStringIO(object): + +class _RandomAccessBytesIO: + """Wrapper to provide required functionality in case memory maps cannot or may not be used. This is only really required in python 2.4""" __slots__ = '_sio' def __init__(self, buf=''): - self._sio = StringIO(buf) + self._sio = BytesIO(buf) def __getattr__(self, attr): return getattr(self._sio, attr) @@ -96,6 +115,7 @@ def __getitem__(self, i): def __getslice__(self, start, end): return self.getvalue()[start:end] + def byte_ord(b): """ Return the integer representation of the byte string. This supports Python @@ -110,7 +130,8 @@ def byte_ord(b): #{ Routines -def make_sha(source=''.encode("ascii")): + +def make_sha(source=b''): """A python2.4 workaround for the sha/hashlib module fiasco **Note** From the dulwich project """ @@ -121,20 +142,21 @@ def make_sha(source=''.encode("ascii")): sha1 = sha.sha(source) return sha1 + def allocate_memory(size): """:return: a file-protocol accessible memory block of the given size""" if size == 0: - return _RandomAccessStringIO('') + return _RandomAccessBytesIO(b'') # END handle empty chunks gracefully try: return mmap.mmap(-1, size) # read-write by default - except EnvironmentError: + except OSError: # setup real memory instead # this of course may fail if the amount of memory is not available in # one chunk - would only be the case in python 2.4, being more likely on # 32 bit systems. - return _RandomAccessStringIO("\0"*size) + return _RandomAccessBytesIO(b"\0" * size) # END handle memory allocation @@ -152,7 +174,7 @@ def file_contents_ro(fd, stream=False, allow_mmap=True): # supports stream and random access try: return mmap.mmap(fd, 0, access=mmap.ACCESS_READ) - except EnvironmentError: + except OSError: # python 2.4 issue, 0 wants to be the actual size return mmap.mmap(fd, os.fstat(fd).st_size, access=mmap.ACCESS_READ) # END handle python 2.4 @@ -160,12 +182,13 @@ def file_contents_ro(fd, stream=False, allow_mmap=True): pass # END exception handling - # read manully + # read manually contents = os.read(fd, os.fstat(fd).st_size) if stream: - return _RandomAccessStringIO(contents) + return _RandomAccessBytesIO(contents) return contents + def file_contents_ro_filepath(filepath, stream=False, allow_mmap=True, flags=0): """Get the file contents at filepath as fast as possible @@ -178,25 +201,28 @@ def file_contents_ro_filepath(filepath, stream=False, allow_mmap=True, flags=0): **Note** for now we don't try to use O_NOATIME directly as the right value needs to be shared per database in fact. It only makes a real difference for loose object databases anyway, and they use it with the help of the ``flags`` parameter""" - fd = os.open(filepath, os.O_RDONLY|getattr(os, 'O_BINARY', 0)|flags) + fd = os.open(filepath, os.O_RDONLY | getattr(os, 'O_BINARY', 0) | flags) try: return file_contents_ro(fd, stream, allow_mmap) finally: close(fd) # END assure file is closed + def sliding_ro_buffer(filepath, flags=0): """ :return: a buffer compatible object which uses our mapped memory manager internally ready to read the whole given filepath""" return SlidingWindowMapBuffer(mman.make_cursor(filepath), flags=flags) + def to_hex_sha(sha): """:return: hexified version of sha""" if len(sha) == 40: return sha return bin_to_hex(sha) + def to_bin_sha(sha): if len(sha) == 20: return sha @@ -208,7 +234,8 @@ def to_bin_sha(sha): #{ Utilities -class LazyMixin(object): +class LazyMixin: + """ Base class providing an interface to lazily retrieve attribute values upon first access. If slots are used, memory will only be reserved once the attribute @@ -221,7 +248,7 @@ class LazyMixin(object): def __getattr__(self, attr): """ Whenever an attribute is requested that we do not know, we allow it - to be created and set. Next time the same attribute is reqeusted, it is simply + to be created and set. Next time the same attribute is requested, it is simply returned from our dict/slots. """ self._set_cache_(attr) # will raise in case the cache was not created @@ -239,7 +266,8 @@ def _set_cache_(self, attr): pass -class LockedFD(object): +class LockedFD: + """ This class facilitates a safe read and write operation to a file on disk. If we write to 'file', we obtain a lock file at 'file.lock' and write to @@ -290,7 +318,7 @@ def open(self, write=False, stream=False): # try to open the lock file binary = getattr(os, 'O_BINARY', 0) - lockmode = os.O_WRONLY | os.O_CREAT | os.O_EXCL | binary + lockmode = os.O_WRONLY | os.O_CREAT | os.O_EXCL | binary try: fd = os.open(self._lockfilepath(), lockmode, int("600", 8)) if not write: @@ -298,18 +326,18 @@ def open(self, write=False, stream=False): else: self._fd = fd # END handle file descriptor - except OSError: - raise IOError("Lock at %r could not be obtained" % self._lockfilepath()) + except OSError as e: + raise OSError("Lock at %r could not be obtained" % self._lockfilepath()) from e # END handle lock retrieval # open actual file if required if self._fd is None: - # we could specify exlusive here, as we obtained the lock anyway + # we could specify exclusive here, as we obtained the lock anyway try: self._fd = os.open(self._filepath, os.O_RDONLY | binary) except: # assure we release our lockfile - os.remove(self._lockfilepath()) + remove(self._lockfilepath()) raise # END handle lockfile # END open descriptor for reading @@ -353,7 +381,7 @@ def _end_writing(self, successful=True): # on windows, rename does not silently overwrite the existing one if sys.platform == "win32": if isfile(self._filepath): - os.remove(self._filepath) + remove(self._filepath) # END remove if exists # END win32 special handling os.rename(lockfile, self._filepath) @@ -364,7 +392,7 @@ def _end_writing(self, successful=True): chmod(self._filepath, int("644", 8)) else: # just delete the file so far, we failed - os.remove(lockfile) + remove(lockfile) # END successful handling #} END utilities diff --git a/gitdb/utils/compat.py b/gitdb/utils/compat.py deleted file mode 100644 index a2640fd..0000000 --- a/gitdb/utils/compat.py +++ /dev/null @@ -1,35 +0,0 @@ -import sys - -PY3 = sys.version_info[0] == 3 - -try: - from itertools import izip - xrange = xrange -except ImportError: - # py3 - izip = zip - xrange = range -# end handle python version - -try: - # Python 2 - buffer = buffer - memoryview = buffer -except NameError: - # Python 3 has no `buffer`; only `memoryview` - # However, it's faster to just slice the object directly, maybe it keeps a view internally - def buffer(obj, offset, size=None): - if size is None: - # return memoryview(obj)[offset:] - return obj[offset:] - else: - # return memoryview(obj)[offset:offset+size] - return obj[offset:offset+size] - # end buffer reimplementation - - memoryview = memoryview - -try: - MAXSIZE = sys.maxint -except AttributeError: - MAXSIZE = sys.maxsize diff --git a/gitdb/utils/encoding.py b/gitdb/utils/encoding.py index 2d03ad3..b534ef7 100644 --- a/gitdb/utils/encoding.py +++ b/gitdb/utils/encoding.py @@ -1,29 +1,18 @@ -from gitdb.utils import compat - -if compat.PY3: - string_types = (str, ) - text_type = str -else: - string_types = (basestring, ) - text_type = unicode - -def force_bytes(data, encoding="ascii"): +def force_bytes(data, encoding="utf-8"): if isinstance(data, bytes): return data - if isinstance(data, string_types): + if isinstance(data, str): return data.encode(encoding) return data + def force_text(data, encoding="utf-8"): - if isinstance(data, text_type): + if isinstance(data, str): return data - if isinstance(data, string_types): + if isinstance(data, bytes): return data.decode(encoding) - if compat.PY3: - return text_type(data, encoding) - else: - return text_type(data) + return str(data, encoding) diff --git a/requirements.txt b/requirements.txt index 8a4cd39..1b2e11d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1 @@ -gitdb -smmap>=0.8.3 \ No newline at end of file +smmap>=3.0.1,<6 diff --git a/setup.py b/setup.py index dc142c5..3a91543 100755 --- a/setup.py +++ b/setup.py @@ -1,107 +1,31 @@ -#!/usr/bin/env python -from distutils.core import setup, Extension -from distutils.command.build_py import build_py -from distutils.command.build_ext import build_ext +from setuptools import setup -import os, sys +# NOTE: This is currently duplicated from the gitdb.__init__ module, because +# that's just how you write a setup.py (nobody reads this stuff out of the +# module) -# wow, this is a mixed bag ... I am pretty upset about all of this ... -setuptools_build_py_module = None -try: - # don't pull it in if we don't have to - if 'setuptools' in sys.modules: - import setuptools.command.build_py as setuptools_build_py_module - from setuptools.command.build_ext import build_ext -except ImportError: - pass - -class build_ext_nofail(build_ext): - """Doesn't fail when build our optional extensions""" - def run(self): - try: - build_ext.run(self) - except Exception: - print("Ignored failure when building extensions, pure python modules will be used instead") - # END ignore errors - - -def get_data_files(self): - """Can you feel the pain ? So, in python2.5 and python2.4 coming with maya, - the line dealing with the ``plen`` has a bug which causes it to truncate too much. - It is fixed in the system interpreters as they receive patches, and shows how - bad it is if something doesn't have proper unittests. - The code here is a plain copy of the python2.6 version which works for all. - - Generate list of '(package,src_dir,build_dir,filenames)' tuples""" - data = [] - if not self.packages: - return data - - # this one is just for the setup tools ! They don't iniitlialize this variable - # when they should, but do it on demand using this method.Its crazy - if hasattr(self, 'analyze_manifest'): - self.analyze_manifest() - # END handle setuptools ... - - for package in self.packages: - # Locate package source directory - src_dir = self.get_package_dir(package) - - # Compute package build directory - build_dir = os.path.join(*([self.build_lib] + package.split('.'))) - - # Length of path to strip from found files - plen = 0 - if src_dir: - plen = len(src_dir)+1 - - # Strip directory from globbed filenames - filenames = [ - file[plen:] for file in self.find_data_files(package, src_dir) - ] - data.append((package, src_dir, build_dir, filenames)) - return data - -build_py.get_data_files = get_data_files -if setuptools_build_py_module: - setuptools_build_py_module.build_py._get_data_files = get_data_files -# END apply setuptools patch too - -# NOTE: This is currently duplicated from the gitdb.__init__ module, as we cannot -# satisfy the dependencies at installation time, unfortunately, due to inherent limitations -# of distutils, which cannot install the prerequesites of a package before the acutal package. __author__ = "Sebastian Thiel" __contact__ = "byronimo@gmail.com" __homepage__ = "https://github.com/gitpython-developers/gitdb" -version_info = (0, 6, 0) +version_info = (4, 0, 12) __version__ = '.'.join(str(i) for i in version_info) -setup(cmdclass={'build_ext':build_ext_nofail}, - name = "gitdb", - version = __version__, - description = "Git Object Database", - author = __author__, - author_email = __contact__, - url = __homepage__, - packages = ('gitdb', 'gitdb.db', 'gitdb.utils'), - package_dir = {'gitdb':'gitdb'}, - ext_modules=[Extension('gitdb._perf', ['gitdb/_fun.c', 'gitdb/_delta_apply.c'], include_dirs=['gitdb'])], - license = "BSD License", - zip_safe=False, - requires=('smmap (>=0.8.3)', ), - install_requires=('smmap >= 0.8.3'), - long_description = """GitDB is a pure-Python git object database""", - # See https://pypi.python.org/pypi?%3Aaction=list_classifiers - classifiers=[ - # Picked from - # http://pypi.python.org/pypi?:action=list_classifiers - #"Development Status :: 1 - Planning", - #"Development Status :: 2 - Pre-Alpha", - #"Development Status :: 3 - Alpha", - # "Development Status :: 4 - Beta", +setup( + name="gitdb", + version=__version__, + description="Git Object Database", + author=__author__, + author_email=__contact__, + url=__homepage__, + packages=('gitdb', 'gitdb.db', 'gitdb.utils', 'gitdb.test'), + license="BSD License", + zip_safe=False, + install_requires=['smmap>=3.0.1,<6'], + long_description="""GitDB is a pure-Python git object database""", + python_requires='>=3.7', + # See https://pypi.python.org/pypi?%3Aaction=list_classifiers + classifiers=[ "Development Status :: 5 - Production/Stable", - #"Development Status :: 6 - Mature", - #"Development Status :: 7 - Inactive", "Environment :: Console", "Intended Audience :: Developers", "License :: OSI Approved :: BSD License", @@ -110,10 +34,14 @@ def get_data_files(self): "Operating System :: Microsoft :: Windows", "Operating System :: MacOS :: MacOS X", "Programming Language :: Python", - "Programming Language :: Python :: 2", - "Programming Language :: Python :: 2.6", - "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.3", - "Programming Language :: Python :: 3.4", - ],) + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3 :: Only", + ] +)