diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 000000000..2c7d17083
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,7 @@
+version: 2
+updates:
+  # Maintain dependencies for GitHub Actions
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "daily"
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
new file mode 100644
index 000000000..4f6360b71
--- /dev/null
+++ b/.github/workflows/codeql-analysis.yml
@@ -0,0 +1,67 @@
+---
+# For most projects, this workflow file will not need changing; you simply need
+# to commit it to your repository.
+#
+# You may wish to alter this file to override the set of languages analyzed,
+# or to provide custom queries or build logic.
+#
+# ******** NOTE ********
+# We have attempted to detect the languages in your repository. Please check
+# the `language` matrix defined below to confirm you have the correct set of
+# supported CodeQL languages.
+#
+name: CodeQL
+on:
+  push:
+    branches: [master]
+  pull_request:
+    # The branches below must be a subset of the branches above
+    branches: [master]
+  schedule:
+    - cron: 19 10 * * 6
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+    strategy:
+      fail-fast: false
+      matrix:
+        language: [python]
+        # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
+        # Learn more:
+        # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+    # Initializes the CodeQL tools for scanning.
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@v3
+        with:
+          languages: ${{ matrix.language }}
+        # If you wish to specify custom queries, you can do so here or in a config file.
+        # By default, queries listed here will override any specified in a config file.
+        # Prefix the list here with "+" to use these queries and those in the config file.
+        # queries: ./path/to/local/query, your-org/your-repo/queries@main
+
+    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
+    # If this step fails, then you should remove it and run the build manually (see below)
+      - name: Autobuild
+        uses: github/codeql-action/autobuild@v3
+
+    # ℹ️ Command-line programs to run using the OS shell.
+    # 📚 https://git.io/JvXDl
+
+    # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
+    #    and modify them (or add more) to build your code if your project
+    #    uses a compiled language
+
+    #- run: |
+    #   make bootstrap
+    #   make release
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@v3
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
new file mode 100644
index 000000000..df790120a
--- /dev/null
+++ b/.github/workflows/python-package.yml
@@ -0,0 +1,92 @@
+# Derived from https://github.com/actions/starter-workflows/blob/main/ci/python-package.yml
+#
+name: Python Package
+
+on:
+  push:
+    branches: ["master"]
+  pull_request:
+    branches: ["master"]
+
+env:
+  FORCE_COLOR: "1"  # Make tools pretty.
+  PIP_DISABLE_PIP_VERSION_CHECK: "1"
+  PIP_NO_PYTHON_VERSION_WARNING: "1"
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    name: "Test: python ${{ matrix.python }} / kafka ${{ matrix.kafka }}"
+    continue-on-error: ${{ matrix.experimental || false }}
+    strategy:
+      fail-fast: false
+      matrix:
+        kafka:
+          - "0.8.2.2"
+          - "0.9.0.1"
+          - "0.10.2.2"
+          - "0.11.0.3"
+          - "1.1.1"
+          - "2.4.0"
+          - "2.8.2"
+          - "3.0.2"
+          - "3.5.2"
+          - "3.9.0"
+          - "4.0.0"
+        python:
+          - "3.13"
+        include:
+          #- python: "pypy3.9"
+          #  kafka: "2.6.0"
+          #  experimental: true
+          - python: "3.8"
+            kafka: "4.0.0"
+          - python: "3.9"
+            kafka: "4.0.0"
+          - python: "3.10"
+            kafka: "4.0.0"
+          - python: "3.11"
+            kafka: "4.0.0"
+          - python: "3.12"
+            kafka: "4.0.0"
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python }}
+          cache: pip
+          cache-dependency-path: |
+            requirements-dev.txt
+      - name: Install dependencies
+        run: |
+          sudo apt install -y libsnappy-dev libzstd-dev
+          python -m pip install --upgrade pip
+          pip install -r requirements-dev.txt
+      - name: Pylint
+        run: pylint --recursive=y --errors-only --exit-zero kafka test
+      - name: Setup java
+        uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: 23
+      - name: Restore cached kafka releases
+        id: cache-servers-dist-restore
+        uses: actions/cache/restore@v4
+        with:
+          path: servers/dist
+          key: servers-dist-${{ matrix.kafka }}
+      - name: Install Kafka release
+        run: make servers/${{ matrix.kafka }}/kafka-bin
+      - name: Update kafka release cache
+        id: cache-servers-dist-save
+        uses: actions/cache/save@v4
+        with:
+          path: servers/dist
+          key: ${{ steps.cache-servers-dist-restore.outputs.cache-primary-key }}
+      - name: Pytest
+        run: make test
+        env:
+          KAFKA_VERSION: ${{ matrix.kafka }}
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 000000000..31dbf0d70
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,35 @@
+# Read the Docs configuration file for Sphinx projects
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the OS, Python version and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.12"
+    # You can also specify other tool versions:
+    # nodejs: "20"
+    # rust: "1.70"
+    # golang: "1.20"
+
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+  configuration: docs/conf.py
+  # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs
+  # builder: "dirhtml"
+  # Fail on all warnings to avoid broken references
+  # fail_on_warning: true
+
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats:
+#   - pdf
+#   - epub
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+  install:
+    - requirements: docs/requirements.txt
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 21e51f5ed..000000000
--- a/.travis.yml
+++ /dev/null
@@ -1,46 +0,0 @@
-language: python
-
-dist: xenial
-
-python:
-    - 2.7
-    - 3.4
-    - 3.7
-    - 3.8
-    - pypy2.7-6.0
-
-env:
-    - KAFKA_VERSION=0.8.2.2
-    - KAFKA_VERSION=0.9.0.1
-    - KAFKA_VERSION=0.10.2.2
-    - KAFKA_VERSION=0.11.0.3
-    - KAFKA_VERSION=1.1.1
-    - KAFKA_VERSION=2.4.0
-    - KAFKA_VERSION=2.5.0
-    - KAFKA_VERSION=2.6.0
-
-addons:
-  apt:
-    packages:
-      - libsnappy-dev
-      - libzstd-dev
-      - openjdk-8-jdk
-
-cache:
-  directories:
-    - $HOME/.cache/pip
-    - servers/dist
-
-before_install:
-    - source travis_java_install.sh
-    - ./build_integration.sh
-
-install:
-    - pip install tox coveralls
-    - pip install .
-
-script:
-  - tox -e `if [ "$TRAVIS_PYTHON_VERSION" == "pypy2.7-6.0" ]; then echo pypy; else echo py${TRAVIS_PYTHON_VERSION/./}; fi`
-
-after_success:
-  - coveralls
diff --git a/CHANGES.md b/CHANGES.md
index 097c55db6..743f3f246 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,370 @@
+# 2.2.4 (May 3, 2025)
+
+Fixes
+* Do not `reset_generation` after RebalanceInProgressError; improve CommitFailed error messages (#2614)
+* Fix KafkaConsumer.poll() with zero timeout (#2613)
+* Fix Fetch._reset_offsets_async() KeyError when fetching from multiple nodes (#2612)
+
+# 2.2.3 (May 1, 2025)
+
+Fixes
+* Ignore leading SECURITY_PROTOCOL:// in bootstrap_servers (#2608)
+* Only create fetch requests for ready nodes (#2607)
+
+# 2.2.2 (Apr 30, 2025)
+
+Fixes
+* Fix lint errors
+
+# 2.2.1 (Apr 29, 2025)
+
+Fixes
+* Always try ApiVersionsRequest v0, even on broker disconnect (#2603)
+* Fix SubscriptionState AttributeError in KafkaConsumer (#2599)
+
+Documentation
+* Add transactional examples to docs
+
+# 2.2.0 (Apr 28, 2025)
+
+KafkaProducer
+* KIP-98: Add idempotent producer support (#2569)
+* KIP-98: Transactional Producer (#2587)
+* KIP-98: Add offsets support to transactional KafkaProducer (#2590)
+* Prefix producer logs w/ client id and transactional id (#2591)
+* KAFKA-5429: Ignore produce response if batch was previously aborted
+* KIP-91: KafkaProducer `delivery_timeout_ms`
+* Default retries -> infinite
+* Expand KafkaProducer docstring w/ idempotent and transactional notes
+* RecordAccumulator: Use helper method to get/set `_tp_locks`; get dq with lock in reenqueue()
+
+KafkaConsumer
+* KIP-98: Add Consumer support for `READ_COMMITTED` (#2582)
+* KIP-394: handle `MEMBER_ID_REQUIRED` error w/ second join group request (#2598)
+* KAFKA-5078: Defer fetch record exception if iterator has already moved across a valid record
+* KAFKA-5075: Defer consumer fetcher exception if fetch position has already increased
+* KAFKA-4937: Batch offset fetches in the Consumer
+* KAFKA-4547: Avoid resetting paused partitions to committed offsets
+* KAFKA-6397: Consumer should not block setting positions of unavailable partitions (#2593)
+
+Potentially Breaking Changes (internal)
+* Rename CorruptRecordException -> CorruptRecordError
+* Rename Coordinator errors to generic not group (#2585)
+* Rename `ClusterMetadata.add_group_coordinator` -> `add_coordinator` + support txn type
+* Use SaslAuthenticationFailedError in kafka.conn connection failure; Drop unused AuthenticationFailedError
+* Remove old/unused errors; reorder; KafkaTimeout -> retriable
+* Drop `log_start_offset` from producer RecordMetadata
+
+Internal
+* MemoryRecords iterator; MemoryRecordsBuilder records() helper
+* Convert `DefaultRecordsBuilder.size_in_bytes` to classmethod
+
+Fixes
+* Resolve datetime deprecation warnings (#2589)
+* Avoid self refcount in log messages; test thread close on all pythons
+* Fix client.wakeup() race from producer/sender close
+* Fix ElectionNotNeededError handling in admin client
+
+Tests
+* Move integration tests and fixtures to test/integration/; simplify unit fixtures (#2588)
+* Expand Sender test coverage (#2586)
+* py2 test fixups
+* Drop unused KafkaClient import from `test_fetcher`
+
+# 2.1.6 (May 2, 2025)
+
+Fixes
+* Only create fetch requests for ready nodes (#2607)
+
+# 2.1.5 (Apr 4, 2025)
+
+Fixes
+* Fix python2.7 errors (#2578)
+
+Improvements
+* Move benchmark scripts to kafka.benchmarks module (#2584)
+* Use __slots__ for metrics (#2583)
+* Pass `metrics_enabled=False` to disable metrics (#2581)
+* Drop unused kafka.producer.buffer / SimpleBufferPool (#2580)
+* Raise UnsupportedVersionError from coordinator (#2579)
+
+# 2.1.4 (Mar 28, 2025)
+
+Fixes
+* Dont block pending FetchRequests when Metadata update requested (#2576)
+* Fix MetadataRequest for no topics (#2573)
+* Send final error byte x01 on Sasl OAuth failure (#2572)
+* Reset SASL state on disconnect (#2571)
+* Try import new Sequence before old to avoid DeprecationWarning
+
+Improvements
+* Update Makefile default to 4.0 broker; add make fixture
+* Improve connection state logging (#2574)
+
+# 2.1.3 (Mar 25, 2025)
+
+Fixes
+* Fix crash when switching to closest compatible api_version in KafkaClient (#2567)
+* Fix maximum version to send an OffsetFetchRequest in KafkaAdminClient (#2563)
+* Return empty set from consumer.partitions_for_topic when topic not found (#2556)
+
+Improvements
+* KIP-511: Use ApiVersions v4 on initial connect w/ client_software_name + version (#2558)
+* KIP-74: Manage assigned partition order in consumer (#2562)
+* KIP-70: Auto-commit offsets on consumer.unsubscribe(), defer assignment changes to rejoin  (#2560)
+* Use SubscriptionType to track topics/pattern/user assignment (#2565)
+* Add optional timeout_ms kwarg to consumer.close() (#2564)
+* Move ensure_valid_topic_name to kafka.util; use in client and producer (#2561)
+
+Testing
+* Support KRaft / 4.0 brokers in tests (#2559)
+* Test older pythons against 4.0 broker
+
+Compatibility
+* Add python 3.13 to compatibility list
+
+# 2.1.2 (Mar 17, 2025)
+
+Fixes
+* Simplify consumer.poll send fetches logic
+* Fix crc validation in consumer / fetcher
+* Lazy `_unpack_records` in PartitionRecords to fix premature fetch offset advance in consumer.poll() (#2555)
+* Debug log fetch records return; separate offsets update log
+* Fix Fetcher retriable error handling (#2554)
+* Use six.add_metaclass for py2/py3 compatible abc (#2551)
+
+Improvements
+* Add FetchMetrics class; move topic_fetch_metrics inside aggregator
+* DefaultRecordsBatchBuilder: support empty batch
+* MemoryRecordsBuilder: support arbitrary offset, skipping offsets
+* Add record.validate_crc() for v0/v1 crc checks
+* Remove fetcher message_generator / iterator interface
+* Add size_in_bytes to ABCRecordBatch and implement for Legacy and Default
+* Add magic property to ABCRecord and implement for LegacyRecord
+
+# 2.1.1 (Mar 16, 2025)
+
+Fixes
+* Fix packaging of 2.1.0 in Fedora: testing requires "pytest-timeout". (#2550)
+* Improve connection error handling when try_api_versions_check fails all attempts (#2548)
+* Add lock synchronization to Future success/failure (#2549)
+* Fix StickyPartitionAssignor encode
+
+# 2.1.0 (Mar 15, 2025)
+
+Support Kafka Broker 2.1 API Baseline
+* Add baseline leader_epoch support for ListOffsets v4 / FetchRequest v10 (#2511)
+* Support OffsetFetch v5 / OffsetCommit v6 (2.1 baseline) (#2505)
+* Support 2.1 baseline consumer group apis (#2503)
+* Support FindCoordinatorRequest v2 in consumer and admin client (#2502)
+* Support ListOffsets v3 in consumer (#2501)
+* Support Fetch Request/Response v6 in consumer (#2500)
+* Add support for Metadata Request/Response v7 (#2497)
+* Implement Incremental Fetch Sessions / KIP-227 (#2508)
+* Implement client-side connection throttling / KIP-219 (#2510)
+* Add KafkaClient.api_version(operation) for best available from api_versions (#2495)
+
+Consumer
+* Timeout coordinator poll / ensure_coordinator_ready / ensure_active_group (#2526)
+* Add optional timeout_ms kwarg to remaining consumer/coordinator methods (#2544)
+* Check for coordinator.poll failure in KafkaConsumer
+* Only mark coordinator dead if connection_delay > 0 (#2530)
+* Delay group coordinator until after bootstrap (#2539)
+* KAFKA-4160: Ensure rebalance listener not called with coordinator lock (#1438)
+* Call default_offset_commit_callback after `_maybe_auto_commit_offsets_async` (#2546)
+* Remove legacy/v1 consumer message iterator (#2543)
+* Log warning when attempting to list offsets for unknown topic/partition (#2540)
+* Add heartbeat thread id to debug logs on start
+* Add inner_timeout_ms handler to fetcher; add fallback (#2529)
+
+Producer
+* KafkaProducer: Flush pending records before close() (#2537)
+* Raise immediate error on producer.send after close (#2542)
+* Limit producer close timeout to 1sec in __del__; use context managers to close in test_producer
+* Use NullLogger in producer atexit cleanup
+* Attempt to fix metadata race condition when partitioning in producer.send (#2523)
+* Remove unused partial KIP-467 implementation (ProduceResponse batch error details) (#2524)
+
+AdminClient
+* Implement perform leader election (#2536)
+* Support delete_records (#2535)
+
+Networking
+* Call ApiVersionsRequest during connection, prior to Sasl Handshake (#2493)
+* Fake api_versions for old brokers, rename to ApiVersionsRequest, and handle error decoding (#2494)
+* Debug log when skipping api_versions request with pre-configured api_version
+* Only refresh metadata if connection fails all dns records (#2532)
+* Support connections through SOCKS5 proxies (#2531)
+* Fix OverflowError when connection_max_idle_ms is 0 or inf (#2538)
+* socket.setblocking for eventlet/gevent compatibility
+* Support custom per-request timeouts (#2498)
+* Include request_timeout_ms in request debug log
+* Support client.poll with future and timeout_ms
+* mask unused afi var
+* Debug log if check_version connection attempt fails
+
+SASL Modules
+* Refactor Sasl authentication with SaslMechanism abstract base class; support SaslAuthenticate (#2515)
+* Add SSPI (Kerberos for Windows) authentication mechanism (#2521)
+* Support AWS_MSK_IAM authentication (#2519)
+* Cleanup sasl mechanism configuration checks; fix gssapi bugs; add sasl_kerberos_name config (#2520)
+* Move kafka.oauth.AbstractTokenProvider -> kafka.sasl.oauth.AbstractTokenProvider (#2525)
+
+Testing
+* Bump default python to 3.13 in CI tests (#2541)
+* Update pytest log_format: use logger instead of filename; add thread id
+* Improve test_consumer_group::test_group logging before group stabilized (#2534)
+* Limit test duration to 5mins w/ pytest-timeout
+* Fix external kafka/zk fixtures for testing (#2533)
+* Disable zookeeper admin server to avoid port conflicts
+* Set default pytest log level to debug
+* test_group: shorter timeout, more logging, more sleep
+* Cache servers/dist in github actions workflow (#2527)
+* Remove tox.ini; update testing docs
+* Use thread-specific client_id in test_group
+* Fix subprocess log warning; specify timeout_ms kwarg in consumer.poll tests
+* Only set KAFKA_JVM_PERFORMANCE_OPTS in makefile if unset; add note re: 2.0-2.3 broker testing
+* Add kafka command to test.fixtures; raise FileNotFoundError if version not installed
+
+Documentation
+* Improve ClusterMetadata docs re: node_id/broker_id str/int types
+* Document api_version_auto_timeout_ms default; override in group tests
+
+Fixes
+* Signal close to metrics expire_loop
+* Add kafka.util timeout_ms_fn
+* fixup TopicAuthorizationFailedError construction
+* Fix lint issues via ruff check (#2522)
+* Make the "mock" dependency optional (only used in Python < 3.3). (#2518)
+
+# 2.0.6 (Mar 4, 2025)
+
+Networking
+* Improve error handling in `client._maybe_connect` (#2504)
+* Client connection / `maybe_refresh_metadata` changes (#2507)
+* Improve too-large timeout handling in client poll
+* Default `client.check_version` timeout to `api_version_auto_timeout_ms` (#2496)
+
+Fixes
+* Decode and skip transactional control records in consumer (#2499)
+* try / except in consumer coordinator `__del__`
+
+Testing
+* test_conn fixup for py2
+
+Project Maintenance
+* Add 2.0 branch for backports
+
+# 2.0.5 (Feb 25, 2025)
+
+Networking
+* Remove unused client bootstrap backoff code
+* 200ms timeout for client.poll in ensure_active_group and admin client
+
+Fixes
+* Admin client: check_version only if needed, use node_id kwarg for controller
+* Check for -1 controller_id in admin client
+* Only acquire coordinator lock in heartbeat thread close if not self thread
+
+Testing
+* Also sleep when waiting for consumers in test_describe_consumer_group_exists
+* Refactor sasl_integration test_client - wait for node ready; use send future
+* Add timeout to test_kafka_consumer
+* Add error str to assert_message_count checks
+* Retry on error in test fixture create_topic_via_metadata
+* Fixup variable interpolation in test fixture error
+
+Documentation
+* Update compatibility docs
+* Include client_id in BrokerConnection __str__ output
+
+Project Maintenance
+* Add make targets `servers/*/api_versions` and `servers/*/messages`
+
+# 2.0.4 (Feb 21, 2025)
+
+Networking
+* Check for wakeup socket errors on read and close and reinit to reset (#2482)
+* Improve client networking backoff / retry (#2480)
+* Check for socket and unresolved futures before creating selector in conn.check_version (#2477)
+* Handle socket init errors, e.g., when IPv6 is disabled (#2476)
+
+Fixes
+* Avoid self-join in heartbeat thread close (#2488)
+
+Error Handling
+* Always log broker errors in producer.send (#2478)
+* Retain unrecognized broker response error codes with dynamic error class (#2481)
+* Update kafka.errors with latest types (#2485)
+
+Compatibility
+* Do not validate snappy xerial header version and compat fields (for redpanda) (#2483)
+
+Documentation
+* Added missing docstrings in admin/client.py (#2487)
+
+Testing
+* Update kafka broker test matrix; test against 3.9.0 (#2486)
+* Add default resources for new kafka server fixtures (#2484)
+* Drop make test-local; add PYTESTS configuration var
+* Fix pytest runs when KAFKA_VERSION is not set
+
+Project Maintenance
+* Migrate to pyproject.toml / PEP-621
+* Remove old travis files; update compatibility tests link to gha
+
+# 2.0.3 (Feb 12, 2025)
+
+Improvements
+* Add optional compression libs to extras_require (#2123, #2387)
+* KafkaConsumer: Exit poll if consumer is closed (#2152)
+* Support configuration of custom kafka client for Admin/Consumer/Producer (#2144)
+* Core Protocol: Add support for flexible versions (#2151)
+* (Internal) Allow disabling thread wakeup in _send_request_to_node (#2335)
+* Change loglevel of cancelled errors to info (#2467)
+* Strip trailing dot off hostname for SSL validation. (#2472)
+* Log connection close(error) at ERROR level (#2473)
+* Support DescribeLogDirs admin api (#2475)
+
+Compatibility
+* Support for python 3.12 (#2379, #2382)
+* Kafka 2.5 / 2.6 (#2162)
+* Try collections.abc imports in vendored selectors34 (#2394)
+* Catch OSError when checking for gssapi import for windows compatibility (#2407)
+* Update vendored six to 1.16.0 (#2398)
+
+Documentation
+* Update usage.rst (#2308, #2334)
+* Fix typos (#2319, #2207, #2178)
+* Fix links to the compatibility page (#2295, #2226)
+* Cleanup install instructions for optional libs (#2139)
+* Update license_file to license_files (#2462)
+* Update some RST documentation syntax (#2463)
+* Add .readthedocs.yaml; update copyright date (#2474)
+
+Fixes
+* Use isinstance in builtin crc32 (#2329)
+* Use six.viewitems instead of six.iteritems to avoid encoding problems in StickyPartitionAssignor (#2154)
+* Fix array encoding TypeError: object of type 'dict_itemiterator' has no len() (#2167)
+* Only try to update sensors fetch lag if the unpacked list contains elements (#2158)
+* Avoid logging errors during test fixture cleanup (#2458)
+* Release coordinator lock before calling maybe_leave_group (#2460)
+* Dont raise RuntimeError for dead process in SpawnedService.wait_for() (#2461)
+* Cast the size of a MemoryRecordsBuilder object (#2438)
+* Fix DescribeConfigsResponse_v1 config_source (#2464)
+* Fix base class of DescribeClientQuotasResponse_v0 (#2465)
+* Update socketpair w/ CVE-2024-3219 fix (#2468)
+
+Testing
+* Transition CI/CD to GitHub Workflows (#2378, #2392, #2381, #2406, #2419, #2418, #2417, #2456)
+* Refactor Makefile (#2457)
+* Use assert_called_with in client_async tests (#2375)
+* Cover sticky assignor's metadata method with tests (#2161)
+* Update fixtures.py to check "127.0.0.1" for auto port assignment (#2384)
+* Use -Djava.security.manager=allow for Java 23 sasl tests (#2469)
+* Test with Java 23 (#2470)
+* Update kafka properties template; disable group rebalance delay (#2471)
+
 # 2.0.2 (Sep 29, 2020)
 
 Consumer
diff --git a/Makefile b/Makefile
index b4dcbffc9..30da9cf91 100644
--- a/Makefile
+++ b/Makefile
@@ -1,35 +1,34 @@
-# Some simple testing tasks (sorry, UNIX only).
+# Some simple testing tasks
 
-FLAGS=
-KAFKA_VERSION=0.11.0.2
-SCALA_VERSION=2.12
+SHELL = bash
+
+export KAFKA_VERSION ?= 4.0.0
+DIST_BASE_URL ?= https://archive.apache.org/dist/kafka/
+
+# Required to support testing old kafka versions on newer java releases
+# The performance opts defaults are set in each kafka brokers bin/kafka_run_class.sh file
+# The values here are taken from the 2.4.0 release.
+# Note that kafka versions 2.0-2.3 crash on newer java releases; openjdk@11 should work with with "-Djava.security.manager=allow" removed from performance opts
+export KAFKA_JVM_PERFORMANCE_OPTS?=-server -XX:+UseG1GC -XX:MaxGCPauseMillis=20 -XX:InitiatingHeapOccupancyPercent=35 -XX:+ExplicitGCInvokesConcurrent -Djava.awt.headless=true -Djava.security.manager=allow
+
+PYTESTS ?= 'test'
 
 setup:
 	pip install -r requirements-dev.txt
 	pip install -Ue .
 
-servers/$(KAFKA_VERSION)/kafka-bin:
-	KAFKA_VERSION=$(KAFKA_VERSION) SCALA_VERSION=$(SCALA_VERSION) ./build_integration.sh
-
-build-integration: servers/$(KAFKA_VERSION)/kafka-bin
-
-# Test and produce coverage using tox. This is the same as is run on Travis
-test37: build-integration
-	KAFKA_VERSION=$(KAFKA_VERSION) SCALA_VERSION=$(SCALA_VERSION) tox -e py37 -- $(FLAGS)
+lint:
+	pylint --recursive=y --errors-only kafka test
 
-test27: build-integration
-	KAFKA_VERSION=$(KAFKA_VERSION) SCALA_VERSION=$(SCALA_VERSION) tox -e py27 -- $(FLAGS)
+test: build-integration
+	pytest $(PYTESTS)
 
-# Test using py.test directly if you want to use local python. Useful for other
-# platforms that require manual installation for C libraries, ie. Windows.
-test-local: build-integration
-	KAFKA_VERSION=$(KAFKA_VERSION) SCALA_VERSION=$(SCALA_VERSION) py.test \
-		--pylint --pylint-rcfile=pylint.rc --pylint-error-types=EF $(FLAGS) kafka test
+fixture: build-integration
+	python -m test.integration.fixtures kafka
 
 cov-local: build-integration
-	KAFKA_VERSION=$(KAFKA_VERSION) SCALA_VERSION=$(SCALA_VERSION) py.test \
-		--pylint --pylint-rcfile=pylint.rc --pylint-error-types=EF --cov=kafka \
-		--cov-config=.covrc --cov-report html $(FLAGS) kafka test
+	pytest --pylint --pylint-rcfile=pylint.rc --pylint-error-types=EF --cov=kafka \
+		--cov-config=.covrc --cov-report html $(TEST_FLAGS) kafka test
 	@echo "open file://`pwd`/htmlcov/index.html"
 
 # Check the readme for syntax errors, which can lead to invalid formatting on
@@ -56,4 +55,61 @@ doc:
 	make -C docs html
 	@echo "open file://`pwd`/docs/_build/html/index.html"
 
-.PHONY: all test37 test27 test-local cov-local clean doc
+.PHONY: all test test-local cov-local clean doc dist publish
+
+kafka_artifact_version=$(lastword $(subst -, ,$(1)))
+
+# Mappings for artifacts -> scala version; any unlisted will use default 2.12
+kafka_scala_0_8_0=2.8.0
+kafka_scala_0_8_1=2.10
+kafka_scala_0_8_1_1=2.10
+kafka_scala_0_8_2_0=2.11
+kafka_scala_0_8_2_1=2.11
+kafka_scala_0_8_2_2=2.11
+kafka_scala_0_9_0_0=2.11
+kafka_scala_0_9_0_1=2.11
+kafka_scala_0_10_0_0=2.11
+kafka_scala_0_10_0_1=2.11
+kafka_scala_0_10_1_0=2.11
+kafka_scala_4_0_0=2.13
+scala_version=$(if $(SCALA_VERSION),$(SCALA_VERSION),$(if $(kafka_scala_$(subst .,_,$(1))),$(kafka_scala_$(subst .,_,$(1))),2.12))
+
+kafka_artifact_name=kafka_$(call scala_version,$(1))-$(1).$(if $(filter 0.8.0,$(1)),tar.gz,tgz)
+
+build-integration: servers/$(KAFKA_VERSION)/kafka-bin
+
+servers/dist:
+	mkdir -p servers/dist
+
+servers/dist/kafka_%.tgz servers/dist/kafka_%.tar.gz:
+	@echo "Downloading $(@F)"
+	wget -nv -P servers/dist/ -N $(DIST_BASE_URL)$(call kafka_artifact_version,$*)/$(@F)
+
+servers/dist/jakarta.xml.bind-api-2.3.3.jar:
+	wget -nv -P servers/dist/ -N https://repo1.maven.org/maven2/jakarta/xml/bind/jakarta.xml.bind-api/2.3.3/jakarta.xml.bind-api-2.3.3.jar
+
+# to allow us to derive the prerequisite artifact name from the target name
+.SECONDEXPANSION:
+
+servers/%/kafka-bin: servers/dist/$$(call kafka_artifact_name,$$*) | servers/dist
+	@echo "Extracting kafka $* binaries from $<"
+	if [ -d "$@" ]; then rm -rf $@.bak; mv $@ $@.bak; fi
+	mkdir -p $@
+	tar xzvf $< -C $@ --strip-components 1
+	if [[ "$*" < "1" ]]; then make servers/patch-libs/$*; fi
+
+servers/%/api_versions: servers/$$*/kafka-bin
+	KAFKA_VERSION=$* python -m test.integration.fixtures get_api_versions >$@
+
+servers/%/messages: servers/$$*/kafka-bin
+	cd servers/$*/ && jar xvf kafka-bin/libs/kafka-clients-$*.jar common/message/
+	mv servers/$*/common/message/ servers/$*/messages/
+	rmdir servers/$*/common
+
+servers/patch-libs/%: servers/dist/jakarta.xml.bind-api-2.3.3.jar | servers/$$*/kafka-bin
+	cp $< servers/$*/kafka-bin/libs/
+
+servers/download/%: servers/dist/$$(call kafka_artifact_name,$$*) | servers/dist ;
+
+# Avoid removing any pattern match targets as intermediates (without this, .tgz artifacts are removed by make after extraction)
+.SECONDARY:
diff --git a/README.rst b/README.rst
index 5f834442c..b820c34eb 100644
--- a/README.rst
+++ b/README.rst
@@ -1,16 +1,22 @@
 Kafka Python client
 ------------------------
 
-.. image:: https://img.shields.io/badge/kafka-2.6%2C%202.5%2C%202.4%2C%202.3%2C%202.2%2C%202.1%2C%202.0%2C%201.1%2C%201.0%2C%200.11%2C%200.10%2C%200.9%2C%200.8-brightgreen.svg
+.. image:: https://img.shields.io/badge/kafka-3.9--0.8-brightgreen.svg
     :target: https://kafka-python.readthedocs.io/en/master/compatibility.html
 .. image:: https://img.shields.io/pypi/pyversions/kafka-python.svg
     :target: https://pypi.python.org/pypi/kafka-python
 .. image:: https://coveralls.io/repos/dpkp/kafka-python/badge.svg?branch=master&service=github
     :target: https://coveralls.io/github/dpkp/kafka-python?branch=master
-.. image:: https://travis-ci.org/dpkp/kafka-python.svg?branch=master
-    :target: https://travis-ci.org/dpkp/kafka-python
 .. image:: https://img.shields.io/badge/license-Apache%202-blue.svg
     :target: https://github.com/dpkp/kafka-python/blob/master/LICENSE
+.. image:: https://img.shields.io/pypi/dw/kafka-python.svg
+    :target: https://pypistats.org/packages/kafka-python
+.. image:: https://img.shields.io/pypi/v/kafka-python.svg
+    :target: https://pypi.org/project/kafka-python
+.. image:: https://img.shields.io/pypi/implementation/kafka-python
+    :target: https://github.com/dpkp/kafka-python/blob/master/setup.py
+
+
 
 Python client for the Apache Kafka distributed stream processing system.
 kafka-python is designed to function much like the official java client, with a
@@ -26,13 +32,15 @@ check code (perhaps using zookeeper or consul). For older brokers, you can
 achieve something similar by manually assigning different partitions to each
 consumer instance with config management tools like chef, ansible, etc. This
 approach will work fine, though it does not support rebalancing on failures.
-See <https://kafka-python.readthedocs.io/en/master/compatibility.html>
+See https://kafka-python.readthedocs.io/en/master/compatibility.html
 for more details.
 
 Please note that the master branch may contain unreleased features. For release
 documentation, please see readthedocs and/or python's inline help.
 
->>> pip install kafka-python
+.. code-block:: bash
+
+    $ pip install kafka-python
 
 
 KafkaConsumer
@@ -42,42 +50,62 @@ KafkaConsumer is a high-level message consumer, intended to operate as similarly
 as possible to the official java client. Full support for coordinated
 consumer groups requires use of kafka brokers that support the Group APIs: kafka v0.9+.
 
-See <https://kafka-python.readthedocs.io/en/master/apidoc/KafkaConsumer.html>
+See https://kafka-python.readthedocs.io/en/master/apidoc/KafkaConsumer.html
 for API and configuration details.
 
 The consumer iterator returns ConsumerRecords, which are simple namedtuples
 that expose basic message attributes: topic, partition, offset, key, and value:
 
->>> from kafka import KafkaConsumer
->>> consumer = KafkaConsumer('my_favorite_topic')
->>> for msg in consumer:
-...     print (msg)
+.. code-block:: python
+
+    from kafka import KafkaConsumer
+    consumer = KafkaConsumer('my_favorite_topic')
+    for msg in consumer:
+        print (msg)
+
+.. code-block:: python
+
+    # join a consumer group for dynamic partition assignment and offset commits
+    from kafka import KafkaConsumer
+    consumer = KafkaConsumer('my_favorite_topic', group_id='my_favorite_group')
+    for msg in consumer:
+        print (msg)
 
->>> # join a consumer group for dynamic partition assignment and offset commits
->>> from kafka import KafkaConsumer
->>> consumer = KafkaConsumer('my_favorite_topic', group_id='my_favorite_group')
->>> for msg in consumer:
-...     print (msg)
+.. code-block:: python
 
->>> # manually assign the partition list for the consumer
->>> from kafka import TopicPartition
->>> consumer = KafkaConsumer(bootstrap_servers='localhost:1234')
->>> consumer.assign([TopicPartition('foobar', 2)])
->>> msg = next(consumer)
+    # manually assign the partition list for the consumer
+    from kafka import TopicPartition
+    consumer = KafkaConsumer(bootstrap_servers='localhost:1234')
+    consumer.assign([TopicPartition('foobar', 2)])
+    msg = next(consumer)
 
->>> # Deserialize msgpack-encoded values
->>> consumer = KafkaConsumer(value_deserializer=msgpack.loads)
->>> consumer.subscribe(['msgpackfoo'])
->>> for msg in consumer:
-...     assert isinstance(msg.value, dict)
+.. code-block:: python
 
->>> # Access record headers. The returned value is a list of tuples
->>> # with str, bytes for key and value
->>> for msg in consumer:
-...     print (msg.headers)
+    # Deserialize msgpack-encoded values
+    consumer = KafkaConsumer(value_deserializer=msgpack.loads)
+    consumer.subscribe(['msgpackfoo'])
+    for msg in consumer:
+        assert isinstance(msg.value, dict)
 
->>> # Get consumer metrics
->>> metrics = consumer.metrics()
+.. code-block:: python
+
+    # Access record headers. The returned value is a list of tuples
+    # with str, bytes for key and value
+    for msg in consumer:
+        print (msg.headers)
+
+.. code-block:: python
+
+    # Read only committed messages from transactional topic
+    consumer = KafkaConsumer(isolation_level='read_committed')
+    consumer.subscribe(['txn_topic'])
+    for msg in consumer:
+        print(msg)
+
+.. code-block:: python
+
+    # Get consumer metrics
+    metrics = consumer.metrics()
 
 
 KafkaProducer
@@ -85,46 +113,79 @@ KafkaProducer
 
 KafkaProducer is a high-level, asynchronous message producer. The class is
 intended to operate as similarly as possible to the official java client.
-See <https://kafka-python.readthedocs.io/en/master/apidoc/KafkaProducer.html>
+See https://kafka-python.readthedocs.io/en/master/apidoc/KafkaProducer.html
 for more details.
 
->>> from kafka import KafkaProducer
->>> producer = KafkaProducer(bootstrap_servers='localhost:1234')
->>> for _ in range(100):
-...     producer.send('foobar', b'some_message_bytes')
+.. code-block:: python
+
+    from kafka import KafkaProducer
+    producer = KafkaProducer(bootstrap_servers='localhost:1234')
+    for _ in range(100):
+        producer.send('foobar', b'some_message_bytes')
+
+.. code-block:: python
+
+    # Block until a single message is sent (or timeout)
+    future = producer.send('foobar', b'another_message')
+    result = future.get(timeout=60)
+
+.. code-block:: python
+
+    # Block until all pending messages are at least put on the network
+    # NOTE: This does not guarantee delivery or success! It is really
+    # only useful if you configure internal batching using linger_ms
+    producer.flush()
+
+.. code-block:: python
+
+    # Use a key for hashed-partitioning
+    producer.send('foobar', key=b'foo', value=b'bar')
+
+.. code-block:: python
+
+    # Serialize json messages
+    import json
+    producer = KafkaProducer(value_serializer=lambda v: json.dumps(v).encode('utf-8'))
+    producer.send('fizzbuzz', {'foo': 'bar'})
+
+.. code-block:: python
+
+    # Serialize string keys
+    producer = KafkaProducer(key_serializer=str.encode)
+    producer.send('flipflap', key='ping', value=b'1234')
+
+.. code-block:: python
 
->>> # Block until a single message is sent (or timeout)
->>> future = producer.send('foobar', b'another_message')
->>> result = future.get(timeout=60)
+    # Compress messages
+    producer = KafkaProducer(compression_type='gzip')
+    for i in range(1000):
+        producer.send('foobar', b'msg %d' % i)
 
->>> # Block until all pending messages are at least put on the network
->>> # NOTE: This does not guarantee delivery or success! It is really
->>> # only useful if you configure internal batching using linger_ms
->>> producer.flush()
+.. code-block:: python
 
->>> # Use a key for hashed-partitioning
->>> producer.send('foobar', key=b'foo', value=b'bar')
+    # Use transactions
+    producer = KafkaProducer(transactional_id='fizzbuzz')
+    producer.init_transactions()
+    producer.begin_transaction()
+    future = producer.send('txn_topic', value=b'yes')
+    future.get() # wait for successful produce
+    producer.commit_transaction() # commit the transaction
 
->>> # Serialize json messages
->>> import json
->>> producer = KafkaProducer(value_serializer=lambda v: json.dumps(v).encode('utf-8'))
->>> producer.send('fizzbuzz', {'foo': 'bar'})
+    producer.begin_transaction()
+    future = producer.send('txn_topic', value=b'no')
+    future.get() # wait for successful produce
+    producer.abort_transaction() # abort the transaction
 
->>> # Serialize string keys
->>> producer = KafkaProducer(key_serializer=str.encode)
->>> producer.send('flipflap', key='ping', value=b'1234')
+.. code-block:: python
 
->>> # Compress messages
->>> producer = KafkaProducer(compression_type='gzip')
->>> for i in range(1000):
-...     producer.send('foobar', b'msg %d' % i)
+    # Include record headers. The format is list of tuples with string key
+    # and bytes value.
+    producer.send('foobar', value=b'c29tZSB2YWx1ZQ==', headers=[('content-encoding', b'base64')])
 
->>> # Include record headers. The format is list of tuples with string key
->>> # and bytes value.
->>> producer.send('foobar', value=b'c29tZSB2YWx1ZQ==', headers=[('content-encoding', b'base64')])
+.. code-block:: python
 
->>> # Get producer performance metrics
->>> metrics = producer.metrics()
+    # Get producer performance metrics
+    metrics = producer.metrics()
 
 
 Thread safety
@@ -148,7 +209,7 @@ kafka-python supports the following compression formats:
 - Zstandard (zstd)
 
 gzip is supported natively, the others require installing additional libraries.
-See <https://kafka-python.readthedocs.io/en/master/install.html> for more information.
+See https://kafka-python.readthedocs.io/en/master/install.html for more information.
 
 
 Optimized CRC32 Validation
@@ -157,7 +218,7 @@ Optimized CRC32 Validation
 Kafka uses CRC32 checksums to validate messages. kafka-python includes a pure
 python implementation for compatibility. To improve performance for high-throughput
 applications, kafka-python will use `crc32c` for optimized native code if installed.
-See <https://kafka-python.readthedocs.io/en/master/install.html> for installation instructions.
+See https://kafka-python.readthedocs.io/en/master/install.html for installation instructions.
 See https://pypi.org/project/crc32c/ for details on the underlying crc32c lib.
 
 
diff --git a/benchmarks/load_example.py b/benchmarks/load_example.py
deleted file mode 100755
index eef113e9a..000000000
--- a/benchmarks/load_example.py
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/usr/bin/env python
-from __future__ import print_function
-import threading, logging, time
-
-from kafka import KafkaConsumer, KafkaProducer
-
-msg_size = 524288
-
-producer_stop = threading.Event()
-consumer_stop = threading.Event()
-
-class Producer(threading.Thread):
-    big_msg = b'1' * msg_size
-
-    def run(self):
-        producer = KafkaProducer(bootstrap_servers='localhost:9092')
-        self.sent = 0
-
-        while not producer_stop.is_set():
-            producer.send('my-topic', self.big_msg)
-            self.sent += 1
-        producer.flush()
-
-
-class Consumer(threading.Thread):
-
-    def run(self):
-        consumer = KafkaConsumer(bootstrap_servers='localhost:9092',
-                                 auto_offset_reset='earliest')
-        consumer.subscribe(['my-topic'])
-        self.valid = 0
-        self.invalid = 0
-
-        for message in consumer:
-            if len(message.value) == msg_size:
-                self.valid += 1
-            else:
-                self.invalid += 1
-
-            if consumer_stop.is_set():
-                break
-
-        consumer.close()
-
-def main():
-    threads = [
-        Producer(),
-        Consumer()
-    ]
-
-    for t in threads:
-        t.start()
-
-    time.sleep(10)
-    producer_stop.set()
-    consumer_stop.set()
-    print('Messages sent: %d' % threads[0].sent)
-    print('Messages recvd: %d' % threads[1].valid)
-    print('Messages invalid: %d' % threads[1].invalid)
-
-if __name__ == "__main__":
-    logging.basicConfig(
-        format='%(asctime)s.%(msecs)s:%(name)s:%(thread)d:%(levelname)s:%(process)d:%(message)s',
-        level=logging.INFO
-        )
-    main()
diff --git a/build_integration.sh b/build_integration.sh
deleted file mode 100755
index c020b0fe2..000000000
--- a/build_integration.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/bin/bash
-
-: ${ALL_RELEASES:="0.8.2.2 0.9.0.1 0.10.1.1 0.10.2.2 0.11.0.3 1.0.2 1.1.1 2.0.1 2.1.1 2.2.1 2.3.0 2.4.0 2.5.0"}
-: ${SCALA_VERSION:=2.11}
-: ${DIST_BASE_URL:=https://archive.apache.org/dist/kafka/}
-: ${KAFKA_SRC_GIT:=https://github.com/apache/kafka.git}
-
-# On travis CI, empty KAFKA_VERSION means skip integration tests
-# so we don't try to get binaries
-# Otherwise it means test all official releases, so we get all of them!
-if [ -z "$KAFKA_VERSION" -a -z "$TRAVIS" ]; then
-  KAFKA_VERSION=$ALL_RELEASES
-fi
-
-pushd servers
-  mkdir -p dist
-  pushd dist
-    for kafka in $KAFKA_VERSION; do
-      if [ "$kafka" == "trunk" ]; then
-        if [ ! -d "$kafka" ]; then
-          git clone $KAFKA_SRC_GIT $kafka
-        fi
-        pushd $kafka
-          git pull
-          ./gradlew -PscalaVersion=$SCALA_VERSION -Pversion=$kafka releaseTarGz -x signArchives
-        popd
-        # Not sure how to construct the .tgz name accurately, so use a wildcard (ugh)
-        tar xzvf $kafka/core/build/distributions/kafka_*.tgz -C ../$kafka/
-        rm $kafka/core/build/distributions/kafka_*.tgz
-        rm -rf ../$kafka/kafka-bin
-        mv ../$kafka/kafka_* ../$kafka/kafka-bin
-      else
-        echo "-------------------------------------"
-        echo "Checking kafka binaries for ${kafka}"
-        echo
-        if [ "$kafka" == "0.8.0" ]; then
-          KAFKA_ARTIFACT="kafka_2.8.0-${kafka}.tar.gz"
-        else if [ "$kafka" \> "2.4.0" ]; then
-          KAFKA_ARTIFACT="kafka_2.12-${kafka}.tgz"
-        else
-          KAFKA_ARTIFACT="kafka_${SCALA_VERSION}-${kafka}.tgz"
-        fi
-        fi
-        if [ ! -f "../$kafka/kafka-bin/bin/kafka-run-class.sh" ]; then
-          if [ -f "${KAFKA_ARTIFACT}" ]; then
-            echo "Using cached artifact: ${KAFKA_ARTIFACT}"
-          else
-            echo "Downloading kafka ${kafka} tarball"
-            TARBALL=${DIST_BASE_URL}${kafka}/${KAFKA_ARTIFACT}
-            if command -v wget 2>/dev/null; then
-              wget -N $TARBALL
-            else
-              echo "wget not found... using curl"
-              curl -f $TARBALL -o ${KAFKA_ARTIFACT}
-            fi
-          fi
-          echo
-          echo "Extracting kafka ${kafka} binaries"
-          tar xzvf ${KAFKA_ARTIFACT} -C ../$kafka/
-          rm -rf ../$kafka/kafka-bin
-          mv ../$kafka/${KAFKA_ARTIFACT/%.t*/} ../$kafka/kafka-bin
-          if [ ! -f "../$kafka/kafka-bin/bin/kafka-run-class.sh" ]; then
-            echo "Extraction Failed ($kafka/kafka-bin/bin/kafka-run-class.sh does not exist)!"
-            exit 1
-          fi
-        else
-          echo "$kafka is already installed in servers/$kafka/ -- skipping"
-        fi
-      fi
-      echo
-    done
-  popd
-popd
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 446b29021..030114a3f 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -1,6 +1,457 @@
 Changelog
 =========
 
+2.2.4 (May 3, 2025)
+###################
+
+Fixes
+-----
+* Do not `reset_generation` after RebalanceInProgressError; improve CommitFailed error messages (#2614)
+* Fix KafkaConsumer.poll() with zero timeout (#2613)
+* Fix Fetch._reset_offsets_async() KeyError when fetching from multiple nodes (#2612)
+
+
+2.2.3 (May 1, 2025)
+###################
+
+Fixes
+-----
+* Ignore leading SECURITY_PROTOCOL:// in bootstrap_servers (#2608)
+* Only create fetch requests for ready nodes (#2607)
+
+
+2.2.2 (Apr 30, 2025)
+####################
+
+Fixes
+-----
+* Fix lint errors
+
+
+2.2.1 (Apr 29, 2025)
+####################
+
+Fixes
+-----
+* Always try ApiVersionsRequest v0, even on broker disconnect (#2603)
+* Fix SubscriptionState AttributeError in KafkaConsumer (#2599)
+
+Documentation
+-------------
+* Add transactional examples to docs
+
+
+2.2.0 (Apr 28, 2025)
+####################
+
+KafkaProducer
+-------------
+* KIP-98: Add idempotent producer support (#2569)
+* KIP-98: Transactional Producer (#2587)
+* KIP-98: Add offsets support to transactional KafkaProducer (#2590)
+* Prefix producer logs w/ client id and transactional id (#2591)
+* KAFKA-5429: Ignore produce response if batch was previously aborted
+* KIP-91: KafkaProducer `delivery_timeout_ms`
+* Default retries -> infinite
+* Expand KafkaProducer docstring w/ idempotent and transactional notes
+* RecordAccumulator: Use helper method to get/set `_tp_locks`; get dq with lock in reenqueue()
+
+KafkaConsumer
+-------------
+* KIP-98: Add Consumer support for `READ_COMMITTED` (#2582)
+* KIP-394: handle `MEMBER_ID_REQUIRED` error w/ second join group request (#2598)
+* KAFKA-5078: Defer fetch record exception if iterator has already moved across a valid record
+* KAFKA-5075: Defer consumer fetcher exception if fetch position has already increased
+* KAFKA-4937: Batch offset fetches in the Consumer
+* KAFKA-4547: Avoid resetting paused partitions to committed offsets
+* KAFKA-6397: Consumer should not block setting positions of unavailable partitions (#2593)
+
+Potentially Breaking Changes (internal)
+---------------------------------------
+* Rename CorruptRecordException -> CorruptRecordError
+* Rename Coordinator errors to generic not group (#2585)
+* Rename `ClusterMetadata.add_group_coordinator` -> `add_coordinator` + support txn type
+* Use SaslAuthenticationFailedError in kafka.conn connection failure; Drop unused AuthenticationFailedError
+* Remove old/unused errors; reorder; KafkaTimeout -> retriable
+* Drop `log_start_offset` from producer RecordMetadata
+
+Internal
+--------
+* MemoryRecords iterator; MemoryRecordsBuilder records() helper
+* Convert `DefaultRecordsBuilder.size_in_bytes` to classmethod
+
+Fixes
+-----
+* Resolve datetime deprecation warnings (#2589)
+* Avoid self refcount in log messages; test thread close on all pythons
+* Fix client.wakeup() race from producer/sender close
+* Fix ElectionNotNeededError handling in admin client
+
+Tests
+-----
+* Move integration tests and fixtures to test/integration/; simplify unit fixtures (#2588)
+* Expand Sender test coverage (#2586)
+* py2 test fixups
+* Drop unused KafkaClient import from `test_fetcher`
+
+
+2.1.6 (May 2, 2025)
+###################
+
+Fixes
+-----
+* Only create fetch requests for ready nodes (#2607)
+
+
+2.1.5 (Apr 4, 2025)
+###################
+
+Fixes
+------
+* Fix python2.7 errors (#2578)
+
+Improvements
+------------
+* Move benchmark scripts to kafka.benchmarks module (#2584)
+* Use __slots__ for metrics (#2583)
+* Pass `metrics_enabled=False` to disable metrics (#2581)
+* Drop unused kafka.producer.buffer / SimpleBufferPool (#2580)
+* Raise UnsupportedVersionError from coordinator (#2579)
+
+
+2.1.4 (Mar 28, 2025)
+####################
+
+Fixes
+-----
+* Dont block pending FetchRequests when Metadata update requested (#2576)
+* Fix MetadataRequest for no topics (#2573)
+* Send final error byte x01 on Sasl OAuth failure (#2572)
+* Reset SASL state on disconnect (#2571)
+* Try import new Sequence before old to avoid DeprecationWarning
+
+Improvements
+------------
+* Update Makefile default to 4.0 broker; add make fixture
+* Improve connection state logging (#2574)
+
+
+2.1.3 (Mar 25, 2025)
+####################
+
+Fixes
+-----
+* Fix crash when switching to closest compatible api_version in KafkaClient (#2567)
+* Fix maximum version to send an OffsetFetchRequest in KafkaAdminClient (#2563)
+* Return empty set from consumer.partitions_for_topic when topic not found (#2556)
+
+Improvements
+------------
+* KIP-511: Use ApiVersions v4 on initial connect w/ client_software_name + version (#2558)
+* KIP-74: Manage assigned partition order in consumer (#2562)
+* KIP-70: Auto-commit offsets on consumer.unsubscribe(), defer assignment changes to rejoin  (#2560)
+* Use SubscriptionType to track topics/pattern/user assignment (#2565)
+* Add optional timeout_ms kwarg to consumer.close() (#2564)
+* Move ensure_valid_topic_name to kafka.util; use in client and producer (#2561)
+
+Testing
+-------
+* Support KRaft / 4.0 brokers in tests (#2559)
+* Test older pythons against 4.0 broker
+
+Compatibility
+-------------
+* Add python 3.13 to compatibility list
+
+
+2.1.2 (Mar 17, 2025)
+####################
+
+Fixes
+-----
+* Simplify consumer.poll send fetches logic
+* Fix crc validation in consumer / fetcher
+* Lazy `_unpack_records` in PartitionRecords to fix premature fetch offset advance in consumer.poll() (#2555)
+* Debug log fetch records return; separate offsets update log
+* Fix Fetcher retriable error handling (#2554)
+* Use six.add_metaclass for py2/py3 compatible abc (#2551)
+
+Improvements
+------------
+* Add FetchMetrics class; move topic_fetch_metrics inside aggregator
+* DefaultRecordsBatchBuilder: support empty batch
+* MemoryRecordsBuilder: support arbitrary offset, skipping offsets
+* Add record.validate_crc() for v0/v1 crc checks
+* Remove fetcher message_generator / iterator interface
+* Add size_in_bytes to ABCRecordBatch and implement for Legacy and Default
+* Add magic property to ABCRecord and implement for LegacyRecord
+
+
+2.1.1 (Mar 16, 2025)
+####################
+
+Fixes
+-----
+* Fix packaging of 2.1.0 in Fedora: testing requires "pytest-timeout". (#2550)
+* Improve connection error handling when try_api_versions_check fails all attempts (#2548)
+* Add lock synchronization to Future success/failure (#2549)
+* Fix StickyPartitionAssignor encode
+
+
+2.1.0 (Mar 15, 2025)
+####################
+
+Support Kafka Broker 2.1 API Baseline
+-------------------------------------
+* Add baseline leader_epoch support for ListOffsets v4 / FetchRequest v10 (#2511)
+* Support OffsetFetch v5 / OffsetCommit v6 (2.1 baseline) (#2505)
+* Support 2.1 baseline consumer group apis (#2503)
+* Support FindCoordinatorRequest v2 in consumer and admin client (#2502)
+* Support ListOffsets v3 in consumer (#2501)
+* Support Fetch Request/Response v6 in consumer (#2500)
+* Add support for Metadata Request/Response v7 (#2497)
+* Implement Incremental Fetch Sessions / KIP-227 (#2508)
+* Implement client-side connection throttling / KIP-219 (#2510)
+* Add KafkaClient.api_version(operation) for best available from api_versions (#2495)
+
+Consumer
+--------
+* Timeout coordinator poll / ensure_coordinator_ready / ensure_active_group (#2526)
+* Add optional timeout_ms kwarg to remaining consumer/coordinator methods (#2544)
+* Check for coordinator.poll failure in KafkaConsumer
+* Only mark coordinator dead if connection_delay > 0 (#2530)
+* Delay group coordinator until after bootstrap (#2539)
+* KAFKA-4160: Ensure rebalance listener not called with coordinator lock (#1438)
+* Call default_offset_commit_callback after `_maybe_auto_commit_offsets_async` (#2546)
+* Remove legacy/v1 consumer message iterator (#2543)
+* Log warning when attempting to list offsets for unknown topic/partition (#2540)
+* Add heartbeat thread id to debug logs on start
+* Add inner_timeout_ms handler to fetcher; add fallback (#2529)
+
+Producer
+--------
+* KafkaProducer: Flush pending records before close() (#2537)
+* Raise immediate error on producer.send after close (#2542)
+* Limit producer close timeout to 1sec in __del__; use context managers to close in test_producer
+* Use NullLogger in producer atexit cleanup
+* Attempt to fix metadata race condition when partitioning in producer.send (#2523)
+* Remove unused partial KIP-467 implementation (ProduceResponse batch error details) (#2524)
+
+AdminClient
+-----------
+* Implement perform leader election (#2536)
+* Support delete_records (#2535)
+
+Networking
+----------
+* Call ApiVersionsRequest during connection, prior to Sasl Handshake (#2493)
+* Fake api_versions for old brokers, rename to ApiVersionsRequest, and handle error decoding (#2494)
+* Debug log when skipping api_versions request with pre-configured api_version
+* Only refresh metadata if connection fails all dns records (#2532)
+* Support connections through SOCKS5 proxies (#2531)
+* Fix OverflowError when connection_max_idle_ms is 0 or inf (#2538)
+* socket.setblocking for eventlet/gevent compatibility
+* Support custom per-request timeouts (#2498)
+* Include request_timeout_ms in request debug log
+* Support client.poll with future and timeout_ms
+* mask unused afi var
+* Debug log if check_version connection attempt fails
+
+SASL Modules
+------------
+* Refactor Sasl authentication with SaslMechanism abstract base class; support SaslAuthenticate (#2515)
+* Add SSPI (Kerberos for Windows) authentication mechanism (#2521)
+* Support AWS_MSK_IAM authentication (#2519)
+* Cleanup sasl mechanism configuration checks; fix gssapi bugs; add sasl_kerberos_name config (#2520)
+* Move kafka.oauth.AbstractTokenProvider -> kafka.sasl.oauth.AbstractTokenProvider (#2525)
+
+Testing
+-------
+* Bump default python to 3.13 in CI tests (#2541)
+* Update pytest log_format: use logger instead of filename; add thread id
+* Improve test_consumer_group::test_group logging before group stabilized (#2534)
+* Limit test duration to 5mins w/ pytest-timeout
+* Fix external kafka/zk fixtures for testing (#2533)
+* Disable zookeeper admin server to avoid port conflicts
+* Set default pytest log level to debug
+* test_group: shorter timeout, more logging, more sleep
+* Cache servers/dist in github actions workflow (#2527)
+* Remove tox.ini; update testing docs
+* Use thread-specific client_id in test_group
+* Fix subprocess log warning; specify timeout_ms kwarg in consumer.poll tests
+* Only set KAFKA_JVM_PERFORMANCE_OPTS in makefile if unset; add note re: 2.0-2.3 broker testing
+* Add kafka command to test.fixtures; raise FileNotFoundError if version not installed
+
+Documentation
+-------------
+* Improve ClusterMetadata docs re: node_id/broker_id str/int types
+* Document api_version_auto_timeout_ms default; override in group tests
+
+Fixes
+-----
+* Signal close to metrics expire_loop
+* Add kafka.util timeout_ms_fn
+* fixup TopicAuthorizationFailedError construction
+* Fix lint issues via ruff check (#2522)
+* Make the "mock" dependency optional (only used in Python < 3.3). (#2518)
+
+
+2.0.6 (Mar 4, 2025)
+###################
+
+Networking
+----------
+* Improve error handling in `client._maybe_connect` (#2504)
+* Client connection / `maybe_refresh_metadata` changes (#2507)
+* Improve too-large timeout handling in client poll
+* Default `client.check_version` timeout to `api_version_auto_timeout_ms` (#2496)
+
+Fixes
+-----
+* Decode and skip transactional control records in consumer (#2499)
+* try / except in consumer coordinator `__del__`
+
+Testing
+-------
+* test_conn fixup for py2
+
+Project Maintenance
+-------------------
+* Add 2.0 branch for backports
+
+
+2.0.5 (Feb 25, 2025)
+####################
+
+Networking
+----------
+* Remove unused client bootstrap backoff code
+* 200ms timeout for client.poll in ensure_active_group and admin client
+
+Fixes
+-----
+* Admin client: check_version only if needed, use node_id kwarg for controller
+* Check for -1 controller_id in admin client
+* Only acquire coordinator lock in heartbeat thread close if not self thread
+
+Testing
+-------
+* Also sleep when waiting for consumers in test_describe_consumer_group_exists
+* Refactor sasl_integration test_client - wait for node ready; use send future
+* Add timeout to test_kafka_consumer
+* Add error str to assert_message_count checks
+* Retry on error in test fixture create_topic_via_metadata
+* Fixup variable interpolation in test fixture error
+
+Documentation
+-------------
+* Update compatibility docs
+* Include client_id in BrokerConnection __str__ output
+
+Project Maintenance
+-------------------
+* Add make targets `servers/*/api_versions` and `servers/*/messages`
+
+
+2.0.4 (Feb 21, 2025)
+####################
+
+Networking
+----------
+* Check for wakeup socket errors on read and close and reinit to reset (#2482)
+* Improve client networking backoff / retry (#2480)
+* Check for socket and unresolved futures before creating selector in conn.check_version (#2477)
+* Handle socket init errors, e.g., when IPv6 is disabled (#2476)
+
+Fixes
+-----
+* Avoid self-join in heartbeat thread close (#2488)
+
+Error Handling
+--------------
+* Always log broker errors in producer.send (#2478)
+* Retain unrecognized broker response error codes with dynamic error class (#2481)
+* Update kafka.errors with latest types (#2485)
+
+Compatibility
+-------------
+* Do not validate snappy xerial header version and compat fields (for redpanda) (#2483)
+
+Documentation
+-------------
+* Added missing docstrings in admin/client.py (#2487)
+
+Testing
+-------
+* Update kafka broker test matrix; test against 3.9.0 (#2486)
+* Add default resources for new kafka server fixtures (#2484)
+* Drop make test-local; add PYTESTS configuration var
+* Fix pytest runs when KAFKA_VERSION is not set
+
+Project Maintenance
+-------------------
+* Migrate to pyproject.toml / PEP-621
+* Remove old travis files; update compatibility tests link to gha
+
+
+2.0.3 (Feb 12, 2025)
+####################
+
+Improvements
+------------
+* Add optional compression libs to extras_require (#2123, #2387)
+* KafkaConsumer: Exit poll if consumer is closed (#2152)
+* Support configuration of custom kafka client for Admin/Consumer/Producer (#2144)
+* Core Protocol: Add support for flexible versions (#2151)
+* (Internal) Allow disabling thread wakeup in _send_request_to_node (#2335)
+* Change loglevel of cancelled errors to info (#2467)
+* Strip trailing dot off hostname for SSL validation. (#2472)
+* Log connection close(error) at ERROR level (#2473)
+* Support DescribeLogDirs admin api (#2475)
+
+Compatibility
+-------------
+* Support for python 3.12 (#2379, #2382)
+* Kafka 2.5 / 2.6 (#2162)
+* Try collections.abc imports in vendored selectors34 (#2394)
+* Catch OSError when checking for gssapi import for windows compatibility (#2407)
+* Update vendored six to 1.16.0 (#2398)
+
+Documentation
+-------------
+* Update usage.rst (#2308, #2334)
+* Fix typos (#2319, #2207, #2178)
+* Fix links to the compatibility page (#2295, #2226)
+* Cleanup install instructions for optional libs (#2139)
+* Update license_file to license_files (#2462)
+* Update some RST documentation syntax (#2463)
+* Add .readthedocs.yaml; update copyright date (#2474)
+
+Fixes
+-----
+* Use isinstance in builtin crc32 (#2329)
+* Use six.viewitems instead of six.iteritems to avoid encoding problems in StickyPartitionAssignor (#2154)
+* Fix array encoding TypeError: object of type 'dict_itemiterator' has no len() (#2167)
+* Only try to update sensors fetch lag if the unpacked list contains elements (#2158)
+* Avoid logging errors during test fixture cleanup (#2458)
+* Release coordinator lock before calling maybe_leave_group (#2460)
+* Dont raise RuntimeError for dead process in SpawnedService.wait_for() (#2461)
+* Cast the size of a MemoryRecordsBuilder object (#2438)
+* Fix DescribeConfigsResponse_v1 config_source (#2464)
+* Fix base class of DescribeClientQuotasResponse_v0 (#2465)
+* Update socketpair w/ CVE-2024-3219 fix (#2468)
+
+Testing
+-------
+* Transition CI/CD to GitHub Workflows (#2378, #2392, #2381, #2406, #2419, #2418, #2417, #2456)
+* Refactor Makefile (#2457)
+* Use assert_called_with in client_async tests (#2375)
+* Cover sticky assignor's metadata method with tests (#2161)
+* Update fixtures.py to check "127.0.0.1" for auto port assignment (#2384)
+* Use -Djava.security.manager=allow for Java 23 sasl tests (#2469)
+* Test with Java 23 (#2470)
+* Update kafka properties template; disable group rebalance delay (#2471)
+
 
 2.0.2 (Sep 29, 2020)
 ####################
@@ -1243,7 +1694,7 @@ Consumers
 * Improve FailedPayloadsError handling in KafkaConsumer (dpkp PR 398)
 * KafkaConsumer: avoid raising KeyError in task_done (dpkp PR 389)
 * MultiProcessConsumer -- support configured partitions list (dpkp PR 380)
-* Fix SimpleConsumer leadership change handling (dpkp PR 393) 
+* Fix SimpleConsumer leadership change handling (dpkp PR 393)
 * Fix SimpleConsumer connection error handling (reAsOn2010 PR 392)
 * Improve Consumer handling of 'falsy' partition values (wting PR 342)
 * Fix _offsets call error in KafkaConsumer (hellais PR 376)
@@ -1348,7 +1799,7 @@ Internals
 * Add test timers via nose-timer plugin; list 10 slowest timings by default (dpkp)
 * Move fetching last known offset logic to a stand alone function (zever - PR 177)
 * Improve KafkaConnection and add more tests (dpkp - PR 196)
-* Raise TypeError if necessary when encoding strings (mdaniel - PR 204) 
+* Raise TypeError if necessary when encoding strings (mdaniel - PR 204)
 * Use Travis-CI to publish tagged releases to pypi (tkuhlman / mumrah)
 * Use official binary tarballs for integration tests and parallelize travis tests (dpkp - PR 193)
 * Improve new-topic creation handling (wizzat - PR 174)
@@ -1362,7 +1813,7 @@ Internals
 * Fix connection error timeout and improve tests (wizzat - PR 158)
 * SimpleProducer randomization of initial round robin ordering (alexcb - PR 139)
 * Fix connection timeout in KafkaClient and KafkaConnection (maciejkula - PR 161)
-* Fix seek + commit behavior (wizzat - PR 148) 
+* Fix seek + commit behavior (wizzat - PR 148)
 
 
 0.9.0 (Mar 21, 2014)
diff --git a/docs/compatibility.rst b/docs/compatibility.rst
index b3ad00634..353273114 100644
--- a/docs/compatibility.rst
+++ b/docs/compatibility.rst
@@ -1,21 +1,21 @@
 Compatibility
 -------------
 
-.. image:: https://img.shields.io/badge/kafka-2.6%2C%202.5%2C%202.4%2C%202.3%2C%202.2%2C%202.1%2C%202.0%2C%201.1%2C%201.0%2C%200.11%2C%200.10%2C%200.9%2C%200.8-brightgreen.svg
+.. image:: https://img.shields.io/badge/kafka-4.0--0.8-brightgreen.svg
     :target: https://kafka-python.readthedocs.io/compatibility.html
 .. image:: https://img.shields.io/pypi/pyversions/kafka-python.svg
     :target: https://pypi.python.org/pypi/kafka-python
 
-kafka-python is compatible with (and tested against) broker versions 2.6
+kafka-python is compatible with (and tested against) broker versions 4.0
 through 0.8.0 . kafka-python is not compatible with the 0.8.2-beta release.
 
 Because the kafka server protocol is backwards compatible, kafka-python is
 expected to work with newer broker releases as well.
 
 Although kafka-python is tested and expected to work on recent broker versions,
-not all features are supported. Specifically, authentication codecs, and
-transactional producer/consumer support are not fully implemented. PRs welcome!
+not all features are supported. Specifically, transactional producer/consumer
+support is not fully implemented. PRs welcome!
 
-kafka-python is tested on python 2.7, 3.4, 3.7, 3.8 and pypy2.7.
+kafka-python is tested on python 2.7, and 3.8-3.13.
 
-Builds and tests via Travis-CI.  See https://travis-ci.org/dpkp/kafka-python
+Builds and tests via Github Actions Workflows.  See https://github.com/dpkp/kafka-python/actions
diff --git a/docs/conf.py b/docs/conf.py
index efa8d0807..6273af0ce 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -13,11 +13,12 @@
 # serve to show the default.
 
 import os
+import sys
 
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-#sys.path.insert(0, os.path.abspath('.'))
+sys.path.insert(0, os.path.abspath('../'))
 
 # -- General configuration ------------------------------------------------
 
@@ -48,7 +49,7 @@
 
 # General information about the project.
 project = u'kafka-python'
-copyright = u'2016 -- Dana Powers, David Arthur, and Contributors'
+copyright = u'2025 -- Dana Powers, David Arthur, and Contributors'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -103,7 +104,7 @@
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-html_theme = 'default'
+html_theme = 'sphinx_rtd_theme'
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
diff --git a/docs/index.rst b/docs/index.rst
index 1f2a4ce98..823780929 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,8 +1,8 @@
 kafka-python
 ############
 
-.. image:: https://img.shields.io/badge/kafka-2.6%2C%202.5%2C%202.4%2C%202.3%2C%202.2%2C%202.1%2C%202.0%2C%201.1%2C%201.0%2C%200.11%2C%200.10%2C%200.9%2C%200.8-brightgreen.svg
-    :target: https://kafka-python.readthedocs.io/compatibility.html
+.. image:: https://img.shields.io/badge/kafka-4.0--0.8-brightgreen.svg
+    :target: https://kafka-python.readthedocs.io/en/master/compatibility.html
 .. image:: https://img.shields.io/pypi/pyversions/kafka-python.svg
     :target: https://pypi.python.org/pypi/kafka-python
 .. image:: https://coveralls.io/repos/dpkp/kafka-python/badge.svg?branch=master&service=github
@@ -31,7 +31,9 @@ failures.  See `Compatibility <compatibility.html>`_ for more details.
 Please note that the master branch may contain unreleased features. For release
 documentation, please see readthedocs and/or python's inline help.
 
->>> pip install kafka-python
+.. code:: bash
+
+    pip install kafka-python
 
 
 KafkaConsumer
@@ -47,28 +49,56 @@ See `KafkaConsumer <apidoc/KafkaConsumer.html>`_ for API and configuration detai
 The consumer iterator returns ConsumerRecords, which are simple namedtuples
 that expose basic message attributes: topic, partition, offset, key, and value:
 
->>> from kafka import KafkaConsumer
->>> consumer = KafkaConsumer('my_favorite_topic')
->>> for msg in consumer:
-...     print (msg)
+.. code:: python
+
+    from kafka import KafkaConsumer
+    consumer = KafkaConsumer('my_favorite_topic')
+    for msg in consumer:
+        print (msg)
+
+.. code:: python
+
+    # join a consumer group for dynamic partition assignment and offset commits
+    from kafka import KafkaConsumer
+    consumer = KafkaConsumer('my_favorite_topic', group_id='my_favorite_group')
+    for msg in consumer:
+        print (msg)
+
+.. code:: python
+
+    # manually assign the partition list for the consumer
+    from kafka import TopicPartition
+    consumer = KafkaConsumer(bootstrap_servers='localhost:1234')
+    consumer.assign([TopicPartition('foobar', 2)])
+    msg = next(consumer)
+
+.. code:: python
 
->>> # join a consumer group for dynamic partition assignment and offset commits
->>> from kafka import KafkaConsumer
->>> consumer = KafkaConsumer('my_favorite_topic', group_id='my_favorite_group')
->>> for msg in consumer:
-...     print (msg)
+    # Deserialize msgpack-encoded values
+    consumer = KafkaConsumer(value_deserializer=msgpack.loads)
+    consumer.subscribe(['msgpackfoo'])
+    for msg in consumer:
+        assert isinstance(msg.value, dict)
 
->>> # manually assign the partition list for the consumer
->>> from kafka import TopicPartition
->>> consumer = KafkaConsumer(bootstrap_servers='localhost:1234')
->>> consumer.assign([TopicPartition('foobar', 2)])
->>> msg = next(consumer)
+.. code-block:: python
 
->>> # Deserialize msgpack-encoded values
->>> consumer = KafkaConsumer(value_deserializer=msgpack.loads)
->>> consumer.subscribe(['msgpackfoo'])
->>> for msg in consumer:
-...     assert isinstance(msg.value, dict)
+    # Access record headers. The returned value is a list of tuples
+    # with str, bytes for key and value
+    for msg in consumer:
+        print (msg.headers)
+
+.. code-block:: python
+
+    # Read only committed messages from transactional topic
+    consumer = KafkaConsumer(isolation_level='read_committed')
+    consumer.subscribe(['txn_topic'])
+    for msg in consumer:
+        print(msg)
+
+.. code-block:: python
+
+    # Get consumer metrics
+    metrics = consumer.metrics()
 
 
 KafkaProducer
@@ -78,36 +108,76 @@ KafkaProducer
 The class is intended to operate as similarly as possible to the official java
 client. See `KafkaProducer <apidoc/KafkaProducer.html>`_ for more details.
 
->>> from kafka import KafkaProducer
->>> producer = KafkaProducer(bootstrap_servers='localhost:1234')
->>> for _ in range(100):
-...     producer.send('foobar', b'some_message_bytes')
+.. code:: python
+
+    from kafka import KafkaProducer
+    producer = KafkaProducer(bootstrap_servers='localhost:1234')
+    for _ in range(100):
+        producer.send('foobar', b'some_message_bytes')
+
+.. code:: python
+
+    # Block until a single message is sent (or timeout)
+    future = producer.send('foobar', b'another_message')
+    result = future.get(timeout=60)
+
+.. code:: python
+
+    # Block until all pending messages are at least put on the network
+    # NOTE: This does not guarantee delivery or success! It is really
+    # only useful if you configure internal batching using linger_ms
+    producer.flush()
+
+.. code:: python
+
+    # Use a key for hashed-partitioning
+    producer.send('foobar', key=b'foo', value=b'bar')
+
+.. code:: python
+
+    # Serialize json messages
+    import json
+    producer = KafkaProducer(value_serializer=lambda v: json.dumps(v).encode('utf-8'))
+    producer.send('fizzbuzz', {'foo': 'bar'})
+
+.. code:: python
+
+    # Serialize string keys
+    producer = KafkaProducer(key_serializer=str.encode)
+    producer.send('flipflap', key='ping', value=b'1234')
+
+.. code:: python
+
+    # Compress messages
+    producer = KafkaProducer(compression_type='gzip')
+    for i in range(1000):
+        producer.send('foobar', b'msg %d' % i)
+
+.. code-block:: python
 
->>> # Block until a single message is sent (or timeout)
->>> future = producer.send('foobar', b'another_message')
->>> result = future.get(timeout=60)
+    # Use transactions
+    producer = KafkaProducer(transactional_id='fizzbuzz')
+    producer.init_transactions()
+    producer.begin_transaction()
+    future = producer.send('txn_topic', value=b'yes')
+    future.get() # wait for successful produce
+    producer.commit_transaction() # commit the transaction
 
->>> # Block until all pending messages are at least put on the network
->>> # NOTE: This does not guarantee delivery or success! It is really
->>> # only useful if you configure internal batching using linger_ms
->>> producer.flush()
+    producer.begin_transaction()
+    future = producer.send('txn_topic', value=b'no')
+    future.get() # wait for successful produce
+    producer.abort_transaction() # abort the transaction
 
->>> # Use a key for hashed-partitioning
->>> producer.send('foobar', key=b'foo', value=b'bar')
+.. code-block:: python
 
->>> # Serialize json messages
->>> import json
->>> producer = KafkaProducer(value_serializer=lambda v: json.dumps(v).encode('utf-8'))
->>> producer.send('fizzbuzz', {'foo': 'bar'})
+    # Include record headers. The format is list of tuples with string key
+    # and bytes value.
+    producer.send('foobar', value=b'c29tZSB2YWx1ZQ==', headers=[('content-encoding', b'base64')])
 
->>> # Serialize string keys
->>> producer = KafkaProducer(key_serializer=str.encode)
->>> producer.send('flipflap', key='ping', value=b'1234')
+.. code-block:: python
 
->>> # Compress messages
->>> producer = KafkaProducer(compression_type='gzip')
->>> for i in range(1000):
-...     producer.send('foobar', b'msg %d' % i)
+    # Get producer performance metrics
+    metrics = producer.metrics()
 
 
 Thread safety
diff --git a/docs/license.rst b/docs/license.rst
index e9d5c9adb..f419915bd 100644
--- a/docs/license.rst
+++ b/docs/license.rst
@@ -6,5 +6,5 @@ License
 
 Apache License, v2.0. See `LICENSE <https://github.com/dpkp/kafka-python/blob/master/LICENSE>`_.
 
-Copyright 2016, Dana Powers, David Arthur, and Contributors
+Copyright 2025, Dana Powers, David Arthur, and Contributors
 (See `AUTHORS <https://github.com/dpkp/kafka-python/blob/master/AUTHORS.md>`_).
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 0f095e074..61a675cab 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,5 +1,5 @@
-sphinx
-sphinx_rtd_theme
+sphinx==8.1.3
+sphinx_rtd_theme==3.0.2
 
 # Install kafka-python in editable mode
 # This allows the sphinx autodoc module
diff --git a/docs/tests.rst b/docs/tests.rst
index 561179ca5..c8adb2d76 100644
--- a/docs/tests.rst
+++ b/docs/tests.rst
@@ -6,12 +6,14 @@ Tests
 .. image:: https://travis-ci.org/dpkp/kafka-python.svg?branch=master
     :target: https://travis-ci.org/dpkp/kafka-python
 
-Test environments are managed via tox. The test suite is run via pytest.
+The test suite is run via pytest.
 
-Linting is run via pylint, but is generally skipped on pypy due to pylint
-compatibility / performance issues.
+Linting is run via pylint, but is currently skipped during CI/CD due to
+accumulated debt. We'd like to transition to ruff!
 
 For test coverage details, see https://coveralls.io/github/dpkp/kafka-python
+Coverage reporting is currently disabled as we have transitioned from travis
+to GH Actions and have not yet re-enabled coveralls integration.
 
 The test suite includes unit tests that mock network interfaces, as well as
 integration tests that setup and teardown kafka broker (and zookeeper)
@@ -21,30 +23,21 @@ fixtures for client / consumer / producer testing.
 Unit tests
 ------------------
 
-To run the tests locally, install tox:
+To run the tests locally, install test dependencies:
 
 .. code:: bash
 
-     pip install tox
+     pip install -r requirements-dev.txt
 
-For more details, see https://tox.readthedocs.io/en/latest/install.html
-
-Then simply run tox, optionally setting the python environment.
-If unset, tox will loop through all environments.
+Then simply run pytest (or make test) from your preferred python + virtualenv.
 
 .. code:: bash
 
-    tox -e py27
-    tox -e py35
-
-    # run protocol tests only
-    tox -- -v test.test_protocol
-
-    # re-run the last failing test, dropping into pdb
-    tox -e py27 -- --lf --pdb
+    # run protocol tests only (via pytest)
+    pytest test/test_protocol.py
 
-    # see available (pytest) options
-    tox -e py27 -- --help
+    # Run conn tests only (via make)
+    PYTESTS=test/test_conn.py make test
 
 
 Integration tests
@@ -52,35 +45,8 @@ Integration tests
 
 .. code:: bash
 
-    KAFKA_VERSION=0.8.2.2 tox -e py27
-    KAFKA_VERSION=1.0.1 tox -e py36
-
-
-Integration tests start Kafka and Zookeeper fixtures. This requires downloading
-kafka server binaries:
-
-.. code:: bash
-
-    ./build_integration.sh
-
-By default, this will install the broker versions listed in build_integration.sh's `ALL_RELEASES`
-into the servers/ directory. To install a specific version, set the `KAFKA_VERSION` variable:
-
-.. code:: bash
-
-    KAFKA_VERSION=1.0.1 ./build_integration.sh
+    KAFKA_VERSION=4.0.0 make test
 
-Then to run the tests against a specific Kafka version, simply set the `KAFKA_VERSION`
-env variable to the server build you want to use for testing:
-
-.. code:: bash
-
-    KAFKA_VERSION=1.0.1 tox -e py36
-
-To test against the kafka source tree, set KAFKA_VERSION=trunk
-[optionally set SCALA_VERSION (defaults to the value set in `build_integration.sh`)]
-
-.. code:: bash
 
-    SCALA_VERSION=2.12 KAFKA_VERSION=trunk ./build_integration.sh
-    KAFKA_VERSION=trunk tox -e py36
+Integration tests start Kafka and Zookeeper fixtures. Make will download
+kafka server binaries automatically if needed.
diff --git a/docs/usage.rst b/docs/usage.rst
index 1cf1aa414..c001ec049 100644
--- a/docs/usage.rst
+++ b/docs/usage.rst
@@ -8,6 +8,8 @@ KafkaConsumer
 .. code:: python
 
     from kafka import KafkaConsumer
+    import json
+    import msgpack
 
     # To consume latest messages and auto-commit offsets
     consumer = KafkaConsumer('my-topic',
@@ -26,7 +28,7 @@ KafkaConsumer
     # consume json messages
     KafkaConsumer(value_deserializer=lambda m: json.loads(m.decode('ascii')))
 
-    # consume msgpack 
+    # consume msgpack
     KafkaConsumer(value_deserializer=msgpack.unpackb)
 
     # StopIteration if no message after 1sec
@@ -57,6 +59,8 @@ KafkaProducer
 
     from kafka import KafkaProducer
     from kafka.errors import KafkaError
+    import msgpack
+    import json
 
     producer = KafkaProducer(bootstrap_servers=['broker1:1234'])
 
@@ -100,7 +104,7 @@ KafkaProducer
         log.error('I am an errback', exc_info=excp)
         # handle exception
 
-    # produce asynchronously with callbacks 
+    # produce asynchronously with callbacks
     producer.send('my-topic', b'raw_bytes').add_callback(on_send_success).add_errback(on_send_error)
 
     # block until all async messages are sent
@@ -108,3 +112,52 @@ KafkaProducer
 
     # configure multiple retries
     producer = KafkaProducer(retries=5)
+
+
+ClusterMetadata
+=============
+.. code:: python
+
+    from kafka.cluster import ClusterMetadata
+
+    clusterMetadata = ClusterMetadata(bootstrap_servers=['broker1:1234'])
+
+    # get all brokers metadata
+    print(clusterMetadata.brokers())
+
+    # get specific broker metadata
+    print(clusterMetadata.broker_metadata('bootstrap-0'))
+
+    # get all partitions of a topic
+    print(clusterMetadata.partitions_for_topic("topic"))
+
+    # list topics
+    print(clusterMetadata.topics())
+
+
+KafkaAdminClient
+=============
+.. code:: python
+    from kafka import KafkaAdminClient
+    from kafka.admin import NewTopic
+
+    admin = KafkaAdminClient(bootstrap_servers=['broker1:1234'])
+
+    # create a new topic
+    topics_list = []
+    topics_list.append(NewTopic(name="testtopic", num_partitions=1, replication_factor=1))
+    admin.create_topics(topics_list,timeout_ms=None, validate_only=False)
+
+    # delete a topic
+    admin.delete_topics(['testtopic'])
+
+    # list consumer groups
+    print(admin.list_consumer_groups())
+
+    # get consumer group details
+    print(admin.describe_consumer_groups('cft-plt-qa.connect'))
+
+    # get consumer group offset
+    print(admin.list_consumer_group_offsets('cft-plt-qa.connect'))
+
+
diff --git a/kafka/__init__.py b/kafka/__init__.py
index d5e30affa..41a014072 100644
--- a/kafka/__init__.py
+++ b/kafka/__init__.py
@@ -4,7 +4,7 @@
 from kafka.version import __version__
 __author__ = 'Dana Powers'
 __license__ = 'Apache License 2.0'
-__copyright__ = 'Copyright 2016 Dana Powers, David Arthur, and Contributors'
+__copyright__ = 'Copyright 2025 Dana Powers, David Arthur, and Contributors'
 
 # Set default logging handler to avoid "No handler found" warnings.
 import logging
diff --git a/kafka/admin/client.py b/kafka/admin/client.py
index c58da0c52..82aaa68e9 100644
--- a/kafka/admin/client.py
+++ b/kafka/admin/client.py
@@ -1,9 +1,10 @@
-from __future__ import absolute_import
+from __future__ import absolute_import, division
 
 from collections import defaultdict
 import copy
 import logging
 import socket
+import time
 
 from . import ConfigResourceType
 from kafka.vendor import six
@@ -14,15 +15,15 @@
 from kafka.coordinator.protocol import ConsumerProtocolMemberMetadata, ConsumerProtocolMemberAssignment, ConsumerProtocol
 import kafka.errors as Errors
 from kafka.errors import (
-    IncompatibleBrokerVersion, KafkaConfigurationError, NotControllerError,
+    IncompatibleBrokerVersion, KafkaConfigurationError, UnknownTopicOrPartitionError,
     UnrecognizedBrokerVersion, IllegalArgumentError)
 from kafka.metrics import MetricConfig, Metrics
 from kafka.protocol.admin import (
     CreateTopicsRequest, DeleteTopicsRequest, DescribeConfigsRequest, AlterConfigsRequest, CreatePartitionsRequest,
     ListGroupsRequest, DescribeGroupsRequest, DescribeAclsRequest, CreateAclsRequest, DeleteAclsRequest,
-    DeleteGroupsRequest
-)
-from kafka.protocol.commit import GroupCoordinatorRequest, OffsetFetchRequest
+    DeleteGroupsRequest, DeleteRecordsRequest, DescribeLogDirsRequest, ElectLeadersRequest, ElectionType)
+from kafka.protocol.commit import OffsetFetchRequest
+from kafka.protocol.find_coordinator import FindCoordinatorRequest
 from kafka.protocol.metadata import MetadataRequest
 from kafka.protocol.types import Array
 from kafka.structs import TopicPartition, OffsetAndMetadata, MemberInformation, GroupInformation
@@ -72,7 +73,7 @@ class KafkaAdminClient(object):
             reconnection attempts will continue periodically with this fixed
             rate. To avoid connection storms, a randomization factor of 0.2
             will be applied to the backoff resulting in a random range between
-            20% below and 20% above the computed value. Default: 1000.
+            20% below and 20% above the computed value. Default: 30000.
         request_timeout_ms (int): Client request timeout in milliseconds.
             Default: 30000.
         connections_max_idle_ms: Close idle connections after the number of
@@ -140,13 +141,17 @@ class KafkaAdminClient(object):
             Required if sasl_mechanism is PLAIN or one of the SCRAM mechanisms.
         sasl_plain_password (str): password for sasl PLAIN and SCRAM authentication.
             Required if sasl_mechanism is PLAIN or one of the SCRAM mechanisms.
+        sasl_kerberos_name (str or gssapi.Name): Constructed gssapi.Name for use with
+            sasl mechanism handshake. If provided, sasl_kerberos_service_name and
+            sasl_kerberos_domain name are ignored. Default: None.
         sasl_kerberos_service_name (str): Service name to include in GSSAPI
             sasl mechanism handshake. Default: 'kafka'
         sasl_kerberos_domain_name (str): kerberos domain name to use in GSSAPI
             sasl mechanism handshake. Default: one of bootstrap servers
-        sasl_oauth_token_provider (AbstractTokenProvider): OAuthBearer token provider
-            instance. (See kafka.oauth.abstract). Default: None
-
+        sasl_oauth_token_provider (kafka.sasl.oauth.AbstractTokenProvider): OAuthBearer
+            token provider instance. Default: None
+        socks5_proxy (str): Socks5 proxy url. Default: None
+        kafka_client (callable): Custom class / callable for creating KafkaClient instances
     """
     DEFAULT_CONFIG = {
         # client configs
@@ -155,7 +160,7 @@ class KafkaAdminClient(object):
         'request_timeout_ms': 30000,
         'connections_max_idle_ms': 9 * 60 * 1000,
         'reconnect_backoff_ms': 50,
-        'reconnect_backoff_max_ms': 1000,
+        'reconnect_backoff_max_ms': 30000,
         'max_in_flight_requests_per_connection': 5,
         'receive_buffer_bytes': None,
         'send_buffer_bytes': None,
@@ -178,14 +183,17 @@ class KafkaAdminClient(object):
         'sasl_mechanism': None,
         'sasl_plain_username': None,
         'sasl_plain_password': None,
+        'sasl_kerberos_name': None,
         'sasl_kerberos_service_name': 'kafka',
         'sasl_kerberos_domain_name': None,
         'sasl_oauth_token_provider': None,
+        'socks5_proxy': None,
 
         # metrics configs
         'metric_reporters': [],
         'metrics_num_samples': 2,
         'metrics_sample_window_ms': 30000,
+        'kafka_client': KafkaClient,
     }
 
     def __init__(self, **configs):
@@ -205,14 +213,14 @@ def __init__(self, **configs):
         reporters = [reporter() for reporter in self.config['metric_reporters']]
         self._metrics = Metrics(metric_config, reporters)
 
-        self._client = KafkaClient(metrics=self._metrics,
-                                   metric_group_prefix='admin',
-                                   **self.config)
-        self._client.check_version(timeout=(self.config['api_version_auto_timeout_ms'] / 1000))
+        self._client = self.config['kafka_client'](
+            metrics=self._metrics,
+            metric_group_prefix='admin',
+            **self.config
+        )
 
         # Get auto-discovered version from client if necessary
-        if self.config['api_version'] is None:
-            self.config['api_version'] = self._client.config['api_version']
+        self.config['api_version'] = self._client.config['api_version']
 
         self._closed = False
         self._refresh_controller_id()
@@ -229,58 +237,44 @@ def close(self):
         self._closed = True
         log.debug("KafkaAdminClient is now closed.")
 
-    def _matching_api_version(self, operation):
-        """Find the latest version of the protocol operation supported by both
-        this library and the broker.
-
-        This resolves to the lesser of either the latest api version this
-        library supports, or the max version supported by the broker.
-
-        :param operation: A list of protocol operation versions from kafka.protocol.
-        :return: The max matching version number between client and broker.
-        """
-        broker_api_versions = self._client.get_api_versions()
-        api_key = operation[0].API_KEY
-        if broker_api_versions is None or api_key not in broker_api_versions:
-            raise IncompatibleBrokerVersion(
-                "Kafka broker does not support the '{}' Kafka protocol."
-                .format(operation[0].__name__))
-        min_version, max_version = broker_api_versions[api_key]
-        version = min(len(operation) - 1, max_version)
-        if version < min_version:
-            # max library version is less than min broker version. Currently,
-            # no Kafka versions specify a min msg version. Maybe in the future?
-            raise IncompatibleBrokerVersion(
-                "No version of the '{}' Kafka protocol is supported by both the client and broker."
-                .format(operation[0].__name__))
-        return version
-
     def _validate_timeout(self, timeout_ms):
         """Validate the timeout is set or use the configuration default.
 
-        :param timeout_ms: The timeout provided by api call, in milliseconds.
-        :return: The timeout to use for the operation.
+        Arguments:
+            timeout_ms: The timeout provided by api call, in milliseconds.
+
+        Returns:
+            The timeout to use for the operation.
         """
         return timeout_ms or self.config['request_timeout_ms']
 
-    def _refresh_controller_id(self):
+    def _refresh_controller_id(self, timeout_ms=30000):
         """Determine the Kafka cluster controller."""
-        version = self._matching_api_version(MetadataRequest)
+        version = self._client.api_version(MetadataRequest, max_version=6)
         if 1 <= version <= 6:
-            request = MetadataRequest[version]()
-            future = self._send_request_to_node(self._client.least_loaded_node(), request)
-
-            self._wait_for_futures([future])
-
-            response = future.value
-            controller_id = response.controller_id
-            # verify the controller is new enough to support our requests
-            controller_version = self._client.check_version(controller_id, timeout=(self.config['api_version_auto_timeout_ms'] / 1000))
-            if controller_version < (0, 10, 0):
-                raise IncompatibleBrokerVersion(
-                    "The controller appears to be running Kafka {}. KafkaAdminClient requires brokers >= 0.10.0.0."
-                    .format(controller_version))
-            self._controller_id = controller_id
+            timeout_at = time.time() + timeout_ms / 1000
+            while time.time() < timeout_at:
+                request = MetadataRequest[version]()
+                future = self._send_request_to_node(self._client.least_loaded_node(), request)
+
+                self._wait_for_futures([future])
+
+                response = future.value
+                controller_id = response.controller_id
+                if controller_id == -1:
+                    log.warning("Controller ID not available, got -1")
+                    time.sleep(1)
+                    continue
+                # verify the controller is new enough to support our requests
+                controller_version = self._client.check_version(node_id=controller_id)
+                if controller_version < (0, 10, 0):
+                    raise IncompatibleBrokerVersion(
+                        "The controller appears to be running Kafka {}. KafkaAdminClient requires brokers >= 0.10.0.0."
+                        .format(controller_version))
+                self._controller_id = controller_id
+                return
+            else:
+                raise Errors.NodeNotReadyError('controller')
         else:
             raise UnrecognizedBrokerVersion(
                 "Kafka Admin interface cannot determine the controller using MetadataRequest_v{}."
@@ -289,43 +283,40 @@ def _refresh_controller_id(self):
     def _find_coordinator_id_send_request(self, group_id):
         """Send a FindCoordinatorRequest to a broker.
 
-        :param group_id: The consumer group ID. This is typically the group
+        Arguments:
+            group_id: The consumer group ID. This is typically the group
             name as a string.
-        :return: A message future
+
+        Returns:
+            A message future
         """
-        # TODO add support for dynamically picking version of
-        # GroupCoordinatorRequest which was renamed to FindCoordinatorRequest.
-        # When I experimented with this, the coordinator value returned in
-        # GroupCoordinatorResponse_v1 didn't match the value returned by
-        # GroupCoordinatorResponse_v0 and I couldn't figure out why.
-        version = 0
-        # version = self._matching_api_version(GroupCoordinatorRequest)
+        version = self._client.api_version(FindCoordinatorRequest, max_version=2)
         if version <= 0:
-            request = GroupCoordinatorRequest[version](group_id)
+            request = FindCoordinatorRequest[version](group_id)
+        elif version <= 2:
+            request = FindCoordinatorRequest[version](group_id, 0)
         else:
             raise NotImplementedError(
-                "Support for GroupCoordinatorRequest_v{} has not yet been added to KafkaAdminClient."
+                "Support for FindCoordinatorRequest_v{} has not yet been added to KafkaAdminClient."
                 .format(version))
         return self._send_request_to_node(self._client.least_loaded_node(), request)
 
     def _find_coordinator_id_process_response(self, response):
         """Process a FindCoordinatorResponse.
 
-        :param response: a FindCoordinatorResponse.
-        :return: The node_id of the broker that is the coordinator.
+        Arguments:
+            response: a FindCoordinatorResponse.
+
+        Returns:
+            The node_id of the broker that is the coordinator.
         """
-        if response.API_VERSION <= 0:
-            error_type = Errors.for_code(response.error_code)
-            if error_type is not Errors.NoError:
-                # Note: When error_type.retriable, Java will retry... see
-                # KafkaAdminClient's handleFindCoordinatorError method
-                raise error_type(
-                    "FindCoordinatorRequest failed with response '{}'."
-                    .format(response))
-        else:
-            raise NotImplementedError(
-                "Support for FindCoordinatorRequest_v{} has not yet been added to KafkaAdminClient."
-                .format(response.API_VERSION))
+        error_type = Errors.for_code(response.error_code)
+        if error_type is not Errors.NoError:
+            # Note: When error_type.retriable, Java will retry... see
+            # KafkaAdminClient's handleFindCoordinatorError method
+            raise error_type(
+                "FindCoordinatorRequest failed with response '{}'."
+                .format(response))
         return response.coordinator_id
 
     def _find_coordinator_ids(self, group_ids):
@@ -335,9 +326,12 @@ def _find_coordinator_ids(self, group_ids):
         Will block until the FindCoordinatorResponse is received for all groups.
         Any errors are immediately raised.
 
-        :param group_ids: A list of consumer group IDs. This is typically the group
+        Arguments:
+            group_ids: A list of consumer group IDs. This is typically the group
             name as a string.
-        :return: A dict of {group_id: node_id} where node_id is the id of the
+
+        Returns:
+            A dict of {group_id: node_id} where node_id is the id of the
             broker that is the coordinator for the corresponding group.
         """
         groups_futures = {
@@ -351,29 +345,39 @@ def _find_coordinator_ids(self, group_ids):
         }
         return groups_coordinators
 
-    def _send_request_to_node(self, node_id, request):
+    def _send_request_to_node(self, node_id, request, wakeup=True):
         """Send a Kafka protocol message to a specific broker.
 
-        Returns a future that may be polled for status and results.
+        Arguments:
+            node_id: The broker id to which to send the message.
+            request: The message to send.
+
+
+        Keyword Arguments:
+            wakeup (bool, optional): Optional flag to disable thread-wakeup.
+
+        Returns:
+            A future object that may be polled for status and results.
 
-        :param node_id: The broker id to which to send the message.
-        :param request: The message to send.
-        :return: A future object that may be polled for status and results.
-        :exception: The exception if the message could not be sent.
+        Raises:
+            The exception if the message could not be sent.
         """
         while not self._client.ready(node_id):
             # poll until the connection to broker is ready, otherwise send()
             # will fail with NodeNotReadyError
-            self._client.poll()
-        return self._client.send(node_id, request)
+            self._client.poll(timeout_ms=200)
+        return self._client.send(node_id, request, wakeup)
 
     def _send_request_to_controller(self, request):
         """Send a Kafka protocol message to the cluster controller.
 
         Will block until the message result is received.
 
-        :param request: The message to send.
-        :return: The Kafka protocol response for the message.
+        Arguments:
+            request: The message to send.
+
+        Returns:
+            The Kafka protocol response for the message.
         """
         tries = 2  # in case our cached self._controller_id is outdated
         while tries:
@@ -389,30 +393,70 @@ def _send_request_to_controller(self, request):
             # So this is a little brittle in that it assumes all responses have
             # one of these attributes and that they always unpack into
             # (topic, error_code) tuples.
-            topic_error_tuples = (response.topic_errors if hasattr(response, 'topic_errors')
-                    else response.topic_error_codes)
-            # Also small py2/py3 compatibility -- py3 can ignore extra values
-            # during unpack via: for x, y, *rest in list_of_values. py2 cannot.
-            # So for now we have to map across the list and explicitly drop any
-            # extra values (usually the error_message)
-            for topic, error_code in map(lambda e: e[:2], topic_error_tuples):
+            topic_error_tuples = getattr(response, 'topic_errors', getattr(response, 'topic_error_codes', None))
+            if topic_error_tuples is not None:
+                success = self._parse_topic_request_response(topic_error_tuples, request, response, tries)
+            else:
+                # Leader Election request has a two layer error response (topic and partition)
+                success = self._parse_topic_partition_request_response(request, response, tries)
+
+            if success:
+                return response
+        raise RuntimeError("This should never happen, please file a bug with full stacktrace if encountered")
+
+    def _parse_topic_request_response(self, topic_error_tuples, request, response, tries):
+        # Also small py2/py3 compatibility -- py3 can ignore extra values
+        # during unpack via: for x, y, *rest in list_of_values. py2 cannot.
+        # So for now we have to map across the list and explicitly drop any
+        # extra values (usually the error_message)
+        for topic, error_code in map(lambda e: e[:2], topic_error_tuples):
+            error_type = Errors.for_code(error_code)
+            if tries and error_type is Errors.NotControllerError:
+                # No need to inspect the rest of the errors for
+                # non-retriable errors because NotControllerError should
+                # either be thrown for all errors or no errors.
+                self._refresh_controller_id()
+                return False
+            elif error_type is not Errors.NoError:
+                raise error_type(
+                    "Request '{}' failed with response '{}'."
+                    .format(request, response))
+        return True
+
+    def _parse_topic_partition_request_response(self, request, response, tries):
+        # Also small py2/py3 compatibility -- py3 can ignore extra values
+        # during unpack via: for x, y, *rest in list_of_values. py2 cannot.
+        # So for now we have to map across the list and explicitly drop any
+        # extra values (usually the error_message)
+        for topic, partition_results in response.replication_election_results:
+            for partition_id, error_code in map(lambda e: e[:2], partition_results):
                 error_type = Errors.for_code(error_code)
-                if tries and error_type is NotControllerError:
+                if tries and error_type is Errors.NotControllerError:
                     # No need to inspect the rest of the errors for
                     # non-retriable errors because NotControllerError should
                     # either be thrown for all errors or no errors.
                     self._refresh_controller_id()
-                    break
-                elif error_type is not Errors.NoError:
+                    return False
+                elif error_type not in (Errors.NoError, Errors.ElectionNotNeededError):
                     raise error_type(
                         "Request '{}' failed with response '{}'."
                         .format(request, response))
-            else:
-                return response
-        raise RuntimeError("This should never happen, please file a bug with full stacktrace if encountered")
+        return True
 
     @staticmethod
     def _convert_new_topic_request(new_topic):
+        """
+        Build the tuple required by CreateTopicsRequest from a NewTopic object.
+
+        Arguments:
+            new_topic: A NewTopic instance containing name, partition count, replication factor,
+                          replica assignments, and config entries.
+
+        Returns:
+            A tuple in the form:
+                 (topic_name, num_partitions, replication_factor, [(partition_id, [replicas])...],
+                  [(config_key, config_value)...])
+        """
         return (
             new_topic.name,
             new_topic.num_partitions,
@@ -428,14 +472,19 @@ def _convert_new_topic_request(new_topic):
     def create_topics(self, new_topics, timeout_ms=None, validate_only=False):
         """Create new topics in the cluster.
 
-        :param new_topics: A list of NewTopic objects.
-        :param timeout_ms: Milliseconds to wait for new topics to be created
-            before the broker returns.
-        :param validate_only: If True, don't actually create new topics.
-            Not supported by all versions. Default: False
-        :return: Appropriate version of CreateTopicResponse class.
+        Arguments:
+            new_topics: A list of NewTopic objects.
+
+        Keyword Arguments:
+            timeout_ms (numeric, optional): Milliseconds to wait for new topics to be created
+                before the broker returns.
+            validate_only (bool, optional): If True, don't actually create new topics.
+                Not supported by all versions. Default: False
+
+        Returns:
+            Appropriate version of CreateTopicResponse class.
         """
-        version = self._matching_api_version(CreateTopicsRequest)
+        version = self._client.api_version(CreateTopicsRequest, max_version=3)
         timeout_ms = self._validate_timeout(timeout_ms)
         if version == 0:
             if validate_only:
@@ -463,12 +512,17 @@ def create_topics(self, new_topics, timeout_ms=None, validate_only=False):
     def delete_topics(self, topics, timeout_ms=None):
         """Delete topics from the cluster.
 
-        :param topics: A list of topic name strings.
-        :param timeout_ms: Milliseconds to wait for topics to be deleted
-            before the broker returns.
-        :return: Appropriate version of DeleteTopicsResponse class.
+        Arguments:
+            topics ([str]): A list of topic name strings.
+
+        Keyword Arguments:
+            timeout_ms (numeric, optional): Milliseconds to wait for topics to be deleted
+                before the broker returns.
+
+        Returns:
+            Appropriate version of DeleteTopicsResponse class.
         """
-        version = self._matching_api_version(DeleteTopicsRequest)
+        version = self._client.api_version(DeleteTopicsRequest, max_version=3)
         timeout_ms = self._validate_timeout(timeout_ms)
         if version <= 3:
             request = DeleteTopicsRequest[version](
@@ -487,7 +541,7 @@ def _get_cluster_metadata(self, topics=None, auto_topic_creation=False):
         """
         topics == None means "get all topics"
         """
-        version = self._matching_api_version(MetadataRequest)
+        version = self._client.api_version(MetadataRequest, max_version=5)
         if version <= 3:
             if auto_topic_creation:
                 raise IncompatibleBrokerVersion(
@@ -510,16 +564,38 @@ def _get_cluster_metadata(self, topics=None, auto_topic_creation=False):
         return future.value
 
     def list_topics(self):
+        """Retrieve a list of all topic names in the cluster.
+
+        Returns:
+            A list of topic name strings.
+        """
         metadata = self._get_cluster_metadata(topics=None)
         obj = metadata.to_object()
         return [t['topic'] for t in obj['topics']]
 
     def describe_topics(self, topics=None):
+        """Fetch metadata for the specified topics or all topics if None.
+
+        Keyword Arguments:
+            topics ([str], optional) A list of topic names. If None, metadata for all
+                topics is retrieved.
+
+        Returns:
+            A list of dicts describing each topic (including partition info).
+        """
         metadata = self._get_cluster_metadata(topics=topics)
         obj = metadata.to_object()
         return obj['topics']
 
     def describe_cluster(self):
+        """
+        Fetch cluster-wide metadata such as the list of brokers, the controller ID,
+        and the cluster ID.
+
+
+        Returns:
+            A dict with cluster-wide metadata, excluding topic details.
+        """
         metadata = self._get_cluster_metadata()
         obj = metadata.to_object()
         obj.pop('topics')  # We have 'describe_topics' for this
@@ -527,6 +603,15 @@ def describe_cluster(self):
 
     @staticmethod
     def _convert_describe_acls_response_to_acls(describe_response):
+        """Convert a DescribeAclsResponse into a list of ACL objects and a KafkaError.
+
+        Arguments:
+            describe_response: The response object from the DescribeAclsRequest.
+
+        Returns:
+            A tuple of (list_of_acl_objects, error) where error is an instance
+                 of KafkaError (NoError if successful).
+        """
         version = describe_response.API_VERSION
 
         error = Errors.for_code(describe_response.error_code)
@@ -566,11 +651,14 @@ def describe_acls(self, acl_filter):
         The cluster must be configured with an authorizer for this to work, or
         you will get a SecurityDisabledError
 
-        :param acl_filter: an ACLFilter object
-        :return: tuple of a list of matching ACL objects and a KafkaError (NoError if successful)
+        Arguments:
+            acl_filter: an ACLFilter object
+
+        Returns:
+            tuple of a list of matching ACL objects and a KafkaError (NoError if successful)
         """
 
-        version = self._matching_api_version(DescribeAclsRequest)
+        version = self._client.api_version(DescribeAclsRequest, max_version=1)
         if version == 0:
             request = DescribeAclsRequest[version](
                 resource_type=acl_filter.resource_pattern.resource_type,
@@ -612,6 +700,14 @@ def describe_acls(self, acl_filter):
 
     @staticmethod
     def _convert_create_acls_resource_request_v0(acl):
+        """Convert an ACL object into the CreateAclsRequest v0 format.
+
+        Arguments:
+            acl: An ACL object with resource pattern and permissions.
+
+        Returns:
+            A tuple: (resource_type, resource_name, principal, host, operation, permission_type).
+        """
 
         return (
             acl.resource_pattern.resource_type,
@@ -624,7 +720,14 @@ def _convert_create_acls_resource_request_v0(acl):
 
     @staticmethod
     def _convert_create_acls_resource_request_v1(acl):
+        """Convert an ACL object into the CreateAclsRequest v1 format.
+
+        Arguments:
+            acl: An ACL object with resource pattern and permissions.
 
+        Returns:
+            A tuple: (resource_type, resource_name, pattern_type, principal, host, operation, permission_type).
+        """
         return (
             acl.resource_pattern.resource_type,
             acl.resource_pattern.resource_name,
@@ -637,6 +740,19 @@ def _convert_create_acls_resource_request_v1(acl):
 
     @staticmethod
     def _convert_create_acls_response_to_acls(acls, create_response):
+        """Parse CreateAclsResponse and correlate success/failure with original ACL objects.
+
+        Arguments:
+            acls: A list of ACL objects that were requested for creation.
+            create_response: The broker's CreateAclsResponse object.
+
+        Returns:
+            A dict with:
+                 {
+                   'succeeded': [list of ACL objects successfully created],
+                   'failed': [(acl_object, KafkaError), ...]
+                 }
+        """
         version = create_response.API_VERSION
 
         creations_error = []
@@ -665,15 +781,18 @@ def create_acls(self, acls):
         This endpoint only accepts a list of concrete ACL objects, no ACLFilters.
         Throws TopicAlreadyExistsError if topic is already present.
 
-        :param acls: a list of ACL objects
-        :return: dict of successes and failures
+        Arguments:
+            acls: a list of ACL objects
+
+        Returns:
+            dict of successes and failures
         """
 
         for acl in acls:
             if not isinstance(acl, ACL):
                 raise IllegalArgumentError("acls must contain ACL objects")
 
-        version = self._matching_api_version(CreateAclsRequest)
+        version = self._client.api_version(CreateAclsRequest, max_version=1)
         if version == 0:
             request = CreateAclsRequest[version](
                 creations=[self._convert_create_acls_resource_request_v0(acl) for acl in acls]
@@ -696,6 +815,14 @@ def create_acls(self, acls):
 
     @staticmethod
     def _convert_delete_acls_resource_request_v0(acl):
+        """Convert an ACLFilter object into the DeleteAclsRequest v0 format.
+
+        Arguments:
+            acl: An ACLFilter object identifying the ACLs to be deleted.
+
+        Returns:
+            A tuple: (resource_type, resource_name, principal, host, operation, permission_type).
+        """
         return (
             acl.resource_pattern.resource_type,
             acl.resource_pattern.resource_name,
@@ -707,6 +834,14 @@ def _convert_delete_acls_resource_request_v0(acl):
 
     @staticmethod
     def _convert_delete_acls_resource_request_v1(acl):
+        """Convert an ACLFilter object into the DeleteAclsRequest v1 format.
+
+        Arguments:
+            acl: An ACLFilter object identifying the ACLs to be deleted.
+
+        Returns:
+            A tuple: (resource_type, resource_name, pattern_type, principal, host, operation, permission_type).
+        """
         return (
             acl.resource_pattern.resource_type,
             acl.resource_pattern.resource_name,
@@ -719,6 +854,16 @@ def _convert_delete_acls_resource_request_v1(acl):
 
     @staticmethod
     def _convert_delete_acls_response_to_matching_acls(acl_filters, delete_response):
+        """Parse the DeleteAclsResponse and map the results back to each input ACLFilter.
+
+        Arguments:
+            acl_filters: A list of ACLFilter objects that were provided in the request.
+            delete_response: The response from the DeleteAclsRequest.
+
+        Returns:
+            A list of tuples of the form:
+                 (acl_filter, [(matching_acl, KafkaError), ...], filter_level_error).
+        """
         version = delete_response.API_VERSION
         filter_result_list = []
         for i, filter_responses in enumerate(delete_response.filter_responses):
@@ -757,8 +902,11 @@ def delete_acls(self, acl_filters):
 
         Deletes all ACLs matching the list of input ACLFilter
 
-        :param acl_filters: a list of ACLFilter
-        :return: a list of 3-tuples corresponding to the list of input filters.
+        Arguments:
+            acl_filters: a list of ACLFilter
+
+        Returns:
+            a list of 3-tuples corresponding to the list of input filters.
                  The tuples hold (the input ACLFilter, list of affected ACLs, KafkaError instance)
         """
 
@@ -766,7 +914,7 @@ def delete_acls(self, acl_filters):
             if not isinstance(acl, ACLFilter):
                 raise IllegalArgumentError("acl_filters must contain ACLFilter type objects")
 
-        version = self._matching_api_version(DeleteAclsRequest)
+        version = self._client.api_version(DeleteAclsRequest, max_version=1)
 
         if version == 0:
             request = DeleteAclsRequest[version](
@@ -790,6 +938,14 @@ def delete_acls(self, acl_filters):
 
     @staticmethod
     def _convert_describe_config_resource_request(config_resource):
+        """Convert a ConfigResource into the format required by DescribeConfigsRequest.
+
+        Arguments:
+            config_resource: A ConfigResource with resource_type, name, and optional config keys.
+
+        Returns:
+            A tuple: (resource_type, resource_name, [list_of_config_keys] or None).
+        """
         return (
             config_resource.resource_type,
             config_resource.name,
@@ -801,13 +957,18 @@ def _convert_describe_config_resource_request(config_resource):
     def describe_configs(self, config_resources, include_synonyms=False):
         """Fetch configuration parameters for one or more Kafka resources.
 
-        :param config_resources: An list of ConfigResource objects.
-            Any keys in ConfigResource.configs dict will be used to filter the
-            result. Setting the configs dict to None will get all values. An
-            empty dict will get zero values (as per Kafka protocol).
-        :param include_synonyms: If True, return synonyms in response. Not
-            supported by all versions. Default: False.
-        :return: Appropriate version of DescribeConfigsResponse class.
+        Arguments:
+            config_resources: An list of ConfigResource objects.
+                Any keys in ConfigResource.configs dict will be used to filter the
+                result. Setting the configs dict to None will get all values. An
+                empty dict will get zero values (as per Kafka protocol).
+
+        Keyword Arguments:
+            include_synonyms (bool, optional): If True, return synonyms in response. Not
+                supported by all versions. Default: False.
+
+        Returns:
+            Appropriate version of DescribeConfigsResponse class.
         """
 
         # Break up requests by type - a broker config request must be sent to the specific broker.
@@ -822,7 +983,7 @@ def describe_configs(self, config_resources, include_synonyms=False):
                 topic_resources.append(self._convert_describe_config_resource_request(config_resource))
 
         futures = []
-        version = self._matching_api_version(DescribeConfigsRequest)
+        version = self._client.api_version(DescribeConfigsRequest, max_version=2)
         if version == 0:
             if include_synonyms:
                 raise IncompatibleBrokerVersion(
@@ -876,6 +1037,14 @@ def describe_configs(self, config_resources, include_synonyms=False):
 
     @staticmethod
     def _convert_alter_config_resource_request(config_resource):
+        """Convert a ConfigResource into the format required by AlterConfigsRequest.
+
+        Arguments:
+            config_resource: A ConfigResource with resource_type, name, and config (key, value) pairs.
+
+        Returns:
+            A tuple: (resource_type, resource_name, [(config_key, config_value), ...]).
+        """
         return (
             config_resource.resource_type,
             config_resource.name,
@@ -893,10 +1062,13 @@ def alter_configs(self, config_resources):
             least-loaded node. See the comment in the source code for details.
             We would happily accept a PR fixing this.
 
-        :param config_resources: A list of ConfigResource objects.
-        :return: Appropriate version of AlterConfigsResponse class.
+        Arguments:
+            config_resources: A list of ConfigResource objects.
+
+        Returns:
+            Appropriate version of AlterConfigsResponse class.
         """
-        version = self._matching_api_version(AlterConfigsRequest)
+        version = self._client.api_version(AlterConfigsRequest, max_version=1)
         if version <= 1:
             request = AlterConfigsRequest[version](
                 resources=[self._convert_alter_config_resource_request(config_resource) for config_resource in config_resources]
@@ -925,6 +1097,15 @@ def alter_configs(self, config_resources):
 
     @staticmethod
     def _convert_create_partitions_request(topic_name, new_partitions):
+        """Convert a NewPartitions object into the tuple format for CreatePartitionsRequest.
+
+        Arguments:
+            topic_name: The name of the existing topic.
+            new_partitions: A NewPartitions instance with total_count and new_assignments.
+
+        Returns:
+            A tuple: (topic_name, (total_count, [list_of_assignments])).
+        """
         return (
             topic_name,
             (
@@ -936,14 +1117,19 @@ def _convert_create_partitions_request(topic_name, new_partitions):
     def create_partitions(self, topic_partitions, timeout_ms=None, validate_only=False):
         """Create additional partitions for an existing topic.
 
-        :param topic_partitions: A map of topic name strings to NewPartition objects.
-        :param timeout_ms: Milliseconds to wait for new partitions to be
-            created before the broker returns.
-        :param validate_only: If True, don't actually create new partitions.
-            Default: False
-        :return: Appropriate version of CreatePartitionsResponse class.
+        Arguments:
+            topic_partitions: A map of topic name strings to NewPartition objects.
+
+        Keyword Arguments:
+            timeout_ms (numeric, optional): Milliseconds to wait for new partitions to be
+                created before the broker returns.
+            validate_only (bool, optional): If True, don't actually create new partitions.
+                Default: False
+
+        Returns:
+            Appropriate version of CreatePartitionsResponse class.
         """
-        version = self._matching_api_version(CreatePartitionsRequest)
+        version = self._client.api_version(CreatePartitionsRequest, max_version=1)
         timeout_ms = self._validate_timeout(timeout_ms)
         if version <= 1:
             request = CreatePartitionsRequest[version](
@@ -957,8 +1143,118 @@ def create_partitions(self, topic_partitions, timeout_ms=None, validate_only=Fal
                 .format(version))
         return self._send_request_to_controller(request)
 
-    # delete records protocol not yet implemented
-    # Note: send the request to the partition leaders
+    def _get_leader_for_partitions(self, partitions, timeout_ms=None):
+        """Finds ID of the leader node for every given topic partition.
+
+        Will raise UnknownTopicOrPartitionError if for some partition no leader can be found.
+
+        :param partitions: ``[TopicPartition]``: partitions for which to find leaders.
+        :param timeout_ms: ``float``: Timeout in milliseconds, if None (default), will be read from
+            config.
+
+        :return: Dictionary with ``{leader_id -> {partitions}}``
+        """
+        timeout_ms = self._validate_timeout(timeout_ms)
+
+        partitions = set(partitions)
+        topics = set(tp.topic for tp in partitions)
+
+        response = self._get_cluster_metadata(topics=topics).to_object()
+
+        leader2partitions = defaultdict(list)
+        valid_partitions = set()
+        for topic in response.get("topics", ()):
+            for partition in topic.get("partitions", ()):
+                t2p = TopicPartition(topic=topic["topic"], partition=partition["partition"])
+                if t2p in partitions:
+                    leader2partitions[partition["leader"]].append(t2p)
+                    valid_partitions.add(t2p)
+
+        if len(partitions) != len(valid_partitions):
+            unknown = set(partitions) - valid_partitions
+            raise UnknownTopicOrPartitionError(
+                "The following partitions are not known: %s"
+                % ", ".join(str(x) for x in unknown)
+            )
+
+        return leader2partitions
+
+    def delete_records(self, records_to_delete, timeout_ms=None, partition_leader_id=None):
+        """Delete records whose offset is smaller than the given offset of the corresponding partition.
+
+        :param records_to_delete: ``{TopicPartition: int}``: The earliest available offsets for the
+            given partitions.
+        :param timeout_ms: ``float``: Timeout in milliseconds, if None (default), will be read from
+            config.
+        :param partition_leader_id: ``str``: If specified, all deletion requests will be sent to
+            this node. No check is performed verifying that this is indeed the leader for all
+            listed partitions: use with caution.
+
+        :return: Dictionary {topicPartition -> metadata}, where metadata is returned by the broker.
+            See DeleteRecordsResponse for possible fields. error_code for all partitions is
+            guaranteed to be zero, otherwise an exception is raised.
+        """
+        timeout_ms = self._validate_timeout(timeout_ms)
+        responses = []
+        version = self._client.api_version(DeleteRecordsRequest, max_version=0)
+        if version is None:
+            raise IncompatibleBrokerVersion("Broker does not support DeleteGroupsRequest")
+
+        # We want to make as few requests as possible
+        # If a single node serves as a partition leader for multiple partitions (and/or
+        # topics), we can send all of those in a single request.
+        # For that we store {leader -> {partitions for leader}}, and do 1 request per leader
+        if partition_leader_id is None:
+            leader2partitions = self._get_leader_for_partitions(
+                set(records_to_delete), timeout_ms
+            )
+        else:
+            leader2partitions = {partition_leader_id: set(records_to_delete)}
+
+        for leader, partitions in leader2partitions.items():
+            topic2partitions = defaultdict(list)
+            for partition in partitions:
+                topic2partitions[partition.topic].append(partition)
+
+            request = DeleteRecordsRequest[version](
+                topics=[
+                    (topic, [(tp.partition, records_to_delete[tp]) for tp in partitions])
+                    for topic, partitions in topic2partitions.items()
+                ],
+                timeout_ms=timeout_ms
+            )
+            future = self._send_request_to_node(leader, request)
+            self._wait_for_futures([future])
+
+            responses.append(future.value.to_object())
+
+        partition2result = {}
+        partition2error = {}
+        for response in responses:
+            for topic in response["topics"]:
+                for partition in topic["partitions"]:
+                    tp = TopicPartition(topic["name"], partition["partition_index"])
+                    partition2result[tp] = partition
+                    if partition["error_code"] != 0:
+                        partition2error[tp] = partition["error_code"]
+
+        if partition2error:
+            if len(partition2error) == 1:
+                key, error = next(iter(partition2error.items()))
+                raise Errors.for_code(error)(
+                    "Error deleting records from topic %s partition %s" % (key.topic, key.partition)
+                )
+            else:
+                raise Errors.BrokerResponseError(
+                    "The following errors occured when trying to delete records: " +
+                    ", ".join(
+                        "%s(partition=%d): %s" %
+                        (partition.topic, partition.partition, Errors.for_code(error).__name__)
+                        for partition, error in partition2error.items()
+                    )
+                )
+
+        return partition2result
 
     # create delegation token protocol not yet implemented
     # Note: send the request to the least_loaded_node()
@@ -975,12 +1271,14 @@ def create_partitions(self, topic_partitions, timeout_ms=None, validate_only=Fal
     def _describe_consumer_groups_send_request(self, group_id, group_coordinator_id, include_authorized_operations=False):
         """Send a DescribeGroupsRequest to the group's coordinator.
 
-        :param group_id: The group name as a string
-        :param group_coordinator_id: The node_id of the groups' coordinator
-            broker.
-        :return: A message future.
+        Arguments:
+            group_id: The group name as a string
+            group_coordinator_id: The node_id of the groups' coordinator broker.
+
+        Returns:
+            A message future.
         """
-        version = self._matching_api_version(DescribeGroupsRequest)
+        version = self._client.api_version(DescribeGroupsRequest, max_version=3)
         if version <= 2:
             if include_authorized_operations:
                 raise IncompatibleBrokerVersion(
@@ -1061,18 +1359,23 @@ def describe_consumer_groups(self, group_ids, group_coordinator_id=None, include
 
         Any errors are immediately raised.
 
-        :param group_ids: A list of consumer group IDs. These are typically the
-            group names as strings.
-        :param group_coordinator_id: The node_id of the groups' coordinator
-            broker. If set to None, it will query the cluster for each group to
-            find that group's coordinator. Explicitly specifying this can be
-            useful for avoiding extra network round trips if you already know
-            the group coordinator. This is only useful when all the group_ids
-            have the same coordinator, otherwise it will error. Default: None.
-        :param include_authorized_operations: Whether or not to include
-            information about the operations a group is allowed to perform.
-            Only supported on API version >= v3. Default: False.
-        :return: A list of group descriptions. For now the group descriptions
+        Arguments:
+            group_ids: A list of consumer group IDs. These are typically the
+                group names as strings.
+
+        Keyword Arguments:
+            group_coordinator_id (int, optional): The node_id of the groups' coordinator
+                broker. If set to None, it will query the cluster for each group to
+                find that group's coordinator. Explicitly specifying this can be
+                useful for avoiding extra network round trips if you already know
+                the group coordinator. This is only useful when all the group_ids
+                have the same coordinator, otherwise it will error. Default: None.
+            include_authorized_operations (bool, optional): Whether or not to include
+                information about the operations a group is allowed to perform.
+                Only supported on API version >= v3. Default: False.
+
+        Returns:
+            A list of group descriptions. For now the group descriptions
             are the raw results from the DescribeGroupsResponse. Long-term, we
             plan to change this to return namedtuples as well as decoding the
             partition assignments.
@@ -1103,10 +1406,13 @@ def describe_consumer_groups(self, group_ids, group_coordinator_id=None, include
     def _list_consumer_groups_send_request(self, broker_id):
         """Send a ListGroupsRequest to a broker.
 
-        :param broker_id: The broker's node_id.
-        :return: A message future
+        Arguments:
+            broker_id (int): The broker's node_id.
+
+        Returns:
+            A message future
         """
-        version = self._matching_api_version(ListGroupsRequest)
+        version = self._client.api_version(ListGroupsRequest, max_version=2)
         if version <= 2:
             request = ListGroupsRequest[version]()
         else:
@@ -1144,15 +1450,20 @@ def list_consumer_groups(self, broker_ids=None):
 
         As soon as any error is encountered, it is immediately raised.
 
-        :param broker_ids: A list of broker node_ids to query for consumer
-            groups. If set to None, will query all brokers in the cluster.
-            Explicitly specifying broker(s) can be useful for determining which
-            consumer groups are coordinated by those broker(s). Default: None
-        :return list: List of tuples of Consumer Groups.
-        :exception GroupCoordinatorNotAvailableError: The coordinator is not
-            available, so cannot process requests.
-        :exception GroupLoadInProgressError: The coordinator is loading and
-            hence can't process requests.
+        Keyword Arguments:
+            broker_ids ([int], optional): A list of broker node_ids to query for consumer
+                groups. If set to None, will query all brokers in the cluster.
+                Explicitly specifying broker(s) can be useful for determining which
+                consumer groups are coordinated by those broker(s). Default: None
+
+        Returns:
+            list: List of tuples of Consumer Groups.
+
+        Raises:
+            CoordinatorNotAvailableError: The coordinator is not
+                available, so cannot process requests.
+            CoordinatorLoadInProgressError: The coordinator is loading and
+                hence can't process requests.
         """
         # While we return a list, internally use a set to prevent duplicates
         # because if a group coordinator fails after being queried, and its
@@ -1172,13 +1483,20 @@ def _list_consumer_group_offsets_send_request(self, group_id,
                 group_coordinator_id, partitions=None):
         """Send an OffsetFetchRequest to a broker.
 
-        :param group_id: The consumer group id name for which to fetch offsets.
-        :param group_coordinator_id: The node_id of the group's coordinator
-            broker.
-        :return: A message future
+        Arguments:
+            group_id (str): The consumer group id name for which to fetch offsets.
+            group_coordinator_id (int): The node_id of the group's coordinator broker.
+
+        Keyword Arguments:
+            partitions: A list of TopicPartitions for which to fetch
+                offsets. On brokers >= 0.10.2, this can be set to None to fetch all
+                known offsets for the consumer group. Default: None.
+
+        Returns:
+            A message future
         """
-        version = self._matching_api_version(OffsetFetchRequest)
-        if version <= 3:
+        version = self._client.api_version(OffsetFetchRequest, max_version=5)
+        if version <= 5:
             if partitions is None:
                 if version <= 1:
                     raise ValueError(
@@ -1203,11 +1521,14 @@ def _list_consumer_group_offsets_send_request(self, group_id,
     def _list_consumer_group_offsets_process_response(self, response):
         """Process an OffsetFetchResponse.
 
-        :param response: an OffsetFetchResponse.
-        :return: A dictionary composed of TopicPartition keys and
-            OffsetAndMetada values.
+        Arguments:
+            response: an OffsetFetchResponse.
+
+        Returns:
+            A dictionary composed of TopicPartition keys and
+            OffsetAndMetadata values.
         """
-        if response.API_VERSION <= 3:
+        if response.API_VERSION <= 5:
 
             # OffsetFetchResponse_v1 lacks a top-level error_code
             if response.API_VERSION > 1:
@@ -1219,16 +1540,21 @@ def _list_consumer_group_offsets_process_response(self, response):
                         .format(response))
 
             # transform response into a dictionary with TopicPartition keys and
-            # OffsetAndMetada values--this is what the Java AdminClient returns
+            # OffsetAndMetadata values--this is what the Java AdminClient returns
             offsets = {}
             for topic, partitions in response.topics:
-                for partition, offset, metadata, error_code in partitions:
+                for partition_data in partitions:
+                    if response.API_VERSION <= 4:
+                        partition, offset, metadata, error_code = partition_data
+                        leader_epoch = -1
+                    else:
+                        partition, offset, leader_epoch, metadata, error_code = partition_data
                     error_type = Errors.for_code(error_code)
                     if error_type is not Errors.NoError:
                         raise error_type(
                             "Unable to fetch consumer group offsets for topic {}, partition {}"
                             .format(topic, partition))
-                    offsets[TopicPartition(topic, partition)] = OffsetAndMetadata(offset, metadata)
+                    offsets[TopicPartition(topic, partition)] = OffsetAndMetadata(offset, metadata, leader_epoch)
         else:
             raise NotImplementedError(
                 "Support for OffsetFetchResponse_v{} has not yet been added to KafkaAdminClient."
@@ -1245,17 +1571,22 @@ def list_consumer_group_offsets(self, group_id, group_coordinator_id=None,
 
         As soon as any error is encountered, it is immediately raised.
 
-        :param group_id: The consumer group id name for which to fetch offsets.
-        :param group_coordinator_id: The node_id of the group's coordinator
-            broker. If set to None, will query the cluster to find the group
-            coordinator. Explicitly specifying this can be useful to prevent
-            that extra network round trip if you already know the group
-            coordinator. Default: None.
-        :param partitions: A list of TopicPartitions for which to fetch
-            offsets. On brokers >= 0.10.2, this can be set to None to fetch all
-            known offsets for the consumer group. Default: None.
-        :return dictionary: A dictionary with TopicPartition keys and
-            OffsetAndMetada values. Partitions that are not specified and for
+        Arguments:
+            group_id (str): The consumer group id name for which to fetch offsets.
+
+        Keyword Arguments:
+            group_coordinator_id (int, optional): The node_id of the group's coordinator
+                broker. If set to None, will query the cluster to find the group
+                coordinator. Explicitly specifying this can be useful to prevent
+                that extra network round trip if you already know the group
+                coordinator. Default: None.
+            partitions: A list of TopicPartitions for which to fetch
+                offsets. On brokers >= 0.10.2, this can be set to None to fetch all
+                known offsets for the consumer group. Default: None.
+
+        Returns:
+            dictionary: A dictionary with TopicPartition keys and
+            OffsetAndMetadata values. Partitions that are not specified and for
             which the group_id does not have a recorded offset are omitted. An
             offset value of `-1` indicates the group_id has no offset for that
             TopicPartition. A `-1` can only happen for partitions that are
@@ -1278,14 +1609,19 @@ def delete_consumer_groups(self, group_ids, group_coordinator_id=None):
 
         The result needs checking for potential errors.
 
-        :param group_ids: The consumer group ids of the groups which are to be deleted.
-        :param group_coordinator_id: The node_id of the broker which is the coordinator for
-            all the groups. Use only if all groups are coordinated by the same broker.
-            If set to None, will query the cluster to find the coordinator for every single group.
-            Explicitly specifying this can be useful to prevent
-            that extra network round trips if you already know the group
-            coordinator. Default: None.
-        :return: A list of tuples (group_id, KafkaError)
+        Arguments:
+            group_ids ([str]): The consumer group ids of the groups which are to be deleted.
+
+        Keyword Arguments:
+            group_coordinator_id (int, optional): The node_id of the broker which is
+                the coordinator for all the groups. Use only if all groups are coordinated
+                by the same broker. If set to None, will query the cluster to find the coordinator
+                for every single group. Explicitly specifying this can be useful to prevent
+                that extra network round trips if you already know the group coordinator.
+                Default: None.
+
+        Returns:
+            A list of tuples (group_id, KafkaError)
         """
         if group_coordinator_id is not None:
             futures = [self._delete_consumer_groups_send_request(group_ids, group_coordinator_id)]
@@ -1306,6 +1642,14 @@ def delete_consumer_groups(self, group_ids, group_coordinator_id=None):
         return results
 
     def _convert_delete_groups_response(self, response):
+        """Parse the DeleteGroupsResponse, mapping group IDs to their respective errors.
+
+        Arguments:
+            response: A DeleteGroupsResponse object from the broker.
+
+        Returns:
+            A list of (group_id, KafkaError) for each deleted group.
+        """
         if response.API_VERSION <= 1:
             results = []
             for group_id, error_code in response.results:
@@ -1317,14 +1661,16 @@ def _convert_delete_groups_response(self, response):
                     .format(response.API_VERSION))
 
     def _delete_consumer_groups_send_request(self, group_ids, group_coordinator_id):
-        """Send a DeleteGroups request to a broker.
+        """Send a DeleteGroupsRequest to the specified broker (the group coordinator).
 
-        :param group_ids: The consumer group ids of the groups which are to be deleted.
-        :param group_coordinator_id: The node_id of the broker which is the coordinator for
-            all the groups.
-        :return: A message future
+        Arguments:
+            group_ids ([str]): A list of consumer group IDs to be deleted.
+            group_coordinator_id (int): The node_id of the broker coordinating these groups.
+
+        Returns:
+            A future representing the in-flight DeleteGroupsRequest.
         """
-        version = self._matching_api_version(DeleteGroupsRequest)
+        version = self._client.api_version(DeleteGroupsRequest, max_version=1)
         if version <= 1:
             request = DeleteGroupsRequest[version](group_ids)
         else:
@@ -1333,10 +1679,80 @@ def _delete_consumer_groups_send_request(self, group_ids, group_coordinator_id):
                     .format(version))
         return self._send_request_to_node(group_coordinator_id, request)
 
+    @staticmethod
+    def _convert_topic_partitions(topic_partitions):
+        return [
+            (
+                topic,
+                partition_ids
+            )
+            for topic, partition_ids in topic_partitions.items()
+        ]
+
+    def _get_all_topic_partitions(self):
+        return [
+            (
+                topic,
+                [partition_info.partition for partition_info in self._client.cluster._partitions[topic].values()]
+            )
+            for topic in self._client.cluster.topics()
+        ]
+
+    def _get_topic_partitions(self, topic_partitions):
+        if topic_partitions is None:
+            return self._get_all_topic_partitions()
+        return self._convert_topic_partitions(topic_partitions)
+
+    def perform_leader_election(self, election_type, topic_partitions=None, timeout_ms=None):
+        """Perform leader election on the topic partitions.
+
+        :param election_type: Type of election to attempt. 0 for Perferred, 1 for Unclean
+        :param topic_partitions: A map of topic name strings to partition ids list.
+            By default, will run on all topic partitions
+        :param timeout_ms: Milliseconds to wait for the leader election process to complete
+            before the broker returns.
+
+        :return: Appropriate version of ElectLeadersResponse class.
+        """
+        version = self._client.api_version(ElectLeadersRequest, max_version=1)
+        timeout_ms = self._validate_timeout(timeout_ms)
+        request = ElectLeadersRequest[version](
+            election_type=ElectionType(election_type),
+            topic_partitions=self._get_topic_partitions(topic_partitions),
+            timeout=timeout_ms,
+        )
+        # TODO convert structs to a more pythonic interface
+        return self._send_request_to_controller(request)
+
     def _wait_for_futures(self, futures):
+        """Block until all futures complete. If any fail, raise the encountered exception.
+
+        Arguments:
+            futures: A list of Future objects awaiting results.
+
+        Raises:
+            The first encountered exception if a future fails.
+        """
         while not all(future.succeeded() for future in futures):
             for future in futures:
                 self._client.poll(future=future)
 
                 if future.failed():
                     raise future.exception  # pylint: disable-msg=raising-bad-type
+
+    def describe_log_dirs(self):
+        """Send a DescribeLogDirsRequest request to a broker.
+
+        Returns:
+            A message future
+        """
+        version = self._client.api_version(DescribeLogDirsRequest, max_version=0)
+        if version <= 0:
+            request = DescribeLogDirsRequest[version]()
+            future = self._send_request_to_node(self._client.least_loaded_node(), request)
+            self._wait_for_futures([future])
+        else:
+            raise NotImplementedError(
+                "Support for DescribeLogDirsRequest_v{} has not yet been added to KafkaAdminClient."
+                    .format(version))
+        return future.value
diff --git a/benchmarks/README.md b/kafka/benchmarks/README.md
similarity index 100%
rename from benchmarks/README.md
rename to kafka/benchmarks/README.md
diff --git a/kafka/benchmarks/__init__.py b/kafka/benchmarks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/consumer_performance.py b/kafka/benchmarks/consumer_performance.py
old mode 100755
new mode 100644
similarity index 67%
rename from benchmarks/consumer_performance.py
rename to kafka/benchmarks/consumer_performance.py
index 9e3b6a919..c35a164c2
--- a/benchmarks/consumer_performance.py
+++ b/kafka/benchmarks/consumer_performance.py
@@ -4,43 +4,16 @@
 from __future__ import absolute_import, print_function
 
 import argparse
-import logging
 import pprint
 import sys
 import threading
+import time
 import traceback
 
-from kafka.vendor.six.moves import range
-
-from kafka import KafkaConsumer, KafkaProducer
-from test.fixtures import KafkaFixture, ZookeeperFixture
-
-logging.basicConfig(level=logging.ERROR)
-
-
-def start_brokers(n):
-    print('Starting {0} {1}-node cluster...'.format(KafkaFixture.kafka_version, n))
-    print('-> 1 Zookeeper')
-    zk = ZookeeperFixture.instance()
-    print('---> {0}:{1}'.format(zk.host, zk.port))
-    print()
-
-    partitions = min(n, 3)
-    replicas = min(n, 3)
-    print('-> {0} Brokers [{1} partitions / {2} replicas]'.format(n, partitions, replicas))
-    brokers = [
-        KafkaFixture.instance(i, zk, zk_chroot='',
-                              partitions=partitions, replicas=replicas)
-        for i in range(n)
-    ]
-    for broker in brokers:
-        print('---> {0}:{1}'.format(broker.host, broker.port))
-    print()
-    return brokers
+from kafka import KafkaConsumer
 
 
 class ConsumerPerformance(object):
-
     @staticmethod
     def run(args):
         try:
@@ -53,28 +26,17 @@ def run(args):
                     pass
                 if v == 'None':
                     v = None
+                elif v == 'False':
+                    v = False
+                elif v == 'True':
+                    v = True
                 props[k] = v
 
-            if args.brokers:
-                brokers = start_brokers(args.brokers)
-                props['bootstrap_servers'] = ['{0}:{1}'.format(broker.host, broker.port)
-                                              for broker in brokers]
-                print('---> bootstrap_servers={0}'.format(props['bootstrap_servers']))
-                print()
-
-                print('-> Producing records')
-                record = bytes(bytearray(args.record_size))
-                producer = KafkaProducer(compression_type=args.fixture_compression,
-                                         **props)
-                for i in range(args.num_records):
-                    producer.send(topic=args.topic, value=record)
-                producer.flush()
-                producer.close()
-                print('-> OK!')
-                print()
-
             print('Initializing Consumer...')
+            props['bootstrap_servers'] = args.bootstrap_servers
             props['auto_offset_reset'] = 'earliest'
+            if 'group_id' not in props:
+                props['group_id'] = 'kafka-consumer-benchmark'
             if 'consumer_timeout_ms' not in props:
                 props['consumer_timeout_ms'] = 10000
             props['metrics_sample_window_ms'] = args.stats_interval * 1000
@@ -92,14 +54,18 @@ def run(args):
             print('-> OK!')
             print()
 
+            start_time = time.time()
             records = 0
             for msg in consumer:
                 records += 1
                 if records >= args.num_records:
                     break
-            print('Consumed {0} records'.format(records))
 
+            end_time = time.time()
             timer_stop.set()
+            timer.join()
+            print('Consumed {0} records'.format(records))
+            print('Execution time:', end_time - start_time, 'secs')
 
         except Exception:
             exc_info = sys.exc_info()
@@ -143,18 +109,17 @@ def get_args_parser():
     parser = argparse.ArgumentParser(
         description='This tool is used to verify the consumer performance.')
 
+    parser.add_argument(
+        '--bootstrap-servers', type=str, nargs='+', default=(),
+        help='host:port for cluster bootstrap servers')
     parser.add_argument(
         '--topic', type=str,
-        help='Topic for consumer test',
+        help='Topic for consumer test (default: kafka-python-benchmark-test)',
         default='kafka-python-benchmark-test')
     parser.add_argument(
         '--num-records', type=int,
-        help='number of messages to consume',
+        help='number of messages to consume (default: 1000000)',
         default=1000000)
-    parser.add_argument(
-        '--record-size', type=int,
-        help='message size in bytes',
-        default=100)
     parser.add_argument(
         '--consumer-config', type=str, nargs='+', default=(),
         help='kafka consumer related configuration properties like '
@@ -162,13 +127,9 @@ def get_args_parser():
     parser.add_argument(
         '--fixture-compression', type=str,
         help='specify a compression type for use with broker fixtures / producer')
-    parser.add_argument(
-        '--brokers', type=int,
-        help='Number of kafka brokers to start',
-        default=0)
     parser.add_argument(
         '--stats-interval', type=int,
-        help='Interval in seconds for stats reporting to console',
+        help='Interval in seconds for stats reporting to console (default: 5)',
         default=5)
     parser.add_argument(
         '--raw-metrics', action='store_true',
diff --git a/kafka/benchmarks/load_example.py b/kafka/benchmarks/load_example.py
new file mode 100644
index 000000000..29796a74c
--- /dev/null
+++ b/kafka/benchmarks/load_example.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+from __future__ import print_function
+
+import argparse
+import logging
+import threading
+import time
+
+from kafka import KafkaConsumer, KafkaProducer
+
+
+class Producer(threading.Thread):
+
+    def __init__(self, bootstrap_servers, topic, stop_event, msg_size):
+        super(Producer, self).__init__()
+        self.bootstrap_servers = bootstrap_servers
+        self.topic = topic
+        self.stop_event = stop_event
+        self.big_msg = b'1' * msg_size
+
+    def run(self):
+        producer = KafkaProducer(bootstrap_servers=self.bootstrap_servers)
+        self.sent = 0
+
+        while not self.stop_event.is_set():
+            producer.send(self.topic, self.big_msg)
+            self.sent += 1
+        producer.flush()
+        producer.close()
+
+
+class Consumer(threading.Thread):
+    def __init__(self, bootstrap_servers, topic, stop_event, msg_size):
+        super(Consumer, self).__init__()
+        self.bootstrap_servers = bootstrap_servers
+        self.topic = topic
+        self.stop_event = stop_event
+        self.msg_size = msg_size
+
+    def run(self):
+        consumer = KafkaConsumer(bootstrap_servers=self.bootstrap_servers,
+                                 auto_offset_reset='earliest')
+        consumer.subscribe([self.topic])
+        self.valid = 0
+        self.invalid = 0
+
+        for message in consumer:
+            if len(message.value) == self.msg_size:
+                self.valid += 1
+            else:
+                print('Invalid message:', len(message.value), self.msg_size)
+                self.invalid += 1
+
+            if self.stop_event.is_set():
+                break
+        consumer.close()
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser(
+        description='This tool is used to demonstrate consumer and producer load.')
+
+    parser.add_argument(
+        '--bootstrap-servers', type=str, nargs='+', default=('localhost:9092'),
+        help='host:port for cluster bootstrap servers (default: localhost:9092)')
+    parser.add_argument(
+        '--topic', type=str,
+        help='Topic for load test (default: kafka-python-benchmark-load-example)',
+        default='kafka-python-benchmark-load-example')
+    parser.add_argument(
+        '--msg-size', type=int,
+        help='Message size, in bytes, for load test (default: 524288)',
+        default=524288)
+    parser.add_argument(
+        '--load-time', type=int,
+        help='number of seconds to run load test (default: 10)',
+        default=10)
+    parser.add_argument(
+        '--log-level', type=str,
+        help='Optional logging level for load test: ERROR|INFO|DEBUG etc',
+        default=None)
+    return parser
+
+
+def main(args):
+    if args.log_level:
+        logging.basicConfig(
+            format='%(asctime)s.%(msecs)s:%(name)s:%(thread)d:%(levelname)s:%(process)d:%(message)s',
+            level=getattr(logging, args.log_level))
+    producer_stop = threading.Event()
+    consumer_stop = threading.Event()
+    threads = [
+        Producer(args.bootstrap_servers, args.topic, producer_stop, args.msg_size),
+        Consumer(args.bootstrap_servers, args.topic, consumer_stop, args.msg_size)
+    ]
+
+    for t in threads:
+        t.start()
+
+    time.sleep(args.load_time)
+    producer_stop.set()
+    consumer_stop.set()
+    print('Messages sent: %d' % threads[0].sent)
+    print('Messages recvd: %d' % threads[1].valid)
+    print('Messages invalid: %d' % threads[1].invalid)
+
+
+if __name__ == "__main__":
+    args = get_args_parser().parse_args()
+    main(args)
diff --git a/benchmarks/producer_performance.py b/kafka/benchmarks/producer_performance.py
old mode 100755
new mode 100644
similarity index 71%
rename from benchmarks/producer_performance.py
rename to kafka/benchmarks/producer_performance.py
index c0de6fd23..1a1092960
--- a/benchmarks/producer_performance.py
+++ b/kafka/benchmarks/producer_performance.py
@@ -7,37 +7,15 @@
 import pprint
 import sys
 import threading
+import time
 import traceback
 
 from kafka.vendor.six.moves import range
 
 from kafka import KafkaProducer
-from test.fixtures import KafkaFixture, ZookeeperFixture
-
-
-def start_brokers(n):
-    print('Starting {0} {1}-node cluster...'.format(KafkaFixture.kafka_version, n))
-    print('-> 1 Zookeeper')
-    zk = ZookeeperFixture.instance()
-    print('---> {0}:{1}'.format(zk.host, zk.port))
-    print()
-
-    partitions = min(n, 3)
-    replicas = min(n, 3)
-    print('-> {0} Brokers [{1} partitions / {2} replicas]'.format(n, partitions, replicas))
-    brokers = [
-        KafkaFixture.instance(i, zk, zk_chroot='',
-                              partitions=partitions, replicas=replicas)
-        for i in range(n)
-    ]
-    for broker in brokers:
-        print('---> {0}:{1}'.format(broker.host, broker.port))
-    print()
-    return brokers
 
 
 class ProducerPerformance(object):
-
     @staticmethod
     def run(args):
         try:
@@ -50,18 +28,14 @@ def run(args):
                     pass
                 if v == 'None':
                     v = None
+                elif v == 'False':
+                    v = False
+                elif v == 'True':
+                    v = True
                 props[k] = v
 
-            if args.brokers:
-                brokers = start_brokers(args.brokers)
-                props['bootstrap_servers'] = ['{0}:{1}'.format(broker.host, broker.port)
-                                              for broker in brokers]
-                print("---> bootstrap_servers={0}".format(props['bootstrap_servers']))
-                print()
-                print('-> OK!')
-                print()
-
             print('Initializing producer...')
+            props['bootstrap_servers'] = args.bootstrap_servers
             record = bytes(bytearray(args.record_size))
             props['metrics_sample_window_ms'] = args.stats_interval * 1000
 
@@ -79,11 +53,29 @@ def run(args):
             print('-> OK!')
             print()
 
-            for i in range(args.num_records):
-                producer.send(topic=args.topic, value=record)
-            producer.flush()
-
+            def _benchmark():
+                results = []
+                for i in range(args.num_records):
+                    results.append(producer.send(topic=args.topic, value=record))
+                print("Send complete...")
+                producer.flush()
+                producer.close()
+                count_success, count_failure = 0, 0
+                for r in results:
+                    if r.succeeded():
+                        count_success += 1
+                    elif r.failed():
+                        count_failure += 1
+                    else:
+                        raise ValueError(r)
+                print("%d suceeded, %d failed" % (count_success, count_failure))
+
+            start_time = time.time()
+            _benchmark()
+            end_time = time.time()
             timer_stop.set()
+            timer.join()
+            print('Execution time:', end_time - start_time, 'secs')
 
         except Exception:
             exc_info = sys.exc_info()
@@ -101,6 +93,8 @@ def __init__(self, interval, producer, event=None, raw_metrics=False):
 
     def print_stats(self):
         metrics = self.producer.metrics()
+        if not metrics:
+            return
         if self.raw_metrics:
             pprint.pprint(metrics)
         else:
@@ -125,29 +119,28 @@ def get_args_parser():
     parser = argparse.ArgumentParser(
         description='This tool is used to verify the producer performance.')
 
+    parser.add_argument(
+        '--bootstrap-servers', type=str, nargs='+', default=(),
+        help='host:port for cluster bootstrap server')
     parser.add_argument(
         '--topic', type=str,
-        help='Topic name for test',
+        help='Topic name for test (default: kafka-python-benchmark-test)',
         default='kafka-python-benchmark-test')
     parser.add_argument(
         '--num-records', type=int,
-        help='number of messages to produce',
+        help='number of messages to produce (default: 1000000)',
         default=1000000)
     parser.add_argument(
         '--record-size', type=int,
-        help='message size in bytes',
+        help='message size in bytes (default: 100)',
         default=100)
     parser.add_argument(
         '--producer-config', type=str, nargs='+', default=(),
         help='kafka producer related configuaration properties like '
              'bootstrap_servers,client_id etc..')
-    parser.add_argument(
-        '--brokers', type=int,
-        help='Number of kafka brokers to start',
-        default=0)
     parser.add_argument(
         '--stats-interval', type=int,
-        help='Interval in seconds for stats reporting to console',
+        help='Interval in seconds for stats reporting to console (default: 5)',
         default=5)
     parser.add_argument(
         '--raw-metrics', action='store_true',
diff --git a/benchmarks/record_batch_compose.py b/kafka/benchmarks/record_batch_compose.py
similarity index 89%
rename from benchmarks/record_batch_compose.py
rename to kafka/benchmarks/record_batch_compose.py
index 5bdefa7af..5b07fd59a 100644
--- a/benchmarks/record_batch_compose.py
+++ b/kafka/benchmarks/record_batch_compose.py
@@ -71,7 +71,8 @@ def func(loops, magic):
     return res
 
 
-runner = pyperf.Runner()
-runner.bench_time_func('batch_append_v0', func, 0)
-runner.bench_time_func('batch_append_v1', func, 1)
-runner.bench_time_func('batch_append_v2', func, 2)
+if __name__ == '__main__':
+    runner = pyperf.Runner()
+    runner.bench_time_func('batch_append_v0', func, 0)
+    runner.bench_time_func('batch_append_v1', func, 1)
+    runner.bench_time_func('batch_append_v2', func, 2)
diff --git a/benchmarks/record_batch_read.py b/kafka/benchmarks/record_batch_read.py
similarity index 90%
rename from benchmarks/record_batch_read.py
rename to kafka/benchmarks/record_batch_read.py
index aa5e9c1e5..2ef32298d 100644
--- a/benchmarks/record_batch_read.py
+++ b/kafka/benchmarks/record_batch_read.py
@@ -76,7 +76,8 @@ def func(loops, magic):
     return res
 
 
-runner = pyperf.Runner()
-runner.bench_time_func('batch_read_v0', func, 0)
-runner.bench_time_func('batch_read_v1', func, 1)
-runner.bench_time_func('batch_read_v2', func, 2)
+if __name__ == '__main__':
+    runner = pyperf.Runner()
+    runner.bench_time_func('batch_read_v0', func, 0)
+    runner.bench_time_func('batch_read_v1', func, 1)
+    runner.bench_time_func('batch_read_v2', func, 2)
diff --git a/benchmarks/varint_speed.py b/kafka/benchmarks/varint_speed.py
similarity index 81%
rename from benchmarks/varint_speed.py
rename to kafka/benchmarks/varint_speed.py
index fd63d0ac1..b2628a1b5 100644
--- a/benchmarks/varint_speed.py
+++ b/kafka/benchmarks/varint_speed.py
@@ -113,8 +113,6 @@ def encode_varint_1(num):
         raise ValueError("Out of double range")
     return buf[:i + 1]
 
-_assert_valid_enc(encode_varint_1)
-
 
 def encode_varint_2(value, int2byte=six.int2byte):
     value = (value << 1) ^ (value >> 63)
@@ -128,8 +126,6 @@ def encode_varint_2(value, int2byte=six.int2byte):
         value >>= 7
     return res + int2byte(bits)
 
-_assert_valid_enc(encode_varint_2)
-
 
 def encode_varint_3(value, buf):
     append = buf.append
@@ -145,12 +141,6 @@ def encode_varint_3(value, buf):
     return value
 
 
-for encoded, decoded in test_data:
-    res = bytearray()
-    encode_varint_3(decoded, res)
-    assert res == encoded
-
-
 def encode_varint_4(value, int2byte=six.int2byte):
     value = (value << 1) ^ (value >> 63)
 
@@ -185,12 +175,6 @@ def encode_varint_4(value, int2byte=six.int2byte):
         return res + int2byte(bits)
 
 
-_assert_valid_enc(encode_varint_4)
-
-# import dis
-# dis.dis(encode_varint_4)
-
-
 def encode_varint_5(value, buf, pos=0):
     value = (value << 1) ^ (value >> 63)
 
@@ -204,12 +188,6 @@ def encode_varint_5(value, buf, pos=0):
     buf[pos] = bits
     return pos + 1
 
-for encoded, decoded in test_data:
-    res = bytearray(10)
-    written = encode_varint_5(decoded, res)
-    assert res[:written] == encoded
-
-
 def encode_varint_6(value, buf):
     append = buf.append
     value = (value << 1) ^ (value >> 63)
@@ -253,12 +231,6 @@ def encode_varint_6(value, buf):
     return i
 
 
-for encoded, decoded in test_data:
-    res = bytearray()
-    encode_varint_6(decoded, res)
-    assert res == encoded
-
-
 def size_of_varint_1(value):
     """ Number of bytes needed to encode an integer in variable-length format.
     """
@@ -271,8 +243,6 @@ def size_of_varint_1(value):
             break
     return res
 
-_assert_valid_size(size_of_varint_1)
-
 
 def size_of_varint_2(value):
     """ Number of bytes needed to encode an integer in variable-length format.
@@ -298,8 +268,6 @@ def size_of_varint_2(value):
         return 9
     return 10
 
-_assert_valid_size(size_of_varint_2)
-
 
 if six.PY3:
     def _read_byte(memview, pos):
@@ -351,8 +319,6 @@ def decode_varint_1(buffer, pos=0):
     # Normalize sign
     return (value >> 1) ^ -(value & 1), i + 1
 
-_assert_valid_dec(decode_varint_1)
-
 
 def decode_varint_2(buffer, pos=0):
     result = 0
@@ -369,9 +335,6 @@ def decode_varint_2(buffer, pos=0):
             raise ValueError("Out of int64 range")
 
 
-_assert_valid_dec(decode_varint_2)
-
-
 def decode_varint_3(buffer, pos=0):
     result = buffer[pos]
     if not (result & 0x81):
@@ -393,51 +356,79 @@ def decode_varint_3(buffer, pos=0):
             raise ValueError("Out of int64 range")
 
 
-_assert_valid_dec(decode_varint_3)
-
-# import dis
-# dis.dis(decode_varint_3)
-
-runner = pyperf.Runner()
-# Encode algorithms returning a bytes result
-for bench_func in [
-        encode_varint_1,
-        encode_varint_2,
-        encode_varint_4]:
-    for i, value in enumerate(BENCH_VALUES_ENC):
-        runner.bench_func(
-            '{}_{}byte'.format(bench_func.__name__, i + 1),
-            bench_func, value)
-
-# Encode algorithms writing to the buffer
-for bench_func in [
-        encode_varint_3,
-        encode_varint_5,
-        encode_varint_6]:
-    for i, value in enumerate(BENCH_VALUES_ENC):
-        fname = bench_func.__name__
-        runner.timeit(
-            '{}_{}byte'.format(fname, i + 1),
-            stmt="{}({}, buffer)".format(fname, value),
-            setup="from __main__ import {}; buffer = bytearray(10)".format(
-                fname)
-        )
-
-# Size algorithms
-for bench_func in [
-        size_of_varint_1,
-        size_of_varint_2]:
-    for i, value in enumerate(BENCH_VALUES_ENC):
-        runner.bench_func(
-            '{}_{}byte'.format(bench_func.__name__, i + 1),
-            bench_func, value)
-
-# Decode algorithms
-for bench_func in [
-        decode_varint_1,
-        decode_varint_2,
-        decode_varint_3]:
-    for i, value in enumerate(BENCH_VALUES_DEC):
-        runner.bench_func(
-            '{}_{}byte'.format(bench_func.__name__, i + 1),
-            bench_func, value)
+if __name__ == '__main__':
+    _assert_valid_enc(encode_varint_1)
+    _assert_valid_enc(encode_varint_2)
+
+    for encoded, decoded in test_data:
+        res = bytearray()
+        encode_varint_3(decoded, res)
+        assert res == encoded
+
+    _assert_valid_enc(encode_varint_4)
+
+    # import dis
+    # dis.dis(encode_varint_4)
+
+    for encoded, decoded in test_data:
+        res = bytearray(10)
+        written = encode_varint_5(decoded, res)
+        assert res[:written] == encoded
+
+    for encoded, decoded in test_data:
+        res = bytearray()
+        encode_varint_6(decoded, res)
+        assert res == encoded
+
+    _assert_valid_size(size_of_varint_1)
+    _assert_valid_size(size_of_varint_2)
+    _assert_valid_dec(decode_varint_1)
+    _assert_valid_dec(decode_varint_2)
+    _assert_valid_dec(decode_varint_3)
+
+    # import dis
+    # dis.dis(decode_varint_3)
+
+    runner = pyperf.Runner()
+    # Encode algorithms returning a bytes result
+    for bench_func in [
+            encode_varint_1,
+            encode_varint_2,
+            encode_varint_4]:
+        for i, value in enumerate(BENCH_VALUES_ENC):
+            runner.bench_func(
+                '{}_{}byte'.format(bench_func.__name__, i + 1),
+                bench_func, value)
+
+    # Encode algorithms writing to the buffer
+    for bench_func in [
+            encode_varint_3,
+            encode_varint_5,
+            encode_varint_6]:
+        for i, value in enumerate(BENCH_VALUES_ENC):
+            fname = bench_func.__name__
+            runner.timeit(
+                '{}_{}byte'.format(fname, i + 1),
+                stmt="{}({}, buffer)".format(fname, value),
+                setup="from __main__ import {}; buffer = bytearray(10)".format(
+                    fname)
+            )
+
+    # Size algorithms
+    for bench_func in [
+            size_of_varint_1,
+            size_of_varint_2]:
+        for i, value in enumerate(BENCH_VALUES_ENC):
+            runner.bench_func(
+                '{}_{}byte'.format(bench_func.__name__, i + 1),
+                bench_func, value)
+
+    # Decode algorithms
+    for bench_func in [
+            decode_varint_1,
+            decode_varint_2,
+            decode_varint_3]:
+        for i, value in enumerate(BENCH_VALUES_DEC):
+            runner.bench_func(
+                '{}_{}byte'.format(bench_func.__name__, i + 1),
+                bench_func, value)
diff --git a/kafka/client_async.py b/kafka/client_async.py
index 58f22d4ec..7d466574f 100644
--- a/kafka/client_async.py
+++ b/kafka/client_async.py
@@ -19,17 +19,18 @@
 from kafka.vendor import six
 
 from kafka.cluster import ClusterMetadata
-from kafka.conn import BrokerConnection, ConnectionStates, collect_hosts, get_ip_port_afi
+from kafka.conn import BrokerConnection, ConnectionStates, get_ip_port_afi
 from kafka import errors as Errors
 from kafka.future import Future
 from kafka.metrics import AnonMeasurable
 from kafka.metrics.stats import Avg, Count, Rate
 from kafka.metrics.stats.rate import TimeUnit
+from kafka.protocol.broker_api_versions import BROKER_API_VERSIONS
 from kafka.protocol.metadata import MetadataRequest
-from kafka.util import Dict, WeakMethod
+from kafka.util import Dict, Timer, WeakMethod, ensure_valid_topic_name
 # Although this looks unused, it actually monkey-patches socket.socketpair()
 # and should be left in as long as we're using socket.socketpair() in this file
-from kafka.vendor import socketpair
+from kafka.vendor import socketpair # noqa: F401
 from kafka.version import __version__
 
 if six.PY2:
@@ -75,7 +76,7 @@ class KafkaClient(object):
             reconnection attempts will continue periodically with this fixed
             rate. To avoid connection storms, a randomization factor of 0.2
             will be applied to the backoff resulting in a random range between
-            20% below and 20% above the computed value. Default: 1000.
+            20% below and 20% above the computed value. Default: 30000.
         request_timeout_ms (int): Client request timeout in milliseconds.
             Default: 30000.
         connections_max_idle_ms: Close idle connections after the number of
@@ -101,6 +102,9 @@ class KafkaClient(object):
             which we force a refresh of metadata even if we haven't seen any
             partition leadership changes to proactively discover any new
             brokers or partitions. Default: 300000
+        allow_auto_create_topics (bool): Enable/disable auto topic creation
+            on metadata request. Only available with api_version >= (0, 11).
+            Default: True
         security_protocol (str): Protocol used to communicate with brokers.
             Valid values are: PLAINTEXT, SSL, SASL_PLAINTEXT, SASL_SSL.
             Default: PLAINTEXT.
@@ -129,12 +133,24 @@ class KafkaClient(object):
             format. If no cipher can be selected (because compile-time options
             or other configuration forbids use of all the specified ciphers),
             an ssl.SSLError will be raised. See ssl.SSLContext.set_ciphers
-        api_version (tuple): Specify which Kafka API version to use. If set
-            to None, KafkaClient will attempt to infer the broker version by
-            probing various APIs. Example: (0, 10, 2). Default: None
+        api_version (tuple): Specify which Kafka API version to use. If set to
+            None, the client will attempt to determine the broker version via
+            ApiVersionsRequest API or, for brokers earlier than 0.10, probing
+            various known APIs. Dynamic version checking is performed eagerly
+            during __init__ and can raise NoBrokersAvailableError if no connection
+            was made before timeout (see api_version_auto_timeout_ms below).
+            Different versions enable different functionality.
+
+            Examples:
+                (3, 9) most recent broker release, enable all supported features
+                (0, 10, 0) enables sasl authentication
+                (0, 8, 0) enables basic functionality only
+
+            Default: None
         api_version_auto_timeout_ms (int): number of milliseconds to throw a
             timeout exception from the constructor when checking the broker
-            api version. Only applies if api_version is None
+            api version. Only applies if api_version set to None.
+            Default: 2000
         selector (selectors.BaseSelector): Provide a specific selector
             implementation to use for I/O multiplexing.
             Default: selectors.DefaultSelector
@@ -148,12 +164,16 @@ class KafkaClient(object):
             Required if sasl_mechanism is PLAIN or one of the SCRAM mechanisms.
         sasl_plain_password (str): password for sasl PLAIN and SCRAM authentication.
             Required if sasl_mechanism is PLAIN or one of the SCRAM mechanisms.
+        sasl_kerberos_name (str or gssapi.Name): Constructed gssapi.Name for use with
+            sasl mechanism handshake. If provided, sasl_kerberos_service_name and
+            sasl_kerberos_domain name are ignored. Default: None.
         sasl_kerberos_service_name (str): Service name to include in GSSAPI
             sasl mechanism handshake. Default: 'kafka'
         sasl_kerberos_domain_name (str): kerberos domain name to use in GSSAPI
             sasl mechanism handshake. Default: one of bootstrap servers
-        sasl_oauth_token_provider (AbstractTokenProvider): OAuthBearer token provider
-            instance. (See kafka.oauth.abstract). Default: None
+        sasl_oauth_token_provider (kafka.sasl.oauth.AbstractTokenProvider): OAuthBearer
+            token provider instance. Default: None
+        socks5_proxy (str): Socks5 proxy URL. Default: None
     """
 
     DEFAULT_CONFIG = {
@@ -164,7 +184,7 @@ class KafkaClient(object):
         'wakeup_timeout_ms': 3000,
         'connections_max_idle_ms': 9 * 60 * 1000,
         'reconnect_backoff_ms': 50,
-        'reconnect_backoff_max_ms': 1000,
+        'reconnect_backoff_max_ms': 30000,
         'max_in_flight_requests_per_connection': 5,
         'receive_buffer_bytes': None,
         'send_buffer_bytes': None,
@@ -172,6 +192,7 @@ class KafkaClient(object):
         'sock_chunk_bytes': 4096,  # undocumented experimental option
         'sock_chunk_buffer_count': 1000,  # undocumented experimental option
         'retry_backoff_ms': 100,
+        'allow_auto_create_topics': True,
         'metadata_max_age_ms': 300000,
         'security_protocol': 'PLAINTEXT',
         'ssl_context': None,
@@ -190,9 +211,11 @@ class KafkaClient(object):
         'sasl_mechanism': None,
         'sasl_plain_username': None,
         'sasl_plain_password': None,
+        'sasl_kerberos_name': None,
         'sasl_kerberos_service_name': 'kafka',
         'sasl_kerberos_domain_name': None,
-        'sasl_oauth_token_provider': None
+        'sasl_oauth_token_provider': None,
+        'socks5_proxy': None,
     }
 
     def __init__(self, **configs):
@@ -204,8 +227,9 @@ def __init__(self, **configs):
         # these properties need to be set on top of the initialization pipeline
         # because they are used when __del__ method is called
         self._closed = False
-        self._wake_r, self._wake_w = socket.socketpair()
         self._selector = self.config['selector']()
+        self._init_wakeup_socketpair()
+        self._wake_lock = threading.Lock()
 
         self.cluster = ClusterMetadata(**self.config)
         self._topics = set()  # empty set will fetch all topic metadata
@@ -214,12 +238,10 @@ def __init__(self, **configs):
         self._api_versions = None
         self._connecting = set()
         self._sending = set()
-        self._refresh_on_disconnects = True
+
+        # Not currently used, but data is collected internally
         self._last_bootstrap = 0
         self._bootstrap_fails = 0
-        self._wake_r.setblocking(False)
-        self._wake_w.settimeout(self.config['wakeup_timeout_ms'] / 1000.0)
-        self._wake_lock = threading.Lock()
 
         self._lock = threading.RLock()
 
@@ -228,7 +250,6 @@ def __init__(self, **configs):
         # lock above.
         self._pending_completion = collections.deque()
 
-        self._selector.register(self._wake_r, selectors.EVENT_READ)
         self._idle_expiry_manager = IdleConnectionManager(self.config['connections_max_idle_ms'])
         self._sensors = None
         if self.config['metrics']:
@@ -236,26 +257,48 @@ def __init__(self, **configs):
                                                self.config['metric_group_prefix'],
                                                weakref.proxy(self._conns))
 
-        self._num_bootstrap_hosts = len(collect_hosts(self.config['bootstrap_servers']))
-
         # Check Broker Version if not set explicitly
         if self.config['api_version'] is None:
-            check_timeout = self.config['api_version_auto_timeout_ms'] / 1000
-            self.config['api_version'] = self.check_version(timeout=check_timeout)
-
-    def _can_bootstrap(self):
-        effective_failures = self._bootstrap_fails // self._num_bootstrap_hosts
-        backoff_factor = 2 ** effective_failures
-        backoff_ms = min(self.config['reconnect_backoff_ms'] * backoff_factor,
-                         self.config['reconnect_backoff_max_ms'])
+            self.config['api_version'] = self.check_version()
+        elif self.config['api_version'] in BROKER_API_VERSIONS:
+            self._api_versions = BROKER_API_VERSIONS[self.config['api_version']]
+        elif (self.config['api_version'] + (0,)) in BROKER_API_VERSIONS:
+            log.warning('Configured api_version %s is ambiguous; using %s',
+                        self.config['api_version'], self.config['api_version'] + (0,))
+            self.config['api_version'] = self.config['api_version'] + (0,)
+            self._api_versions = BROKER_API_VERSIONS[self.config['api_version']]
+        else:
+            compatible_version = None
+            for v in sorted(BROKER_API_VERSIONS.keys(), reverse=True):
+                if v <= self.config['api_version']:
+                    compatible_version = v
+                    break
+            if compatible_version:
+                log.warning('Configured api_version %s not supported; using %s',
+                            self.config['api_version'], compatible_version)
+                self.config['api_version'] = compatible_version
+                self._api_versions = BROKER_API_VERSIONS[compatible_version]
+            else:
+                raise Errors.UnrecognizedBrokerVersion(self.config['api_version'])
 
-        backoff_ms *= random.uniform(0.8, 1.2)
+    def _init_wakeup_socketpair(self):
+        self._wake_r, self._wake_w = socket.socketpair()
+        self._wake_r.setblocking(False)
+        self._wake_w.settimeout(self.config['wakeup_timeout_ms'] / 1000.0)
+        self._waking = False
+        self._selector.register(self._wake_r, selectors.EVENT_READ)
 
-        next_at = self._last_bootstrap + backoff_ms / 1000.0
-        now = time.time()
-        if next_at > now:
-            return False
-        return True
+    def _close_wakeup_socketpair(self):
+        if self._wake_r is not None:
+            try:
+                self._selector.unregister(self._wake_r)
+            except (KeyError, ValueError, TypeError):
+                pass
+            self._wake_r.close()
+        if self._wake_w is not None:
+            self._wake_w.close()
+        self._wake_r = None
+        self._wake_w = None
 
     def _can_connect(self, node_id):
         if node_id not in self._conns:
@@ -267,7 +310,7 @@ def _can_connect(self, node_id):
 
     def _conn_state_change(self, node_id, sock, conn):
         with self._lock:
-            if conn.connecting():
+            if conn.state is ConnectionStates.CONNECTING:
                 # SSL connections can enter this state 2x (second during Handshake)
                 if node_id not in self._connecting:
                     self._connecting.add(node_id)
@@ -279,7 +322,19 @@ def _conn_state_change(self, node_id, sock, conn):
                 if self.cluster.is_bootstrap(node_id):
                     self._last_bootstrap = time.time()
 
-            elif conn.connected():
+            elif conn.state is ConnectionStates.API_VERSIONS_SEND:
+                try:
+                    self._selector.register(sock, selectors.EVENT_WRITE, conn)
+                except KeyError:
+                    self._selector.modify(sock, selectors.EVENT_WRITE, conn)
+
+            elif conn.state in (ConnectionStates.API_VERSIONS_RECV, ConnectionStates.AUTHENTICATING):
+                try:
+                    self._selector.register(sock, selectors.EVENT_READ, conn)
+                except KeyError:
+                    self._selector.modify(sock, selectors.EVENT_READ, conn)
+
+            elif conn.state is ConnectionStates.CONNECTED:
                 log.debug("Node %s connected", node_id)
                 if node_id in self._connecting:
                     self._connecting.remove(node_id)
@@ -296,6 +351,8 @@ def _conn_state_change(self, node_id, sock, conn):
 
                 if self.cluster.is_bootstrap(node_id):
                     self._bootstrap_fails = 0
+                    if self._api_versions is None:
+                        self._api_versions = conn._api_versions
 
                 else:
                     for node_id in list(self._conns.keys()):
@@ -308,7 +365,7 @@ def _conn_state_change(self, node_id, sock, conn):
                     self._connecting.remove(node_id)
                 try:
                     self._selector.unregister(sock)
-                except KeyError:
+                except (KeyError, ValueError):
                     pass
 
                 if self._sensors:
@@ -327,7 +384,7 @@ def _conn_state_change(self, node_id, sock, conn):
                 elif self.cluster.is_bootstrap(node_id):
                     self._bootstrap_fails += 1
 
-                elif self._refresh_on_disconnects and not self._closed and not idle_disconnect:
+                elif conn.connect_failed() and not self._closed and not idle_disconnect:
                     log.warning("Node %s connection failed -- refreshing metadata", node_id)
                     self.cluster.request_update()
 
@@ -343,6 +400,11 @@ def maybe_connect(self, node_id, wakeup=True):
             return True
         return False
 
+    def connection_failed(self, node_id):
+        if node_id not in self._conns:
+            return False
+        return self._conns[node_id].connect_failed()
+
     def _should_recycle_connection(self, conn):
         # Never recycle unless disconnected
         if not conn.disconnected():
@@ -353,7 +415,7 @@ def _should_recycle_connection(self, conn):
         if broker is None:
             return False
 
-        host, _, afi = get_ip_port_afi(broker.host)
+        host, _, _ = get_ip_port_afi(broker.host)
         if conn.host != host or conn.port != broker.port:
             log.info("Broker metadata change detected for node %s"
                      " from %s:%s to %s:%s", conn.node_id, conn.host, conn.port,
@@ -362,14 +424,24 @@ def _should_recycle_connection(self, conn):
 
         return False
 
-    def _maybe_connect(self, node_id):
-        """Idempotent non-blocking connection attempt to the given node id."""
+    def _init_connect(self, node_id):
+        """Idempotent non-blocking connection attempt to the given node id.
+
+        Returns True if connection object exists and is connected / connecting
+        """
         with self._lock:
             conn = self._conns.get(node_id)
 
+            # Check if existing connection should be recreated because host/port changed
+            if conn is not None and self._should_recycle_connection(conn):
+                self._conns.pop(node_id).close()
+                conn = None
+
             if conn is None:
                 broker = self.cluster.broker_metadata(node_id)
-                assert broker, 'Broker id %s not in current metadata' % (node_id,)
+                if broker is None:
+                    log.debug('Broker id %s not in current metadata', node_id)
+                    return False
 
                 log.debug("Initiating connection to node %s at %s:%s",
                           node_id, broker.host, broker.port)
@@ -381,16 +453,9 @@ def _maybe_connect(self, node_id):
                                         **self.config)
                 self._conns[node_id] = conn
 
-            # Check if existing connection should be recreated because host/port changed
-            elif self._should_recycle_connection(conn):
-                self._conns.pop(node_id)
-                return False
-
-            elif conn.connected():
-                return True
-
-            conn.connect()
-            return conn.connected()
+            if conn.disconnected():
+                conn.connect()
+            return not conn.disconnected()
 
     def ready(self, node_id, metadata_priority=True):
         """Check whether a node is connected and ok to send more requests.
@@ -416,8 +481,7 @@ def connected(self, node_id):
     def _close(self):
         if not self._closed:
             self._closed = True
-            self._wake_r.close()
-            self._wake_w.close()
+            self._close_wakeup_socketpair()
             self._selector.close()
 
     def close(self, node_id=None):
@@ -464,9 +528,8 @@ def is_disconnected(self, node_id):
     def connection_delay(self, node_id):
         """
         Return the number of milliseconds to wait, based on the connection
-        state, before attempting to send data. When disconnected, this respects
-        the reconnect backoff time. When connecting, returns 0 to allow
-        non-blocking connect to finish. When connected, returns a very large
+        state, before attempting to send data. When connecting or disconnected,
+        this respects the reconnect backoff time. When connected, returns a very large
         number to handle slow/stalled connections.
 
         Arguments:
@@ -480,6 +543,16 @@ def connection_delay(self, node_id):
             return 0
         return conn.connection_delay()
 
+    def throttle_delay(self, node_id):
+        """
+        Return the number of milliseconds to wait until a broker is no longer throttled.
+        When disconnected / connecting, returns 0.
+        """
+        conn = self._conns.get(node_id)
+        if conn is None:
+            return 0
+        return conn.throttle_delay()
+
     def is_ready(self, node_id, metadata_priority=True):
         """Check whether a node is ready to send more requests.
 
@@ -512,7 +585,7 @@ def _can_send_request(self, node_id):
             return False
         return conn.connected() and conn.can_send_more()
 
-    def send(self, node_id, request, wakeup=True):
+    def send(self, node_id, request, wakeup=True, request_timeout_ms=None):
         """Send a request to a specific node. Bytes are placed on an
         internal per-connection send-queue. Actual network I/O will be
         triggered in a subsequent call to .poll()
@@ -520,7 +593,13 @@ def send(self, node_id, request, wakeup=True):
         Arguments:
             node_id (int): destination node
             request (Struct): request object (not-encoded)
-            wakeup (bool): optional flag to disable thread-wakeup
+
+        Keyword Arguments:
+            wakeup (bool, optional): optional flag to disable thread-wakeup.
+            request_timeout_ms (int, optional): Provide custom timeout in milliseconds.
+                If response is not processed before timeout, client will fail the
+                request and close the connection.
+                Default: None (uses value from client configuration)
 
         Raises:
             AssertionError: if node_id is not in current cluster metadata
@@ -536,8 +615,9 @@ def send(self, node_id, request, wakeup=True):
         # conn.send will queue the request internally
         # we will need to call send_pending_requests()
         # to trigger network I/O
-        future = conn.send(request, blocking=False)
-        self._sending.add(conn)
+        future = conn.send(request, blocking=False, request_timeout_ms=request_timeout_ms)
+        if not future.is_done:
+            self._sending.add(conn)
 
         # Wakeup signal is useful in case another thread is
         # blocked waiting for incoming network traffic while holding
@@ -563,12 +643,9 @@ def poll(self, timeout_ms=None, future=None):
         Returns:
             list: responses received (can be empty)
         """
-        if future is not None:
-            timeout_ms = 100
-        elif timeout_ms is None:
-            timeout_ms = self.config['request_timeout_ms']
-        elif not isinstance(timeout_ms, (int, float)):
+        if not isinstance(timeout_ms, (int, float, type(None))):
             raise TypeError('Invalid type for timeout: %s' % type(timeout_ms))
+        timer = Timer(timeout_ms)
 
         # Loop for futures, break after first loop if None
         responses = []
@@ -579,24 +656,30 @@ def poll(self, timeout_ms=None, future=None):
 
                 # Attempt to complete pending connections
                 for node_id in list(self._connecting):
-                    self._maybe_connect(node_id)
-
-                # Send a metadata request if needed
+                    # False return means no more connection progress is possible
+                    # Connected nodes will update _connecting via state_change callback
+                    if not self._init_connect(node_id):
+                        # It's possible that the connection attempt triggered a state change
+                        # but if not, make sure to remove from _connecting list
+                        if node_id in self._connecting:
+                            self._connecting.remove(node_id)
+
+                # Send a metadata request if needed (or initiate new connection)
                 metadata_timeout_ms = self._maybe_refresh_metadata()
 
                 # If we got a future that is already done, don't block in _poll
                 if future is not None and future.is_done:
                     timeout = 0
                 else:
+                    user_timeout_ms = timer.timeout_ms if timeout_ms is not None else self.config['request_timeout_ms']
                     idle_connection_timeout_ms = self._idle_expiry_manager.next_check_ms()
+                    request_timeout_ms = self._next_ifr_request_timeout_ms()
+                    log.debug("Timeouts: user %f, metadata %f, idle connection %f, request %f", user_timeout_ms, metadata_timeout_ms, idle_connection_timeout_ms, request_timeout_ms)
                     timeout = min(
-                        timeout_ms,
+                        user_timeout_ms,
                         metadata_timeout_ms,
                         idle_connection_timeout_ms,
-                        self.config['request_timeout_ms'])
-                    # if there are no requests in flight, do not block longer than the retry backoff
-                    if self.in_flight_request_count() == 0:
-                        timeout = min(timeout, self.config['retry_backoff_ms'])
+                        request_timeout_ms)
                     timeout = max(0, timeout)  # avoid negative timeouts
 
                 self._poll(timeout / 1000)
@@ -607,7 +690,11 @@ def poll(self, timeout_ms=None, future=None):
 
             # If all we had was a timeout (future is None) - only do one poll
             # If we do have a future, we keep looping until it is done
-            if future is None or future.is_done:
+            if future is None:
+                break
+            elif future.is_done:
+                break
+            elif timeout_ms is not None and timer.expired:
                 break
 
         return responses
@@ -615,6 +702,8 @@ def poll(self, timeout_ms=None, future=None):
     def _register_send_sockets(self):
         while self._sending:
             conn = self._sending.pop()
+            if conn._sock is None:
+                continue
             try:
                 key = self._selector.get_key(conn._sock)
                 events = key.events | selectors.EVENT_WRITE
@@ -623,6 +712,11 @@ def _register_send_sockets(self):
                 self._selector.register(conn._sock, selectors.EVENT_WRITE, conn)
 
     def _poll(self, timeout):
+        # Python throws OverflowError if timeout is > 2147483647 milliseconds
+        # (though the param to selector.select is in seconds)
+        # so convert any too-large timeout to blocking
+        if timeout > 2147483:
+            timeout = None
         # This needs to be locked, but since it is only called from within the
         # locked section of poll(), there is no additional lock acquisition here
         processed = set()
@@ -695,11 +789,13 @@ def _poll(self, timeout):
 
         for conn in six.itervalues(self._conns):
             if conn.requests_timed_out():
+                timed_out = conn.timed_out_ifrs()
+                timeout_ms = (timed_out[0][2] - timed_out[0][1]) * 1000
                 log.warning('%s timed out after %s ms. Closing connection.',
-                            conn, conn.config['request_timeout_ms'])
+                            conn, timeout_ms)
                 conn.close(error=Errors.RequestTimedOutError(
                     'Request timed out after %s ms' %
-                    conn.config['request_timeout_ms']))
+                    timeout_ms))
 
         if self._sensors:
             self._sensors.io_time.record((time.time() - end_select) * 1000000000)
@@ -737,16 +833,17 @@ def _fire_pending_completed_requests(self):
                 break
             future.success(response)
             responses.append(response)
+
         return responses
 
     def least_loaded_node(self):
         """Choose the node with fewest outstanding requests, with fallbacks.
 
-        This method will prefer a node with an existing connection and no
-        in-flight-requests. If no such node is found, a node will be chosen
-        randomly from disconnected nodes that are not "blacked out" (i.e.,
+        This method will prefer a node with an existing connection (not throttled)
+        with no in-flight-requests. If no such node is found, a node will be chosen
+        randomly from all nodes that are not throttled or "blacked out" (i.e.,
         are not subject to a reconnect backoff). If no node metadata has been
-        obtained, will return a bootstrap node (subject to exponential backoff).
+        obtained, will return a bootstrap node.
 
         Returns:
             node_id or None if no suitable node was found
@@ -758,11 +855,11 @@ def least_loaded_node(self):
         found = None
         for node_id in nodes:
             conn = self._conns.get(node_id)
-            connected = conn is not None and conn.connected()
-            blacked_out = conn is not None and conn.blacked_out()
+            connected = conn is not None and conn.connected() and conn.can_send_more()
+            blacked_out = conn is not None and (conn.blacked_out() or conn.throttled())
             curr_inflight = len(conn.in_flight_requests) if conn is not None else 0
             if connected and curr_inflight == 0:
-                # if we find an established connection
+                # if we find an established connection (not throttled)
                 # with no in-flight requests, we can stop right away
                 return node_id
             elif not blacked_out and curr_inflight < inflight:
@@ -772,6 +869,24 @@ def least_loaded_node(self):
 
         return found
 
+    def _refresh_delay_ms(self, node_id):
+        conn = self._conns.get(node_id)
+        if conn is not None and conn.connected():
+            return self.throttle_delay(node_id)
+        else:
+            return self.connection_delay(node_id)
+
+    def least_loaded_node_refresh_ms(self):
+        """Return connection or throttle delay in milliseconds for next available node.
+
+        This method is used primarily for retry/backoff during metadata refresh
+        during / after a cluster outage, in which there are no available nodes.
+
+        Returns:
+           float: delay_ms
+        """
+        return min([self._refresh_delay_ms(broker.nodeId) for broker in self.cluster.brokers()])
+
     def set_topics(self, topics):
         """Set specific topics to track for metadata.
 
@@ -796,19 +911,31 @@ def add_topic(self, topic):
 
         Returns:
             Future: resolves after metadata request/response
+
+        Raises:
+            TypeError: if topic is not a string
+            ValueError: if topic is invalid: must be chars (a-zA-Z0-9._-), and less than 250 length
         """
+        ensure_valid_topic_name(topic)
+
         if topic in self._topics:
             return Future().success(set(self._topics))
 
         self._topics.add(topic)
         return self.cluster.request_update()
 
+    def _next_ifr_request_timeout_ms(self):
+        if self._conns:
+            return min([conn.next_ifr_request_timeout_ms() for conn in six.itervalues(self._conns)])
+        else:
+            return float('inf')
+
     # This method should be locked when running multi-threaded
     def _maybe_refresh_metadata(self, wakeup=False):
         """Send a metadata request if needed.
 
         Returns:
-            int: milliseconds until next refresh
+            float: milliseconds until next refresh
         """
         ttl = self.cluster.ttl()
         wait_for_in_progress_ms = self.config['request_timeout_ms'] if self._metadata_refresh_in_progress else 0
@@ -822,18 +949,44 @@ def _maybe_refresh_metadata(self, wakeup=False):
         # least_loaded_node()
         node_id = self.least_loaded_node()
         if node_id is None:
-            log.debug("Give up sending metadata request since no node is available");
-            return self.config['reconnect_backoff_ms']
+            next_connect_ms = self.least_loaded_node_refresh_ms()
+            log.debug("Give up sending metadata request since no node is available. (reconnect delay %d ms)", next_connect_ms)
+            return next_connect_ms
 
+        if not self._can_send_request(node_id):
+            # If there's any connection establishment underway, wait until it completes. This prevents
+            # the client from unnecessarily connecting to additional nodes while a previous connection
+            # attempt has not been completed.
+            if self._connecting:
+                return float('inf')
+
+            elif self._can_connect(node_id):
+                log.debug("Initializing connection to node %s for metadata request", node_id)
+                self._connecting.add(node_id)
+                if not self._init_connect(node_id):
+                    if node_id in self._connecting:
+                        self._connecting.remove(node_id)
+                    # Connection attempt failed immediately, need to retry with a different node
+                    return self.config['reconnect_backoff_ms']
+            else:
+                # Existing connection throttled or max in flight requests.
+                return self.throttle_delay(node_id) or self.config['request_timeout_ms']
+
+        # Recheck node_id in case we were able to connect immediately above
         if self._can_send_request(node_id):
             topics = list(self._topics)
             if not topics and self.cluster.is_bootstrap(node_id):
                 topics = list(self.config['bootstrap_topics_filter'])
 
-            if self.cluster.need_all_topic_metadata or not topics:
-                topics = [] if self.config['api_version'] < (0, 10) else None
-            api_version = 0 if self.config['api_version'] < (0, 10) else 1
-            request = MetadataRequest[api_version](topics)
+            api_version = self.api_version(MetadataRequest, max_version=7)
+            if self.cluster.need_all_topic_metadata:
+                topics = MetadataRequest[api_version].ALL_TOPICS
+            elif not topics:
+                topics = MetadataRequest[api_version].NO_TOPICS
+            if api_version >= 4:
+                request = MetadataRequest[api_version](topics, self.config['allow_auto_create_topics'])
+            else:
+                request = MetadataRequest[api_version](topics)
             log.debug("Sending metadata request %s to node %s", request, node_id)
             future = self.send(node_id, request, wakeup=wakeup)
             future.add_callback(self.cluster.update_metadata)
@@ -846,103 +999,146 @@ def refresh_done(val_or_error):
             future.add_errback(refresh_done)
             return self.config['request_timeout_ms']
 
-        # If there's any connection establishment underway, wait until it completes. This prevents
-        # the client from unnecessarily connecting to additional nodes while a previous connection
-        # attempt has not been completed.
+        # Should only get here if still connecting
         if self._connecting:
+            return float('inf')
+        else:
             return self.config['reconnect_backoff_ms']
 
-        if self.maybe_connect(node_id, wakeup=wakeup):
-            log.debug("Initializing connection to node %s for metadata request", node_id)
-            return self.config['reconnect_backoff_ms']
-
-        # connected but can't send more, OR connecting
-        # In either case we just need to wait for a network event
-        # to let us know the selected connection might be usable again.
-        return float('inf')
-
     def get_api_versions(self):
         """Return the ApiVersions map, if available.
 
-        Note: A call to check_version must previously have succeeded and returned
-        version 0.10.0 or later
+        Note: Only available after bootstrap; requires broker version 0.10.0 or later.
 
         Returns: a map of dict mapping {api_key : (min_version, max_version)},
         or None if ApiVersion is not supported by the kafka cluster.
         """
         return self._api_versions
 
-    def check_version(self, node_id=None, timeout=2, strict=False):
+    def check_version(self, node_id=None, timeout=None, **kwargs):
         """Attempt to guess the version of a Kafka broker.
 
-        Note: It is possible that this method blocks longer than the
-            specified timeout. This can happen if the entire cluster
-            is down and the client enters a bootstrap backoff sleep.
-            This is only possible if node_id is None.
+        Keyword Arguments:
+            node_id (str, optional): Broker node id from cluster metadata. If None, attempts
+                to connect to any available broker until version is identified.
+                Default: None
+            timeout (num, optional): Maximum time in seconds to try to check broker version.
+                If unable to identify version before timeout, raise error (see below).
+                Default: api_version_auto_timeout_ms / 1000
 
-        Returns: version tuple, i.e. (0, 10), (0, 9), (0, 8, 2), ...
+        Returns: version tuple, i.e. (3, 9), (2, 0), (0, 10, 2) etc
 
         Raises:
             NodeNotReadyError (if node_id is provided)
             NoBrokersAvailable (if node_id is None)
-            UnrecognizedBrokerVersion: please file bug if seen!
-            AssertionError (if strict=True): please file bug if seen!
         """
-        self._lock.acquire()
-        end = time.time() + timeout
-        while time.time() < end:
-
-            # It is possible that least_loaded_node falls back to bootstrap,
-            # which can block for an increasing backoff period
-            try_node = node_id or self.least_loaded_node()
-            if try_node is None:
-                self._lock.release()
-                raise Errors.NoBrokersAvailable()
-            self._maybe_connect(try_node)
-            conn = self._conns[try_node]
-
-            # We will intentionally cause socket failures
-            # These should not trigger metadata refresh
-            self._refresh_on_disconnects = False
-            try:
-                remaining = end - time.time()
-                version = conn.check_version(timeout=remaining, strict=strict, topics=list(self.config['bootstrap_topics_filter']))
-                if version >= (0, 10, 0):
-                    # cache the api versions map if it's available (starting
-                    # in 0.10 cluster version)
-                    self._api_versions = conn.get_api_versions()
-                self._lock.release()
-                return version
-            except Errors.NodeNotReadyError:
-                # Only raise to user if this is a node-specific request
+        timeout = timeout or (self.config['api_version_auto_timeout_ms'] / 1000)
+        with self._lock:
+            end = time.time() + timeout
+            while time.time() < end:
+                time_remaining = max(end - time.time(), 0)
+                if node_id is not None and self.connection_delay(node_id) > 0:
+                    sleep_time = min(time_remaining, self.connection_delay(node_id) / 1000.0)
+                    if sleep_time > 0:
+                        time.sleep(sleep_time)
+                    continue
+                try_node = node_id or self.least_loaded_node()
+                if try_node is None:
+                    sleep_time = min(time_remaining,  self.least_loaded_node_refresh_ms() / 1000.0)
+                    if sleep_time > 0:
+                        log.warning('No node available during check_version; sleeping %.2f secs', sleep_time)
+                        time.sleep(sleep_time)
+                    continue
+                log.debug('Attempting to check version with node %s', try_node)
+                if not self._init_connect(try_node):
+                    if try_node == node_id:
+                        raise Errors.NodeNotReadyError("Connection failed to %s" % node_id)
+                    else:
+                        continue
+                conn = self._conns[try_node]
+
+                while conn.connecting() and time.time() < end:
+                    timeout_ms = min((end - time.time()) * 1000, 200)
+                    self.poll(timeout_ms=timeout_ms)
+
+                if conn._api_version is not None:
+                    return conn._api_version
+                else:
+                    log.debug('Failed to identify api_version after connection attempt to %s', conn)
+
+            # Timeout
+            else:
                 if node_id is not None:
-                    self._lock.release()
-                    raise
-            finally:
-                self._refresh_on_disconnects = True
+                    raise Errors.NodeNotReadyError(node_id)
+                else:
+                    raise Errors.NoBrokersAvailable()
 
-        # Timeout
-        else:
-            self._lock.release()
-            raise Errors.NoBrokersAvailable()
+    def api_version(self, operation, max_version=None):
+        """Find the latest version of the protocol operation supported by both
+        this library and the broker.
+
+        This resolves to the lesser of either the latest api version this
+        library supports, or the max version supported by the broker.
+
+        Arguments:
+            operation: A list of protocol operation versions from kafka.protocol.
+
+        Keyword Arguments:
+            max_version (int, optional): Provide an alternate maximum api version
+                to reflect limitations in user code.
+
+        Returns:
+            int: The highest api version number compatible between client and broker.
+
+        Raises: IncompatibleBrokerVersion if no matching version is found
+        """
+        # Cap max_version at the largest available version in operation list
+        max_version = min(len(operation) - 1, max_version if max_version is not None else float('inf'))
+        broker_api_versions = self._api_versions
+        api_key = operation[0].API_KEY
+        if broker_api_versions is None or api_key not in broker_api_versions:
+            raise Errors.IncompatibleBrokerVersion(
+                "Kafka broker does not support the '{}' Kafka protocol."
+                .format(operation[0].__name__))
+        broker_min_version, broker_max_version = broker_api_versions[api_key]
+        version = min(max_version, broker_max_version)
+        if version < broker_min_version:
+            # max library version is less than min broker version. Currently,
+            # no Kafka versions specify a min msg version. Maybe in the future?
+            raise Errors.IncompatibleBrokerVersion(
+                "No version of the '{}' Kafka protocol is supported by both the client and broker."
+                .format(operation[0].__name__))
+        return version
 
     def wakeup(self):
+        if self._closed or self._waking or self._wake_w is None:
+            return
         with self._wake_lock:
             try:
                 self._wake_w.sendall(b'x')
-            except socket.timeout:
+                self._waking = True
+            except socket.timeout as e:
                 log.warning('Timeout to send to wakeup socket!')
-                raise Errors.KafkaTimeoutError()
-            except socket.error:
-                log.warning('Unable to send to wakeup socket!')
+                raise Errors.KafkaTimeoutError(e)
+            except socket.error as e:
+                log.warning('Unable to send to wakeup socket! %s', e)
+                raise e
 
     def _clear_wake_fd(self):
         # reading from wake socket should only happen in a single thread
-        while True:
-            try:
-                self._wake_r.recv(1024)
-            except socket.error:
-                break
+        with self._wake_lock:
+            self._waking = False
+            while True:
+                try:
+                    if not self._wake_r.recv(1024):
+                        # Non-blocking socket returns empty on error
+                        log.warning("Error reading wakeup socket. Rebuilding socketpair.")
+                        self._close_wakeup_socketpair()
+                        self._init_wakeup_socketpair()
+                        break
+                except socket.error:
+                    # Non-blocking socket raises when socket is ok but no data available to read
+                    break
 
     def _maybe_close_oldest_connection(self):
         expired_connection = self._idle_expiry_manager.poll_expired_connection()
@@ -962,6 +1158,39 @@ def bootstrap_connected(self):
         else:
             return False
 
+    def await_ready(self, node_id, timeout_ms=30000):
+        """
+        Invokes `poll` to discard pending disconnects, followed by `client.ready` and 0 or more `client.poll`
+        invocations until the connection to `node` is ready, the timeoutMs expires or the connection fails.
+
+        It returns `true` if the call completes normally or `false` if the timeoutMs expires. If the connection fails,
+        an `IOException` is thrown instead. Note that if the `NetworkClient` has been configured with a positive
+        connection timeoutMs, it is possible for this method to raise an `IOException` for a previous connection which
+        has recently disconnected.
+
+        This method is useful for implementing blocking behaviour on top of the non-blocking `NetworkClient`, use it with
+        care.
+        """
+        timer = Timer(timeout_ms)
+        self.poll(timeout_ms=0)
+        if self.is_ready(node_id):
+            return True
+
+        while not self.is_ready(node_id) and not timer.expired:
+            if self.connection_failed(node_id):
+                raise Errors.KafkaConnectionError("Connection to %s failed." % (node_id,))
+            self.maybe_connect(node_id)
+            self.poll(timeout_ms=timer.timeout_ms)
+        return self.is_ready(node_id)
+
+    def send_and_receive(self, node_id, request):
+        future = self.send(node_id, request)
+        self.poll(future=future)
+        assert future.is_done
+        if future.failed():
+            raise future.exception
+        return future.value
+
 
 # OrderedDict requires python2.7+
 try:
@@ -998,7 +1227,7 @@ def is_expired(self, conn_id):
 
     def next_check_ms(self):
         now = time.time()
-        if not self.lru_connections:
+        if not self.lru_connections or self.next_idle_close_check_time == float('inf'):
             return float('inf')
         elif self.next_idle_close_check_time <= now:
             return 0
diff --git a/kafka/cluster.py b/kafka/cluster.py
index 438baf29d..d6ec82dba 100644
--- a/kafka/cluster.py
+++ b/kafka/cluster.py
@@ -3,13 +3,15 @@
 import collections
 import copy
 import logging
+import random
+import re
 import threading
 import time
 
 from kafka.vendor import six
 
 from kafka import errors as Errors
-from kafka.conn import collect_hosts
+from kafka.conn import get_ip_port_afi
 from kafka.future import Future
 from kafka.structs import BrokerMetadata, PartitionMetadata, TopicPartition
 
@@ -21,7 +23,7 @@ class ClusterMetadata(object):
     A class to manage kafka cluster metadata.
 
     This class does not perform any IO. It simply updates internal state
-    given API responses (MetadataResponse, GroupCoordinatorResponse).
+    given API responses (MetadataResponse, FindCoordinatorResponse).
 
     Keyword Arguments:
         retry_backoff_ms (int): Milliseconds to backoff when retrying on
@@ -47,7 +49,7 @@ def __init__(self, **configs):
         self._brokers = {}  # node_id -> BrokerMetadata
         self._partitions = {}  # topic -> partition -> PartitionMetadata
         self._broker_partitions = collections.defaultdict(set)  # node_id -> {TopicPartition...}
-        self._groups = {}  # group_name -> node_id
+        self._coordinators = {}  # (coord_type, coord_key) -> node_id
         self._last_refresh_ms = 0
         self._last_successful_refresh_ms = 0
         self._need_update = True
@@ -58,6 +60,7 @@ def __init__(self, **configs):
         self.unauthorized_topics = set()
         self.internal_topics = set()
         self.controller = None
+        self.cluster_id = None
 
         self.config = copy.copy(self.DEFAULT_CONFIG)
         for key in self.config:
@@ -92,7 +95,7 @@ def broker_metadata(self, broker_id):
         """Get BrokerMetadata
 
         Arguments:
-            broker_id (int): node_id for a broker to check
+            broker_id (int or str): node_id for a broker to check
 
         Returns:
             BrokerMetadata or None if not found
@@ -111,6 +114,7 @@ def partitions_for_topic(self, topic):
 
         Returns:
             set: {partition (int), ...}
+            None if topic not found.
         """
         if topic not in self._partitions:
             return None
@@ -140,11 +144,14 @@ def leader_for_partition(self, partition):
             return None
         return self._partitions[partition.topic][partition.partition].leader
 
+    def leader_epoch_for_partition(self, partition):
+        return self._partitions[partition.topic][partition.partition].leader_epoch
+
     def partitions_for_broker(self, broker_id):
         """Return TopicPartitions for which the broker is a leader.
 
         Arguments:
-            broker_id (int): node id for a broker
+            broker_id (int or str): node id for a broker
 
         Returns:
             set: {TopicPartition, ...}
@@ -159,10 +166,10 @@ def coordinator_for_group(self, group):
             group (str): name of consumer group
 
         Returns:
-            int: node_id for group coordinator
+            node_id (int or str) for group coordinator, -1 if coordinator unknown
             None if the group does not exist.
         """
-        return self._groups.get(group)
+        return self._coordinators.get(('group', group))
 
     def ttl(self):
         """Milliseconds until metadata should be refreshed"""
@@ -197,6 +204,10 @@ def request_update(self):
                 self._future = Future()
             return self._future
 
+    @property
+    def need_update(self):
+        return self._need_update
+
     def topics(self, exclude_internal_topics=True):
         """Get set of known topics.
 
@@ -236,7 +247,7 @@ def update_metadata(self, metadata):
         """
         # In the common case where we ask for a single topic and get back an
         # error, we should fail the future
-        if len(metadata.topics) == 1 and metadata.topics[0][0] != 0:
+        if len(metadata.topics) == 1 and metadata.topics[0][0] != Errors.NoError.errno:
             error_code, topic = metadata.topics[0][:2]
             error = Errors.for_code(error_code)(topic)
             return self.failed_update(error)
@@ -261,6 +272,11 @@ def update_metadata(self, metadata):
         else:
             _new_controller = _new_brokers.get(metadata.controller_id)
 
+        if metadata.API_VERSION < 2:
+            _new_cluster_id = None
+        else:
+            _new_cluster_id = metadata.cluster_id
+
         _new_partitions = {}
         _new_broker_partitions = collections.defaultdict(set)
         _new_unauthorized_topics = set()
@@ -277,10 +293,21 @@ def update_metadata(self, metadata):
             error_type = Errors.for_code(error_code)
             if error_type is Errors.NoError:
                 _new_partitions[topic] = {}
-                for p_error, partition, leader, replicas, isr in partitions:
+                for partition_data in partitions:
+                    leader_epoch = -1
+                    offline_replicas = []
+                    if metadata.API_VERSION >= 7:
+                        p_error, partition, leader, leader_epoch, replicas, isr, offline_replicas = partition_data
+                    elif metadata.API_VERSION >= 5:
+                        p_error, partition, leader, replicas, isr, offline_replicas = partition_data
+                    else:
+                        p_error, partition, leader, replicas, isr = partition_data
+
                     _new_partitions[topic][partition] = PartitionMetadata(
-                        topic=topic, partition=partition, leader=leader,
-                        replicas=replicas, isr=isr, error=p_error)
+                        topic=topic, partition=partition,
+                        leader=leader, leader_epoch=leader_epoch,
+                        replicas=replicas, isr=isr, offline_replicas=offline_replicas,
+                        error=p_error)
                     if leader != -1:
                         _new_broker_partitions[leader].add(
                             TopicPartition(topic, partition))
@@ -306,6 +333,7 @@ def update_metadata(self, metadata):
         with self._lock:
             self._brokers = _new_brokers
             self.controller = _new_controller
+            self.cluster_id = _new_cluster_id
             self._partitions = _new_partitions
             self._broker_partitions = _new_broker_partitions
             self.unauthorized_topics = _new_unauthorized_topics
@@ -342,24 +370,25 @@ def remove_listener(self, listener):
         """Remove a previously added listener callback"""
         self._listeners.remove(listener)
 
-    def add_group_coordinator(self, group, response):
-        """Update with metadata for a group coordinator
+    def add_coordinator(self, response, coord_type, coord_key):
+        """Update with metadata for a group or txn coordinator
 
         Arguments:
-            group (str): name of group from GroupCoordinatorRequest
-            response (GroupCoordinatorResponse): broker response
+            response (FindCoordinatorResponse): broker response
+            coord_type (str): 'group' or 'transaction'
+            coord_key (str): consumer_group or transactional_id
 
         Returns:
             string: coordinator node_id if metadata is updated, None on error
         """
-        log.debug("Updating coordinator for %s: %s", group, response)
+        log.debug("Updating coordinator for %s/%s: %s", coord_type, coord_key, response)
         error_type = Errors.for_code(response.error_code)
         if error_type is not Errors.NoError:
-            log.error("GroupCoordinatorResponse error: %s", error_type)
-            self._groups[group] = -1
+            log.error("FindCoordinatorResponse error: %s", error_type)
+            self._coordinators[(coord_type, coord_key)] = -1
             return
 
-        # Use a coordinator-specific node id so that group requests
+        # Use a coordinator-specific node id so that requests
         # get a dedicated connection
         node_id = 'coordinator-{}'.format(response.coordinator_id)
         coordinator = BrokerMetadata(
@@ -368,9 +397,9 @@ def add_group_coordinator(self, group, response):
             response.port,
             None)
 
-        log.info("Group coordinator for %s is %s", group, coordinator)
+        log.info("Coordinator for %s/%s is %s", coord_type, coord_key, coordinator)
         self._coordinator_brokers[node_id] = coordinator
-        self._groups[group] = node_id
+        self._coordinators[(coord_type, coord_key)] = node_id
         return node_id
 
     def with_partitions(self, partitions_to_add):
@@ -379,7 +408,7 @@ def with_partitions(self, partitions_to_add):
         new_metadata._brokers = copy.deepcopy(self._brokers)
         new_metadata._partitions = copy.deepcopy(self._partitions)
         new_metadata._broker_partitions = copy.deepcopy(self._broker_partitions)
-        new_metadata._groups = copy.deepcopy(self._groups)
+        new_metadata._coordinators = copy.deepcopy(self._coordinators)
         new_metadata.internal_topics = copy.deepcopy(self.internal_topics)
         new_metadata.unauthorized_topics = copy.deepcopy(self.unauthorized_topics)
 
@@ -393,5 +422,26 @@ def with_partitions(self, partitions_to_add):
         return new_metadata
 
     def __str__(self):
-        return 'ClusterMetadata(brokers: %d, topics: %d, groups: %d)' % \
-               (len(self._brokers), len(self._partitions), len(self._groups))
+        return 'ClusterMetadata(brokers: %d, topics: %d, coordinators: %d)' % \
+               (len(self._brokers), len(self._partitions), len(self._coordinators))
+
+
+def collect_hosts(hosts, randomize=True):
+    """
+    Collects a comma-separated set of hosts (host:port) and optionally
+    randomize the returned list.
+    """
+
+    if isinstance(hosts, six.string_types):
+        hosts = hosts.strip().split(',')
+
+    result = []
+    for host_port in hosts:
+        # ignore leading SECURITY_PROTOCOL:// to mimic java client
+        host_port = re.sub('^.*://', '', host_port)
+        host, port, afi = get_ip_port_afi(host_port)
+        result.append((host, port, afi))
+
+    if randomize:
+        random.shuffle(result)
+    return result
diff --git a/kafka/codec.py b/kafka/codec.py
index 917400e74..b73df060d 100644
--- a/kafka/codec.py
+++ b/kafka/codec.py
@@ -187,14 +187,21 @@ def _detect_xerial_stream(payload):
         The version is the version of this format as written by xerial,
         in the wild this is currently 1 as such we only support v1.
 
-        Compat is there to claim the miniumum supported version that
+        Compat is there to claim the minimum supported version that
         can read a xerial block stream, presently in the wild this is
         1.
     """
 
     if len(payload) > 16:
-        header = struct.unpack('!' + _XERIAL_V1_FORMAT, bytes(payload)[:16])
-        return header == _XERIAL_V1_HEADER
+        magic = struct.unpack('!' + _XERIAL_V1_FORMAT[:8], bytes(payload)[:8])
+        version, compat = struct.unpack('!' + _XERIAL_V1_FORMAT[8:], bytes(payload)[8:16])
+        # Until there is more than one way to do xerial blocking, the version + compat
+        # fields can be ignored. Also some producers (i.e., redpanda) are known to
+        # incorrectly encode these as little-endian, and that causes us to fail decoding
+        # when we otherwise would have succeeded.
+        # See https://github.com/dpkp/kafka-python/issues/2414
+        if magic == _XERIAL_V1_HEADER[:8]:
+            return True
     return False
 
 
diff --git a/kafka/conn.py b/kafka/conn.py
index cac354875..c9cdd595f 100644
--- a/kafka/conn.py
+++ b/kafka/conn.py
@@ -4,7 +4,7 @@
 import errno
 import io
 import logging
-from random import shuffle, uniform
+from random import uniform
 
 # selectors in stdlib as of py3.4
 try:
@@ -14,7 +14,6 @@
     from kafka.vendor import selectors34 as selectors
 
 import socket
-import struct
 import threading
 import time
 
@@ -23,16 +22,21 @@
 import kafka.errors as Errors
 from kafka.future import Future
 from kafka.metrics.stats import Avg, Count, Max, Rate
-from kafka.oauth.abstract import AbstractTokenProvider
-from kafka.protocol.admin import SaslHandShakeRequest, DescribeAclsRequest_v2, DescribeClientQuotasRequest
+from kafka.protocol.admin import DescribeAclsRequest, DescribeClientQuotasRequest, ListGroupsRequest
+from kafka.protocol.api_versions import ApiVersionsRequest
+from kafka.protocol.broker_api_versions import BROKER_API_VERSIONS
 from kafka.protocol.commit import OffsetFetchRequest
-from kafka.protocol.offset import OffsetRequest
-from kafka.protocol.produce import ProduceRequest
-from kafka.protocol.metadata import MetadataRequest
 from kafka.protocol.fetch import FetchRequest
+from kafka.protocol.find_coordinator import FindCoordinatorRequest
+from kafka.protocol.list_offsets import ListOffsetsRequest
+from kafka.protocol.metadata import MetadataRequest
 from kafka.protocol.parser import KafkaProtocol
-from kafka.protocol.types import Int32, Int8
-from kafka.scram import ScramClient
+from kafka.protocol.produce import ProduceRequest
+from kafka.protocol.sasl_authenticate import SaslAuthenticateRequest
+from kafka.protocol.sasl_handshake import SaslHandshakeRequest
+from kafka.protocol.types import Int32
+from kafka.sasl import get_sasl_mechanism
+from kafka.socks5_wrapper import Socks5Wrapper
 from kafka.version import __version__
 
 
@@ -45,10 +49,6 @@
 
 DEFAULT_KAFKA_PORT = 9092
 
-SASL_QOP_AUTH = 1
-SASL_QOP_AUTH_INT = 2
-SASL_QOP_AUTH_CONF = 4
-
 try:
     import ssl
     ssl_available = True
@@ -74,15 +74,6 @@ class SSLWantReadError(Exception):
     class SSLWantWriteError(Exception):
         pass
 
-# needed for SASL_GSSAPI authentication:
-try:
-    import gssapi
-    from gssapi.raw.misc import GSSError
-except ImportError:
-    #no gssapi available, will disable gssapi mechanism
-    gssapi = None
-    GSSError = None
-
 
 AFI_NAMES = {
     socket.AF_UNSPEC: "unspecified",
@@ -92,12 +83,13 @@ class SSLWantWriteError(Exception):
 
 
 class ConnectionStates(object):
-    DISCONNECTING = '<disconnecting>'
     DISCONNECTED = '<disconnected>'
     CONNECTING = '<connecting>'
     HANDSHAKE = '<handshake>'
     CONNECTED = '<connected>'
     AUTHENTICATING = '<authenticating>'
+    API_VERSIONS_SEND = '<checking_api_versions_send>'
+    API_VERSIONS_RECV = '<checking_api_versions_recv>'
 
 
 class BrokerConnection(object):
@@ -109,6 +101,10 @@ class BrokerConnection(object):
             server-side log entries that correspond to this client. Also
             submitted to GroupCoordinator for logging with respect to
             consumer group administration. Default: 'kafka-python-{version}'
+        client_software_name (str): Sent to kafka broker for KIP-511.
+            Default: 'kafka-python'
+        client_software_version (str): Sent to kafka broker for KIP-511.
+            Default: The kafka-python version (via kafka.version).
         reconnect_backoff_ms (int): The amount of time in milliseconds to
             wait before attempting to reconnect to a given host.
             Default: 50.
@@ -120,7 +116,7 @@ class BrokerConnection(object):
             reconnection attempts will continue periodically with this fixed
             rate. To avoid connection storms, a randomization factor of 0.2
             will be applied to the backoff resulting in a random range between
-            20% below and 20% above the computed value. Default: 1000.
+            20% below and 20% above the computed value. Default: 30000.
         request_timeout_ms (int): Client request timeout in milliseconds.
             Default: 30000.
         max_in_flight_requests_per_connection (int): Requests are pipelined
@@ -165,11 +161,11 @@ class BrokerConnection(object):
             or other configuration forbids use of all the specified ciphers),
             an ssl.SSLError will be raised. See ssl.SSLContext.set_ciphers
         api_version (tuple): Specify which Kafka API version to use.
-            Accepted values are: (0, 8, 0), (0, 8, 1), (0, 8, 2), (0, 9),
-            (0, 10). Default: (0, 8, 2)
+            Must be None or >= (0, 10, 0) to enable SASL authentication.
+            Default: None
         api_version_auto_timeout_ms (int): number of milliseconds to throw a
             timeout exception from the constructor when checking the broker
-            api version. Only applies if api_version is None
+            api version. Only applies if api_version is None. Default: 2000.
         selector (selectors.BaseSelector): Provide a specific selector
             implementation to use for I/O multiplexing.
             Default: selectors.DefaultSelector
@@ -185,20 +181,26 @@ class BrokerConnection(object):
             Required if sasl_mechanism is PLAIN or one of the SCRAM mechanisms.
         sasl_plain_password (str): password for sasl PLAIN and SCRAM authentication.
             Required if sasl_mechanism is PLAIN or one of the SCRAM mechanisms.
+        sasl_kerberos_name (str or gssapi.Name): Constructed gssapi.Name for use with
+            sasl mechanism handshake. If provided, sasl_kerberos_service_name and
+            sasl_kerberos_domain name are ignored. Default: None.
         sasl_kerberos_service_name (str): Service name to include in GSSAPI
             sasl mechanism handshake. Default: 'kafka'
         sasl_kerberos_domain_name (str): kerberos domain name to use in GSSAPI
             sasl mechanism handshake. Default: one of bootstrap servers
-        sasl_oauth_token_provider (AbstractTokenProvider): OAuthBearer token provider
-            instance. (See kafka.oauth.abstract). Default: None
+        sasl_oauth_token_provider (kafka.sasl.oauth.AbstractTokenProvider): OAuthBearer
+            token provider instance. Default: None
+        socks5_proxy (str): Socks5 proxy url. Default: None
     """
 
     DEFAULT_CONFIG = {
         'client_id': 'kafka-python-' + __version__,
+        'client_software_name': 'kafka-python',
+        'client_software_version': __version__,
         'node_id': 0,
         'request_timeout_ms': 30000,
         'reconnect_backoff_ms': 50,
-        'reconnect_backoff_max_ms': 1000,
+        'reconnect_backoff_max_ms': 30000,
         'max_in_flight_requests_per_connection': 5,
         'receive_buffer_bytes': None,
         'send_buffer_bytes': None,
@@ -214,7 +216,8 @@ class BrokerConnection(object):
         'ssl_crlfile': None,
         'ssl_password': None,
         'ssl_ciphers': None,
-        'api_version': (0, 8, 2),  # default to most restrictive
+        'api_version': None,
+        'api_version_auto_timeout_ms': 2000,
         'selector': selectors.DefaultSelector,
         'state_change_callback': lambda node_id, sock, conn: True,
         'metrics': None,
@@ -222,12 +225,19 @@ class BrokerConnection(object):
         'sasl_mechanism': None,
         'sasl_plain_username': None,
         'sasl_plain_password': None,
+        'sasl_kerberos_name': None,
         'sasl_kerberos_service_name': 'kafka',
         'sasl_kerberos_domain_name': None,
-        'sasl_oauth_token_provider': None
+        'sasl_oauth_token_provider': None,
+        'socks5_proxy': None,
     }
     SECURITY_PROTOCOLS = ('PLAINTEXT', 'SSL', 'SASL_PLAINTEXT', 'SASL_SSL')
-    SASL_MECHANISMS = ('PLAIN', 'GSSAPI', 'OAUTHBEARER', "SCRAM-SHA-256", "SCRAM-SHA-512")
+    VERSION_CHECKS = (
+        ((0, 9), ListGroupsRequest[0]()),
+        ((0, 8, 2), FindCoordinatorRequest[0]('kafka-python-default-group')),
+        ((0, 8, 1), OffsetFetchRequest[0]('kafka-python-default-group', [])),
+        ((0, 8, 0), MetadataRequest[0]([])),
+    )
 
     def __init__(self, host, port, afi, **configs):
         self.host = host
@@ -236,6 +246,11 @@ def __init__(self, host, port, afi, **configs):
         self._sock_afi = afi
         self._sock_addr = None
         self._api_versions = None
+        self._api_version = None
+        self._check_version_idx = None
+        self._api_versions_idx = 4 # version of ApiVersionsRequest to try on first connect
+        self._throttle_time = None
+        self._socks5_proxy = None
 
         self.config = copy.copy(self.DEFAULT_CONFIG)
         for key in self.config:
@@ -259,23 +274,8 @@ def __init__(self, host, port, afi, **configs):
         if self.config['security_protocol'] in ('SSL', 'SASL_SSL'):
             assert ssl_available, "Python wasn't built with SSL support"
 
-        if self.config['security_protocol'] in ('SASL_PLAINTEXT', 'SASL_SSL'):
-            assert self.config['sasl_mechanism'] in self.SASL_MECHANISMS, (
-                'sasl_mechanism must be in ' + ', '.join(self.SASL_MECHANISMS))
-            if self.config['sasl_mechanism'] in ('PLAIN', 'SCRAM-SHA-256', 'SCRAM-SHA-512'):
-                assert self.config['sasl_plain_username'] is not None, (
-                    'sasl_plain_username required for PLAIN or SCRAM sasl'
-                )
-                assert self.config['sasl_plain_password'] is not None, (
-                    'sasl_plain_password required for PLAIN or SCRAM sasl'
-                )
-            if self.config['sasl_mechanism'] == 'GSSAPI':
-                assert gssapi is not None, 'GSSAPI lib not available'
-                assert self.config['sasl_kerberos_service_name'] is not None, 'sasl_kerberos_service_name required for GSSAPI sasl'
-            if self.config['sasl_mechanism'] == 'OAUTHBEARER':
-                token_provider = self.config['sasl_oauth_token_provider']
-                assert token_provider is not None, 'sasl_oauth_token_provider required for OAUTHBEARER sasl'
-                assert callable(getattr(token_provider, "token", None)), 'sasl_oauth_token_provider must implement method #token()'
+        self._init_sasl_mechanism()
+
         # This is not a general lock / this class is not generally thread-safe yet
         # However, to avoid pushing responsibility for maintaining
         # per-connection locks to the upstream client, we will use this lock to
@@ -300,6 +300,8 @@ def __init__(self, host, port, afi, **configs):
         self._ssl_context = None
         if self.config['ssl_context'] is not None:
             self._ssl_context = self.config['ssl_context']
+        self._api_versions_future = None
+        self._api_versions_check_timeout = self.config['api_version_auto_timeout_ms']
         self._sasl_auth_future = None
         self.last_attempt = 0
         self._gai = []
@@ -309,11 +311,17 @@ def __init__(self, host, port, afi, **configs):
                                                     self.config['metric_group_prefix'],
                                                     self.node_id)
 
+    def _init_sasl_mechanism(self):
+        if self.config['security_protocol'] in ('SASL_PLAINTEXT', 'SASL_SSL'):
+            self._sasl_mechanism = get_sasl_mechanism(self.config['sasl_mechanism'])(**self.config)
+        else:
+            self._sasl_mechanism = None
+
     def _dns_lookup(self):
         self._gai = dns_lookup(self.host, self.port, self.afi)
         if not self._gai:
-            log.error('DNS lookup failed for %s:%i (%s)',
-                      self.host, self.port, self.afi)
+            log.error('%s: DNS lookup failed for %s:%i (%s)',
+                      self, self.host, self.port, self.afi)
             return False
         return True
 
@@ -359,6 +367,7 @@ def connect_blocking(self, timeout=float('inf')):
     def connect(self):
         """Attempt to connect and return ConnectionState"""
         if self.state is ConnectionStates.DISCONNECTED and not self.blacked_out():
+            self.state = ConnectionStates.CONNECTING
             self.last_attempt = time.time()
             next_lookup = self._next_afi_sockaddr()
             if not next_lookup:
@@ -368,14 +377,21 @@ def connect(self):
                 log.debug('%s: creating new socket', self)
                 assert self._sock is None
                 self._sock_afi, self._sock_addr = next_lookup
-                self._sock = socket.socket(self._sock_afi, socket.SOCK_STREAM)
+                try:
+                    if self.config["socks5_proxy"] is not None:
+                        self._socks5_proxy = Socks5Wrapper(self.config["socks5_proxy"], self.afi)
+                        self._sock = self._socks5_proxy.socket(self._sock_afi, socket.SOCK_STREAM)
+                    else:
+                        self._sock = socket.socket(self._sock_afi, socket.SOCK_STREAM)
+                except (socket.error, OSError) as e:
+                    self.close(e)
+                    return self.state
 
             for option in self.config['socket_options']:
                 log.debug('%s: setting socket option %s', self, option)
                 self._sock.setsockopt(*option)
 
             self._sock.setblocking(False)
-            self.state = ConnectionStates.CONNECTING
             self.config['state_change_callback'](self.node_id, self._sock, self)
             log.info('%s: connecting to %s:%d [%s %s]', self, self.host,
                      self.port, self._sock_addr, AFI_NAMES[self._sock_afi])
@@ -385,7 +401,10 @@ def connect(self):
             # to check connection status
             ret = None
             try:
-                ret = self._sock.connect_ex(self._sock_addr)
+                if self._socks5_proxy:
+                    ret = self._socks5_proxy.connect_ex(self._sock_addr)
+                else:
+                    ret = self._sock.connect_ex(self._sock_addr)
             except socket.error as err:
                 ret = err.errno
 
@@ -394,28 +413,20 @@ def connect(self):
                 log.debug('%s: established TCP connection', self)
 
                 if self.config['security_protocol'] in ('SSL', 'SASL_SSL'):
-                    log.debug('%s: initiating SSL handshake', self)
                     self.state = ConnectionStates.HANDSHAKE
+                    log.debug('%s: initiating SSL handshake', self)
                     self.config['state_change_callback'](self.node_id, self._sock, self)
                     # _wrap_ssl can alter the connection state -- disconnects on failure
                     self._wrap_ssl()
-
-                elif self.config['security_protocol'] == 'SASL_PLAINTEXT':
-                    log.debug('%s: initiating SASL authentication', self)
-                    self.state = ConnectionStates.AUTHENTICATING
-                    self.config['state_change_callback'](self.node_id, self._sock, self)
-
                 else:
-                    # security_protocol PLAINTEXT
-                    log.info('%s: Connection complete.', self)
-                    self.state = ConnectionStates.CONNECTED
-                    self._reset_reconnect_backoff()
+                    self.state = ConnectionStates.API_VERSIONS_SEND
+                    log.debug('%s: checking broker Api Versions', self)
                     self.config['state_change_callback'](self.node_id, self._sock, self)
 
             # Connection failed
             # WSAEINVAL == 10022, but errno.WSAEINVAL is not available on non-win systems
             elif ret not in (errno.EINPROGRESS, errno.EALREADY, errno.EWOULDBLOCK, 10022):
-                log.error('Connect attempt to %s returned error %s.'
+                log.error('%s: Connect attempt returned error %s.'
                           ' Disconnecting.', self, ret)
                 errstr = errno.errorcode.get(ret, 'UNKNOWN')
                 self.close(Errors.KafkaConnectionError('{} {}'.format(ret, errstr)))
@@ -428,22 +439,32 @@ def connect(self):
         if self.state is ConnectionStates.HANDSHAKE:
             if self._try_handshake():
                 log.debug('%s: completed SSL handshake.', self)
-                if self.config['security_protocol'] == 'SASL_SSL':
-                    log.debug('%s: initiating SASL authentication', self)
-                    self.state = ConnectionStates.AUTHENTICATING
-                else:
-                    log.info('%s: Connection complete.', self)
-                    self.state = ConnectionStates.CONNECTED
-                    self._reset_reconnect_backoff()
+                self.state = ConnectionStates.API_VERSIONS_SEND
+                log.debug('%s: checking broker Api Versions', self)
                 self.config['state_change_callback'](self.node_id, self._sock, self)
 
+        if self.state in (ConnectionStates.API_VERSIONS_SEND, ConnectionStates.API_VERSIONS_RECV):
+            if self._try_api_versions_check():
+                # _try_api_versions_check has side-effects: possibly disconnected on socket errors
+                if self.state in (ConnectionStates.API_VERSIONS_SEND, ConnectionStates.API_VERSIONS_RECV):
+                    if self.config['security_protocol'] in ('SASL_PLAINTEXT', 'SASL_SSL'):
+                        self.state = ConnectionStates.AUTHENTICATING
+                        log.debug('%s: initiating SASL authentication', self)
+                        self.config['state_change_callback'](self.node_id, self._sock, self)
+                    else:
+                        # security_protocol PLAINTEXT
+                        self.state = ConnectionStates.CONNECTED
+                        log.info('%s: Connection complete.', self)
+                        self._reset_reconnect_backoff()
+                        self.config['state_change_callback'](self.node_id, self._sock, self)
+
         if self.state is ConnectionStates.AUTHENTICATING:
             assert self.config['security_protocol'] in ('SASL_PLAINTEXT', 'SASL_SSL')
             if self._try_authenticate():
                 # _try_authenticate has side-effects: possibly disconnected on socket errors
                 if self.state is ConnectionStates.AUTHENTICATING:
-                    log.info('%s: Connection complete.', self)
                     self.state = ConnectionStates.CONNECTED
+                    log.info('%s: Connection complete.', self)
                     self._reset_reconnect_backoff()
                     self.config['state_change_callback'](self.node_id, self._sock, self)
 
@@ -452,7 +473,7 @@ def connect(self):
             # Connection timed out
             request_timeout = self.config['request_timeout_ms'] / 1000.0
             if time.time() > request_timeout + self.last_attempt:
-                log.error('Connection attempt to %s timed out', self)
+                log.error('%s: Connection attempt timed out', self)
                 self.close(Errors.KafkaConnectionError('timeout'))
                 return self.state
 
@@ -496,7 +517,7 @@ def _wrap_ssl(self):
         try:
             self._sock = self._ssl_context.wrap_socket(
                 self._sock,
-                server_hostname=self.host,
+                server_hostname=self.host.rstrip("."),
                 do_handshake_on_connect=False)
         except ssl.SSLError as e:
             log.exception('%s: Failed to wrap socket in SSLContext!', self)
@@ -511,20 +532,136 @@ def _try_handshake(self):
         except (SSLWantReadError, SSLWantWriteError):
             pass
         except (SSLZeroReturnError, ConnectionError, TimeoutError, SSLEOFError):
-            log.warning('SSL connection closed by server during handshake.')
+            log.warning('%s: SSL connection closed by server during handshake.', self)
             self.close(Errors.KafkaConnectionError('SSL connection closed by server during handshake'))
         # Other SSLErrors will be raised to user
 
         return False
 
-    def _try_authenticate(self):
-        assert self.config['api_version'] is None or self.config['api_version'] >= (0, 10)
+    def _try_api_versions_check(self):
+        if self._api_versions_future is None:
+            if self.config['api_version'] is not None:
+                self._api_version = self.config['api_version']
+                # api_version will be normalized by KafkaClient, so this should not happen
+                if self._api_version not in BROKER_API_VERSIONS:
+                    raise Errors.UnrecognizedBrokerVersion('api_version %s not found in kafka.protocol.broker_api_versions' % (self._api_version,))
+                self._api_versions = BROKER_API_VERSIONS[self._api_version]
+                log.debug('%s: Using pre-configured api_version %s for ApiVersions', self, self._api_version)
+                return True
+            elif self._check_version_idx is None:
+                version = self._api_versions_idx
+                if version >= 3:
+                    request = ApiVersionsRequest[version](
+                        client_software_name=self.config['client_software_name'],
+                        client_software_version=self.config['client_software_version'],
+                        _tagged_fields={})
+                else:
+                    request = ApiVersionsRequest[version]()
+                future = Future()
+                self._api_versions_check_timeout /= 2
+                response = self._send(request, blocking=True, request_timeout_ms=self._api_versions_check_timeout)
+                response.add_callback(self._handle_api_versions_response, future)
+                response.add_errback(self._handle_api_versions_failure, future)
+                self._api_versions_future = future
+                self.state = ConnectionStates.API_VERSIONS_RECV
+                self.config['state_change_callback'](self.node_id, self._sock, self)
+            elif self._check_version_idx < len(self.VERSION_CHECKS):
+                version, request = self.VERSION_CHECKS[self._check_version_idx]
+                future = Future()
+                self._api_versions_check_timeout /= 2
+                response = self._send(request, blocking=True, request_timeout_ms=self._api_versions_check_timeout)
+                response.add_callback(self._handle_check_version_response, future, version)
+                response.add_errback(self._handle_check_version_failure, future)
+                self._api_versions_future = future
+                self.state = ConnectionStates.API_VERSIONS_RECV
+                self.config['state_change_callback'](self.node_id, self._sock, self)
+            else:
+                self.close(Errors.KafkaConnectionError('Unable to determine broker version.'))
+                return False
+
+        for r, f in self.recv():
+            f.success(r)
+
+        # A connection error during blocking send could trigger close() which will reset the future
+        if self._api_versions_future is None:
+            return False
+        elif self._api_versions_future.failed():
+            ex = self._api_versions_future.exception
+            if not isinstance(ex, Errors.KafkaConnectionError):
+                raise ex
+        return self._api_versions_future.succeeded()
 
+    def _handle_api_versions_response(self, future, response):
+        error_type = Errors.for_code(response.error_code)
+        if error_type is not Errors.NoError:
+            future.failure(error_type())
+            if error_type is Errors.UnsupportedVersionError:
+                self._api_versions_idx -= 1
+                for api_version_data in response.api_versions:
+                    api_key, min_version, max_version = api_version_data[:3]
+                    # If broker provides a lower max_version, skip to that
+                    if api_key == response.API_KEY:
+                        self._api_versions_idx = min(self._api_versions_idx, max_version)
+                        break
+                if self._api_versions_idx >= 0:
+                    self._api_versions_future = None
+                    self.state = ConnectionStates.API_VERSIONS_SEND
+                    self.config['state_change_callback'](self.node_id, self._sock, self)
+            else:
+                self.close(error=error_type())
+            return
+        self._api_versions = dict([
+            (api_version_data[0], (api_version_data[1], api_version_data[2]))
+            for api_version_data in response.api_versions
+        ])
+        self._api_version = self._infer_broker_version_from_api_versions(self._api_versions)
+        log.info('%s: Broker version identified as %s', self, '.'.join(map(str, self._api_version)))
+        future.success(self._api_version)
+        self.connect()
+
+    def _handle_api_versions_failure(self, future, ex):
+        future.failure(ex)
+        # Modern brokers should not disconnect on unrecognized api-versions request,
+        # but in case they do we always want to try v0 as a fallback
+        # otherwise switch to check_version probe.
+        if self._api_versions_idx > 0:
+            self._api_versions_idx = 0
+        else:
+            self._check_version_idx = 0
+        # after failure connection is closed, so state should already be DISCONNECTED
+
+    def _handle_check_version_response(self, future, version, _response):
+        log.info('%s: Broker version identified as %s', self, '.'.join(map(str, version)))
+        log.info('Set configuration api_version=%s to skip auto'
+                 ' check_version requests on startup', version)
+        self._api_versions = BROKER_API_VERSIONS[version]
+        self._api_version = version
+        future.success(version)
+        self.connect()
+
+    def _handle_check_version_failure(self, future, ex):
+        future.failure(ex)
+        self._check_version_idx += 1
+        # after failure connection is closed, so state should already be DISCONNECTED
+
+    def _sasl_handshake_version(self):
+        if self._api_versions is None:
+            raise RuntimeError('_api_versions not set')
+        if SaslHandshakeRequest[0].API_KEY not in self._api_versions:
+            raise Errors.UnsupportedVersionError('SaslHandshake')
+
+        # Build a SaslHandshakeRequest message
+        min_version, max_version = self._api_versions[SaslHandshakeRequest[0].API_KEY]
+        if min_version > 1:
+            raise Errors.UnsupportedVersionError('SaslHandshake %s' % min_version)
+        return min(max_version, 1)
+
+    def _try_authenticate(self):
         if self._sasl_auth_future is None:
-            # Build a SaslHandShakeRequest message
-            request = SaslHandShakeRequest[0](self.config['sasl_mechanism'])
+            version = self._sasl_handshake_version()
+            request = SaslHandshakeRequest[version](self.config['sasl_mechanism'])
             future = Future()
-            sasl_response = self._send(request)
+            sasl_response = self._send(request, blocking=True)
             sasl_response.add_callback(self._handle_sasl_handshake_response, future)
             sasl_response.add_errback(lambda f, e: f.failure(e), future)
             self._sasl_auth_future = future
@@ -549,23 +686,18 @@ def _handle_sasl_handshake_response(self, future, response):
             return future.failure(error_type(self))
 
         if self.config['sasl_mechanism'] not in response.enabled_mechanisms:
-            return future.failure(
+            future.failure(
                 Errors.UnsupportedSaslMechanismError(
                     'Kafka broker does not support %s sasl mechanism. Enabled mechanisms are: %s'
                     % (self.config['sasl_mechanism'], response.enabled_mechanisms)))
-        elif self.config['sasl_mechanism'] == 'PLAIN':
-            return self._try_authenticate_plain(future)
-        elif self.config['sasl_mechanism'] == 'GSSAPI':
-            return self._try_authenticate_gssapi(future)
-        elif self.config['sasl_mechanism'] == 'OAUTHBEARER':
-            return self._try_authenticate_oauth(future)
-        elif self.config['sasl_mechanism'].startswith("SCRAM-SHA-"):
-            return self._try_authenticate_scram(future)
         else:
-            return future.failure(
-                Errors.UnsupportedSaslMechanismError(
-                    'kafka-python does not support SASL mechanism %s' %
-                    self.config['sasl_mechanism']))
+            self._sasl_authenticate(future)
+
+        assert future.is_done, 'SASL future not complete after mechanism processing!'
+        if future.failed():
+            self.close(error=future.exception)
+        else:
+            self.connect()
 
     def _send_bytes(self, data):
         """Send some data via non-blocking IO
@@ -594,6 +726,7 @@ def _send_bytes(self, data):
         return total_sent
 
     def _send_bytes_blocking(self, data):
+        self._sock.setblocking(True)
         self._sock.settimeout(self.config['request_timeout_ms'] / 1000)
         total_sent = 0
         try:
@@ -605,8 +738,10 @@ def _send_bytes_blocking(self, data):
             return total_sent
         finally:
             self._sock.settimeout(0.0)
+            self._sock.setblocking(False)
 
     def _recv_bytes_blocking(self, n):
+        self._sock.setblocking(True)
         self._sock.settimeout(self.config['request_timeout_ms'] / 1000)
         try:
             data = b''
@@ -618,225 +753,76 @@ def _recv_bytes_blocking(self, n):
             return data
         finally:
             self._sock.settimeout(0.0)
+            self._sock.setblocking(False)
 
-    def _try_authenticate_plain(self, future):
-        if self.config['security_protocol'] == 'SASL_PLAINTEXT':
-            log.warning('%s: Sending username and password in the clear', self)
-
-        data = b''
-        # Send PLAIN credentials per RFC-4616
-        msg = bytes('\0'.join([self.config['sasl_plain_username'],
-                               self.config['sasl_plain_username'],
-                               self.config['sasl_plain_password']]).encode('utf-8'))
-        size = Int32.encode(len(msg))
-
-        err = None
-        close = False
-        with self._lock:
-            if not self._can_send_recv():
-                err = Errors.NodeNotReadyError(str(self))
-                close = False
-            else:
-                try:
-                    self._send_bytes_blocking(size + msg)
-
-                    # The server will send a zero sized message (that is Int32(0)) on success.
-                    # The connection is closed on failure
-                    data = self._recv_bytes_blocking(4)
-
-                except (ConnectionError, TimeoutError) as e:
-                    log.exception("%s: Error receiving reply from server", self)
-                    err = Errors.KafkaConnectionError("%s: %s" % (self, e))
-                    close = True
-
-        if err is not None:
-            if close:
+    def _send_sasl_authenticate(self, sasl_auth_bytes):
+        version = self._sasl_handshake_version()
+        if version == 1:
+            request = SaslAuthenticateRequest[0](sasl_auth_bytes)
+            self._send(request, blocking=True)
+        else:
+            log.debug('%s: Sending %d raw sasl auth bytes to server', self, len(sasl_auth_bytes))
+            try:
+                self._send_bytes_blocking(Int32.encode(len(sasl_auth_bytes)) + sasl_auth_bytes)
+            except (ConnectionError, TimeoutError) as e:
+                log.exception("%s: Error sending sasl auth bytes to server", self)
+                err = Errors.KafkaConnectionError("%s: %s" % (self, e))
                 self.close(error=err)
-            return future.failure(err)
-
-        if data != b'\x00\x00\x00\x00':
-            error = Errors.AuthenticationFailedError('Unrecognized response during authentication')
-            return future.failure(error)
-
-        log.info('%s: Authenticated as %s via PLAIN', self, self.config['sasl_plain_username'])
-        return future.success(True)
 
-    def _try_authenticate_scram(self, future):
-        if self.config['security_protocol'] == 'SASL_PLAINTEXT':
-            log.warning('%s: Exchanging credentials in the clear', self)
+    def _recv_sasl_authenticate(self):
+        version = self._sasl_handshake_version()
+        # GSSAPI mechanism does not get a final recv in old non-framed mode
+        if version == 0 and self._sasl_mechanism.is_done():
+            return b''
 
-        scram_client = ScramClient(
-            self.config['sasl_plain_username'], self.config['sasl_plain_password'], self.config['sasl_mechanism']
-        )
-
-        err = None
-        close = False
-        with self._lock:
-            if not self._can_send_recv():
-                err = Errors.NodeNotReadyError(str(self))
-                close = False
-            else:
-                try:
-                    client_first = scram_client.first_message().encode('utf-8')
-                    size = Int32.encode(len(client_first))
-                    self._send_bytes_blocking(size + client_first)
-
-                    (data_len,) = struct.unpack('>i', self._recv_bytes_blocking(4))
-                    server_first = self._recv_bytes_blocking(data_len).decode('utf-8')
-                    scram_client.process_server_first_message(server_first)
-
-                    client_final = scram_client.final_message().encode('utf-8')
-                    size = Int32.encode(len(client_final))
-                    self._send_bytes_blocking(size + client_final)
-
-                    (data_len,) = struct.unpack('>i', self._recv_bytes_blocking(4))
-                    server_final = self._recv_bytes_blocking(data_len).decode('utf-8')
-                    scram_client.process_server_final_message(server_final)
-
-                except (ConnectionError, TimeoutError) as e:
-                    log.exception("%s: Error receiving reply from server", self)
-                    err = Errors.KafkaConnectionError("%s: %s" % (self, e))
-                    close = True
+        try:
+            data = self._recv_bytes_blocking(4)
+            nbytes = Int32.decode(io.BytesIO(data))
+            data += self._recv_bytes_blocking(nbytes)
+        except (ConnectionError, TimeoutError) as e:
+            log.exception("%s: Error receiving sasl auth bytes from server", self)
+            err = Errors.KafkaConnectionError("%s: %s" % (self, e))
+            self.close(error=err)
+            return
 
-        if err is not None:
-            if close:
-                self.close(error=err)
-            return future.failure(err)
-
-        log.info(
-            '%s: Authenticated as %s via %s', self, self.config['sasl_plain_username'], self.config['sasl_mechanism']
-        )
-        return future.success(True)
-
-    def _try_authenticate_gssapi(self, future):
-        kerberos_damin_name = self.config['sasl_kerberos_domain_name'] or self.host
-        auth_id = self.config['sasl_kerberos_service_name'] + '@' + kerberos_damin_name
-        gssapi_name = gssapi.Name(
-            auth_id,
-            name_type=gssapi.NameType.hostbased_service
-        ).canonicalize(gssapi.MechType.kerberos)
-        log.debug('%s: GSSAPI name: %s', self, gssapi_name)
+        if version == 1:
+            ((correlation_id, response),) = self._protocol.receive_bytes(data)
+            (future, timestamp, _timeout) = self.in_flight_requests.pop(correlation_id)
+            latency_ms = (time.time() - timestamp) * 1000
+            if self._sensors:
+                self._sensors.request_time.record(latency_ms)
+            log.debug('%s: Response %d (%s ms): %s', self, correlation_id, latency_ms, response)
 
-        err = None
-        close = False
-        with self._lock:
+            error_type = Errors.for_code(response.error_code)
+            if error_type is not Errors.NoError:
+                log.error("%s: SaslAuthenticate error: %s (%s)",
+                          self, error_type.__name__, response.error_message)
+                self.close(error=error_type(response.error_message))
+                return
+            return response.auth_bytes
+        else:
+            # unframed bytes w/ SaslHandhake v0
+            log.debug('%s: Received %d raw sasl auth bytes from server', self, nbytes)
+            return data[4:]
+
+    def _sasl_authenticate(self, future):
+        while not self._sasl_mechanism.is_done():
+            send_token = self._sasl_mechanism.auth_bytes()
+            self._send_sasl_authenticate(send_token)
             if not self._can_send_recv():
-                err = Errors.NodeNotReadyError(str(self))
-                close = False
-            else:
-                # Establish security context and negotiate protection level
-                # For reference RFC 2222, section 7.2.1
-                try:
-                    # Exchange tokens until authentication either succeeds or fails
-                    client_ctx = gssapi.SecurityContext(name=gssapi_name, usage='initiate')
-                    received_token = None
-                    while not client_ctx.complete:
-                        # calculate an output token from kafka token (or None if first iteration)
-                        output_token = client_ctx.step(received_token)
-
-                        # pass output token to kafka, or send empty response if the security
-                        # context is complete (output token is None in that case)
-                        if output_token is None:
-                            self._send_bytes_blocking(Int32.encode(0))
-                        else:
-                            msg = output_token
-                            size = Int32.encode(len(msg))
-                            self._send_bytes_blocking(size + msg)
-
-                        # The server will send a token back. Processing of this token either
-                        # establishes a security context, or it needs further token exchange.
-                        # The gssapi will be able to identify the needed next step.
-                        # The connection is closed on failure.
-                        header = self._recv_bytes_blocking(4)
-                        (token_size,) = struct.unpack('>i', header)
-                        received_token = self._recv_bytes_blocking(token_size)
-
-                    # Process the security layer negotiation token, sent by the server
-                    # once the security context is established.
-
-                    # unwraps message containing supported protection levels and msg size
-                    msg = client_ctx.unwrap(received_token).message
-                    # Kafka currently doesn't support integrity or confidentiality security layers, so we
-                    # simply set QoP to 'auth' only (first octet). We reuse the max message size proposed
-                    # by the server
-                    msg = Int8.encode(SASL_QOP_AUTH & Int8.decode(io.BytesIO(msg[0:1]))) + msg[1:]
-                    # add authorization identity to the response, GSS-wrap and send it
-                    msg = client_ctx.wrap(msg + auth_id.encode(), False).message
-                    size = Int32.encode(len(msg))
-                    self._send_bytes_blocking(size + msg)
-
-                except (ConnectionError, TimeoutError) as e:
-                    log.exception("%s: Error receiving reply from server",  self)
-                    err = Errors.KafkaConnectionError("%s: %s" % (self, e))
-                    close = True
-                except Exception as e:
-                    err = e
-                    close = True
-
-        if err is not None:
-            if close:
-                self.close(error=err)
-            return future.failure(err)
+                return future.failure(Errors.KafkaConnectionError("%s: Connection failure during Sasl Authenticate" % self))
 
-        log.info('%s: Authenticated as %s via GSSAPI', self, gssapi_name)
-        return future.success(True)
-
-    def _try_authenticate_oauth(self, future):
-        data = b''
-
-        msg = bytes(self._build_oauth_client_request().encode("utf-8"))
-        size = Int32.encode(len(msg))
-
-        err = None
-        close = False
-        with self._lock:
-            if not self._can_send_recv():
-                err = Errors.NodeNotReadyError(str(self))
-                close = False
+            recv_token = self._recv_sasl_authenticate()
+            if recv_token is None:
+                return future.failure(Errors.KafkaConnectionError("%s: Connection failure during Sasl Authenticate" % self))
             else:
-                try:
-                    # Send SASL OAuthBearer request with OAuth token
-                    self._send_bytes_blocking(size + msg)
-
-                    # The server will send a zero sized message (that is Int32(0)) on success.
-                    # The connection is closed on failure
-                    data = self._recv_bytes_blocking(4)
-
-                except (ConnectionError, TimeoutError) as e:
-                    log.exception("%s: Error receiving reply from server", self)
-                    err = Errors.KafkaConnectionError("%s: %s" % (self, e))
-                    close = True
-
-        if err is not None:
-            if close:
-                self.close(error=err)
-            return future.failure(err)
+                self._sasl_mechanism.receive(recv_token)
 
-        if data != b'\x00\x00\x00\x00':
-            error = Errors.AuthenticationFailedError('Unrecognized response during authentication')
-            return future.failure(error)
-
-        log.info('%s: Authenticated via OAuth', self)
-        return future.success(True)
-
-    def _build_oauth_client_request(self):
-        token_provider = self.config['sasl_oauth_token_provider']
-        return "n,,\x01auth=Bearer {}{}\x01\x01".format(token_provider.token(), self._token_extensions())
-
-    def _token_extensions(self):
-        """
-        Return a string representation of the OPTIONAL key-value pairs that can be sent with an OAUTHBEARER
-        initial request.
-        """
-        token_provider = self.config['sasl_oauth_token_provider']
-
-        # Only run if the #extensions() method is implemented by the clients Token Provider class
-        # Builds up a string separated by \x01 via a dict of key value pairs
-        if callable(getattr(token_provider, "extensions", None)) and len(token_provider.extensions()) > 0:
-            msg = "\x01".join(["{}={}".format(k, v) for k, v in token_provider.extensions().items()])
-            return "\x01" + msg
+        if self._sasl_mechanism.is_authenticated():
+            log.info('%s: %s', self, self._sasl_mechanism.auth_details())
+            return future.success(True)
         else:
-            return ""
+            return future.failure(Errors.SaslAuthenticationFailedError('Failed to authenticate via SASL %s' % self.config['sasl_mechanism']))
 
     def blacked_out(self):
         """
@@ -844,20 +830,43 @@ def blacked_out(self):
         re-establish a connection yet
         """
         if self.state is ConnectionStates.DISCONNECTED:
-            if time.time() < self.last_attempt + self._reconnect_backoff:
-                return True
+            return self.connection_delay() > 0
         return False
 
+    def throttled(self):
+        """
+        Return True if we are connected but currently throttled.
+        """
+        if self.state is not ConnectionStates.CONNECTED:
+            return False
+        return self.throttle_delay() > 0
+
+    def throttle_delay(self):
+        """
+        Return the number of milliseconds to wait until connection is no longer throttled.
+        """
+        if self._throttle_time is not None:
+            remaining_ms = (self._throttle_time - time.time()) * 1000
+            if remaining_ms > 0:
+                return remaining_ms
+            else:
+                self._throttle_time = None
+                return 0
+        return 0
+
     def connection_delay(self):
         """
         Return the number of milliseconds to wait, based on the connection
-        state, before attempting to send data. When disconnected, this respects
-        the reconnect backoff time. When connecting or connected, returns a very
+        state, before attempting to send data. When connecting or disconnected,
+        this respects the reconnect backoff time. When connected, returns a very
         large number to handle slow/stalled connections.
         """
-        time_waited = time.time() - (self.last_attempt or 0)
-        if self.state is ConnectionStates.DISCONNECTED:
-            return max(self._reconnect_backoff - time_waited, 0) * 1000
+        if self.disconnected() or self.connecting():
+            if len(self._gai) > 0:
+                return 0
+            else:
+                time_waited = time.time() - self.last_attempt
+                return max(self._reconnect_backoff - time_waited, 0) * 1000
         else:
             # When connecting or connected, we should be able to delay
             # indefinitely since other events (connection or data acked) will
@@ -873,16 +882,33 @@ def connecting(self):
         different states, such as SSL handshake, authorization, etc)."""
         return self.state in (ConnectionStates.CONNECTING,
                               ConnectionStates.HANDSHAKE,
-                              ConnectionStates.AUTHENTICATING)
+                              ConnectionStates.AUTHENTICATING,
+                              ConnectionStates.API_VERSIONS_SEND,
+                              ConnectionStates.API_VERSIONS_RECV)
+
+    def initializing(self):
+        """Returns True if socket is connected but full connection is not complete.
+        During this time the connection may send api requests to the broker to
+        check api versions and perform SASL authentication."""
+        return self.state in (ConnectionStates.AUTHENTICATING,
+                              ConnectionStates.API_VERSIONS_SEND,
+                              ConnectionStates.API_VERSIONS_RECV)
 
     def disconnected(self):
         """Return True iff socket is closed"""
         return self.state is ConnectionStates.DISCONNECTED
 
+    def connect_failed(self):
+        """Return True iff connection attempt failed after attempting all dns records"""
+        return self.disconnected() and self.last_attempt >= 0 and len(self._gai) == 0
+
     def _reset_reconnect_backoff(self):
         self._failures = 0
         self._reconnect_backoff = self.config['reconnect_backoff_ms'] / 1000.0
 
+    def _reconnect_jitter_pct(self):
+        return uniform(0.8, 1.2)
+
     def _update_reconnect_backoff(self):
         # Do not mark as failure if there are more dns entries available to try
         if len(self._gai) > 0:
@@ -891,7 +917,7 @@ def _update_reconnect_backoff(self):
             self._failures += 1
             self._reconnect_backoff = self.config['reconnect_backoff_ms'] * 2 ** (self._failures - 1)
             self._reconnect_backoff = min(self._reconnect_backoff, self.config['reconnect_backoff_max_ms'])
-            self._reconnect_backoff *= uniform(0.8, 1.2)
+            self._reconnect_backoff *= self._reconnect_jitter_pct()
             self._reconnect_backoff /= 1000.0
             log.debug('%s: reconnect backoff %s after %s failures', self, self._reconnect_backoff, self._failures)
 
@@ -916,9 +942,12 @@ def close(self, error=None):
         with self._lock:
             if self.state is ConnectionStates.DISCONNECTED:
                 return
-            log.info('%s: Closing connection. %s', self, error or '')
-            self._update_reconnect_backoff()
+            log.log(logging.ERROR if error else logging.INFO, '%s: Closing connection. %s', self, error or '')
+            if error:
+                self._update_reconnect_backoff()
+            self._api_versions_future = None
             self._sasl_auth_future = None
+            self._init_sasl_mechanism()
             self._protocol = KafkaProtocol(
                 client_id=self.config['client_id'],
                 api_version=self.config['api_version'])
@@ -938,27 +967,43 @@ def close(self, error=None):
 
         # drop lock before state change callback and processing futures
         self.config['state_change_callback'](self.node_id, sock, self)
-        sock.close()
-        for (_correlation_id, (future, _timestamp)) in ifrs:
+        if sock:
+            sock.close()
+        for (_correlation_id, (future, _timestamp, _timeout)) in ifrs:
             future.failure(error)
 
     def _can_send_recv(self):
         """Return True iff socket is ready for requests / responses"""
-        return self.state in (ConnectionStates.AUTHENTICATING,
-                              ConnectionStates.CONNECTED)
+        return self.connected() or self.initializing()
+
+    def send(self, request, blocking=True, request_timeout_ms=None):
+        """Queue request for async network send, return Future()
+
+        Arguments:
+            request (Request): kafka protocol request object to send.
+
+        Keyword Arguments:
+            blocking (bool, optional): Whether to immediately send via
+                blocking socket I/O. Default: True.
+            request_timeout_ms: Custom timeout in milliseconds for request.
+                Default: None (uses value from connection configuration)
 
-    def send(self, request, blocking=True):
-        """Queue request for async network send, return Future()"""
+        Returns: future
+        """
         future = Future()
         if self.connecting():
             return future.failure(Errors.NodeNotReadyError(str(self)))
         elif not self.connected():
             return future.failure(Errors.KafkaConnectionError(str(self)))
         elif not self.can_send_more():
+            # very small race here, but prefer it over breaking abstraction to check self._throttle_time
+            if self.throttled():
+                return future.failure(Errors.ThrottlingQuotaExceededError(str(self)))
             return future.failure(Errors.TooManyInFlightRequests(str(self)))
-        return self._send(request, blocking=blocking)
+        return self._send(request, blocking=blocking, request_timeout_ms=request_timeout_ms)
 
-    def _send(self, request, blocking=True):
+    def _send(self, request, blocking=True, request_timeout_ms=None):
+        request_timeout_ms = request_timeout_ms or self.config['request_timeout_ms']
         future = Future()
         with self._lock:
             if not self._can_send_recv():
@@ -969,11 +1014,12 @@ def _send(self, request, blocking=True):
 
             correlation_id = self._protocol.send_request(request)
 
-            log.debug('%s Request %d: %s', self, correlation_id, request)
+            log.debug('%s: Request %d (timeout_ms %s): %s', self, correlation_id, request_timeout_ms, request)
             if request.expect_response():
-                sent_time = time.time()
                 assert correlation_id not in self.in_flight_requests, 'Correlation ID already in-flight!'
-                self.in_flight_requests[correlation_id] = (future, sent_time)
+                sent_time = time.time()
+                timeout_at = sent_time + (request_timeout_ms / 1000)
+                self.in_flight_requests[correlation_id] = (future, sent_time, timeout_at)
             else:
                 future.success(None)
 
@@ -1002,7 +1048,7 @@ def send_pending_requests(self):
             return True
 
         except (ConnectionError, TimeoutError) as e:
-            log.exception("Error sending request data to %s", self)
+            log.exception("%s: Error sending request data", self)
             error = Errors.KafkaConnectionError("%s: %s" % (self, e))
             self.close(error=error)
             return False
@@ -1035,13 +1081,31 @@ def send_pending_requests_v2(self):
             return len(self._send_buffer) == 0
 
         except (ConnectionError, TimeoutError, Exception) as e:
-            log.exception("Error sending request data to %s", self)
+            log.exception("%s: Error sending request data", self)
             error = Errors.KafkaConnectionError("%s: %s" % (self, e))
             self.close(error=error)
             return False
 
+    def _maybe_throttle(self, response):
+        throttle_time_ms = getattr(response, 'throttle_time_ms', 0)
+        if self._sensors:
+            self._sensors.throttle_time.record(throttle_time_ms)
+        if not throttle_time_ms:
+            if self._throttle_time is not None:
+                self._throttle_time = None
+            return
+        # Client side throttling enabled in v2.0 brokers
+        # prior to that throttling (if present) was managed broker-side
+        if self.config['api_version'] is not None and self.config['api_version'] >= (2, 0):
+            throttle_time = time.time() + throttle_time_ms / 1000
+            self._throttle_time = max(throttle_time, self._throttle_time or 0)
+        log.warning("%s: %s throttled by broker (%d ms)", self,
+                    response.__class__.__name__, throttle_time_ms)
+
     def can_send_more(self):
-        """Return True unless there are max_in_flight_requests_per_connection."""
+        """Check for throttling / quota violations and max in-flight-requests"""
+        if self.throttle_delay() > 0:
+            return False
         max_ifrs = self.config['max_in_flight_requests_per_connection']
         return len(self.in_flight_requests) < max_ifrs
 
@@ -1052,18 +1116,20 @@ def recv(self):
         """
         responses = self._recv()
         if not responses and self.requests_timed_out():
-            log.warning('%s timed out after %s ms. Closing connection.',
-                        self, self.config['request_timeout_ms'])
+            timed_out = self.timed_out_ifrs()
+            timeout_ms = (timed_out[0][2] - timed_out[0][1]) * 1000
+            log.warning('%s: timed out after %s ms. Closing connection.',
+                        self, timeout_ms)
             self.close(error=Errors.RequestTimedOutError(
                 'Request timed out after %s ms' %
-                self.config['request_timeout_ms']))
+                timeout_ms))
             return ()
 
         # augment responses w/ correlation_id, future, and timestamp
         for i, (correlation_id, response) in enumerate(responses):
             try:
                 with self._lock:
-                    (future, timestamp) = self.in_flight_requests.pop(correlation_id)
+                    (future, timestamp, _timeout) = self.in_flight_requests.pop(correlation_id)
             except KeyError:
                 self.close(Errors.KafkaConnectionError('Received unrecognized correlation id'))
                 return ()
@@ -1071,7 +1137,8 @@ def recv(self):
             if self._sensors:
                 self._sensors.request_time.record(latency_ms)
 
-            log.debug('%s Response %d (%s ms): %s', self, correlation_id, latency_ms, response)
+            log.debug('%s: Response %d (%s ms): %s', self, correlation_id, latency_ms, response)
+            self._maybe_throttle(response)
             responses[i] = (response, future)
 
         return responses
@@ -1082,7 +1149,7 @@ def _recv(self):
         err = None
         with self._lock:
             if not self._can_send_recv():
-                log.warning('%s cannot recv: socket not connected', self)
+                log.warning('%s: cannot recv: socket not connected', self)
                 return ()
 
             while len(recvd) < self.config['sock_chunk_buffer_count']:
@@ -1132,36 +1199,30 @@ def _recv(self):
         return ()
 
     def requests_timed_out(self):
+        return self.next_ifr_request_timeout_ms() == 0
+
+    def timed_out_ifrs(self):
+        now = time.time()
+        ifrs = sorted(self.in_flight_requests.values(), reverse=True, key=lambda ifr: ifr[2])
+        return list(filter(lambda ifr: ifr[2] <= now, ifrs))
+
+    def next_ifr_request_timeout_ms(self):
         with self._lock:
             if self.in_flight_requests:
-                get_timestamp = lambda v: v[1]
-                oldest_at = min(map(get_timestamp,
-                                    self.in_flight_requests.values()))
-                timeout = self.config['request_timeout_ms'] / 1000.0
-                if time.time() >= oldest_at + timeout:
-                    return True
-            return False
-
-    def _handle_api_version_response(self, response):
-        error_type = Errors.for_code(response.error_code)
-        assert error_type is Errors.NoError, "API version check failed"
-        self._api_versions = dict([
-            (api_key, (min_version, max_version))
-            for api_key, min_version, max_version in response.api_versions
-        ])
-        return self._api_versions
+                def get_timeout(v):
+                    return v[2]
+                next_timeout = min(map(get_timeout,
+                                   self.in_flight_requests.values()))
+                return max(0, (next_timeout - time.time()) * 1000)
+            else:
+                return float('inf')
 
     def get_api_versions(self):
-        if self._api_versions is not None:
-            return self._api_versions
-
-        version = self.check_version()
-        if version < (0, 10, 0):
-            raise Errors.UnsupportedVersionError(
-                "ApiVersion not supported by cluster version {} < 0.10.0"
-                .format(version))
-        # _api_versions is set as a side effect of check_versions() on a cluster
-        # that supports 0.10.0 or later
+        # _api_versions is set as a side effect of first connection
+        # which should typically be bootstrap, but call check_version
+        # if that hasn't happened yet
+        if self._api_versions is None:
+            self.check_version()
         return self._api_versions
 
     def _infer_broker_version_from_api_versions(self, api_versions):
@@ -1169,140 +1230,69 @@ def _infer_broker_version_from_api_versions(self, api_versions):
         # in reverse order. As soon as we find one that works, return it
         test_cases = [
             # format (<broker version>, <needed struct>)
-            ((2, 6, 0), DescribeClientQuotasRequest[0]),
-            ((2, 5, 0), DescribeAclsRequest_v2),
-            ((2, 4, 0), ProduceRequest[8]),
-            ((2, 3, 0), FetchRequest[11]),
-            ((2, 2, 0), OffsetRequest[5]),
-            ((2, 1, 0), FetchRequest[10]),
-            ((2, 0, 0), FetchRequest[8]),
-            ((1, 1, 0), FetchRequest[7]),
-            ((1, 0, 0), MetadataRequest[5]),
-            ((0, 11, 0), MetadataRequest[4]),
+            # Make sure to update consumer_integration test check when adding newer versions.
+            # ((3, 9), FetchRequest[17]),
+            # ((3, 8), ProduceRequest[11]),
+            # ((3, 7), FetchRequest[16]),
+            # ((3, 6), AddPartitionsToTxnRequest[4]),
+            # ((3, 5), FetchRequest[15]),
+            # ((3, 4), StopReplicaRequest[3]), # broker-internal api...
+            # ((3, 3), DescribeAclsRequest[3]),
+            # ((3, 2), JoinGroupRequest[9]),
+            # ((3, 1), FetchRequest[13]),
+            # ((3, 0), ListOffsetsRequest[7]),
+            # ((2, 8), ProduceRequest[9]),
+            # ((2, 7), FetchRequest[12]),
+            # ((2, 6), ListGroupsRequest[4]),
+            # ((2, 5), JoinGroupRequest[7]),
+            ((2, 6), DescribeClientQuotasRequest[0]),
+            ((2, 5), DescribeAclsRequest[2]),
+            ((2, 4), ProduceRequest[8]),
+            ((2, 3), FetchRequest[11]),
+            ((2, 2), ListOffsetsRequest[5]),
+            ((2, 1), FetchRequest[10]),
+            ((2, 0), FetchRequest[8]),
+            ((1, 1), FetchRequest[7]),
+            ((1, 0), MetadataRequest[5]),
+            ((0, 11), MetadataRequest[4]),
             ((0, 10, 2), OffsetFetchRequest[2]),
             ((0, 10, 1), MetadataRequest[2]),
         ]
 
         # Get the best match of test cases
-        for broker_version, struct in sorted(test_cases, reverse=True):
-            if struct.API_KEY not in api_versions:
+        for broker_version, proto_struct in sorted(test_cases, reverse=True):
+            if proto_struct.API_KEY not in api_versions:
                 continue
-            min_version, max_version = api_versions[struct.API_KEY]
-            if min_version <= struct.API_VERSION <= max_version:
+            min_version, max_version = api_versions[proto_struct.API_KEY]
+            if min_version <= proto_struct.API_VERSION <= max_version:
                 return broker_version
 
-        # We know that ApiVersionResponse is only supported in 0.10+
+        # We know that ApiVersionsResponse is only supported in 0.10+
         # so if all else fails, choose that
         return (0, 10, 0)
 
-    def check_version(self, timeout=2, strict=False, topics=[]):
+    def check_version(self, timeout=2, **kwargs):
         """Attempt to guess the broker version.
 
+        Keyword Arguments:
+            timeout (numeric, optional): Maximum number of seconds to block attempting
+                to connect and check version. Default 2
+
         Note: This is a blocking call.
 
-        Returns: version tuple, i.e. (0, 10), (0, 9), (0, 8, 2), ...
+        Returns: version tuple, i.e. (3, 9), (2, 4), etc ...
+
+        Raises: NodeNotReadyError on timeout
         """
         timeout_at = time.time() + timeout
-        log.info('Probing node %s broker version', self.node_id)
-        # Monkeypatch some connection configurations to avoid timeouts
-        override_config = {
-            'request_timeout_ms': timeout * 1000,
-            'max_in_flight_requests_per_connection': 5
-        }
-        stashed = {}
-        for key in override_config:
-            stashed[key] = self.config[key]
-            self.config[key] = override_config[key]
-
-        def reset_override_configs():
-            for key in stashed:
-                self.config[key] = stashed[key]
-
-        # kafka kills the connection when it doesn't recognize an API request
-        # so we can send a test request and then follow immediately with a
-        # vanilla MetadataRequest. If the server did not recognize the first
-        # request, both will be failed with a ConnectionError that wraps
-        # socket.error (32, 54, or 104)
-        from kafka.protocol.admin import ApiVersionRequest, ListGroupsRequest
-        from kafka.protocol.commit import OffsetFetchRequest, GroupCoordinatorRequest
-
-        test_cases = [
-            # All cases starting from 0.10 will be based on ApiVersionResponse
-            ((0, 10), ApiVersionRequest[0]()),
-            ((0, 9), ListGroupsRequest[0]()),
-            ((0, 8, 2), GroupCoordinatorRequest[0]('kafka-python-default-group')),
-            ((0, 8, 1), OffsetFetchRequest[0]('kafka-python-default-group', [])),
-            ((0, 8, 0), MetadataRequest[0](topics)),
-        ]
-
-        for version, request in test_cases:
-            if not self.connect_blocking(timeout_at - time.time()):
-                reset_override_configs()
-                raise Errors.NodeNotReadyError()
-            f = self.send(request)
-            # HACK: sleeping to wait for socket to send bytes
-            time.sleep(0.1)
-            # when broker receives an unrecognized request API
-            # it abruptly closes our socket.
-            # so we attempt to send a second request immediately
-            # that we believe it will definitely recognize (metadata)
-            # the attempt to write to a disconnected socket should
-            # immediately fail and allow us to infer that the prior
-            # request was unrecognized
-            mr = self.send(MetadataRequest[0](topics))
-
-            selector = self.config['selector']()
-            selector.register(self._sock, selectors.EVENT_READ)
-            while not (f.is_done and mr.is_done):
-                selector.select(1)
-                for response, future in self.recv():
-                    future.success(response)
-            selector.close()
-
-            if f.succeeded():
-                if isinstance(request, ApiVersionRequest[0]):
-                    # Starting from 0.10 kafka broker we determine version
-                    # by looking at ApiVersionResponse
-                    api_versions = self._handle_api_version_response(f.value)
-                    version = self._infer_broker_version_from_api_versions(api_versions)
-                log.info('Broker version identified as %s', '.'.join(map(str, version)))
-                log.info('Set configuration api_version=%s to skip auto'
-                         ' check_version requests on startup', version)
-                break
-
-            # Only enable strict checking to verify that we understand failure
-            # modes. For most users, the fact that the request failed should be
-            # enough to rule out a particular broker version.
-            if strict:
-                # If the socket flush hack did not work (which should force the
-                # connection to close and fail all pending requests), then we
-                # get a basic Request Timeout. This is not ideal, but we'll deal
-                if isinstance(f.exception, Errors.RequestTimedOutError):
-                    pass
-
-                # 0.9 brokers do not close the socket on unrecognized api
-                # requests (bug...). In this case we expect to see a correlation
-                # id mismatch
-                elif (isinstance(f.exception, Errors.CorrelationIdError) and
-                      version == (0, 10)):
-                    pass
-                elif six.PY2:
-                    assert isinstance(f.exception.args[0], socket.error)
-                    assert f.exception.args[0].errno in (32, 54, 104)
-                else:
-                    assert isinstance(f.exception.args[0], ConnectionError)
-            log.info("Broker is not v%s -- it did not recognize %s",
-                     version, request.__class__.__name__)
+        if not self.connect_blocking(timeout_at - time.time()):
+            raise Errors.NodeNotReadyError()
         else:
-            reset_override_configs()
-            raise Errors.UnrecognizedBrokerVersion()
-
-        reset_override_configs()
-        return version
+            return self._api_version
 
     def __str__(self):
-        return "<BrokerConnection node_id=%s host=%s:%d %s [%s %s]>" % (
-            self.node_id, self.host, self.port, self.state,
+        return "<BrokerConnection client_id=%s, node_id=%s host=%s:%d %s [%s %s]>" % (
+            self.config['client_id'], self.node_id, self.host, self.port, self.state,
             AFI_NAMES[self._sock_afi], self._sock_addr)
 
 
@@ -1359,6 +1349,16 @@ def __init__(self, metrics, metric_group_prefix, node_id):
                 'The maximum request latency in ms.'),
                 Max())
 
+            throttle_time = metrics.sensor('throttle-time')
+            throttle_time.add(metrics.metric_name(
+                'throttle-time-avg', metric_group_name,
+                'The average throttle time in ms.'),
+                Avg())
+            throttle_time.add(metrics.metric_name(
+                'throttle-time-max', metric_group_name,
+                'The maximum throttle time in ms.'),
+                Max())
+
         # if one sensor of the metrics has been registered for the connection,
         # then all other sensors should have been registered; and vice versa
         node_str = 'node-{0}'.format(node_id)
@@ -1410,9 +1410,23 @@ def __init__(self, metrics, metric_group_prefix, node_id):
                 'The maximum request latency in ms.'),
                 Max())
 
+            throttle_time = metrics.sensor(
+                node_str + '.throttle',
+                parents=[metrics.get_sensor('throttle-time')])
+            throttle_time.add(metrics.metric_name(
+                'throttle-time-avg', metric_group_name,
+                'The average throttle time in ms.'),
+                Avg())
+            throttle_time.add(metrics.metric_name(
+                'throttle-time-max', metric_group_name,
+                'The maximum throttle time in ms.'),
+                Max())
+
+
         self.bytes_sent = metrics.sensor(node_str + '.bytes-sent')
         self.bytes_received = metrics.sensor(node_str + '.bytes-received')
         self.request_time = metrics.sensor(node_str + '.latency')
+        self.throttle_time = metrics.sensor(node_str + '.throttle')
 
 
 def _address_family(address):
@@ -1482,32 +1496,6 @@ def get_ip_port_afi(host_and_port_str):
             return host, port, af
 
 
-def collect_hosts(hosts, randomize=True):
-    """
-    Collects a comma-separated set of hosts (host:port) and optionally
-    randomize the returned list.
-    """
-
-    if isinstance(hosts, six.string_types):
-        hosts = hosts.strip().split(',')
-
-    result = []
-    afi = socket.AF_INET
-    for host_port in hosts:
-
-        host, port, afi = get_ip_port_afi(host_port)
-
-        if port < 0:
-            port = DEFAULT_KAFKA_PORT
-
-        result.append((host, port, afi))
-
-    if randomize:
-        shuffle(result)
-
-    return result
-
-
 def is_inet_4_or_6(gai):
     """Given a getaddrinfo struct, return True iff ipv4 or ipv6"""
     return gai[0] in (socket.AF_INET, socket.AF_INET6)
diff --git a/kafka/consumer/fetcher.py b/kafka/consumer/fetcher.py
index 7ff9daf7b..42e2d660c 100644
--- a/kafka/consumer/fetcher.py
+++ b/kafka/consumer/fetcher.py
@@ -1,9 +1,9 @@
-from __future__ import absolute_import
+from __future__ import absolute_import, division
 
 import collections
 import copy
+import itertools
 import logging
-import random
 import sys
 import time
 
@@ -12,13 +12,14 @@
 import kafka.errors as Errors
 from kafka.future import Future
 from kafka.metrics.stats import Avg, Count, Max, Rate
-from kafka.protocol.fetch import FetchRequest
-from kafka.protocol.offset import (
-    OffsetRequest, OffsetResetStrategy, UNKNOWN_OFFSET
+from kafka.protocol.fetch import FetchRequest, AbortedTransaction
+from kafka.protocol.list_offsets import (
+    ListOffsetsRequest, OffsetResetStrategy, UNKNOWN_OFFSET
 )
 from kafka.record import MemoryRecords
 from kafka.serializer import Deserializer
-from kafka.structs import TopicPartition, OffsetAndTimestamp
+from kafka.structs import TopicPartition, OffsetAndMetadata, OffsetAndTimestamp
+from kafka.util import Timer
 
 log = logging.getLogger(__name__)
 
@@ -27,8 +28,13 @@
 READ_UNCOMMITTED = 0
 READ_COMMITTED = 1
 
+ISOLATION_LEVEL_CONFIG = {
+    'read_uncommitted': READ_UNCOMMITTED,
+    'read_committed': READ_COMMITTED,
+}
+
 ConsumerRecord = collections.namedtuple("ConsumerRecord",
-    ["topic", "partition", "offset", "timestamp", "timestamp_type",
+    ["topic", "partition", "leader_epoch", "offset", "timestamp", "timestamp_type",
      "key", "value", "headers", "checksum", "serialized_key_size", "serialized_value_size", "serialized_header_size"])
 
 
@@ -37,6 +43,10 @@
      "partition_data", "metric_aggregator"])
 
 
+ExceptionMetadata = collections.namedtuple("ExceptionMetadata",
+    ["partition", "fetched_offset", "exception"])
+
+
 class NoOffsetForPartitionError(Errors.KafkaError):
     pass
 
@@ -55,13 +65,15 @@ class Fetcher(six.Iterator):
         'max_partition_fetch_bytes': 1048576,
         'max_poll_records': sys.maxsize,
         'check_crcs': True,
-        'iterator_refetch_records': 1,  # undocumented -- interface may change
+        'metrics': None,
         'metric_group_prefix': 'consumer',
-        'api_version': (0, 8, 0),
-        'retry_backoff_ms': 100
+        'request_timeout_ms': 30000,
+        'retry_backoff_ms': 100,
+        'enable_incremental_fetch_sessions': True,
+        'isolation_level': 'read_uncommitted',
     }
 
-    def __init__(self, client, subscriptions, metrics, **configs):
+    def __init__(self, client, subscriptions, **configs):
         """Initialize a Kafka Message Fetcher.
 
         Keyword Arguments:
@@ -69,6 +81,8 @@ def __init__(self, client, subscriptions, metrics, **configs):
                 raw message key and returns a deserialized key.
             value_deserializer (callable, optional): Any callable that takes a
                 raw message value and returns a deserialized value.
+            enable_incremental_fetch_sessions: (bool): Use incremental fetch sessions
+                when available / supported by kafka broker. See KIP-227. Default: True.
             fetch_min_bytes (int): Minimum amount of data the server should
                 return for a fetch request, otherwise wait up to
                 fetch_max_wait_ms for more data to accumulate. Default: 1.
@@ -97,20 +111,33 @@ def __init__(self, client, subscriptions, metrics, **configs):
                 consumed. This ensures no on-the-wire or on-disk corruption to
                 the messages occurred. This check adds some overhead, so it may
                 be disabled in cases seeking extreme performance. Default: True
+            isolation_level (str): Configure KIP-98 transactional consumer by
+                setting to 'read_committed'. This will cause the consumer to
+                skip records from aborted tranactions. Default: 'read_uncommitted'
         """
         self.config = copy.copy(self.DEFAULT_CONFIG)
         for key in self.config:
             if key in configs:
                 self.config[key] = configs[key]
 
+        if self.config['isolation_level'] not in ISOLATION_LEVEL_CONFIG:
+            raise Errors.KafkaConfigurationError('Unrecognized isolation_level')
+
         self._client = client
         self._subscriptions = subscriptions
         self._completed_fetches = collections.deque()  # Unparsed responses
         self._next_partition_records = None  # Holds a single PartitionRecords until fully consumed
         self._iterator = None
         self._fetch_futures = collections.deque()
-        self._sensors = FetchManagerMetrics(metrics, self.config['metric_group_prefix'])
-        self._isolation_level = READ_UNCOMMITTED
+        if self.config['metrics']:
+            self._sensors = FetchManagerMetrics(self.config['metrics'], self.config['metric_group_prefix'])
+        else:
+            self._sensors = None
+        self._isolation_level = ISOLATION_LEVEL_CONFIG[self.config['isolation_level']]
+        self._session_handlers = {}
+        self._nodes_with_pending_fetch_requests = set()
+        self._cached_list_offsets_exception = None
+        self._next_in_line_exception_metadata = None
 
     def send_fetches(self):
         """Send FetchRequests for all assigned partitions that do not already have
@@ -120,29 +147,18 @@ def send_fetches(self):
             List of Futures: each future resolves to a FetchResponse
         """
         futures = []
-        for node_id, request in six.iteritems(self._create_fetch_requests()):
-            if self._client.ready(node_id):
-                log.debug("Sending FetchRequest to node %s", node_id)
-                future = self._client.send(node_id, request, wakeup=False)
-                future.add_callback(self._handle_fetch_response, request, time.time())
-                future.add_errback(log.error, 'Fetch to node %s failed: %s', node_id)
-                futures.append(future)
+        for node_id, (request, fetch_offsets) in six.iteritems(self._create_fetch_requests()):
+            log.debug("Sending FetchRequest to node %s", node_id)
+            self._nodes_with_pending_fetch_requests.add(node_id)
+            future = self._client.send(node_id, request, wakeup=False)
+            future.add_callback(self._handle_fetch_response, node_id, fetch_offsets, time.time())
+            future.add_errback(self._handle_fetch_error, node_id)
+            future.add_both(self._clear_pending_fetch_request, node_id)
+            futures.append(future)
         self._fetch_futures.extend(futures)
         self._clean_done_fetch_futures()
         return futures
 
-    def reset_offsets_if_needed(self, partitions):
-        """Lookup and set offsets for any partitions which are awaiting an
-        explicit reset.
-
-        Arguments:
-            partitions (set of TopicPartitions): the partitions to reset
-        """
-        for tp in partitions:
-            # TODO: If there are several offsets to reset, we could submit offset requests in parallel
-            if self._subscriptions.is_assigned(tp) and self._subscriptions.is_offset_reset_needed(tp):
-                self._reset_offset(tp)
-
     def _clean_done_fetch_futures(self):
         while True:
             if not self._fetch_futures:
@@ -156,97 +172,34 @@ def in_flight_fetches(self):
         self._clean_done_fetch_futures()
         return bool(self._fetch_futures)
 
-    def update_fetch_positions(self, partitions):
-        """Update the fetch positions for the provided partitions.
-
-        Arguments:
-            partitions (list of TopicPartitions): partitions to update
-
-        Raises:
-            NoOffsetForPartitionError: if no offset is stored for a given
-                partition and no reset policy is available
-        """
-        # reset the fetch position to the committed position
-        for tp in partitions:
-            if not self._subscriptions.is_assigned(tp):
-                log.warning("partition %s is not assigned - skipping offset"
-                            " update", tp)
-                continue
-            elif self._subscriptions.is_fetchable(tp):
-                log.warning("partition %s is still fetchable -- skipping offset"
-                            " update", tp)
-                continue
-
-            if self._subscriptions.is_offset_reset_needed(tp):
-                self._reset_offset(tp)
-            elif self._subscriptions.assignment[tp].committed is None:
-                # there's no committed position, so we need to reset with the
-                # default strategy
-                self._subscriptions.need_offset_reset(tp)
-                self._reset_offset(tp)
-            else:
-                committed = self._subscriptions.assignment[tp].committed.offset
-                log.debug("Resetting offset for partition %s to the committed"
-                          " offset %s", tp, committed)
-                self._subscriptions.seek(tp, committed)
-
-    def get_offsets_by_times(self, timestamps, timeout_ms):
-        offsets = self._retrieve_offsets(timestamps, timeout_ms)
-        for tp in timestamps:
-            if tp not in offsets:
-                offsets[tp] = None
-            else:
-                offset, timestamp = offsets[tp]
-                offsets[tp] = OffsetAndTimestamp(offset, timestamp)
-        return offsets
-
-    def beginning_offsets(self, partitions, timeout_ms):
-        return self.beginning_or_end_offset(
-            partitions, OffsetResetStrategy.EARLIEST, timeout_ms)
-
-    def end_offsets(self, partitions, timeout_ms):
-        return self.beginning_or_end_offset(
-            partitions, OffsetResetStrategy.LATEST, timeout_ms)
-
-    def beginning_or_end_offset(self, partitions, timestamp, timeout_ms):
-        timestamps = dict([(tp, timestamp) for tp in partitions])
-        offsets = self._retrieve_offsets(timestamps, timeout_ms)
-        for tp in timestamps:
-            offsets[tp] = offsets[tp][0]
-        return offsets
-
-    def _reset_offset(self, partition):
-        """Reset offsets for the given partition using the offset reset strategy.
+    def reset_offsets_if_needed(self):
+        """Reset offsets for the given partitions using the offset reset strategy.
 
         Arguments:
-            partition (TopicPartition): the partition that needs reset offset
+            partitions ([TopicPartition]): the partitions that need offsets reset
 
         Raises:
             NoOffsetForPartitionError: if no offset reset strategy is defined
+            KafkaTimeoutError if timeout_ms provided
         """
-        timestamp = self._subscriptions.assignment[partition].reset_strategy
-        if timestamp is OffsetResetStrategy.EARLIEST:
-            strategy = 'earliest'
-        elif timestamp is OffsetResetStrategy.LATEST:
-            strategy = 'latest'
-        else:
-            raise NoOffsetForPartitionError(partition)
+        # Raise exception from previous offset fetch if there is one
+        exc, self._cached_list_offsets_exception = self._cached_list_offsets_exception, None
+        if exc:
+            raise exc
 
-        log.debug("Resetting offset for partition %s to %s offset.",
-                  partition, strategy)
-        offsets = self._retrieve_offsets({partition: timestamp})
+        partitions = self._subscriptions.partitions_needing_reset()
+        if not partitions:
+            return
 
-        if partition in offsets:
-            offset = offsets[partition][0]
+        offset_resets = dict()
+        for tp in partitions:
+            ts = self._subscriptions.assignment[tp].reset_strategy
+            if ts:
+                offset_resets[tp] = ts
 
-            # we might lose the assignment while fetching the offset,
-            # so check it is still active
-            if self._subscriptions.is_assigned(partition):
-                self._subscriptions.seek(partition, offset)
-        else:
-            log.debug("Could not find offset for partition %s since it is probably deleted" % (partition,))
+        self._reset_offsets_async(offset_resets)
 
-    def _retrieve_offsets(self, timestamps, timeout_ms=float("inf")):
+    def offsets_by_times(self, timestamps, timeout_ms=None):
         """Fetch offset for each partition passed in ``timestamps`` map.
 
         Blocks until offsets are obtained, a non-retriable exception is raised
@@ -256,57 +209,83 @@ def _retrieve_offsets(self, timestamps, timeout_ms=float("inf")):
             timestamps: {TopicPartition: int} dict with timestamps to fetch
                 offsets by. -1 for the latest available, -2 for the earliest
                 available. Otherwise timestamp is treated as epoch milliseconds.
+            timeout_ms (int, optional): The maximum time in milliseconds to block.
 
         Returns:
-            {TopicPartition: (int, int)}: Mapping of partition to
-                retrieved offset and timestamp. If offset does not exist for
+            {TopicPartition: OffsetAndTimestamp}: Mapping of partition to
+                retrieved offset, timestamp, and leader_epoch. If offset does not exist for
                 the provided timestamp, that partition will be missing from
                 this mapping.
+
+        Raises:
+            KafkaTimeoutError if timeout_ms provided
         """
+        offsets = self._fetch_offsets_by_times(timestamps, timeout_ms)
+        for tp in timestamps:
+            if tp not in offsets:
+                offsets[tp] = None
+        return offsets
+
+    def _fetch_offsets_by_times(self, timestamps, timeout_ms=None):
         if not timestamps:
             return {}
 
-        start_time = time.time()
-        remaining_ms = timeout_ms
+        timer = Timer(timeout_ms, "Failed to get offsets by timestamps in %s ms" % (timeout_ms,))
         timestamps = copy.copy(timestamps)
-        while remaining_ms > 0:
+        fetched_offsets = dict()
+        while True:
             if not timestamps:
                 return {}
 
-            future = self._send_offset_requests(timestamps)
-            self._client.poll(future=future, timeout_ms=remaining_ms)
+            future = self._send_list_offsets_requests(timestamps)
+            self._client.poll(future=future, timeout_ms=timer.timeout_ms)
+
+            # Timeout w/o future completion
+            if not future.is_done:
+                break
 
             if future.succeeded():
-                return future.value
-            if not future.retriable():
-                raise future.exception  # pylint: disable-msg=raising-bad-type
+                fetched_offsets.update(future.value[0])
+                if not future.value[1]:
+                    return fetched_offsets
 
-            elapsed_ms = (time.time() - start_time) * 1000
-            remaining_ms = timeout_ms - elapsed_ms
-            if remaining_ms < 0:
-                break
+                timestamps = {tp: timestamps[tp] for tp in future.value[1]}
 
-            if future.exception.invalid_metadata:
+            elif not future.retriable():
+                raise future.exception  # pylint: disable-msg=raising-bad-type
+
+            if future.exception.invalid_metadata or self._client.cluster.need_update:
                 refresh_future = self._client.cluster.request_update()
-                self._client.poll(future=refresh_future, timeout_ms=remaining_ms)
-
-                # Issue #1780
-                # Recheck partition existence after after a successful metadata refresh
-                if refresh_future.succeeded() and isinstance(future.exception, Errors.StaleMetadata):
-                    log.debug("Stale metadata was raised, and we now have an updated metadata. Rechecking partition existence")
-                    unknown_partition = future.exception.args[0]  # TopicPartition from StaleMetadata
-                    if self._client.cluster.leader_for_partition(unknown_partition) is None:
-                        log.debug("Removed partition %s from offsets retrieval" % (unknown_partition, ))
-                        timestamps.pop(unknown_partition)
+                self._client.poll(future=refresh_future, timeout_ms=timer.timeout_ms)
+
+                if not future.is_done:
+                    break
             else:
-                time.sleep(self.config['retry_backoff_ms'] / 1000.0)
+                if timer.timeout_ms is None or timer.timeout_ms > self.config['retry_backoff_ms']:
+                    time.sleep(self.config['retry_backoff_ms'] / 1000)
+                else:
+                    time.sleep(timer.timeout_ms / 1000)
 
-            elapsed_ms = (time.time() - start_time) * 1000
-            remaining_ms = timeout_ms - elapsed_ms
+            timer.maybe_raise()
 
         raise Errors.KafkaTimeoutError(
             "Failed to get offsets by timestamps in %s ms" % (timeout_ms,))
 
+    def beginning_offsets(self, partitions, timeout_ms):
+        return self.beginning_or_end_offset(
+            partitions, OffsetResetStrategy.EARLIEST, timeout_ms)
+
+    def end_offsets(self, partitions, timeout_ms):
+        return self.beginning_or_end_offset(
+            partitions, OffsetResetStrategy.LATEST, timeout_ms)
+
+    def beginning_or_end_offset(self, partitions, timestamp, timeout_ms):
+        timestamps = dict([(tp, timestamp) for tp in partitions])
+        offsets = self._fetch_offsets_by_times(timestamps, timeout_ms)
+        for tp in timestamps:
+            offsets[tp] = offsets[tp].offset
+        return offsets
+
     def fetched_records(self, max_records=None, update_offsets=True):
         """Returns previously fetched records and updates consumed offsets.
 
@@ -316,7 +295,7 @@ def fetched_records(self, max_records=None, update_offsets=True):
 
         Raises:
             OffsetOutOfRangeError: if no subscription offset_reset_strategy
-            CorruptRecordException: if message crc validation fails (check_crcs
+            CorruptRecordError: if message crc validation fails (check_crcs
                 must be set to True)
             RecordTooLargeError: if a message is larger than the currently
                 configured max_partition_fetch_bytes
@@ -333,20 +312,40 @@ def fetched_records(self, max_records=None, update_offsets=True):
             max_records = self.config['max_poll_records']
         assert max_records > 0
 
+        if self._next_in_line_exception_metadata is not None:
+            exc_meta = self._next_in_line_exception_metadata
+            self._next_in_line_exception_metadata = None
+            tp = exc_meta.partition
+            if self._subscriptions.is_fetchable(tp) and self._subscriptions.position(tp).offset == exc_meta.fetched_offset:
+                raise exc_meta.exception
+
         drained = collections.defaultdict(list)
         records_remaining = max_records
+        # Needed to construct ExceptionMetadata if any exception is found when processing completed_fetch
+        fetched_partition = None
+        fetched_offset = -1
 
-        while records_remaining > 0:
-            if not self._next_partition_records:
-                if not self._completed_fetches:
-                    break
-                completion = self._completed_fetches.popleft()
-                self._next_partition_records = self._parse_fetched_data(completion)
-            else:
-                records_remaining -= self._append(drained,
-                                                  self._next_partition_records,
-                                                  records_remaining,
-                                                  update_offsets)
+        try:
+            while records_remaining > 0:
+                if not self._next_partition_records:
+                    if not self._completed_fetches:
+                        break
+                    completion = self._completed_fetches.popleft()
+                    fetched_partition = completion.topic_partition
+                    fetched_offset = completion.fetched_offset
+                    self._next_partition_records = self._parse_fetched_data(completion)
+                else:
+                    fetched_partition = self._next_partition_records.topic_partition
+                    fetched_offset = self._next_partition_records.next_fetch_offset
+                    records_remaining -= self._append(drained,
+                                                      self._next_partition_records,
+                                                      records_remaining,
+                                                      update_offsets)
+        except Exception as e:
+            if not drained:
+                raise e
+            # To be thrown in the next call of this method
+            self._next_in_line_exception_metadata = ExceptionMetadata(fetched_partition, fetched_offset, e)
         return dict(drained), bool(self._completed_fetches)
 
     def _append(self, drained, part, max_records, update_offsets):
@@ -354,163 +353,101 @@ def _append(self, drained, part, max_records, update_offsets):
             return 0
 
         tp = part.topic_partition
-        fetch_offset = part.fetch_offset
         if not self._subscriptions.is_assigned(tp):
             # this can happen when a rebalance happened before
             # fetched records are returned to the consumer's poll call
             log.debug("Not returning fetched records for partition %s"
                       " since it is no longer assigned", tp)
+        elif not self._subscriptions.is_fetchable(tp):
+            # this can happen when a partition is paused before
+            # fetched records are returned to the consumer's poll call
+            log.debug("Not returning fetched records for assigned partition"
+                      " %s since it is no longer fetchable", tp)
+
         else:
             # note that the position should always be available
             # as long as the partition is still assigned
             position = self._subscriptions.assignment[tp].position
-            if not self._subscriptions.is_fetchable(tp):
-                # this can happen when a partition is paused before
-                # fetched records are returned to the consumer's poll call
-                log.debug("Not returning fetched records for assigned partition"
-                          " %s since it is no longer fetchable", tp)
-
-            elif fetch_offset == position:
-                # we are ensured to have at least one record since we already checked for emptiness
+            if part.next_fetch_offset == position.offset:
+                log.debug("Returning fetched records at offset %d for assigned"
+                          " partition %s", position.offset, tp)
                 part_records = part.take(max_records)
-                next_offset = part_records[-1].offset + 1
-
-                log.log(0, "Returning fetched records at offset %d for assigned"
-                           " partition %s and update position to %s", position,
-                           tp, next_offset)
-
-                for record in part_records:
-                    drained[tp].append(record)
-
-                if update_offsets:
-                    self._subscriptions.assignment[tp].position = next_offset
+                # list.extend([]) is a noop, but because drained is a defaultdict
+                # we should avoid initializing the default list unless there are records
+                if part_records:
+                    drained[tp].extend(part_records)
+                # We want to increment subscription position if (1) we're using consumer.poll(),
+                # or (2) we didn't return any records (consumer iterator will update position
+                # when each message is yielded). There may be edge cases where we re-fetch records
+                # that we'll end up skipping, but for now we'll live with that.
+                highwater = self._subscriptions.assignment[tp].highwater
+                if highwater is not None and self._sensors:
+                    self._sensors.records_fetch_lag.record(highwater - part.next_fetch_offset)
+                if update_offsets or not part_records:
+                    # TODO: save leader_epoch
+                    log.debug("Updating fetch position for assigned partition %s to %s (leader epoch %s)",
+                              tp, part.next_fetch_offset, part.leader_epoch)
+                    self._subscriptions.assignment[tp].position = OffsetAndMetadata(part.next_fetch_offset, '', -1)
                 return len(part_records)
 
             else:
                 # these records aren't next in line based on the last consumed
                 # position, ignore them they must be from an obsolete request
                 log.debug("Ignoring fetched records for %s at offset %s since"
-                          " the current position is %d", tp, part.fetch_offset,
-                          position)
+                          " the current position is %d", tp, part.next_fetch_offset,
+                          position.offset)
 
-        part.discard()
+        part.drain()
         return 0
 
-    def _message_generator(self):
-        """Iterate over fetched_records"""
-        while self._next_partition_records or self._completed_fetches:
-
-            if not self._next_partition_records:
-                completion = self._completed_fetches.popleft()
-                self._next_partition_records = self._parse_fetched_data(completion)
-                continue
-
-            # Send additional FetchRequests when the internal queue is low
-            # this should enable moderate pipelining
-            if len(self._completed_fetches) <= self.config['iterator_refetch_records']:
-                self.send_fetches()
-
-            tp = self._next_partition_records.topic_partition
-
-            # We can ignore any prior signal to drop pending message sets
-            # because we are starting from a fresh one where fetch_offset == position
-            # i.e., the user seek()'d to this position
-            self._subscriptions.assignment[tp].drop_pending_message_set = False
-
-            for msg in self._next_partition_records.take():
-
-                # Because we are in a generator, it is possible for
-                # subscription state to change between yield calls
-                # so we need to re-check on each loop
-                # this should catch assignment changes, pauses
-                # and resets via seek_to_beginning / seek_to_end
-                if not self._subscriptions.is_fetchable(tp):
-                    log.debug("Not returning fetched records for partition %s"
-                              " since it is no longer fetchable", tp)
-                    self._next_partition_records = None
-                    break
-
-                # If there is a seek during message iteration,
-                # we should stop unpacking this message set and
-                # wait for a new fetch response that aligns with the
-                # new seek position
-                elif self._subscriptions.assignment[tp].drop_pending_message_set:
-                    log.debug("Skipping remainder of message set for partition %s", tp)
-                    self._subscriptions.assignment[tp].drop_pending_message_set = False
-                    self._next_partition_records = None
-                    break
-
-                # Compressed messagesets may include earlier messages
-                elif msg.offset < self._subscriptions.assignment[tp].position:
-                    log.debug("Skipping message offset: %s (expecting %s)",
-                              msg.offset,
-                              self._subscriptions.assignment[tp].position)
-                    continue
-
-                self._subscriptions.assignment[tp].position = msg.offset + 1
-                yield msg
-
-            self._next_partition_records = None
+    def _reset_offset_if_needed(self, partition, timestamp, offset):
+        # we might lose the assignment while fetching the offset, or the user might seek to a different offset,
+        # so verify it is still assigned and still in need of the requested reset
+        if not self._subscriptions.is_assigned(partition):
+            log.debug("Skipping reset of partition %s since it is no longer assigned", partition)
+        elif not self._subscriptions.is_offset_reset_needed(partition):
+            log.debug("Skipping reset of partition %s since reset is no longer needed", partition)
+        elif timestamp and not timestamp == self._subscriptions.assignment[partition].reset_strategy:
+            log.debug("Skipping reset of partition %s since an alternative reset has been requested", partition)
+        else:
+            log.info("Resetting offset for partition %s to offset %s.", partition, offset)
+            self._subscriptions.seek(partition, offset)
 
-    def _unpack_message_set(self, tp, records):
-        try:
-            batch = records.next_batch()
-            while batch is not None:
-
-                # LegacyRecordBatch cannot access either base_offset or last_offset_delta
-                try:
-                    self._subscriptions.assignment[tp].last_offset_from_message_batch = batch.base_offset + \
-                                                                                        batch.last_offset_delta
-                except AttributeError:
-                    pass
-
-                for record in batch:
-                    key_size = len(record.key) if record.key is not None else -1
-                    value_size = len(record.value) if record.value is not None else -1
-                    key = self._deserialize(
-                        self.config['key_deserializer'],
-                        tp.topic, record.key)
-                    value = self._deserialize(
-                        self.config['value_deserializer'],
-                        tp.topic, record.value)
-                    headers = record.headers
-                    header_size = sum(
-                        len(h_key.encode("utf-8")) + (len(h_val) if h_val is not None else 0) for h_key, h_val in
-                        headers) if headers else -1
-                    yield ConsumerRecord(
-                        tp.topic, tp.partition, record.offset, record.timestamp,
-                        record.timestamp_type, key, value, headers, record.checksum,
-                        key_size, value_size, header_size)
+    def _reset_offsets_async(self, timestamps):
+        timestamps_by_node = self._group_list_offset_requests(timestamps)
 
-                batch = records.next_batch()
+        for node_id, timestamps_and_epochs in six.iteritems(timestamps_by_node):
+            if not self._client.ready(node_id):
+                continue
+            partitions = set(timestamps_and_epochs.keys())
+            expire_at = time.time() + self.config['request_timeout_ms'] / 1000
+            self._subscriptions.set_reset_pending(partitions, expire_at)
+
+            def on_success(timestamps_and_epochs, result):
+                fetched_offsets, partitions_to_retry = result
+                if partitions_to_retry:
+                    self._subscriptions.reset_failed(partitions_to_retry, time.time() + self.config['retry_backoff_ms'] / 1000)
+                    self._client.cluster.request_update()
+
+                for partition, offset in six.iteritems(fetched_offsets):
+                    ts, _epoch = timestamps_and_epochs[partition]
+                    self._reset_offset_if_needed(partition, ts, offset.offset)
+
+            def on_failure(partitions, error):
+                self._subscriptions.reset_failed(partitions, time.time() + self.config['retry_backoff_ms'] / 1000)
+                self._client.cluster.request_update()
 
-        # If unpacking raises StopIteration, it is erroneously
-        # caught by the generator. We want all exceptions to be raised
-        # back to the user. See Issue 545
-        except StopIteration as e:
-            log.exception('StopIteration raised unpacking messageset')
-            raise RuntimeError('StopIteration raised unpacking messageset')
+                if not getattr(error, 'retriable', False):
+                    if not self._cached_list_offsets_exception:
+                        self._cached_list_offsets_exception = error
+                    else:
+                        log.error("Discarding error in ListOffsetResponse because another error is pending: %s", error)
 
-    def __iter__(self):  # pylint: disable=non-iterator-returned
-        return self
+            future = self._send_list_offsets_request(node_id, timestamps_and_epochs)
+            future.add_callback(on_success, timestamps_and_epochs)
+            future.add_errback(on_failure, partitions)
 
-    def __next__(self):
-        if not self._iterator:
-            self._iterator = self._message_generator()
-        try:
-            return next(self._iterator)
-        except StopIteration:
-            self._iterator = None
-            raise
-
-    def _deserialize(self, f, topic, bytes_):
-        if not f:
-            return bytes_
-        if isinstance(f, Deserializer):
-            return f.deserialize(topic, bytes_)
-        return f(bytes_)
-
-    def _send_offset_requests(self, timestamps):
+    def _send_list_offsets_requests(self, timestamps):
         """Fetch offsets for each partition in timestamps dict. This may send
         request to multiple nodes, based on who is Leader for partition.
 
@@ -521,80 +458,98 @@ def _send_offset_requests(self, timestamps):
         Returns:
             Future: resolves to a mapping of retrieved offsets
         """
-        timestamps_by_node = collections.defaultdict(dict)
-        for partition, timestamp in six.iteritems(timestamps):
-            node_id = self._client.cluster.leader_for_partition(partition)
-            if node_id is None:
-                self._client.add_topic(partition.topic)
-                log.debug("Partition %s is unknown for fetching offset,"
-                          " wait for metadata refresh", partition)
-                return Future().failure(Errors.StaleMetadata(partition))
-            elif node_id == -1:
-                log.debug("Leader for partition %s unavailable for fetching "
-                          "offset, wait for metadata refresh", partition)
-                return Future().failure(
-                    Errors.LeaderNotAvailableError(partition))
-            else:
-                timestamps_by_node[node_id][partition] = timestamp
+        timestamps_by_node = self._group_list_offset_requests(timestamps)
+        if not timestamps_by_node:
+            return Future().failure(Errors.StaleMetadata())
 
-        # Aggregate results until we have all
+        # Aggregate results until we have all responses
         list_offsets_future = Future()
-        responses = []
-        node_count = len(timestamps_by_node)
+        fetched_offsets = dict()
+        partitions_to_retry = set()
+        remaining_responses = [len(timestamps_by_node)] # list for mutable / 2.7 hack
 
-        def on_success(value):
-            responses.append(value)
-            if len(responses) == node_count:
-                offsets = {}
-                for r in responses:
-                    offsets.update(r)
-                list_offsets_future.success(offsets)
+        def on_success(remaining_responses, value):
+            remaining_responses[0] -= 1 # noqa: F823
+            fetched_offsets.update(value[0])
+            partitions_to_retry.update(value[1])
+            if not remaining_responses[0] and not list_offsets_future.is_done:
+                list_offsets_future.success((fetched_offsets, partitions_to_retry))
 
         def on_fail(err):
             if not list_offsets_future.is_done:
                 list_offsets_future.failure(err)
 
         for node_id, timestamps in six.iteritems(timestamps_by_node):
-            _f = self._send_offset_request(node_id, timestamps)
-            _f.add_callback(on_success)
+            _f = self._send_list_offsets_request(node_id, timestamps)
+            _f.add_callback(on_success, remaining_responses)
             _f.add_errback(on_fail)
         return list_offsets_future
 
-    def _send_offset_request(self, node_id, timestamps):
+    def _group_list_offset_requests(self, timestamps):
+        timestamps_by_node = collections.defaultdict(dict)
+        for partition, timestamp in six.iteritems(timestamps):
+            node_id = self._client.cluster.leader_for_partition(partition)
+            if node_id is None:
+                self._client.add_topic(partition.topic)
+                log.debug("Partition %s is unknown for fetching offset", partition)
+                self._client.cluster.request_update()
+            elif node_id == -1:
+                log.debug("Leader for partition %s unavailable for fetching "
+                          "offset, wait for metadata refresh", partition)
+                self._client.cluster.request_update()
+            else:
+                leader_epoch = -1
+                timestamps_by_node[node_id][partition] = (timestamp, leader_epoch)
+        return dict(timestamps_by_node)
+
+    def _send_list_offsets_request(self, node_id, timestamps_and_epochs):
+        version = self._client.api_version(ListOffsetsRequest, max_version=4)
+        if self.config['isolation_level'] == 'read_committed' and version < 2:
+            raise Errors.UnsupportedVersionError('read_committed isolation level requires ListOffsetsRequest >= v2')
         by_topic = collections.defaultdict(list)
-        for tp, timestamp in six.iteritems(timestamps):
-            if self.config['api_version'] >= (0, 10, 1):
+        for tp, (timestamp, leader_epoch) in six.iteritems(timestamps_and_epochs):
+            if version >= 4:
+                data = (tp.partition, leader_epoch, timestamp)
+            elif version >= 1:
                 data = (tp.partition, timestamp)
             else:
                 data = (tp.partition, timestamp, 1)
             by_topic[tp.topic].append(data)
 
-        if self.config['api_version'] >= (0, 10, 1):
-            request = OffsetRequest[1](-1, list(six.iteritems(by_topic)))
+        if version <= 1:
+            request = ListOffsetsRequest[version](
+                    -1,
+                    list(six.iteritems(by_topic)))
         else:
-            request = OffsetRequest[0](-1, list(six.iteritems(by_topic)))
+            request = ListOffsetsRequest[version](
+                    -1,
+                    self._isolation_level,
+                    list(six.iteritems(by_topic)))
 
         # Client returns a future that only fails on network issues
         # so create a separate future and attach a callback to update it
         # based on response error codes
         future = Future()
 
+        log.debug("Sending ListOffsetRequest %s to broker %s", request, node_id)
         _f = self._client.send(node_id, request)
-        _f.add_callback(self._handle_offset_response, future)
+        _f.add_callback(self._handle_list_offsets_response, future)
         _f.add_errback(lambda e: future.failure(e))
         return future
 
-    def _handle_offset_response(self, future, response):
-        """Callback for the response of the list offset call above.
+    def _handle_list_offsets_response(self, future, response):
+        """Callback for the response of the ListOffsets api call
 
         Arguments:
             future (Future): the future to update based on response
-            response (OffsetResponse): response from the server
+            response (ListOffsetsResponse): response from the server
 
         Raises:
             AssertionError: if response does not match partition
         """
-        timestamp_offset_map = {}
+        fetched_offsets = dict()
+        partitions_to_retry = set()
+        unauthorized_topics = set()
         for topic, part_data in response.topics:
             for partition_info in part_data:
                 partition, error_code = partition_info[:2]
@@ -603,58 +558,60 @@ def _handle_offset_response(self, future, response):
                 if error_type is Errors.NoError:
                     if response.API_VERSION == 0:
                         offsets = partition_info[2]
-                        assert len(offsets) <= 1, 'Expected OffsetResponse with one offset'
+                        assert len(offsets) <= 1, 'Expected ListOffsetsResponse with one offset'
                         if not offsets:
                             offset = UNKNOWN_OFFSET
                         else:
                             offset = offsets[0]
-                        log.debug("Handling v0 ListOffsetResponse response for %s. "
-                                  "Fetched offset %s", partition, offset)
-                        if offset != UNKNOWN_OFFSET:
-                            timestamp_offset_map[partition] = (offset, None)
-                    else:
+                        timestamp = None
+                        leader_epoch = -1
+                    elif response.API_VERSION <= 3:
                         timestamp, offset = partition_info[2:]
-                        log.debug("Handling ListOffsetResponse response for %s. "
-                                  "Fetched offset %s, timestamp %s",
-                                  partition, offset, timestamp)
-                        if offset != UNKNOWN_OFFSET:
-                            timestamp_offset_map[partition] = (offset, timestamp)
+                        leader_epoch = -1
+                    else:
+                        timestamp, offset, leader_epoch = partition_info[2:]
+                    log.debug("Handling ListOffsetsResponse response for %s. "
+                              "Fetched offset %s, timestamp %s, leader_epoch %s",
+                              partition, offset, timestamp, leader_epoch)
+                    if offset != UNKNOWN_OFFSET:
+                        fetched_offsets[partition] = OffsetAndTimestamp(offset, timestamp, leader_epoch)
                 elif error_type is Errors.UnsupportedForMessageFormatError:
-                    # The message format on the broker side is before 0.10.0,
-                    # we simply put None in the response.
+                    # The message format on the broker side is before 0.10.0, which means it does not
+                    # support timestamps. We treat this case the same as if we weren't able to find an
+                    # offset corresponding to the requested timestamp and leave it out of the result.
                     log.debug("Cannot search by timestamp for partition %s because the"
                               " message format version is before 0.10.0", partition)
-                elif error_type is Errors.NotLeaderForPartitionError:
+                elif error_type in (Errors.NotLeaderForPartitionError,
+                                    Errors.ReplicaNotAvailableError,
+                                    Errors.KafkaStorageError):
                     log.debug("Attempt to fetch offsets for partition %s failed due"
-                              " to obsolete leadership information, retrying.",
-                              partition)
-                    future.failure(error_type(partition))
-                    return
+                              " to %s, retrying.", error_type.__name__, partition)
+                    partitions_to_retry.add(partition)
                 elif error_type is Errors.UnknownTopicOrPartitionError:
-                    log.warning("Received unknown topic or partition error in ListOffset "
-                             "request for partition %s. The topic/partition " +
-                             "may not exist or the user may not have Describe access "
-                             "to it.", partition)
-                    future.failure(error_type(partition))
-                    return
+                    log.warning("Received unknown topic or partition error in ListOffsets "
+                                "request for partition %s. The topic/partition " +
+                                "may not exist or the user may not have Describe access "
+                                "to it.", partition)
+                    partitions_to_retry.add(partition)
+                elif error_type is Errors.TopicAuthorizationFailedError:
+                    unauthorized_topics.add(topic)
                 else:
                     log.warning("Attempt to fetch offsets for partition %s failed due to:"
-                                " %s", partition, error_type)
-                    future.failure(error_type(partition))
-                    return
-        if not future.is_done:
-            future.success(timestamp_offset_map)
+                                " %s", partition, error_type.__name__)
+                    partitions_to_retry.add(partition)
+        if unauthorized_topics:
+            future.failure(Errors.TopicAuthorizationFailedError(unauthorized_topics))
+        else:
+            future.success((fetched_offsets, partitions_to_retry))
 
     def _fetchable_partitions(self):
         fetchable = self._subscriptions.fetchable_partitions()
         # do not fetch a partition if we have a pending fetch response to process
+        discard = {fetch.topic_partition for fetch in self._completed_fetches}
         current = self._next_partition_records
-        pending = copy.copy(self._completed_fetches)
         if current:
-            fetchable.discard(current.topic_partition)
-        for fetch in pending:
-            fetchable.discard(fetch.topic_partition)
-        return fetchable
+            discard.add(current.topic_partition)
+        return [tp for tp in fetchable if tp not in discard]
 
     def _create_fetch_requests(self):
         """Create fetch requests for all assigned partitions, grouped by node.
@@ -662,25 +619,16 @@ def _create_fetch_requests(self):
         FetchRequests skipped if no leader, or node has requests in flight
 
         Returns:
-            dict: {node_id: FetchRequest, ...} (version depends on api_version)
+            dict: {node_id: (FetchRequest, {TopicPartition: fetch_offset}), ...} (version depends on client api_versions)
         """
         # create the fetch info as a dict of lists of partition info tuples
         # which can be passed to FetchRequest() via .items()
-        fetchable = collections.defaultdict(lambda: collections.defaultdict(list))
+        version = self._client.api_version(FetchRequest, max_version=10)
+        fetchable = collections.defaultdict(collections.OrderedDict)
 
         for partition in self._fetchable_partitions():
             node_id = self._client.cluster.leader_for_partition(partition)
 
-            # advance position for any deleted compacted messages if required
-            if self._subscriptions.assignment[partition].last_offset_from_message_batch:
-                next_offset_from_batch_header = self._subscriptions.assignment[partition].last_offset_from_message_batch + 1
-                if next_offset_from_batch_header > self._subscriptions.assignment[partition].position:
-                    log.debug(
-                        "Advance position for partition %s from %s to %s (last message batch location plus one)"
-                        " to correct for deleted compacted messages",
-                        partition, self._subscriptions.assignment[partition].position, next_offset_from_batch_header)
-                    self._subscriptions.assignment[partition].position = next_offset_from_batch_header
-
             position = self._subscriptions.assignment[partition].position
 
             # fetch if there is a leader and no in-flight requests
@@ -689,104 +637,161 @@ def _create_fetch_requests(self):
                           " Requesting metadata update", partition)
                 self._client.cluster.request_update()
 
-            elif self._client.in_flight_request_count(node_id) == 0:
-                partition_info = (
-                    partition.partition,
-                    position,
-                    self.config['max_partition_fetch_bytes']
-                )
-                fetchable[node_id][partition.topic].append(partition_info)
-                log.debug("Adding fetch request for partition %s at offset %d",
-                          partition, position)
-            else:
-                log.log(0, "Skipping fetch for partition %s because there is an inflight request to node %s",
+            elif not self._client.connected(node_id) and self._client.connection_delay(node_id) > 0:
+                # If we try to send during the reconnect backoff window, then the request is just
+                # going to be failed anyway before being sent, so skip the send for now
+                log.debug("Skipping fetch for partition %s because node %s is awaiting reconnect backoff",
                         partition, node_id)
 
-        if self.config['api_version'] >= (0, 11, 0):
-            version = 4
-        elif self.config['api_version'] >= (0, 10, 1):
-            version = 3
-        elif self.config['api_version'] >= (0, 10):
-            version = 2
-        elif self.config['api_version'] == (0, 9):
-            version = 1
-        else:
-            version = 0
+            elif self._client.throttle_delay(node_id) > 0:
+                # If we try to send while throttled, then the request is just
+                # going to be failed anyway before being sent, so skip the send for now
+                log.debug("Skipping fetch for partition %s because node %s is throttled",
+                        partition, node_id)
+
+            elif not self._client.ready(node_id):
+                # Until we support send request queues, any attempt to send to a not-ready node will be
+                # immediately failed with NodeNotReadyError.
+                log.debug("Skipping fetch for partition %s because connection to leader node is not ready yet")
+
+            elif node_id in self._nodes_with_pending_fetch_requests:
+                log.debug("Skipping fetch for partition %s because there is a pending fetch request to node %s",
+                        partition, node_id)
+
+            else:
+                # Leader is connected and does not have a pending fetch request
+                if version < 5:
+                    partition_info = (
+                        partition.partition,
+                        position.offset,
+                        self.config['max_partition_fetch_bytes']
+                    )
+                elif version <= 8:
+                    partition_info = (
+                        partition.partition,
+                        position.offset,
+                        -1, # log_start_offset is used internally by brokers / replicas only
+                        self.config['max_partition_fetch_bytes'],
+                    )
+                else:
+                    partition_info = (
+                        partition.partition,
+                        position.leader_epoch,
+                        position.offset,
+                        -1, # log_start_offset is used internally by brokers / replicas only
+                        self.config['max_partition_fetch_bytes'],
+                    )
+
+                fetchable[node_id][partition] = partition_info
+                log.debug("Adding fetch request for partition %s at offset %d",
+                          partition, position.offset)
+
         requests = {}
-        for node_id, partition_data in six.iteritems(fetchable):
-            if version < 3:
-                requests[node_id] = FetchRequest[version](
+        for node_id, next_partitions in six.iteritems(fetchable):
+            if version >= 7 and self.config['enable_incremental_fetch_sessions']:
+                if node_id not in self._session_handlers:
+                    self._session_handlers[node_id] = FetchSessionHandler(node_id)
+                session = self._session_handlers[node_id].build_next(next_partitions)
+            else:
+                # No incremental fetch support
+                session = FetchRequestData(next_partitions, None, FetchMetadata.LEGACY)
+
+            if version <= 2:
+                request = FetchRequest[version](
+                    -1,  # replica_id
+                    self.config['fetch_max_wait_ms'],
+                    self.config['fetch_min_bytes'],
+                    session.to_send)
+            elif version == 3:
+                request = FetchRequest[version](
+                    -1,  # replica_id
+                    self.config['fetch_max_wait_ms'],
+                    self.config['fetch_min_bytes'],
+                    self.config['fetch_max_bytes'],
+                    session.to_send)
+            elif version <= 6:
+                request = FetchRequest[version](
                     -1,  # replica_id
                     self.config['fetch_max_wait_ms'],
                     self.config['fetch_min_bytes'],
-                    partition_data.items())
+                    self.config['fetch_max_bytes'],
+                    self._isolation_level,
+                    session.to_send)
             else:
-                # As of version == 3 partitions will be returned in order as
-                # they are requested, so to avoid starvation with
-                # `fetch_max_bytes` option we need this shuffle
-                # NOTE: we do have partition_data in random order due to usage
-                #       of unordered structures like dicts, but that does not
-                #       guarantee equal distribution, and starting in Python3.6
-                #       dicts retain insert order.
-                partition_data = list(partition_data.items())
-                random.shuffle(partition_data)
-                if version == 3:
-                    requests[node_id] = FetchRequest[version](
-                        -1,  # replica_id
-                        self.config['fetch_max_wait_ms'],
-                        self.config['fetch_min_bytes'],
-                        self.config['fetch_max_bytes'],
-                        partition_data)
+                # Through v8
+                request = FetchRequest[version](
+                    -1,  # replica_id
+                    self.config['fetch_max_wait_ms'],
+                    self.config['fetch_min_bytes'],
+                    self.config['fetch_max_bytes'],
+                    self._isolation_level,
+                    session.id,
+                    session.epoch,
+                    session.to_send,
+                    session.to_forget)
+
+            fetch_offsets = {}
+            for tp, partition_data in six.iteritems(next_partitions):
+                if version <= 8:
+                    offset = partition_data[1]
                 else:
-                    requests[node_id] = FetchRequest[version](
-                        -1,  # replica_id
-                        self.config['fetch_max_wait_ms'],
-                        self.config['fetch_min_bytes'],
-                        self.config['fetch_max_bytes'],
-                        self._isolation_level,
-                        partition_data)
+                    offset = partition_data[2]
+                fetch_offsets[tp] = offset
+
+            requests[node_id] = (request, fetch_offsets)
+
         return requests
 
-    def _handle_fetch_response(self, request, send_time, response):
+    def _handle_fetch_response(self, node_id, fetch_offsets, send_time, response):
         """The callback for fetch completion"""
-        fetch_offsets = {}
-        for topic, partitions in request.topics:
-            for partition_data in partitions:
-                partition, offset = partition_data[:2]
-                fetch_offsets[TopicPartition(topic, partition)] = offset
+        if response.API_VERSION >= 7 and self.config['enable_incremental_fetch_sessions']:
+            if node_id not in self._session_handlers:
+                log.error("Unable to find fetch session handler for node %s. Ignoring fetch response", node_id)
+                return
+            if not self._session_handlers[node_id].handle_response(response):
+                return
 
         partitions = set([TopicPartition(topic, partition_data[0])
                           for topic, partitions in response.topics
                           for partition_data in partitions])
-        metric_aggregator = FetchResponseMetricAggregator(self._sensors, partitions)
+        if self._sensors:
+            metric_aggregator = FetchResponseMetricAggregator(self._sensors, partitions)
+        else:
+            metric_aggregator = None
 
-        # randomized ordering should improve balance for short-lived consumers
-        random.shuffle(response.topics)
         for topic, partitions in response.topics:
-            random.shuffle(partitions)
             for partition_data in partitions:
                 tp = TopicPartition(topic, partition_data[0])
+                fetch_offset = fetch_offsets[tp]
                 completed_fetch = CompletedFetch(
-                    tp, fetch_offsets[tp],
+                    tp, fetch_offset,
                     response.API_VERSION,
                     partition_data[1:],
                     metric_aggregator
                 )
                 self._completed_fetches.append(completed_fetch)
 
-        if response.API_VERSION >= 1:
-            self._sensors.fetch_throttle_time_sensor.record(response.throttle_time_ms)
-        self._sensors.fetch_latency.record((time.time() - send_time) * 1000)
+        if self._sensors:
+            self._sensors.fetch_latency.record((time.time() - send_time) * 1000)
+
+    def _handle_fetch_error(self, node_id, exception):
+        level = logging.INFO if isinstance(exception, Errors.Cancelled) else logging.ERROR
+        log.log(level, 'Fetch to node %s failed: %s', node_id, exception)
+        if node_id in self._session_handlers:
+            self._session_handlers[node_id].handle_error(exception)
+
+    def _clear_pending_fetch_request(self, node_id, _):
+        try:
+            self._nodes_with_pending_fetch_requests.remove(node_id)
+        except KeyError:
+            pass
 
     def _parse_fetched_data(self, completed_fetch):
         tp = completed_fetch.topic_partition
         fetch_offset = completed_fetch.fetched_offset
-        num_bytes = 0
-        records_count = 0
-        parsed_records = None
-
         error_code, highwater = completed_fetch.partition_data[:2]
         error_type = Errors.for_code(error_code)
+        parsed_records = None
 
         try:
             if not self._subscriptions.is_fetchable(tp):
@@ -796,118 +801,498 @@ def _parse_fetched_data(self, completed_fetch):
                           " since it is no longer fetchable", tp)
 
             elif error_type is Errors.NoError:
-                self._subscriptions.assignment[tp].highwater = highwater
-
                 # we are interested in this fetch only if the beginning
                 # offset (of the *request*) matches the current consumed position
                 # Note that the *response* may return a messageset that starts
                 # earlier (e.g., compressed messages) or later (e.g., compacted topic)
                 position = self._subscriptions.assignment[tp].position
-                if position is None or position != fetch_offset:
+                if position is None or position.offset != fetch_offset:
                     log.debug("Discarding fetch response for partition %s"
                               " since its offset %d does not match the"
                               " expected offset %d", tp, fetch_offset,
-                              position)
+                              position.offset)
                     return None
 
                 records = MemoryRecords(completed_fetch.partition_data[-1])
-                if records.has_next():
-                    log.debug("Adding fetched record for partition %s with"
-                              " offset %d to buffered record list", tp,
-                              position)
-                    unpacked = list(self._unpack_message_set(tp, records))
-                    parsed_records = self.PartitionRecords(fetch_offset, tp, unpacked)
-                    if unpacked:
-                        last_offset = unpacked[-1].offset
-                        self._sensors.records_fetch_lag.record(highwater - last_offset)
-                    num_bytes = records.valid_bytes()
-                    records_count = len(unpacked)
-                elif records.size_in_bytes() > 0:
-                    # we did not read a single message from a non-empty
-                    # buffer because that message's size is larger than
-                    # fetch size, in this case record this exception
-                    record_too_large_partitions = {tp: fetch_offset}
-                    raise RecordTooLargeError(
-                        "There are some messages at [Partition=Offset]: %s "
-                        " whose size is larger than the fetch size %s"
-                        " and hence cannot be ever returned."
-                        " Increase the fetch size, or decrease the maximum message"
-                        " size the broker will allow." % (
-                            record_too_large_partitions,
-                            self.config['max_partition_fetch_bytes']),
-                        record_too_large_partitions)
-                self._sensors.record_topic_fetch_metrics(tp.topic, num_bytes, records_count)
+                aborted_transactions = None
+                if completed_fetch.response_version >= 11:
+                    aborted_transactions = completed_fetch.partition_data[-3]
+                elif completed_fetch.response_version >= 4:
+                    aborted_transactions = completed_fetch.partition_data[-2]
+                log.debug("Preparing to read %s bytes of data for partition %s with offset %d",
+                          records.size_in_bytes(), tp, fetch_offset)
+                parsed_records = self.PartitionRecords(fetch_offset, tp, records,
+                                                       key_deserializer=self.config['key_deserializer'],
+                                                       value_deserializer=self.config['value_deserializer'],
+                                                       check_crcs=self.config['check_crcs'],
+                                                       isolation_level=self._isolation_level,
+                                                       aborted_transactions=aborted_transactions,
+                                                       metric_aggregator=completed_fetch.metric_aggregator,
+                                                       on_drain=self._on_partition_records_drain)
+                if not records.has_next() and records.size_in_bytes() > 0:
+                    if completed_fetch.response_version < 3:
+                        # Implement the pre KIP-74 behavior of throwing a RecordTooLargeException.
+                        record_too_large_partitions = {tp: fetch_offset}
+                        raise RecordTooLargeError(
+                            "There are some messages at [Partition=Offset]: %s "
+                            " whose size is larger than the fetch size %s"
+                            " and hence cannot be ever returned. Please condier upgrading your broker to 0.10.1.0 or"
+                            " newer to avoid this issue. Alternatively, increase the fetch size on the client (using"
+                            " max_partition_fetch_bytes)" % (
+                                record_too_large_partitions,
+                                self.config['max_partition_fetch_bytes']),
+                            record_too_large_partitions)
+                    else:
+                        # This should not happen with brokers that support FetchRequest/Response V3 or higher (i.e. KIP-74)
+                        raise Errors.KafkaError("Failed to make progress reading messages at %s=%s."
+                                                " Received a non-empty fetch response from the server, but no"
+                                                " complete records were found." % (tp, fetch_offset))
+
+                if highwater >= 0:
+                    self._subscriptions.assignment[tp].highwater = highwater
 
             elif error_type in (Errors.NotLeaderForPartitionError,
-                                Errors.UnknownTopicOrPartitionError):
+                                Errors.ReplicaNotAvailableError,
+                                Errors.UnknownTopicOrPartitionError,
+                                Errors.KafkaStorageError):
+                log.debug("Error fetching partition %s: %s", tp, error_type.__name__)
                 self._client.cluster.request_update()
             elif error_type is Errors.OffsetOutOfRangeError:
                 position = self._subscriptions.assignment[tp].position
-                if position is None or position != fetch_offset:
+                if position is None or position.offset != fetch_offset:
                     log.debug("Discarding stale fetch response for partition %s"
                               " since the fetched offset %d does not match the"
-                              " current offset %d", tp, fetch_offset, position)
+                              " current offset %d", tp, fetch_offset, position.offset)
                 elif self._subscriptions.has_default_offset_reset_policy():
                     log.info("Fetch offset %s is out of range for topic-partition %s", fetch_offset, tp)
-                    self._subscriptions.need_offset_reset(tp)
+                    self._subscriptions.request_offset_reset(tp)
                 else:
                     raise Errors.OffsetOutOfRangeError({tp: fetch_offset})
 
             elif error_type is Errors.TopicAuthorizationFailedError:
                 log.warning("Not authorized to read from topic %s.", tp.topic)
-                raise Errors.TopicAuthorizationFailedError(set(tp.topic))
-            elif error_type is Errors.UnknownError:
-                log.warning("Unknown error fetching data for topic-partition %s", tp)
+                raise Errors.TopicAuthorizationFailedError(set([tp.topic]))
+            elif getattr(error_type, 'retriable', False):
+                log.debug("Retriable error fetching partition %s: %s", tp, error_type())
+                if getattr(error_type, 'invalid_metadata', False):
+                    self._client.cluster.request_update()
             else:
                 raise error_type('Unexpected error while fetching data')
 
         finally:
-            completed_fetch.metric_aggregator.record(tp, num_bytes, records_count)
+            if parsed_records is None and completed_fetch.metric_aggregator:
+                completed_fetch.metric_aggregator.record(tp, 0, 0)
+
+            if error_type is not Errors.NoError:
+                # we move the partition to the end if there was an error. This way, it's more likely that partitions for
+                # the same topic can remain together (allowing for more efficient serialization).
+                self._subscriptions.move_partition_to_end(tp)
 
         return parsed_records
 
+    def _on_partition_records_drain(self, partition_records):
+        # we move the partition to the end if we received some bytes. This way, it's more likely that partitions
+        # for the same topic can remain together (allowing for more efficient serialization).
+        if partition_records.bytes_read > 0:
+            self._subscriptions.move_partition_to_end(partition_records.topic_partition)
+
+    def close(self):
+        if self._next_partition_records is not None:
+            self._next_partition_records.drain()
+        self._next_in_line_exception_metadata = None
+
     class PartitionRecords(object):
-        def __init__(self, fetch_offset, tp, messages):
+        def __init__(self, fetch_offset, tp, records,
+                     key_deserializer=None, value_deserializer=None,
+                     check_crcs=True, isolation_level=READ_UNCOMMITTED,
+                     aborted_transactions=None, # raw data from response / list of (producer_id, first_offset) tuples
+                     metric_aggregator=None, on_drain=lambda x: None):
             self.fetch_offset = fetch_offset
             self.topic_partition = tp
-            self.messages = messages
+            self.leader_epoch = -1
+            self.next_fetch_offset = fetch_offset
+            self.bytes_read = 0
+            self.records_read = 0
+            self.isolation_level = isolation_level
+            self.aborted_producer_ids = set()
+            self.aborted_transactions = collections.deque(
+                sorted([AbortedTransaction(*data) for data in aborted_transactions] if aborted_transactions else [],
+                       key=lambda txn: txn.first_offset)
+            )
+            self.metric_aggregator = metric_aggregator
+            self.check_crcs = check_crcs
+            self.record_iterator = itertools.dropwhile(
+                self._maybe_skip_record,
+                self._unpack_records(tp, records, key_deserializer, value_deserializer))
+            self.on_drain = on_drain
+            self._next_inline_exception = None
+
+        def _maybe_skip_record(self, record):
             # When fetching an offset that is in the middle of a
             # compressed batch, we will get all messages in the batch.
             # But we want to start 'take' at the fetch_offset
             # (or the next highest offset in case the message was compacted)
-            for i, msg in enumerate(messages):
-                if msg.offset < fetch_offset:
-                    log.debug("Skipping message offset: %s (expecting %s)",
-                              msg.offset, fetch_offset)
+            if record.offset < self.fetch_offset:
+                log.debug("Skipping message offset: %s (expecting %s)",
+                          record.offset, self.fetch_offset)
+                return True
+            else:
+                return False
+
+        # For truthiness evaluation
+        def __bool__(self):
+            return self.record_iterator is not None
+
+        # py2
+        __nonzero__ = __bool__
+
+        def drain(self):
+            if self.record_iterator is not None:
+                self.record_iterator = None
+                self._next_inline_exception = None
+                if self.metric_aggregator:
+                    self.metric_aggregator.record(self.topic_partition, self.bytes_read, self.records_read)
+                self.on_drain(self)
+
+        def _maybe_raise_next_inline_exception(self):
+            if self._next_inline_exception:
+                exc, self._next_inline_exception = self._next_inline_exception, None
+                raise exc
+
+        def take(self, n=None):
+            self._maybe_raise_next_inline_exception()
+            records = []
+            try:
+                # Note that records.extend(iter) will extend partially when exception raised mid-stream
+                records.extend(itertools.islice(self.record_iterator, 0, n))
+            except Exception as e:
+                if not records:
+                    raise e
+                # To be thrown in the next call of this method
+                self._next_inline_exception = e
+            return records
+
+        def _unpack_records(self, tp, records, key_deserializer, value_deserializer):
+            try:
+                batch = records.next_batch()
+                last_batch = None
+                while batch is not None:
+                    last_batch = batch
+
+                    if self.check_crcs and not batch.validate_crc():
+                        raise Errors.CorruptRecordError(
+                                "Record batch for partition %s at offset %s failed crc check" % (
+                                    self.topic_partition, batch.base_offset))
+
+
+                    # Try DefaultsRecordBatch / message log format v2
+                    # base_offset, last_offset_delta, aborted transactions, and control batches
+                    if batch.magic == 2:
+                        self.leader_epoch = batch.leader_epoch
+                        if self.isolation_level == READ_COMMITTED and batch.has_producer_id():
+                            # remove from the aborted transaction queue all aborted transactions which have begun
+                            # before the current batch's last offset and add the associated producerIds to the
+                            # aborted producer set
+                            self._consume_aborted_transactions_up_to(batch.last_offset)
+
+                            producer_id = batch.producer_id
+                            if self._contains_abort_marker(batch):
+                                try:
+                                    self.aborted_producer_ids.remove(producer_id)
+                                except KeyError:
+                                    pass
+                            elif self._is_batch_aborted(batch):
+                                log.debug("Skipping aborted record batch from partition %s with producer_id %s and"
+                                          " offsets %s to %s",
+                                          self.topic_partition, producer_id, batch.base_offset, batch.last_offset)
+                                self.next_fetch_offset = batch.next_offset
+                                batch = records.next_batch()
+                                continue
+
+                        # Control batches have a single record indicating whether a transaction
+                        # was aborted or committed. These are not returned to the consumer.
+                        if batch.is_control_batch:
+                            self.next_fetch_offset = batch.next_offset
+                            batch = records.next_batch()
+                            continue
+
+                    for record in batch:
+                        if self.check_crcs and not record.validate_crc():
+                            raise Errors.CorruptRecordError(
+                                    "Record for partition %s at offset %s failed crc check" % (
+                                        self.topic_partition, record.offset))
+                        key_size = len(record.key) if record.key is not None else -1
+                        value_size = len(record.value) if record.value is not None else -1
+                        key = self._deserialize(key_deserializer, tp.topic, record.key)
+                        value = self._deserialize(value_deserializer, tp.topic, record.value)
+                        headers = record.headers
+                        header_size = sum(
+                            len(h_key.encode("utf-8")) + (len(h_val) if h_val is not None else 0) for h_key, h_val in
+                            headers) if headers else -1
+                        self.records_read += 1
+                        self.bytes_read += record.size_in_bytes
+                        self.next_fetch_offset = record.offset + 1
+                        yield ConsumerRecord(
+                            tp.topic, tp.partition, self.leader_epoch, record.offset, record.timestamp,
+                            record.timestamp_type, key, value, headers, record.checksum,
+                            key_size, value_size, header_size)
+
+                    batch = records.next_batch()
                 else:
-                    self.message_idx = i
-                    break
+                    # Message format v2 preserves the last offset in a batch even if the last record is removed
+                    # through compaction. By using the next offset computed from the last offset in the batch,
+                    # we ensure that the offset of the next fetch will point to the next batch, which avoids
+                    # unnecessary re-fetching of the same batch (in the worst case, the consumer could get stuck
+                    # fetching the same batch repeatedly).
+                    if last_batch and last_batch.magic == 2:
+                        self.next_fetch_offset = last_batch.next_offset
+                    self.drain()
+
+            # If unpacking raises StopIteration, it is erroneously
+            # caught by the generator. We want all exceptions to be raised
+            # back to the user. See Issue 545
+            except StopIteration:
+                log.exception('StopIteration raised unpacking messageset')
+                raise RuntimeError('StopIteration raised unpacking messageset')
+
+        def _deserialize(self, f, topic, bytes_):
+            if not f:
+                return bytes_
+            if isinstance(f, Deserializer):
+                return f.deserialize(topic, bytes_)
+            return f(bytes_)
+
+        def _consume_aborted_transactions_up_to(self, offset):
+            if not self.aborted_transactions:
+                return
+
+            while self.aborted_transactions and self.aborted_transactions[0].first_offset <= offset:
+                self.aborted_producer_ids.add(self.aborted_transactions.popleft().producer_id)
+
+        def _is_batch_aborted(self, batch):
+            return batch.is_transactional and batch.producer_id in self.aborted_producer_ids
+
+        def _contains_abort_marker(self, batch):
+            if not batch.is_control_batch:
+                return False
+            record = next(batch)
+            if not record:
+                return False
+            return record.abort
+
+
+class FetchSessionHandler(object):
+    """
+    FetchSessionHandler maintains the fetch session state for connecting to a broker.
 
+    Using the protocol outlined by KIP-227, clients can create incremental fetch sessions.
+    These sessions allow the client to fetch information about a set of partition over
+    and over, without explicitly enumerating all the partitions in the request and the
+    response.
+
+    FetchSessionHandler tracks the partitions which are in the session.  It also
+    determines which partitions need to be included in each fetch request, and what
+    the attached fetch session metadata should be for each request.
+    """
+
+    def __init__(self, node_id):
+        self.node_id = node_id
+        self.next_metadata = FetchMetadata.INITIAL
+        self.session_partitions = {}
+
+    def build_next(self, next_partitions):
+        """
+        Arguments:
+            next_partitions (dict): TopicPartition -> TopicPartitionState
+
+        Returns:
+            FetchRequestData
+        """
+        if self.next_metadata.is_full:
+            log.debug("Built full fetch %s for node %s with %s partition(s).",
+                self.next_metadata, self.node_id, len(next_partitions))
+            self.session_partitions = next_partitions
+            return FetchRequestData(next_partitions, None, self.next_metadata)
+
+        prev_tps = set(self.session_partitions.keys())
+        next_tps = set(next_partitions.keys())
+        log.debug("Building incremental partitions from next: %s, previous: %s", next_tps, prev_tps)
+        added = next_tps - prev_tps
+        for tp in added:
+            self.session_partitions[tp] = next_partitions[tp]
+        removed = prev_tps - next_tps
+        for tp in removed:
+            self.session_partitions.pop(tp)
+        altered = set()
+        for tp in next_tps & prev_tps:
+            if next_partitions[tp] != self.session_partitions[tp]:
+                self.session_partitions[tp] = next_partitions[tp]
+                altered.add(tp)
+
+        log.debug("Built incremental fetch %s for node %s. Added %s, altered %s, removed %s out of %s",
+                  self.next_metadata, self.node_id, added, altered, removed, self.session_partitions.keys())
+        to_send = collections.OrderedDict({tp: next_partitions[tp] for tp in next_partitions if tp in (added | altered)})
+        return FetchRequestData(to_send, removed, self.next_metadata)
+
+    def handle_response(self, response):
+        if response.error_code != Errors.NoError.errno:
+            error_type = Errors.for_code(response.error_code)
+            log.info("Node %s was unable to process the fetch request with %s: %s.",
+                self.node_id, self.next_metadata, error_type())
+            if error_type is Errors.FetchSessionIdNotFoundError:
+                self.next_metadata = FetchMetadata.INITIAL
+            else:
+                self.next_metadata = self.next_metadata.next_close_existing()
+            return False
+
+        response_tps = self._response_partitions(response)
+        session_tps = set(self.session_partitions.keys())
+        if self.next_metadata.is_full:
+            if response_tps != session_tps:
+                log.info("Node %s sent an invalid full fetch response with extra %s / omitted %s",
+                         self.node_id, response_tps - session_tps, session_tps - response_tps)
+                self.next_metadata = FetchMetadata.INITIAL
+                return False
+            elif response.session_id == FetchMetadata.INVALID_SESSION_ID:
+                log.debug("Node %s sent a full fetch response with %s partitions",
+                          self.node_id, len(response_tps))
+                self.next_metadata = FetchMetadata.INITIAL
+                return True
+            elif response.session_id == FetchMetadata.THROTTLED_SESSION_ID:
+                log.debug("Node %s sent a empty full fetch response due to a quota violation (%s partitions)",
+                          self.node_id, len(response_tps))
+                # Keep current metadata
+                return True
             else:
-                self.message_idx = 0
-                self.messages = None
+                # The server created a new incremental fetch session.
+                log.debug("Node %s sent a full fetch response that created a new incremental fetch session %s"
+                          " with %s response partitions",
+                          self.node_id, response.session_id,
+                          len(response_tps))
+                self.next_metadata = FetchMetadata.new_incremental(response.session_id)
+                return True
+        else:
+            if response_tps - session_tps:
+                log.info("Node %s sent an invalid incremental fetch response with extra partitions %s",
+                         self.node_id, response_tps - session_tps)
+                self.next_metadata = self.next_metadata.next_close_existing()
+                return False
+            elif response.session_id == FetchMetadata.INVALID_SESSION_ID:
+                # The incremental fetch session was closed by the server.
+                log.debug("Node %s sent an incremental fetch response closing session %s"
+                          " with %s response partitions (%s implied)",
+                          self.node_id, self.next_metadata.session_id,
+                          len(response_tps), len(self.session_partitions) - len(response_tps))
+                self.next_metadata = FetchMetadata.INITIAL
+                return True
+            elif response.session_id == FetchMetadata.THROTTLED_SESSION_ID:
+                log.debug("Node %s sent a empty incremental fetch response due to a quota violation (%s partitions)",
+                          self.node_id, len(response_tps))
+                # Keep current metadata
+                return True
+            else:
+                # The incremental fetch session was continued by the server.
+                log.debug("Node %s sent an incremental fetch response for session %s"
+                          " with %s response partitions (%s implied)",
+                          self.node_id, response.session_id,
+                          len(response_tps), len(self.session_partitions) - len(response_tps))
+                self.next_metadata = self.next_metadata.next_incremental()
+                return True
+
+    def handle_error(self, _exception):
+        self.next_metadata = self.next_metadata.next_close_existing()
+
+    def _response_partitions(self, response):
+        return {TopicPartition(topic, partition_data[0])
+                for topic, partitions in response.topics
+                for partition_data in partitions}
+
+
+class FetchMetadata(object):
+    __slots__ = ('session_id', 'epoch')
+
+    MAX_EPOCH = 2147483647
+    INVALID_SESSION_ID = 0 # used by clients with no session.
+    THROTTLED_SESSION_ID = -1 # returned with empty response on quota violation
+    INITIAL_EPOCH = 0 # client wants to create or recreate a session.
+    FINAL_EPOCH = -1 # client wants to close any existing session, and not create a new one.
+
+    def __init__(self, session_id, epoch):
+        self.session_id = session_id
+        self.epoch = epoch
+
+    @property
+    def is_full(self):
+        return self.epoch == self.INITIAL_EPOCH or self.epoch == self.FINAL_EPOCH
+
+    @classmethod
+    def next_epoch(cls, prev_epoch):
+        if prev_epoch < 0:
+            return cls.FINAL_EPOCH
+        elif prev_epoch == cls.MAX_EPOCH:
+            return 1
+        else:
+            return prev_epoch + 1
 
-        # For truthiness evaluation we need to define __len__ or __nonzero__
-        def __len__(self):
-            if self.messages is None or self.message_idx >= len(self.messages):
-                return 0
-            return len(self.messages) - self.message_idx
+    def next_close_existing(self):
+        return self.__class__(self.session_id, self.INITIAL_EPOCH)
 
-        def discard(self):
-            self.messages = None
+    @classmethod
+    def new_incremental(cls, session_id):
+        return cls(session_id, cls.next_epoch(cls.INITIAL_EPOCH))
 
-        def take(self, n=None):
-            if not len(self):
-                return []
-            if n is None or n > len(self):
-                n = len(self)
-            next_idx = self.message_idx + n
-            res = self.messages[self.message_idx:next_idx]
-            self.message_idx = next_idx
-            # fetch_offset should be incremented by 1 to parallel the
-            # subscription position (also incremented by 1)
-            self.fetch_offset = max(self.fetch_offset, res[-1].offset + 1)
-            return res
+    def next_incremental(self):
+        return self.__class__(self.session_id, self.next_epoch(self.epoch))
+
+FetchMetadata.INITIAL = FetchMetadata(FetchMetadata.INVALID_SESSION_ID, FetchMetadata.INITIAL_EPOCH)
+FetchMetadata.LEGACY = FetchMetadata(FetchMetadata.INVALID_SESSION_ID, FetchMetadata.FINAL_EPOCH)
+
+
+class FetchRequestData(object):
+    __slots__ = ('_to_send', '_to_forget', '_metadata')
+
+    def __init__(self, to_send, to_forget, metadata):
+        self._to_send = to_send or dict() # {TopicPartition: (partition, ...)}
+        self._to_forget = to_forget or set() # {TopicPartition}
+        self._metadata = metadata
+
+    @property
+    def metadata(self):
+        return self._metadata
+
+    @property
+    def id(self):
+        return self._metadata.session_id
+
+    @property
+    def epoch(self):
+        return self._metadata.epoch
+
+    @property
+    def to_send(self):
+        # Return as list of [(topic, [(partition, ...), ...]), ...]
+        # so it can be passed directly to encoder
+        partition_data = collections.defaultdict(list)
+        for tp, partition_info in six.iteritems(self._to_send):
+            partition_data[tp.topic].append(partition_info)
+        return list(partition_data.items())
+
+    @property
+    def to_forget(self):
+        # Return as list of [(topic, (partiiton, ...)), ...]
+        # so it an be passed directly to encoder
+        partition_data = collections.defaultdict(list)
+        for tp in self._to_forget:
+            partition_data[tp.topic].append(tp.partition)
+        return list(partition_data.items())
+
+
+class FetchMetrics(object):
+    __slots__ = ('total_bytes', 'total_records')
+
+    def __init__(self):
+        self.total_bytes = 0
+        self.total_records = 0
 
 
 class FetchResponseMetricAggregator(object):
@@ -920,8 +1305,8 @@ class FetchResponseMetricAggregator(object):
     def __init__(self, sensors, partitions):
         self.sensors = sensors
         self.unrecorded_partitions = partitions
-        self.total_bytes = 0
-        self.total_records = 0
+        self.fetch_metrics = FetchMetrics()
+        self.topic_fetch_metrics = collections.defaultdict(FetchMetrics)
 
     def record(self, partition, num_bytes, num_records):
         """
@@ -930,13 +1315,17 @@ def record(self, partition, num_bytes, num_records):
         have reported, we write the metric.
         """
         self.unrecorded_partitions.remove(partition)
-        self.total_bytes += num_bytes
-        self.total_records += num_records
+        self.fetch_metrics.total_bytes += num_bytes
+        self.fetch_metrics.total_records += num_records
+        self.topic_fetch_metrics[partition.topic].total_bytes += num_bytes
+        self.topic_fetch_metrics[partition.topic].total_records += num_records
 
         # once all expected partitions from the fetch have reported in, record the metrics
         if not self.unrecorded_partitions:
-            self.sensors.bytes_fetched.record(self.total_bytes)
-            self.sensors.records_fetched.record(self.total_records)
+            self.sensors.bytes_fetched.record(self.fetch_metrics.total_bytes)
+            self.sensors.records_fetched.record(self.fetch_metrics.total_records)
+            for topic, metrics in six.iteritems(self.topic_fetch_metrics):
+                self.sensors.record_topic_fetch_metrics(topic, metrics.total_bytes, metrics.total_records)
 
 
 class FetchManagerMetrics(object):
@@ -970,12 +1359,6 @@ def __init__(self, metrics, prefix):
         self.records_fetch_lag.add(metrics.metric_name('records-lag-max', self.group_name,
             'The maximum lag in terms of number of records for any partition in self window'), Max())
 
-        self.fetch_throttle_time_sensor = metrics.sensor('fetch-throttle-time')
-        self.fetch_throttle_time_sensor.add(metrics.metric_name('fetch-throttle-time-avg', self.group_name,
-            'The average throttle time in ms'), Avg())
-        self.fetch_throttle_time_sensor.add(metrics.metric_name('fetch-throttle-time-max', self.group_name,
-            'The maximum throttle time in ms'), Max())
-
     def record_topic_fetch_metrics(self, topic, num_bytes, num_records):
         # record bytes fetched
         name = '.'.join(['topic', topic, 'bytes-fetched'])
diff --git a/kafka/consumer/group.py b/kafka/consumer/group.py
index 4fd57ae9c..ce3cf9203 100644
--- a/kafka/consumer/group.py
+++ b/kafka/consumer/group.py
@@ -5,7 +5,7 @@
 import socket
 import time
 
-from kafka.errors import KafkaConfigurationError, UnsupportedVersionError
+from kafka.errors import KafkaConfigurationError, KafkaTimeoutError, UnsupportedVersionError
 
 from kafka.vendor import six
 
@@ -16,8 +16,9 @@
 from kafka.coordinator.assignors.range import RangePartitionAssignor
 from kafka.coordinator.assignors.roundrobin import RoundRobinPartitionAssignor
 from kafka.metrics import MetricConfig, Metrics
-from kafka.protocol.offset import OffsetResetStrategy
-from kafka.structs import TopicPartition
+from kafka.protocol.list_offsets import OffsetResetStrategy
+from kafka.structs import OffsetAndMetadata, TopicPartition
+from kafka.util import Timer
 from kafka.version import __version__
 
 log = logging.getLogger(__name__)
@@ -60,6 +61,8 @@ class KafkaConsumer(six.Iterator):
             raw message key and returns a deserialized key.
         value_deserializer (callable): Any callable that takes a
             raw message value and returns a deserialized value.
+        enable_incremental_fetch_sessions: (bool): Use incremental fetch sessions
+            when available / supported by kafka broker. See KIP-227. Default: True.
         fetch_min_bytes (int): Minimum amount of data the server should
             return for a fetch request, otherwise wait up to
             fetch_max_wait_ms for more data to accumulate. Default: 1.
@@ -98,7 +101,7 @@ class KafkaConsumer(six.Iterator):
             reconnection attempts will continue periodically with this fixed
             rate. To avoid connection storms, a randomization factor of 0.2
             will be applied to the backoff resulting in a random range between
-            20% below and 20% above the computed value. Default: 1000.
+            20% below and 20% above the computed value. Default: 30000.
         max_in_flight_requests_per_connection (int): Requests are pipelined
             to kafka brokers up to this number of maximum requests per
             broker connection. Default: 5.
@@ -118,6 +121,12 @@ class KafkaConsumer(six.Iterator):
             consumed. This ensures no on-the-wire or on-disk corruption to
             the messages occurred. This check adds some overhead, so it may
             be disabled in cases seeking extreme performance. Default: True
+        isolation_level (str): Configure KIP-98 transactional consumer by
+            setting to 'read_committed'. This will cause the consumer to
+            skip records from aborted tranactions. Default: 'read_uncommitted'
+        allow_auto_create_topics (bool): Enable/disable auto topic creation
+            on metadata request. Only available with api_version >= (0, 11).
+            Default: True
         metadata_max_age_ms (int): The period of time in milliseconds after
             which we force a refresh of metadata, even if we haven't seen any
             partition leadership changes to proactively discover any new
@@ -195,10 +204,17 @@ class KafkaConsumer(six.Iterator):
             or other configuration forbids use of all the specified ciphers),
             an ssl.SSLError will be raised. See ssl.SSLContext.set_ciphers
         api_version (tuple): Specify which Kafka API version to use. If set to
-            None, the client will attempt to infer the broker version by probing
-            various APIs. Different versions enable different functionality.
+            None, the client will attempt to determine the broker version via
+            ApiVersionsRequest API or, for brokers earlier than 0.10, probing
+            various known APIs. Dynamic version checking is performed eagerly
+            during __init__ and can raise NoBrokersAvailableError if no connection
+            was made before timeout (see api_version_auto_timeout_ms below).
+            Different versions enable different functionality.
 
             Examples:
+                (3, 9) most recent broker release, enable all supported features
+                (0, 11) enables message format v2 (internal)
+                (0, 10, 0) enables sasl authentication and message format v1
                 (0, 9) enables full group coordination features with automatic
                     partition assignment and rebalancing,
                 (0, 8, 2) enables kafka-storage offset commits with manual
@@ -212,6 +228,7 @@ class KafkaConsumer(six.Iterator):
         api_version_auto_timeout_ms (int): number of milliseconds to throw a
             timeout exception from the constructor when checking the broker
             api version. Only applies if api_version set to None.
+            Default: 2000
         connections_max_idle_ms: Close idle connections after the number of
             milliseconds specified by this config. The broker closes idle
             connections after connections.max.idle.ms, so this avoids hitting
@@ -220,6 +237,7 @@ class KafkaConsumer(six.Iterator):
         metric_reporters (list): A list of classes to use as metrics reporters.
             Implementing the AbstractMetricsReporter interface allows plugging
             in classes that will be notified of new metric creation. Default: []
+        metrics_enabled (bool): Whether to track metrics on this instance. Default True.
         metrics_num_samples (int): The number of samples maintained to compute
             metrics. Default: 2
         metrics_sample_window_ms (int): The maximum age in milliseconds of
@@ -238,12 +256,17 @@ class KafkaConsumer(six.Iterator):
             Required if sasl_mechanism is PLAIN or one of the SCRAM mechanisms.
         sasl_plain_password (str): password for sasl PLAIN and SCRAM authentication.
             Required if sasl_mechanism is PLAIN or one of the SCRAM mechanisms.
+        sasl_kerberos_name (str or gssapi.Name): Constructed gssapi.Name for use with
+            sasl mechanism handshake. If provided, sasl_kerberos_service_name and
+            sasl_kerberos_domain name are ignored. Default: None.
         sasl_kerberos_service_name (str): Service name to include in GSSAPI
             sasl mechanism handshake. Default: 'kafka'
         sasl_kerberos_domain_name (str): kerberos domain name to use in GSSAPI
             sasl mechanism handshake. Default: one of bootstrap servers
-        sasl_oauth_token_provider (AbstractTokenProvider): OAuthBearer token provider
-            instance. (See kafka.oauth.abstract). Default: None
+        sasl_oauth_token_provider (kafka.sasl.oauth.AbstractTokenProvider): OAuthBearer
+            token provider instance. Default: None
+        socks5_proxy (str): Socks5 proxy URL. Default: None
+        kafka_client (callable): Custom class / callable for creating KafkaClient instances
 
     Note:
         Configuration parameters are described in more detail at
@@ -255,6 +278,7 @@ class KafkaConsumer(six.Iterator):
         'group_id': None,
         'key_deserializer': None,
         'value_deserializer': None,
+        'enable_incremental_fetch_sessions': True,
         'fetch_max_wait_ms': 500,
         'fetch_min_bytes': 1,
         'fetch_max_bytes': 52428800,
@@ -262,13 +286,15 @@ class KafkaConsumer(six.Iterator):
         'request_timeout_ms': 305000, # chosen to be higher than the default of max_poll_interval_ms
         'retry_backoff_ms': 100,
         'reconnect_backoff_ms': 50,
-        'reconnect_backoff_max_ms': 1000,
+        'reconnect_backoff_max_ms': 30000,
         'max_in_flight_requests_per_connection': 5,
         'auto_offset_reset': 'latest',
         'enable_auto_commit': True,
         'auto_commit_interval_ms': 5000,
         'default_offset_commit_callback': lambda offsets, response: True,
         'check_crcs': True,
+        'isolation_level': 'read_uncommitted',
+        'allow_auto_create_topics': True,
         'metadata_max_age_ms': 5 * 60 * 1000,
         'partition_assignment_strategy': (RangePartitionAssignor, RoundRobinPartitionAssignor),
         'max_poll_records': 500,
@@ -294,6 +320,7 @@ class KafkaConsumer(six.Iterator):
         'api_version_auto_timeout_ms': 2000,
         'connections_max_idle_ms': 9 * 60 * 1000,
         'metric_reporters': [],
+        'metrics_enabled': True,
         'metrics_num_samples': 2,
         'metrics_sample_window_ms': 30000,
         'metric_group_prefix': 'consumer',
@@ -302,10 +329,12 @@ class KafkaConsumer(six.Iterator):
         'sasl_mechanism': None,
         'sasl_plain_username': None,
         'sasl_plain_password': None,
+        'sasl_kerberos_name': None,
         'sasl_kerberos_service_name': 'kafka',
         'sasl_kerberos_domain_name': None,
         'sasl_oauth_token_provider': None,
-        'legacy_iterator': False, # enable to revert to < 1.4.7 iterator
+        'socks5_proxy': None,
+        'kafka_client': KafkaClient,
     }
     DEFAULT_SESSION_TIMEOUT_MS_0_9 = 30000
 
@@ -335,13 +364,15 @@ def __init__(self, *topics, **configs):
                 "fetch_max_wait_ms ({})."
                 .format(connections_max_idle_ms, request_timeout_ms, fetch_max_wait_ms))
 
-        metrics_tags = {'client-id': self.config['client_id']}
-        metric_config = MetricConfig(samples=self.config['metrics_num_samples'],
-                                     time_window_ms=self.config['metrics_sample_window_ms'],
-                                     tags=metrics_tags)
-        reporters = [reporter() for reporter in self.config['metric_reporters']]
-        self._metrics = Metrics(metric_config, reporters)
-        # TODO _metrics likely needs to be passed to KafkaClient, etc.
+        if self.config['metrics_enabled']:
+            metrics_tags = {'client-id': self.config['client_id']}
+            metric_config = MetricConfig(samples=self.config['metrics_num_samples'],
+                                         time_window_ms=self.config['metrics_sample_window_ms'],
+                                         tags=metrics_tags)
+            reporters = [reporter() for reporter in self.config['metric_reporters']]
+            self._metrics = Metrics(metric_config, reporters)
+        else:
+            self._metrics = None
 
         # api_version was previously a str. Accept old format for now
         if isinstance(self.config['api_version'], str):
@@ -353,11 +384,10 @@ def __init__(self, *topics, **configs):
             log.warning('use api_version=%s [tuple] -- "%s" as str is deprecated',
                         str(self.config['api_version']), str_version)
 
-        self._client = KafkaClient(metrics=self._metrics, **self.config)
+        self._client = self.config['kafka_client'](metrics=self._metrics, **self.config)
 
-        # Get auto-discovered version from client if necessary
-        if self.config['api_version'] is None:
-            self.config['api_version'] = self._client.config['api_version']
+        # Get auto-discovered / normalized version from client
+        self.config['api_version'] = self._client.config['api_version']
 
         # Coordinator configurations are different for older brokers
         # max_poll_interval_ms is not supported directly -- it must the be
@@ -380,9 +410,9 @@ def __init__(self, *topics, **configs):
 
         self._subscription = SubscriptionState(self.config['auto_offset_reset'])
         self._fetcher = Fetcher(
-            self._client, self._subscription, self._metrics, **self.config)
+            self._client, self._subscription, metrics=self._metrics, **self.config)
         self._coordinator = ConsumerCoordinator(
-            self._client, self._subscription, self._metrics,
+            self._client, self._subscription, metrics=self._metrics,
             assignors=self.config['partition_assignment_strategy'],
             **self.config)
         self._closed = False
@@ -422,8 +452,15 @@ def assign(self, partitions):
             no rebalance operation triggered when group membership or cluster
             and topic metadata change.
         """
-        self._subscription.assign_from_user(partitions)
-        self._client.set_topics([tp.topic for tp in partitions])
+        if not partitions:
+            self.unsubscribe()
+        else:
+            # make sure the offsets of topic partitions the consumer is unsubscribing from
+            # are committed since there will be no following rebalance
+            self._coordinator.maybe_auto_commit_offsets_now()
+            self._subscription.assign_from_user(partitions)
+            self._client.set_topics([tp.topic for tp in partitions])
+            log.debug("Subscribed to partition(s): %s", partitions)
 
     def assignment(self):
         """Get the TopicPartitions currently assigned to this consumer.
@@ -441,20 +478,23 @@ def assignment(self):
         """
         return self._subscription.assigned_partitions()
 
-    def close(self, autocommit=True):
+    def close(self, autocommit=True, timeout_ms=None):
         """Close the consumer, waiting indefinitely for any needed cleanup.
 
         Keyword Arguments:
             autocommit (bool): If auto-commit is configured for this consumer,
                 this optional flag causes the consumer to attempt to commit any
                 pending consumed offsets prior to close. Default: True
+            timeout_ms (num, optional): Milliseconds to wait for auto-commit.
+                Default: None
         """
         if self._closed:
             return
         log.debug("Closing the KafkaConsumer.")
         self._closed = True
-        self._coordinator.close(autocommit=autocommit)
-        self._metrics.close()
+        self._coordinator.close(autocommit=autocommit, timeout_ms=timeout_ms)
+        if self._metrics:
+            self._metrics.close()
         self._client.close()
         try:
             self.config['key_deserializer'].close()
@@ -500,7 +540,7 @@ def commit_async(self, offsets=None, callback=None):
             offsets, callback=callback)
         return future
 
-    def commit(self, offsets=None):
+    def commit(self, offsets=None, timeout_ms=None):
         """Commit offsets to kafka, blocking until success or error.
 
         This commits offsets only to Kafka. The offsets committed using this API
@@ -524,17 +564,16 @@ def commit(self, offsets=None):
         assert self.config['group_id'] is not None, 'Requires group_id'
         if offsets is None:
             offsets = self._subscription.all_consumed_offsets()
-        self._coordinator.commit_offsets_sync(offsets)
+        self._coordinator.commit_offsets_sync(offsets, timeout_ms=timeout_ms)
 
-    def committed(self, partition, metadata=False):
+    def committed(self, partition, metadata=False, timeout_ms=None):
         """Get the last committed offset for the given partition.
 
         This offset will be used as the position for the consumer
         in the event of a failure.
 
-        This call may block to do a remote call if the partition in question
-        isn't assigned to this consumer or if the consumer hasn't yet
-        initialized its cache of committed offsets.
+        This call will block to do a remote call to get the latest committed
+        offsets from the server.
 
         Arguments:
             partition (TopicPartition): The partition to check.
@@ -543,28 +582,19 @@ def committed(self, partition, metadata=False):
 
         Returns:
             The last committed offset (int or OffsetAndMetadata), or None if there was no prior commit.
+
+        Raises:
+            KafkaTimeoutError if timeout_ms provided
+            BrokerResponseErrors if OffsetFetchRequest raises an error.
         """
         assert self.config['api_version'] >= (0, 8, 1), 'Requires >= Kafka 0.8.1'
         assert self.config['group_id'] is not None, 'Requires group_id'
         if not isinstance(partition, TopicPartition):
             raise TypeError('partition must be a TopicPartition namedtuple')
-        if self._subscription.is_assigned(partition):
-            committed = self._subscription.assignment[partition].committed
-            if committed is None:
-                self._coordinator.refresh_committed_offsets_if_needed()
-                committed = self._subscription.assignment[partition].committed
-        else:
-            commit_map = self._coordinator.fetch_committed_offsets([partition])
-            if partition in commit_map:
-                committed = commit_map[partition]
-            else:
-                committed = None
-
-        if committed is not None:
-            if metadata:
-                return committed
-            else:
-                return committed.offset
+        committed = self._coordinator.fetch_committed_offsets([partition], timeout_ms=timeout_ms)
+        if partition not in committed:
+            return None
+        return committed[partition] if metadata else committed[partition].offset
 
     def _fetch_all_topic_metadata(self):
         """A blocking call that fetches topic metadata for all topics in the
@@ -609,7 +639,7 @@ def partitions_for_topic(self, topic):
         if partitions is None:
             self._fetch_all_topic_metadata()
             partitions = cluster.partitions_for_topic(topic)
-        return partitions
+        return partitions or set()
 
     def poll(self, timeout_ms=0, max_records=None, update_offsets=True):
         """Fetch data from assigned topics / partitions.
@@ -649,59 +679,54 @@ def poll(self, timeout_ms=0, max_records=None, update_offsets=True):
         assert not self._closed, 'KafkaConsumer is closed'
 
         # Poll for new data until the timeout expires
-        start = time.time()
-        remaining = timeout_ms
+        timer = Timer(timeout_ms)
         while not self._closed:
-            records = self._poll_once(remaining, max_records, update_offsets=update_offsets)
+            records = self._poll_once(timer, max_records, update_offsets=update_offsets)
             if records:
                 return records
-
-            elapsed_ms = (time.time() - start) * 1000
-            remaining = timeout_ms - elapsed_ms
-
-            if remaining <= 0:
+            elif timer.expired:
                 break
-
         return {}
 
-    def _poll_once(self, timeout_ms, max_records, update_offsets=True):
+    def _poll_once(self, timer, max_records, update_offsets=True):
         """Do one round of polling. In addition to checking for new data, this does
         any needed heart-beating, auto-commits, and offset updates.
 
         Arguments:
-            timeout_ms (int): The maximum time in milliseconds to block.
+            timer (Timer): The maximum time in milliseconds to block.
 
         Returns:
             dict: Map of topic to list of records (may be empty).
         """
-        self._coordinator.poll()
+        if not self._coordinator.poll(timeout_ms=timer.timeout_ms):
+            return {}
 
-        # Fetch positions if we have partitions we're subscribed to that we
-        # don't know the offset for
-        if not self._subscription.has_all_fetch_positions():
-            self._update_fetch_positions(self._subscription.missing_fetch_positions())
+        has_all_fetch_positions = self._update_fetch_positions(timeout_ms=timer.timeout_ms)
 
         # If data is available already, e.g. from a previous network client
         # poll() call to commit, then just return it immediately
         records, partial = self._fetcher.fetched_records(max_records, update_offsets=update_offsets)
+        log.debug('Fetched records: %s, %s', records, partial)
+        # Before returning the fetched records, we can send off the
+        # next round of fetches and avoid block waiting for their
+        # responses to enable pipelining while the user is handling the
+        # fetched records.
+        if not partial:
+            log.debug("Sending fetches")
+            futures = self._fetcher.send_fetches()
+            if len(futures):
+                self._client.poll(timeout_ms=0)
+
         if records:
-            # Before returning the fetched records, we can send off the
-            # next round of fetches and avoid block waiting for their
-            # responses to enable pipelining while the user is handling the
-            # fetched records.
-            if not partial:
-                futures = self._fetcher.send_fetches()
-                if len(futures):
-                    self._client.poll(timeout_ms=0)
             return records
 
-        # Send any new fetches (won't resend pending fetches)
-        futures = self._fetcher.send_fetches()
-        if len(futures):
-            self._client.poll(timeout_ms=0)
+        # We do not want to be stuck blocking in poll if we are missing some positions
+        # since the offset lookup may be backing off after a failure
+        poll_timeout_ms = min(timer.timeout_ms, self._coordinator.time_to_next_poll() * 1000)
+        if not has_all_fetch_positions:
+            poll_timeout_ms = min(poll_timeout_ms, self.config['retry_backoff_ms'])
 
-        timeout_ms = min(timeout_ms, self._coordinator.time_to_next_poll() * 1000)
-        self._client.poll(timeout_ms=timeout_ms)
+        self._client.poll(timeout_ms=poll_timeout_ms)
         # after the long poll, we should check whether the group needs to rebalance
         # prior to returning data so that the group can stabilize faster
         if self._coordinator.need_rejoin():
@@ -710,23 +735,29 @@ def _poll_once(self, timeout_ms, max_records, update_offsets=True):
         records, _ = self._fetcher.fetched_records(max_records, update_offsets=update_offsets)
         return records
 
-    def position(self, partition):
+    def position(self, partition, timeout_ms=None):
         """Get the offset of the next record that will be fetched
 
         Arguments:
             partition (TopicPartition): Partition to check
 
         Returns:
-            int: Offset
+            int: Offset or None
         """
         if not isinstance(partition, TopicPartition):
             raise TypeError('partition must be a TopicPartition namedtuple')
         assert self._subscription.is_assigned(partition), 'Partition is not assigned'
-        offset = self._subscription.assignment[partition].position
-        if offset is None:
-            self._update_fetch_positions([partition])
-            offset = self._subscription.assignment[partition].position
-        return offset
+
+        timer = Timer(timeout_ms)
+        position = self._subscription.assignment[partition].position
+        while position is None:
+            # batch update fetch positions for any partitions without a valid position
+            if self._update_fetch_positions(timeout_ms=timer.timeout_ms):
+                position = self._subscription.assignment[partition].position
+            elif timer.expired:
+                return None
+        else:
+            return position.offset
 
     def highwater(self, partition):
         """Last known highwater offset for a partition.
@@ -820,8 +851,7 @@ def seek(self, partition, offset):
         assert partition in self._subscription.assigned_partitions(), 'Unassigned partition'
         log.debug("Seeking to offset %s for partition %s", offset, partition)
         self._subscription.assignment[partition].seek(offset)
-        if not self.config['legacy_iterator']:
-            self._iterator = None
+        self._iterator = None
 
     def seek_to_beginning(self, *partitions):
         """Seek to the oldest available offset for partitions.
@@ -845,9 +875,8 @@ def seek_to_beginning(self, *partitions):
 
         for tp in partitions:
             log.debug("Seeking to beginning of partition %s", tp)
-            self._subscription.need_offset_reset(tp, OffsetResetStrategy.EARLIEST)
-        if not self.config['legacy_iterator']:
-            self._iterator = None
+            self._subscription.request_offset_reset(tp, OffsetResetStrategy.EARLIEST)
+        self._iterator = None
 
     def seek_to_end(self, *partitions):
         """Seek to the most recent available offset for partitions.
@@ -871,9 +900,8 @@ def seek_to_end(self, *partitions):
 
         for tp in partitions:
             log.debug("Seeking to end of partition %s", tp)
-            self._subscription.need_offset_reset(tp, OffsetResetStrategy.LATEST)
-        if not self.config['legacy_iterator']:
-            self._iterator = None
+            self._subscription.request_offset_reset(tp, OffsetResetStrategy.LATEST)
+        self._iterator = None
 
     def subscribe(self, topics=(), pattern=None, listener=None):
         """Subscribe to a list of topics, or a topic regex pattern.
@@ -944,13 +972,16 @@ def subscription(self):
 
     def unsubscribe(self):
         """Unsubscribe from all topics and clear all assigned partitions."""
+        # make sure the offsets of topic partitions the consumer is unsubscribing from
+        # are committed since there will be no following rebalance
+        self._coordinator.maybe_auto_commit_offsets_now()
         self._subscription.unsubscribe()
-        self._coordinator.close()
+        if self.config['api_version'] >= (0, 9):
+            self._coordinator.maybe_leave_group()
         self._client.cluster.need_all_topic_metadata = False
         self._client.set_topics([])
         log.debug("Unsubscribed all topics or patterns and assigned partitions")
-        if not self.config['legacy_iterator']:
-            self._iterator = None
+        self._iterator = None
 
     def metrics(self, raw=False):
         """Get metrics on consumer performance.
@@ -962,6 +993,8 @@ def metrics(self, raw=False):
             This is an unstable interface. It may change in future
             releases without warning.
         """
+        if not self._metrics:
+            return
         if raw:
             return self._metrics.metrics.copy()
 
@@ -1017,7 +1050,7 @@ def offsets_for_times(self, timestamps):
                 raise ValueError(
                     "The target time for partition {} is {}. The target time "
                     "cannot be negative.".format(tp, ts))
-        return self._fetcher.get_offsets_by_times(
+        return self._fetcher.offsets_by_times(
             timestamps, self.config['request_timeout_ms'])
 
     def beginning_offsets(self, partitions):
@@ -1083,7 +1116,7 @@ def _use_consumer_group(self):
             return False
         return True
 
-    def _update_fetch_positions(self, partitions):
+    def _update_fetch_positions(self, timeout_ms=None):
         """Set the fetch position to the committed position (if there is one)
         or reset it using the offset reset policy the user has configured.
 
@@ -1091,30 +1124,39 @@ def _update_fetch_positions(self, partitions):
             partitions (List[TopicPartition]): The partitions that need
                 updating fetch positions.
 
+        Returns True if fetch positions updated, False if timeout
+
         Raises:
             NoOffsetForPartitionError: If no offset is stored for a given
                 partition and no offset reset policy is defined.
         """
-        # Lookup any positions for partitions which are awaiting reset (which may be the
-        # case if the user called :meth:`seek_to_beginning` or :meth:`seek_to_end`. We do
-        # this check first to avoid an unnecessary lookup of committed offsets (which
-        # typically occurs when the user is manually assigning partitions and managing
-        # their own offsets).
-        self._fetcher.reset_offsets_if_needed(partitions)
-
-        if not self._subscription.has_all_fetch_positions():
-            # if we still don't have offsets for all partitions, then we should either seek
-            # to the last committed position or reset using the auto reset policy
-            if (self.config['api_version'] >= (0, 8, 1) and
-                self.config['group_id'] is not None):
-                # first refresh commits for all assigned partitions
-                self._coordinator.refresh_committed_offsets_if_needed()
-
-            # Then, do any offset lookups in case some positions are not known
-            self._fetcher.update_fetch_positions(partitions)
+        if self._subscription.has_all_fetch_positions():
+            return True
+
+        if (self.config['api_version'] >= (0, 8, 1) and
+            self.config['group_id'] is not None):
+            try:
+                # If there are any partitions which do not have a valid position and are not
+                # awaiting reset, then we need to fetch committed offsets. We will only do a
+                # coordinator lookup if there are partitions which have missing positions, so
+                # a consumer with manually assigned partitions can avoid a coordinator dependence
+                # by always ensuring that assigned partitions have an initial position.
+                self._coordinator.refresh_committed_offsets_if_needed(timeout_ms=timeout_ms)
+            except KafkaTimeoutError:
+                pass
+
+        # If there are partitions still needing a position and a reset policy is defined,
+        # request reset using the default policy. If no reset strategy is defined and there
+        # are partitions with a missing position, then we will raise an exception.
+        self._subscription.reset_missing_positions()
+
+        # Finally send an asynchronous request to lookup and update the positions of any
+        # partitions which are awaiting reset.
+        self._fetcher.reset_offsets_if_needed()
+        return False
 
     def _message_generator_v2(self):
-        timeout_ms = 1000 * (self._consumer_timeout - time.time())
+        timeout_ms = 1000 * max(0, self._consumer_timeout - time.time())
         record_map = self.poll(timeout_ms=timeout_ms, update_offsets=False)
         for tp, records in six.iteritems(record_map):
             # Generators are stateful, and it is possible that the tp / records
@@ -1129,72 +1171,15 @@ def _message_generator_v2(self):
                     log.debug("Not returning fetched records for partition %s"
                               " since it is no longer fetchable", tp)
                     break
-                self._subscription.assignment[tp].position = record.offset + 1
+                self._subscription.assignment[tp].position = OffsetAndMetadata(record.offset + 1, '', -1)
                 yield record
 
-    def _message_generator(self):
-        assert self.assignment() or self.subscription() is not None, 'No topic subscription or manual partition assignment'
-        while time.time() < self._consumer_timeout:
-
-            self._coordinator.poll()
-
-            # Fetch offsets for any subscribed partitions that we arent tracking yet
-            if not self._subscription.has_all_fetch_positions():
-                partitions = self._subscription.missing_fetch_positions()
-                self._update_fetch_positions(partitions)
-
-            poll_ms = min((1000 * (self._consumer_timeout - time.time())), self.config['retry_backoff_ms'])
-            self._client.poll(timeout_ms=poll_ms)
-
-            # after the long poll, we should check whether the group needs to rebalance
-            # prior to returning data so that the group can stabilize faster
-            if self._coordinator.need_rejoin():
-                continue
-
-            # We need to make sure we at least keep up with scheduled tasks,
-            # like heartbeats, auto-commits, and metadata refreshes
-            timeout_at = self._next_timeout()
-
-            # Short-circuit the fetch iterator if we are already timed out
-            # to avoid any unintentional interaction with fetcher setup
-            if time.time() > timeout_at:
-                continue
-
-            for msg in self._fetcher:
-                yield msg
-                if time.time() > timeout_at:
-                    log.debug("internal iterator timeout - breaking for poll")
-                    break
-                self._client.poll(timeout_ms=0)
-
-            # An else block on a for loop only executes if there was no break
-            # so this should only be called on a StopIteration from the fetcher
-            # We assume that it is safe to init_fetches when fetcher is done
-            # i.e., there are no more records stored internally
-            else:
-                self._fetcher.send_fetches()
-
-    def _next_timeout(self):
-        timeout = min(self._consumer_timeout,
-                      self._client.cluster.ttl() / 1000.0 + time.time(),
-                      self._coordinator.time_to_next_poll() + time.time())
-        return timeout
-
     def __iter__(self):  # pylint: disable=non-iterator-returned
         return self
 
     def __next__(self):
         if self._closed:
             raise StopIteration('KafkaConsumer closed')
-        # Now that the heartbeat thread runs in the background
-        # there should be no reason to maintain a separate iterator
-        # but we'll keep it available for a few releases just in case
-        if self.config['legacy_iterator']:
-            return self.next_v1()
-        else:
-            return self.next_v2()
-
-    def next_v2(self):
         self._set_consumer_timeout()
         while time.time() < self._consumer_timeout:
             if not self._iterator:
@@ -1205,17 +1190,6 @@ def next_v2(self):
                 self._iterator = None
         raise StopIteration()
 
-    def next_v1(self):
-        if not self._iterator:
-            self._iterator = self._message_generator()
-
-        self._set_consumer_timeout()
-        try:
-            return next(self._iterator)
-        except StopIteration:
-            self._iterator = None
-            raise
-
     def _set_consumer_timeout(self):
         # consumer_timeout_ms can be used to stop iteration early
         if self.config['consumer_timeout_ms'] >= 0:
diff --git a/kafka/consumer/subscription_state.py b/kafka/consumer/subscription_state.py
index 08842d133..cc3675b1d 100644
--- a/kafka/consumer/subscription_state.py
+++ b/kafka/consumer/subscription_state.py
@@ -1,18 +1,39 @@
 from __future__ import absolute_import
 
 import abc
+from collections import OrderedDict
+try:
+    from collections.abc import Sequence
+except ImportError:
+    from collections import Sequence
+try:
+    # enum in stdlib as of py3.4
+    from enum import IntEnum  # pylint: disable=import-error
+except ImportError:
+    # vendored backport module
+    from kafka.vendor.enum34 import IntEnum
 import logging
+import random
 import re
+import time
 
 from kafka.vendor import six
 
-from kafka.errors import IllegalStateError
-from kafka.protocol.offset import OffsetResetStrategy
+import kafka.errors as Errors
+from kafka.protocol.list_offsets import OffsetResetStrategy
 from kafka.structs import OffsetAndMetadata
+from kafka.util import ensure_valid_topic_name
 
 log = logging.getLogger(__name__)
 
 
+class SubscriptionType(IntEnum):
+    NONE = 0
+    AUTO_TOPICS = 1
+    AUTO_PATTERN = 2
+    USER_ASSIGNED = 3
+
+
 class SubscriptionState(object):
     """
     A class for tracking the topics, partitions, and offsets for the consumer.
@@ -32,10 +53,6 @@ class SubscriptionState(object):
     Note that pause state as well as fetch/consumed positions are not preserved
     when partition assignment is changed whether directly by the user or
     through a group rebalance.
-
-    This class also maintains a cache of the latest commit position for each of
-    the assigned partitions. This is updated through committed() and can be used
-    to set the initial fetch position (e.g. Fetcher._reset_offset() ).
     """
     _SUBSCRIPTION_EXCEPTION_MESSAGE = (
         "You must choose only one way to configure your consumer:"
@@ -43,10 +60,6 @@ class SubscriptionState(object):
         " (2) subscribe to topics matching a regex pattern,"
         " (3) assign itself specific topic-partitions.")
 
-    # Taken from: https://github.com/apache/kafka/blob/39eb31feaeebfb184d98cc5d94da9148c2319d81/clients/src/main/java/org/apache/kafka/common/internals/Topic.java#L29
-    _MAX_NAME_LENGTH = 249
-    _TOPIC_LEGAL_CHARS = re.compile('^[a-zA-Z0-9._-]+$')
-
     def __init__(self, offset_reset_strategy='earliest'):
         """Initialize a SubscriptionState instance
 
@@ -64,14 +77,21 @@ def __init__(self, offset_reset_strategy='earliest'):
         self._default_offset_reset_strategy = offset_reset_strategy
 
         self.subscription = None # set() or None
+        self.subscription_type = SubscriptionType.NONE
         self.subscribed_pattern = None # regex str or None
         self._group_subscription = set()
         self._user_assignment = set()
-        self.assignment = dict()
-        self.listener = None
-
-        # initialize to true for the consumers to fetch offset upon starting up
-        self.needs_fetch_committed_offsets = True
+        self.assignment = OrderedDict()
+        self.rebalance_listener = None
+        self.listeners = []
+
+    def _set_subscription_type(self, subscription_type):
+        if not isinstance(subscription_type, SubscriptionType):
+            raise ValueError('SubscriptionType enum required')
+        if self.subscription_type == SubscriptionType.NONE:
+            self.subscription_type = subscription_type
+        elif self.subscription_type != subscription_type:
+            raise Errors.IllegalStateError(self._SUBSCRIPTION_EXCEPTION_MESSAGE)
 
     def subscribe(self, topics=(), pattern=None, listener=None):
         """Subscribe to a list of topics, or a topic regex pattern.
@@ -108,38 +128,24 @@ def subscribe(self, topics=(), pattern=None, listener=None):
                 guaranteed, however, that the partitions revoked/assigned
                 through this interface are from topics subscribed in this call.
         """
-        if self._user_assignment or (topics and pattern):
-            raise IllegalStateError(self._SUBSCRIPTION_EXCEPTION_MESSAGE)
         assert topics or pattern, 'Must provide topics or pattern'
+        if (topics and pattern):
+            raise Errors.IllegalStateError(self._SUBSCRIPTION_EXCEPTION_MESSAGE)
 
-        if pattern:
+        elif pattern:
+            self._set_subscription_type(SubscriptionType.AUTO_PATTERN)
             log.info('Subscribing to pattern: /%s/', pattern)
             self.subscription = set()
             self.subscribed_pattern = re.compile(pattern)
         else:
+            if isinstance(topics, str) or not isinstance(topics, Sequence):
+                raise TypeError('Topics must be a list (or non-str sequence)')
+            self._set_subscription_type(SubscriptionType.AUTO_TOPICS)
             self.change_subscription(topics)
 
         if listener and not isinstance(listener, ConsumerRebalanceListener):
             raise TypeError('listener must be a ConsumerRebalanceListener')
-        self.listener = listener
-
-    def _ensure_valid_topic_name(self, topic):
-        """ Ensures that the topic name is valid according to the kafka source. """
-
-        # See Kafka Source:
-        # https://github.com/apache/kafka/blob/39eb31feaeebfb184d98cc5d94da9148c2319d81/clients/src/main/java/org/apache/kafka/common/internals/Topic.java
-        if topic is None:
-            raise TypeError('All topics must not be None')
-        if not isinstance(topic, six.string_types):
-            raise TypeError('All topics must be strings')
-        if len(topic) == 0:
-            raise ValueError('All topics must be non-empty strings')
-        if topic == '.' or topic == '..':
-            raise ValueError('Topic name cannot be "." or ".."')
-        if len(topic) > self._MAX_NAME_LENGTH:
-            raise ValueError('Topic name is illegal, it can\'t be longer than {0} characters, topic: "{1}"'.format(self._MAX_NAME_LENGTH, topic))
-        if not self._TOPIC_LEGAL_CHARS.match(topic):
-            raise ValueError('Topic name "{0}" is illegal, it contains a character other than ASCII alphanumerics, ".", "_" and "-"'.format(topic))
+        self.rebalance_listener = listener
 
     def change_subscription(self, topics):
         """Change the topic subscription.
@@ -154,8 +160,8 @@ def change_subscription(self, topics):
                         - a topic name is '.' or '..' or
                         - a topic name does not consist of ASCII-characters/'-'/'_'/'.'
         """
-        if self._user_assignment:
-            raise IllegalStateError(self._SUBSCRIPTION_EXCEPTION_MESSAGE)
+        if not self.partitions_auto_assigned():
+            raise Errors.IllegalStateError(self._SUBSCRIPTION_EXCEPTION_MESSAGE)
 
         if isinstance(topics, six.string_types):
             topics = [topics]
@@ -166,17 +172,12 @@ def change_subscription(self, topics):
             return
 
         for t in topics:
-            self._ensure_valid_topic_name(t)
+            ensure_valid_topic_name(t)
 
         log.info('Updating subscribed topics to: %s', topics)
         self.subscription = set(topics)
         self._group_subscription.update(topics)
 
-        # Remove any assigned partitions which are no longer subscribed to
-        for tp in set(self.assignment.keys()):
-            if tp.topic not in self.subscription:
-                del self.assignment[tp]
-
     def group_subscribe(self, topics):
         """Add topics to the current group subscription.
 
@@ -186,14 +187,14 @@ def group_subscribe(self, topics):
         Arguments:
             topics (list of str): topics to add to the group subscription
         """
-        if self._user_assignment:
-            raise IllegalStateError(self._SUBSCRIPTION_EXCEPTION_MESSAGE)
+        if not self.partitions_auto_assigned():
+            raise Errors.IllegalStateError(self._SUBSCRIPTION_EXCEPTION_MESSAGE)
         self._group_subscription.update(topics)
 
     def reset_group_subscription(self):
         """Reset the group's subscription to only contain topics subscribed by this consumer."""
-        if self._user_assignment:
-            raise IllegalStateError(self._SUBSCRIPTION_EXCEPTION_MESSAGE)
+        if not self.partitions_auto_assigned():
+            raise Errors.IllegalStateError(self._SUBSCRIPTION_EXCEPTION_MESSAGE)
         assert self.subscription is not None, 'Subscription required'
         self._group_subscription.intersection_update(self.subscription)
 
@@ -215,20 +216,11 @@ def assign_from_user(self, partitions):
         Raises:
             IllegalStateError: if consumer has already called subscribe()
         """
-        if self.subscription is not None:
-            raise IllegalStateError(self._SUBSCRIPTION_EXCEPTION_MESSAGE)
-
+        self._set_subscription_type(SubscriptionType.USER_ASSIGNED)
         if self._user_assignment != set(partitions):
             self._user_assignment = set(partitions)
-
-            for partition in partitions:
-                if partition not in self.assignment:
-                    self._add_assigned_partition(partition)
-
-            for tp in set(self.assignment.keys()) - self._user_assignment:
-                del self.assignment[tp]
-
-            self.needs_fetch_committed_offsets = True
+            self._set_assignment({partition: self.assignment.get(partition, TopicPartitionState())
+                                  for partition in partitions})
 
     def assign_from_subscribed(self, assignments):
         """Update the assignment to the specified partitions
@@ -243,25 +235,36 @@ def assign_from_subscribed(self, assignments):
                 consumer instance.
         """
         if not self.partitions_auto_assigned():
-            raise IllegalStateError(self._SUBSCRIPTION_EXCEPTION_MESSAGE)
+            raise Errors.IllegalStateError(self._SUBSCRIPTION_EXCEPTION_MESSAGE)
 
         for tp in assignments:
             if tp.topic not in self.subscription:
                 raise ValueError("Assigned partition %s for non-subscribed topic." % (tp,))
 
-        # after rebalancing, we always reinitialize the assignment state
-        self.assignment.clear()
-        for tp in assignments:
-            self._add_assigned_partition(tp)
-        self.needs_fetch_committed_offsets = True
+        # randomized ordering should improve balance for short-lived consumers
+        self._set_assignment({partition: TopicPartitionState() for partition in assignments}, randomize=True)
         log.info("Updated partition assignment: %s", assignments)
 
+    def _set_assignment(self, partition_states, randomize=False):
+        """Batch partition assignment by topic (self.assignment is OrderedDict)"""
+        self.assignment.clear()
+        topics = [tp.topic for tp in six.iterkeys(partition_states)]
+        if randomize:
+            random.shuffle(topics)
+        topic_partitions = OrderedDict({topic: [] for topic in topics})
+        for tp in six.iterkeys(partition_states):
+            topic_partitions[tp.topic].append(tp)
+        for topic in six.iterkeys(topic_partitions):
+            for tp in topic_partitions[topic]:
+                self.assignment[tp] = partition_states[tp]
+
     def unsubscribe(self):
         """Clear all topic subscriptions and partition assignments"""
         self.subscription = None
         self._user_assignment.clear()
         self.assignment.clear()
         self.subscribed_pattern = None
+        self.subscription_type = SubscriptionType.NONE
 
     def group_subscription(self):
         """Get the topic subscription for the group.
@@ -289,8 +292,10 @@ def seek(self, partition, offset):
 
         Arguments:
             partition (TopicPartition): partition for seek operation
-            offset (int): message offset in partition
+            offset (int or OffsetAndMetadata): message offset in partition
         """
+        if not isinstance(offset, (int, OffsetAndMetadata)):
+            raise TypeError("offset must be type in or OffsetAndMetadata")
         self.assignment[partition].seek(offset)
 
     def assigned_partitions(self):
@@ -303,26 +308,26 @@ def paused_partitions(self):
                    if self.is_paused(partition))
 
     def fetchable_partitions(self):
-        """Return set of TopicPartitions that should be Fetched."""
-        fetchable = set()
+        """Return ordered list of TopicPartitions that should be Fetched."""
+        fetchable = list()
         for partition, state in six.iteritems(self.assignment):
             if state.is_fetchable():
-                fetchable.add(partition)
+                fetchable.append(partition)
         return fetchable
 
     def partitions_auto_assigned(self):
         """Return True unless user supplied partitions manually."""
-        return self.subscription is not None
+        return self.subscription_type in (SubscriptionType.AUTO_TOPICS, SubscriptionType.AUTO_PATTERN)
 
     def all_consumed_offsets(self):
         """Returns consumed offsets as {TopicPartition: OffsetAndMetadata}"""
         all_consumed = {}
         for partition, state in six.iteritems(self.assignment):
             if state.has_valid_position:
-                all_consumed[partition] = OffsetAndMetadata(state.position, '')
+                all_consumed[partition] = state.position
         return all_consumed
 
-    def need_offset_reset(self, partition, offset_reset_strategy=None):
+    def request_offset_reset(self, partition, offset_reset_strategy=None):
         """Mark partition for offset reset using specified or default strategy.
 
         Arguments:
@@ -331,7 +336,11 @@ def need_offset_reset(self, partition, offset_reset_strategy=None):
         """
         if offset_reset_strategy is None:
             offset_reset_strategy = self._default_offset_reset_strategy
-        self.assignment[partition].await_reset(offset_reset_strategy)
+        self.assignment[partition].reset(offset_reset_strategy)
+
+    def set_reset_pending(self, partitions, next_allowed_reset_time):
+        for partition in partitions:
+            self.assignment[partition].set_reset_pending(next_allowed_reset_time)
 
     def has_default_offset_reset_policy(self):
         """Return True if default offset reset policy is Earliest or Latest"""
@@ -341,7 +350,7 @@ def is_offset_reset_needed(self, partition):
         return self.assignment[partition].awaiting_reset
 
     def has_all_fetch_positions(self):
-        for state in self.assignment.values():
+        for state in six.itervalues(self.assignment):
             if not state.has_valid_position:
                 return False
         return True
@@ -349,10 +358,32 @@ def has_all_fetch_positions(self):
     def missing_fetch_positions(self):
         missing = set()
         for partition, state in six.iteritems(self.assignment):
-            if not state.has_valid_position:
+            if state.is_missing_position():
                 missing.add(partition)
         return missing
 
+    def has_valid_position(self, partition):
+        return partition in self.assignment and self.assignment[partition].has_valid_position
+
+    def reset_missing_positions(self):
+        partitions_with_no_offsets = set()
+        for tp, state in six.iteritems(self.assignment):
+            if state.is_missing_position():
+                if self._default_offset_reset_strategy == OffsetResetStrategy.NONE:
+                    partitions_with_no_offsets.add(tp)
+                else:
+                    state.reset(self._default_offset_reset_strategy)
+
+        if partitions_with_no_offsets:
+            raise Errors.NoOffsetForPartitionError(partitions_with_no_offsets)
+
+    def partitions_needing_reset(self):
+        partitions = set()
+        for tp, state in six.iteritems(self.assignment):
+            if state.awaiting_reset and state.is_reset_allowed():
+                partitions.add(tp)
+        return partitions
+
     def is_assigned(self, partition):
         return partition in self.assignment
 
@@ -368,26 +399,34 @@ def pause(self, partition):
     def resume(self, partition):
         self.assignment[partition].resume()
 
-    def _add_assigned_partition(self, partition):
-        self.assignment[partition] = TopicPartitionState()
+    def reset_failed(self, partitions, next_retry_time):
+        for partition in partitions:
+            self.assignment[partition].reset_failed(next_retry_time)
+
+    def move_partition_to_end(self, partition):
+        if partition in self.assignment:
+            try:
+                self.assignment.move_to_end(partition)
+            except AttributeError:
+                state = self.assignment.pop(partition)
+                self.assignment[partition] = state
+
+    def position(self, partition):
+        return self.assignment[partition].position
 
 
 class TopicPartitionState(object):
     def __init__(self):
-        self.committed = None # last committed OffsetAndMetadata
-        self.has_valid_position = False # whether we have valid position
         self.paused = False # whether this partition has been paused by the user
-        self.awaiting_reset = False # whether we are awaiting reset
-        self.reset_strategy = None # the reset strategy if awaitingReset is set
-        self._position = None # offset exposed to the user
+        self.reset_strategy = None # the reset strategy if awaiting_reset is set
+        self._position = None # OffsetAndMetadata exposed to the user
         self.highwater = None
-        self.drop_pending_message_set = False
-        # The last message offset hint available from a message batch with
-        # magic=2 which includes deleted compacted messages
-        self.last_offset_from_message_batch = None
+        self.drop_pending_record_batch = False
+        self.next_allowed_retry_time = None
 
     def _set_position(self, offset):
         assert self.has_valid_position, 'Valid position required'
+        assert isinstance(offset, OffsetAndMetadata)
         self._position = offset
 
     def _get_position(self):
@@ -395,20 +434,37 @@ def _get_position(self):
 
     position = property(_get_position, _set_position, None, "last position")
 
-    def await_reset(self, strategy):
-        self.awaiting_reset = True
+    def reset(self, strategy):
+        assert strategy is not None
         self.reset_strategy = strategy
         self._position = None
-        self.last_offset_from_message_batch = None
-        self.has_valid_position = False
+        self.next_allowed_retry_time = None
+
+    def is_reset_allowed(self):
+        return self.next_allowed_retry_time is None or self.next_allowed_retry_time < time.time()
+
+    @property
+    def awaiting_reset(self):
+        return self.reset_strategy is not None
+
+    def set_reset_pending(self, next_allowed_retry_time):
+        self.next_allowed_retry_time = next_allowed_retry_time
+
+    def reset_failed(self, next_allowed_retry_time):
+        self.next_allowed_retry_time = next_allowed_retry_time
+
+    @property
+    def has_valid_position(self):
+        return self._position is not None
+
+    def is_missing_position(self):
+        return not self.has_valid_position and not self.awaiting_reset
 
     def seek(self, offset):
-        self._position = offset
-        self.awaiting_reset = False
+        self._position = offset if isinstance(offset, OffsetAndMetadata) else OffsetAndMetadata(offset, '', -1)
         self.reset_strategy = None
-        self.has_valid_position = True
-        self.drop_pending_message_set = True
-        self.last_offset_from_message_batch = None
+        self.drop_pending_record_batch = True
+        self.next_allowed_retry_time = None
 
     def pause(self):
         self.paused = True
@@ -420,6 +476,7 @@ def is_fetchable(self):
         return not self.paused and self.has_valid_position
 
 
+@six.add_metaclass(abc.ABCMeta)
 class ConsumerRebalanceListener(object):
     """
     A callback interface that the user can implement to trigger custom actions
@@ -461,8 +518,6 @@ class ConsumerRebalanceListener(object):
     taking over that partition has their on_partitions_assigned() callback
     called to load the state.
     """
-    __metaclass__ = abc.ABCMeta
-
     @abc.abstractmethod
     def on_partitions_revoked(self, revoked):
         """
diff --git a/kafka/coordinator/assignors/sticky/sticky_assignor.py b/kafka/coordinator/assignors/sticky/sticky_assignor.py
index dce714f1a..69f68f564 100644
--- a/kafka/coordinator/assignors/sticky/sticky_assignor.py
+++ b/kafka/coordinator/assignors/sticky/sticky_assignor.py
@@ -2,7 +2,6 @@
 from collections import defaultdict, namedtuple
 from copy import deepcopy
 
-from kafka.cluster import ClusterMetadata
 from kafka.coordinator.assignors.abstract import AbstractPartitionAssignor
 from kafka.coordinator.assignors.sticky.partition_movements import PartitionMovements
 from kafka.coordinator.assignors.sticky.sorted_set import SortedSet
@@ -660,7 +659,7 @@ def _metadata(cls, topics, member_assignment_partitions, generation=-1):
             partitions_by_topic = defaultdict(list)
             for topic_partition in member_assignment_partitions:
                 partitions_by_topic[topic_partition.topic].append(topic_partition.partition)
-            data = StickyAssignorUserDataV1(six.viewitems(partitions_by_topic), generation)
+            data = StickyAssignorUserDataV1(list(partitions_by_topic.items()), generation)
             user_data = data.encode()
         return ConsumerProtocolMemberMetadata(cls.version, list(topics), user_data)
 
diff --git a/kafka/coordinator/base.py b/kafka/coordinator/base.py
index 5e41309df..1592f9154 100644
--- a/kafka/coordinator/base.py
+++ b/kafka/coordinator/base.py
@@ -14,9 +14,9 @@
 from kafka.future import Future
 from kafka.metrics import AnonMeasurable
 from kafka.metrics.stats import Avg, Count, Max, Rate
-from kafka.protocol.commit import GroupCoordinatorRequest, OffsetCommitRequest
-from kafka.protocol.group import (HeartbeatRequest, JoinGroupRequest,
-                            LeaveGroupRequest, SyncGroupRequest)
+from kafka.protocol.find_coordinator import FindCoordinatorRequest
+from kafka.protocol.group import HeartbeatRequest, JoinGroupRequest, LeaveGroupRequest, SyncGroupRequest, DEFAULT_GENERATION_ID, UNKNOWN_MEMBER_ID
+from kafka.util import Timer
 
 log = logging.getLogger('kafka.coordinator')
 
@@ -33,10 +33,17 @@ def __init__(self, generation_id, member_id, protocol):
         self.member_id = member_id
         self.protocol = protocol
 
-Generation.NO_GENERATION = Generation(
-    OffsetCommitRequest[2].DEFAULT_GENERATION_ID,
-    JoinGroupRequest[0].UNKNOWN_MEMBER_ID,
-    None)
+    @property
+    def is_valid(self):
+        return self.generation_id != DEFAULT_GENERATION_ID
+
+    def __eq__(self, other):
+        return (self.generation_id == other.generation_id and
+                self.member_id == other.member_id and
+                self.protocol == other.protocol)
+
+
+Generation.NO_GENERATION = Generation(DEFAULT_GENERATION_ID, UNKNOWN_MEMBER_ID, None)
 
 
 class UnjoinedGroupException(Errors.KafkaError):
@@ -87,10 +94,11 @@ class BaseCoordinator(object):
         'max_poll_interval_ms': 300000,
         'retry_backoff_ms': 100,
         'api_version': (0, 10, 1),
+        'metrics': None,
         'metric_group_prefix': '',
     }
 
-    def __init__(self, client, metrics, **configs):
+    def __init__(self, client, **configs):
         """
         Keyword Arguments:
             group_id (str): name of the consumer group to join for dynamic
@@ -133,8 +141,11 @@ def __init__(self, client, metrics, **configs):
         self.coordinator_id = None
         self._find_coordinator_future = None
         self._generation = Generation.NO_GENERATION
-        self.sensors = GroupCoordinatorMetrics(self.heartbeat, metrics,
-                                               self.config['metric_group_prefix'])
+        if self.config['metrics']:
+            self._sensors = GroupCoordinatorMetrics(self.heartbeat, self.config['metrics'],
+                                                   self.config['metric_group_prefix'])
+        else:
+            self._sensors = None
 
     @abc.abstractmethod
     def protocol_type(self):
@@ -166,7 +177,7 @@ def group_protocols(self):
         pass
 
     @abc.abstractmethod
-    def _on_join_prepare(self, generation, member_id):
+    def _on_join_prepare(self, generation, member_id, timeout_ms=None):
         """Invoked prior to each group join or rejoin.
 
         This is typically used to perform any cleanup from the previous
@@ -232,16 +243,22 @@ def coordinator(self):
         """
         if self.coordinator_id is None:
             return None
-        elif self._client.is_disconnected(self.coordinator_id):
+        elif self._client.is_disconnected(self.coordinator_id) and self._client.connection_delay(self.coordinator_id) > 0:
             self.coordinator_dead('Node Disconnected')
             return None
         else:
             return self.coordinator_id
 
-    def ensure_coordinator_ready(self):
-        """Block until the coordinator for this group is known
-        (and we have an active connection -- java client uses unsent queue).
+    def ensure_coordinator_ready(self, timeout_ms=None):
+        """Block until the coordinator for this group is known.
+
+        Keyword Arguments:
+            timeout_ms (numeric, optional): Maximum number of milliseconds to
+                block waiting to find coordinator. Default: None.
+
+        Returns: True is coordinator found before timeout_ms, else False
         """
+        timer = Timer(timeout_ms)
         with self._client._lock, self._lock:
             while self.coordinator_unknown():
 
@@ -249,24 +266,43 @@ def ensure_coordinator_ready(self):
                 # so we will just pick a node at random and treat
                 # it as the "coordinator"
                 if self.config['api_version'] < (0, 8, 2):
-                    self.coordinator_id = self._client.least_loaded_node()
-                    if self.coordinator_id is not None:
+                    maybe_coordinator_id = self._client.least_loaded_node()
+                    if maybe_coordinator_id is None or self._client.cluster.is_bootstrap(maybe_coordinator_id):
+                        future = Future().failure(Errors.NoBrokersAvailable())
+                    else:
+                        self.coordinator_id = maybe_coordinator_id
                         self._client.maybe_connect(self.coordinator_id)
-                    continue
+                        if timer.expired:
+                            return False
+                        else:
+                            continue
+                else:
+                    future = self.lookup_coordinator()
+
+                self._client.poll(future=future, timeout_ms=timer.timeout_ms)
 
-                future = self.lookup_coordinator()
-                self._client.poll(future=future)
+                if not future.is_done:
+                    return False
 
                 if future.failed():
                     if future.retriable():
                         if getattr(future.exception, 'invalid_metadata', False):
                             log.debug('Requesting metadata for group coordinator request: %s', future.exception)
                             metadata_update = self._client.cluster.request_update()
-                            self._client.poll(future=metadata_update)
+                            self._client.poll(future=metadata_update, timeout_ms=timer.timeout_ms)
+                            if not metadata_update.is_done:
+                                return False
                         else:
-                            time.sleep(self.config['retry_backoff_ms'] / 1000)
+                            if timeout_ms is None or timer.timeout_ms > self.config['retry_backoff_ms']:
+                                time.sleep(self.config['retry_backoff_ms'] / 1000)
+                            else:
+                                time.sleep(timer.timeout_ms / 1000)
                     else:
                         raise future.exception  # pylint: disable-msg=raising-bad-type
+                if timer.expired:
+                    return False
+            else:
+                return True
 
     def _reset_find_coordinator_future(self, result):
         self._find_coordinator_future = None
@@ -330,103 +366,139 @@ def time_to_next_heartbeat(self):
                 return float('inf')
             return self.heartbeat.time_to_next_heartbeat()
 
+    def _reset_join_group_future(self):
+        with self._lock:
+            self.join_future = None
+
+    def _initiate_join_group(self):
+        with self._lock:
+            # we store the join future in case we are woken up by the user
+            # after beginning the rebalance in the call to poll below.
+            # This ensures that we do not mistakenly attempt to rejoin
+            # before the pending rebalance has completed.
+            if self.join_future is None:
+                self.state = MemberState.REBALANCING
+                self.join_future = self._send_join_group_request()
+
+                # handle join completion in the callback so that the
+                # callback will be invoked even if the consumer is woken up
+                # before finishing the rebalance
+                self.join_future.add_callback(self._handle_join_success)
+
+                # we handle failures below after the request finishes.
+                # If the join completes after having been woken up, the
+                # exception is ignored and we will rejoin
+                self.join_future.add_errback(self._handle_join_failure)
+
+        return self.join_future
+
     def _handle_join_success(self, member_assignment_bytes):
+        # handle join completion in the callback so that the callback
+        # will be invoked even if the consumer is woken up before
+        # finishing the rebalance
         with self._lock:
             log.info("Successfully joined group %s with generation %s",
                      self.group_id, self._generation.generation_id)
             self.state = MemberState.STABLE
-            self.rejoin_needed = False
             if self._heartbeat_thread:
                 self._heartbeat_thread.enable()
 
     def _handle_join_failure(self, _):
+        # we handle failures below after the request finishes.
+        # if the join completes after having been woken up,
+        # the exception is ignored and we will rejoin
         with self._lock:
             self.state = MemberState.UNJOINED
 
-    def ensure_active_group(self):
-        """Ensure that the group is active (i.e. joined and synced)"""
-        with self._client._lock, self._lock:
-            if self._heartbeat_thread is None:
-                self._start_heartbeat_thread()
-
-            while self.need_rejoin() or self._rejoin_incomplete():
-                self.ensure_coordinator_ready()
-
-                # call on_join_prepare if needed. We set a flag
-                # to make sure that we do not call it a second
-                # time if the client is woken up before a pending
-                # rebalance completes. This must be called on each
-                # iteration of the loop because an event requiring
-                # a rebalance (such as a metadata refresh which
-                # changes the matched subscription set) can occur
-                # while another rebalance is still in progress.
-                if not self.rejoining:
-                    self._on_join_prepare(self._generation.generation_id,
-                                          self._generation.member_id)
-                    self.rejoining = True
-
-                # ensure that there are no pending requests to the coordinator.
-                # This is important in particular to avoid resending a pending
-                # JoinGroup request.
-                while not self.coordinator_unknown():
-                    if not self._client.in_flight_request_count(self.coordinator_id):
-                        break
-                    self._client.poll()
-                else:
-                    continue
-
-                # we store the join future in case we are woken up by the user
-                # after beginning the rebalance in the call to poll below.
-                # This ensures that we do not mistakenly attempt to rejoin
-                # before the pending rebalance has completed.
-                if self.join_future is None:
-                    # Fence off the heartbeat thread explicitly so that it cannot
-                    # interfere with the join group. Note that this must come after
-                    # the call to _on_join_prepare since we must be able to continue
-                    # sending heartbeats if that callback takes some time.
-                    self._heartbeat_thread.disable()
-
-                    self.state = MemberState.REBALANCING
-                    future = self._send_join_group_request()
-
-                    self.join_future = future  # this should happen before adding callbacks
+    def ensure_active_group(self, timeout_ms=None):
+        """Ensure that the group is active (i.e. joined and synced)
 
-                    # handle join completion in the callback so that the
-                    # callback will be invoked even if the consumer is woken up
-                    # before finishing the rebalance
-                    future.add_callback(self._handle_join_success)
-
-                    # we handle failures below after the request finishes.
-                    # If the join completes after having been woken up, the
-                    # exception is ignored and we will rejoin
-                    future.add_errback(self._handle_join_failure)
-
-                else:
-                    future = self.join_future
-
-                self._client.poll(future=future)
+        Keyword Arguments:
+            timeout_ms (numeric, optional): Maximum number of milliseconds to
+                block waiting to join group. Default: None.
 
-                if future.succeeded():
-                    self._on_join_complete(self._generation.generation_id,
-                                           self._generation.member_id,
-                                           self._generation.protocol,
-                                           future.value)
-                    self.join_future = None
-                    self.rejoining = False
+        Returns: True if group initialized before timeout_ms, else False
+        """
+        if self.config['api_version'] < (0, 9):
+            raise Errors.UnsupportedVersionError('Group Coordinator APIs require 0.9+ broker')
+        timer = Timer(timeout_ms)
+        if not self.ensure_coordinator_ready(timeout_ms=timer.timeout_ms):
+            return False
+        self._start_heartbeat_thread()
+        return self.join_group(timeout_ms=timer.timeout_ms)
+
+    def join_group(self, timeout_ms=None):
+        if self.config['api_version'] < (0, 9):
+            raise Errors.UnsupportedVersionError('Group Coordinator APIs require 0.9+ broker')
+        timer = Timer(timeout_ms)
+        while self.need_rejoin():
+            if not self.ensure_coordinator_ready(timeout_ms=timer.timeout_ms):
+                return False
+
+            # call on_join_prepare if needed. We set a flag
+            # to make sure that we do not call it a second
+            # time if the client is woken up before a pending
+            # rebalance completes. This must be called on each
+            # iteration of the loop because an event requiring
+            # a rebalance (such as a metadata refresh which
+            # changes the matched subscription set) can occur
+            # while another rebalance is still in progress.
+            if not self.rejoining:
+                self._on_join_prepare(self._generation.generation_id,
+                                      self._generation.member_id,
+                                      timeout_ms=timer.timeout_ms)
+                self.rejoining = True
+
+            # fence off the heartbeat thread explicitly so that it cannot
+            # interfere with the join group.  # Note that this must come after
+            # the call to onJoinPrepare since we must be able to continue
+            # sending heartbeats if that callback takes some time.
+            self._disable_heartbeat_thread()
+
+            # ensure that there are no pending requests to the coordinator.
+            # This is important in particular to avoid resending a pending
+            # JoinGroup request.
+            while not self.coordinator_unknown():
+                if not self._client.in_flight_request_count(self.coordinator_id):
+                    break
+                poll_timeout_ms = 200 if timer.timeout_ms is None or timer.timeout_ms > 200 else timer.timeout_ms
+                self._client.poll(timeout_ms=poll_timeout_ms)
+                if timer.expired:
+                    return False
+            else:
+                continue
 
+            future = self._initiate_join_group()
+            self._client.poll(future=future, timeout_ms=timer.timeout_ms)
+            if future.is_done:
+                self._reset_join_group_future()
+            else:
+                return False
+
+            if future.succeeded():
+                self.rejoining = False
+                self.rejoin_needed = False
+                self._on_join_complete(self._generation.generation_id,
+                                       self._generation.member_id,
+                                       self._generation.protocol,
+                                       future.value)
+                return True
+            else:
+                exception = future.exception
+                if isinstance(exception, (Errors.UnknownMemberIdError,
+                                          Errors.RebalanceInProgressError,
+                                          Errors.IllegalGenerationError,
+                                          Errors.MemberIdRequiredError)):
+                    continue
+                elif not future.retriable():
+                    raise exception  # pylint: disable-msg=raising-bad-type
+                elif timer.expired:
+                    return False
                 else:
-                    self.join_future = None
-                    exception = future.exception
-                    if isinstance(exception, (Errors.UnknownMemberIdError,
-                                              Errors.RebalanceInProgressError,
-                                              Errors.IllegalGenerationError)):
-                        continue
-                    elif not future.retriable():
-                        raise exception  # pylint: disable-msg=raising-bad-type
-                    time.sleep(self.config['retry_backoff_ms'] / 1000)
-
-    def _rejoin_incomplete(self):
-        return self.join_future is not None
+                    if timer.timeout_ms is None or timer.timeout_ms > self.config['retry_backoff_ms']:
+                        time.sleep(self.config['retry_backoff_ms'] / 1000)
+                    else:
+                        time.sleep(timer.timeout_ms / 1000)
 
     def _send_join_group_request(self):
         """Join the group and return the assignment for the next generation.
@@ -439,7 +511,7 @@ def _send_join_group_request(self):
                 group leader
         """
         if self.coordinator_unknown():
-            e = Errors.GroupCoordinatorNotAvailableError(self.coordinator_id)
+            e = Errors.CoordinatorNotAvailableError(self.coordinator_id)
             return Future().failure(e)
 
         elif not self._client.ready(self.coordinator_id, metadata_priority=False):
@@ -452,25 +524,16 @@ def _send_join_group_request(self):
             (protocol, metadata if isinstance(metadata, bytes) else metadata.encode())
             for protocol, metadata in self.group_protocols()
         ]
-        if self.config['api_version'] < (0, 9):
-            raise Errors.KafkaError('JoinGroupRequest api requires 0.9+ brokers')
-        elif (0, 9) <= self.config['api_version'] < (0, 10, 1):
-            request = JoinGroupRequest[0](
-                self.group_id,
-                self.config['session_timeout_ms'],
-                self._generation.member_id,
-                self.protocol_type(),
-                member_metadata)
-        elif (0, 10, 1) <= self.config['api_version'] < (0, 11, 0):
-            request = JoinGroupRequest[1](
+        version = self._client.api_version(JoinGroupRequest, max_version=4)
+        if version == 0:
+            request = JoinGroupRequest[version](
                 self.group_id,
                 self.config['session_timeout_ms'],
-                self.config['max_poll_interval_ms'],
                 self._generation.member_id,
                 self.protocol_type(),
                 member_metadata)
         else:
-            request = JoinGroupRequest[2](
+            request = JoinGroupRequest[version](
                 self.group_id,
                 self.config['session_timeout_ms'],
                 self.config['max_poll_interval_ms'],
@@ -505,7 +568,8 @@ def _handle_join_group_response(self, future, send_time, response):
         if error_type is Errors.NoError:
             log.debug("Received successful JoinGroup response for group %s: %s",
                       self.group_id, response)
-            self.sensors.join_latency.record((time.time() - send_time) * 1000)
+            if self._sensors:
+                self._sensors.join_latency.record((time.time() - send_time) * 1000)
             with self._lock:
                 if self.state is not MemberState.REBALANCING:
                     # if the consumer was woken up before a rebalance completes,
@@ -524,7 +588,7 @@ def _handle_join_group_response(self, future, send_time, response):
                 else:
                     self._on_join_follower().chain(future)
 
-        elif error_type is Errors.GroupLoadInProgressError:
+        elif error_type is Errors.CoordinatorLoadInProgressError:
             log.debug("Attempt to join group %s rejected since coordinator %s"
                       " is loading the group.", self.group_id, self.coordinator_id)
             # backoff and retry
@@ -536,8 +600,8 @@ def _handle_join_group_response(self, future, send_time, response):
             log.debug("Attempt to join group %s failed due to unknown member id",
                       self.group_id)
             future.failure(error)
-        elif error_type in (Errors.GroupCoordinatorNotAvailableError,
-                            Errors.NotCoordinatorForGroupError):
+        elif error_type in (Errors.CoordinatorNotAvailableError,
+                            Errors.NotCoordinatorError):
             # re-discover the coordinator and retry with backoff
             self.coordinator_dead(error_type())
             log.debug("Attempt to join group %s failed due to obsolete "
@@ -554,6 +618,11 @@ def _handle_join_group_response(self, future, send_time, response):
             future.failure(error)
         elif error_type is Errors.GroupAuthorizationFailedError:
             future.failure(error_type(self.group_id))
+        elif error_type is Errors.MemberIdRequiredError:
+            # Broker requires a concrete member id to be allowed to join the group. Update member id
+            # and send another join group request in next cycle.
+            self.reset_generation(response.member_id)
+            future.failure(error_type())
         else:
             # unexpected error, throw the exception
             error = error_type()
@@ -562,7 +631,7 @@ def _handle_join_group_response(self, future, send_time, response):
 
     def _on_join_follower(self):
         # send follower's sync group with an empty assignment
-        version = 0 if self.config['api_version'] < (0, 11, 0) else 1
+        version = self._client.api_version(SyncGroupRequest, max_version=2)
         request = SyncGroupRequest[version](
             self.group_id,
             self._generation.generation_id,
@@ -590,7 +659,7 @@ def _on_join_leader(self, response):
         except Exception as e:
             return Future().failure(e)
 
-        version = 0 if self.config['api_version'] < (0, 11, 0) else 1
+        version = self._client.api_version(SyncGroupRequest, max_version=2)
         request = SyncGroupRequest[version](
             self.group_id,
             self._generation.generation_id,
@@ -605,7 +674,7 @@ def _on_join_leader(self, response):
 
     def _send_sync_group_request(self, request):
         if self.coordinator_unknown():
-            e = Errors.GroupCoordinatorNotAvailableError(self.coordinator_id)
+            e = Errors.CoordinatorNotAvailableError(self.coordinator_id)
             return Future().failure(e)
 
         # We assume that coordinator is ready if we're sending SyncGroup
@@ -624,7 +693,8 @@ def _send_sync_group_request(self, request):
     def _handle_sync_group_response(self, future, send_time, response):
         error_type = Errors.for_code(response.error_code)
         if error_type is Errors.NoError:
-            self.sensors.sync_latency.record((time.time() - send_time) * 1000)
+            if self._sensors:
+                self._sensors.sync_latency.record((time.time() - send_time) * 1000)
             future.success(response.member_assignment)
             return
 
@@ -642,8 +712,8 @@ def _handle_sync_group_response(self, future, send_time, response):
             log.debug("SyncGroup for group %s failed due to %s", self.group_id, error)
             self.reset_generation()
             future.failure(error)
-        elif error_type in (Errors.GroupCoordinatorNotAvailableError,
-                            Errors.NotCoordinatorForGroupError):
+        elif error_type in (Errors.CoordinatorNotAvailableError,
+                            Errors.NotCoordinatorError):
             error = error_type()
             log.debug("SyncGroup for group %s failed due to %s", self.group_id, error)
             self.coordinator_dead(error)
@@ -660,7 +730,7 @@ def _send_group_coordinator_request(self):
             Future: resolves to the node id of the coordinator
         """
         node_id = self._client.least_loaded_node()
-        if node_id is None:
+        if node_id is None or self._client.cluster.is_bootstrap(node_id):
             return Future().failure(Errors.NoBrokersAvailable())
 
         elif not self._client.ready(node_id, metadata_priority=False):
@@ -669,7 +739,11 @@ def _send_group_coordinator_request(self):
 
         log.debug("Sending group coordinator request for group %s to broker %s",
                   self.group_id, node_id)
-        request = GroupCoordinatorRequest[0](self.group_id)
+        version = self._client.api_version(FindCoordinatorRequest, max_version=2)
+        if version == 0:
+            request = FindCoordinatorRequest[version](self.group_id)
+        else:
+            request = FindCoordinatorRequest[version](self.group_id, 0)
         future = Future()
         _f = self._client.send(node_id, request)
         _f.add_callback(self._handle_group_coordinator_response, future)
@@ -682,7 +756,7 @@ def _handle_group_coordinator_response(self, future, response):
         error_type = Errors.for_code(response.error_code)
         if error_type is Errors.NoError:
             with self._lock:
-                coordinator_id = self._client.cluster.add_group_coordinator(self.group_id, response)
+                coordinator_id = self._client.cluster.add_coordinator(response, 'group', self.group_id)
                 if not coordinator_id:
                     # This could happen if coordinator metadata is different
                     # than broker metadata
@@ -696,7 +770,7 @@ def _handle_group_coordinator_response(self, future, response):
                 self.heartbeat.reset_timeouts()
             future.success(self.coordinator_id)
 
-        elif error_type is Errors.GroupCoordinatorNotAvailableError:
+        elif error_type is Errors.CoordinatorNotAvailableError:
             log.debug("Group Coordinator Not Available; retry")
             future.failure(error_type())
         elif error_type is Errors.GroupAuthorizationFailedError:
@@ -726,10 +800,10 @@ def generation(self):
                 return None
             return self._generation
 
-    def reset_generation(self):
-        """Reset the generation and memberId because we have fallen out of the group."""
+    def reset_generation(self, member_id=UNKNOWN_MEMBER_ID):
+        """Reset the generation and member_id because we have fallen out of the group."""
         with self._lock:
-            self._generation = Generation.NO_GENERATION
+            self._generation = Generation(DEFAULT_GENERATION_ID, member_id, None)
             self.rejoin_needed = True
             self.state = MemberState.UNJOINED
 
@@ -737,46 +811,62 @@ def request_rejoin(self):
         self.rejoin_needed = True
 
     def _start_heartbeat_thread(self):
-        if self._heartbeat_thread is None:
-            log.info('Starting new heartbeat thread')
-            self._heartbeat_thread = HeartbeatThread(weakref.proxy(self))
-            self._heartbeat_thread.daemon = True
-            self._heartbeat_thread.start()
-
-    def _close_heartbeat_thread(self):
-        if self._heartbeat_thread is not None:
-            log.info('Stopping heartbeat thread')
-            try:
-                self._heartbeat_thread.close()
-            except ReferenceError:
-                pass
-            self._heartbeat_thread = None
+        if self.config['api_version'] < (0, 9):
+            raise Errors.UnsupportedVersionError('Heartbeat APIs require 0.9+ broker')
+        with self._lock:
+            if self._heartbeat_thread is None:
+                log.info('Starting new heartbeat thread')
+                self._heartbeat_thread = HeartbeatThread(weakref.proxy(self))
+                self._heartbeat_thread.daemon = True
+                self._heartbeat_thread.start()
+                log.debug("Started heartbeat thread %s", self._heartbeat_thread.ident)
+
+    def _disable_heartbeat_thread(self):
+        with self._lock:
+            if self._heartbeat_thread is not None:
+                self._heartbeat_thread.disable()
+
+    def _close_heartbeat_thread(self, timeout_ms=None):
+        with self._lock:
+            if self._heartbeat_thread is not None:
+                log.info('Stopping heartbeat thread')
+                try:
+                    self._heartbeat_thread.close(timeout_ms=timeout_ms)
+                except ReferenceError:
+                    pass
+                self._heartbeat_thread = None
 
     def __del__(self):
-        self._close_heartbeat_thread()
+        try:
+            self._close_heartbeat_thread()
+        except (TypeError, AttributeError):
+            pass
 
-    def close(self):
+    def close(self, timeout_ms=None):
         """Close the coordinator, leave the current group,
         and reset local generation / member_id"""
-        self._close_heartbeat_thread()
-        self.maybe_leave_group()
+        self._close_heartbeat_thread(timeout_ms=timeout_ms)
+        if self.config['api_version'] >= (0, 9):
+            self.maybe_leave_group(timeout_ms=timeout_ms)
 
-    def maybe_leave_group(self):
+    def maybe_leave_group(self, timeout_ms=None):
         """Leave the current group and reset local generation/memberId."""
+        if self.config['api_version'] < (0, 9):
+            raise Errors.UnsupportedVersionError('Group Coordinator APIs require 0.9+ broker')
         with self._client._lock, self._lock:
             if (not self.coordinator_unknown()
                 and self.state is not MemberState.UNJOINED
-                and self._generation is not Generation.NO_GENERATION):
+                and self._generation.is_valid):
 
                 # this is a minimal effort attempt to leave the group. we do not
                 # attempt any resending if the request fails or times out.
                 log.info('Leaving consumer group (%s).', self.group_id)
-                version = 0 if self.config['api_version'] < (0, 11, 0) else 1
+                version = self._client.api_version(LeaveGroupRequest, max_version=2)
                 request = LeaveGroupRequest[version](self.group_id, self._generation.member_id)
                 future = self._client.send(self.coordinator_id, request)
                 future.add_callback(self._handle_leave_group_response)
                 future.add_errback(log.error, "LeaveGroup request failed: %s")
-                self._client.poll(future=future)
+                self._client.poll(future=future, timeout_ms=timeout_ms)
 
             self.reset_generation()
 
@@ -792,14 +882,14 @@ def _handle_leave_group_response(self, response):
     def _send_heartbeat_request(self):
         """Send a heartbeat request"""
         if self.coordinator_unknown():
-            e = Errors.GroupCoordinatorNotAvailableError(self.coordinator_id)
+            e = Errors.CoordinatorNotAvailableError(self.coordinator_id)
             return Future().failure(e)
 
         elif not self._client.ready(self.coordinator_id, metadata_priority=False):
             e = Errors.NodeNotReadyError(self.coordinator_id)
             return Future().failure(e)
 
-        version = 0 if self.config['api_version'] < (0, 11, 0) else 1
+        version = self._client.api_version(HeartbeatRequest, max_version=2)
         request = HeartbeatRequest[version](self.group_id,
                                             self._generation.generation_id,
                                             self._generation.member_id)
@@ -812,14 +902,15 @@ def _send_heartbeat_request(self):
         return future
 
     def _handle_heartbeat_response(self, future, send_time, response):
-        self.sensors.heartbeat_latency.record((time.time() - send_time) * 1000)
+        if self._sensors:
+            self._sensors.heartbeat_latency.record((time.time() - send_time) * 1000)
         error_type = Errors.for_code(response.error_code)
         if error_type is Errors.NoError:
             log.debug("Received successful heartbeat response for group %s",
                       self.group_id)
             future.success(None)
-        elif error_type in (Errors.GroupCoordinatorNotAvailableError,
-                            Errors.NotCoordinatorForGroupError):
+        elif error_type in (Errors.CoordinatorNotAvailableError,
+                            Errors.NotCoordinatorError):
             log.warning("Heartbeat failed for group %s: coordinator (node %s)"
                         " is either not started or not valid", self.group_id,
                         self.coordinator())
@@ -912,19 +1003,34 @@ def __init__(self, coordinator):
 
     def enable(self):
         with self.coordinator._lock:
+            log.debug('Enabling heartbeat thread')
             self.enabled = True
             self.coordinator.heartbeat.reset_timeouts()
             self.coordinator._lock.notify()
 
     def disable(self):
-        self.enabled = False
+        with self.coordinator._lock:
+            log.debug('Disabling heartbeat thread')
+            self.enabled = False
 
-    def close(self):
+    def close(self, timeout_ms=None):
+        if self.closed:
+            return
         self.closed = True
+
+        # Generally this should not happen - close() is triggered
+        # by the coordinator. But in some cases GC may close the coordinator
+        # from within the heartbeat thread.
+        if threading.current_thread() == self:
+            return
+
         with self.coordinator._lock:
             self.coordinator._lock.notify()
+
         if self.is_alive():
-            self.join(self.coordinator.config['heartbeat_interval_ms'] / 1000)
+            if timeout_ms is None:
+                timeout_ms = self.coordinator.config['heartbeat_interval_ms']
+            self.join(timeout_ms / 1000)
         if self.is_alive():
             log.warning("Heartbeat thread did not fully terminate during close")
 
@@ -952,7 +1058,7 @@ def _run_once(self):
                 # disable here to prevent propagating an exception to this
                 # heartbeat thread
                 # must get client._lock, or maybe deadlock at heartbeat 
-                # failure callbak in consumer poll
+                # failure callback in consumer poll
                 self.coordinator._client.poll(timeout_ms=0)
 
         with self.coordinator._lock:
@@ -990,6 +1096,11 @@ def _run_once(self):
                 # foreground thread has stalled in between calls to
                 # poll(), so we explicitly leave the group.
                 log.warning('Heartbeat poll expired, leaving group')
+                ### XXX
+                # maybe_leave_group acquires client + coordinator lock;
+                # if we hold coordinator lock before calling, we risk deadlock
+                # release() is safe here because this is the last code in the current context
+                self.coordinator._lock.release()
                 self.coordinator.maybe_leave_group()
 
             elif not self.coordinator.heartbeat.should_heartbeat():
diff --git a/kafka/coordinator/consumer.py b/kafka/coordinator/consumer.py
index 971f5e802..3db00d72c 100644
--- a/kafka/coordinator/consumer.py
+++ b/kafka/coordinator/consumer.py
@@ -19,7 +19,7 @@
 from kafka.metrics.stats import Avg, Count, Max, Rate
 from kafka.protocol.commit import OffsetCommitRequest, OffsetFetchRequest
 from kafka.structs import OffsetAndMetadata, TopicPartition
-from kafka.util import WeakMethod
+from kafka.util import Timer, WeakMethod
 
 
 log = logging.getLogger(__name__)
@@ -39,10 +39,11 @@ class ConsumerCoordinator(BaseCoordinator):
         'retry_backoff_ms': 100,
         'api_version': (0, 10, 1),
         'exclude_internal_topics': True,
+        'metrics': None,
         'metric_group_prefix': 'consumer'
     }
 
-    def __init__(self, client, subscription, metrics, **configs):
+    def __init__(self, client, subscription, **configs):
         """Initialize the coordination manager.
 
         Keyword Arguments:
@@ -54,7 +55,7 @@ def __init__(self, client, subscription, metrics, **configs):
             auto_commit_interval_ms (int): milliseconds between automatic
                 offset commits, if enable_auto_commit is True. Default: 5000.
             default_offset_commit_callback (callable): called as
-                callback(offsets, exception) response will be either an Exception
+                callback(offsets, response) response will be either an Exception
                 or None. This callback can be used to trigger custom actions when
                 a commit request completes.
             assignors (list): List of objects to use to distribute partition
@@ -78,7 +79,7 @@ def __init__(self, client, subscription, metrics, **configs):
                 True the only way to receive records from an internal topic is
                 subscribing to it. Requires 0.10+. Default: True
         """
-        super(ConsumerCoordinator, self).__init__(client, metrics, **configs)
+        super(ConsumerCoordinator, self).__init__(client, **configs)
 
         self.config = copy.copy(self.DEFAULT_CONFIG)
         for key in self.config:
@@ -94,6 +95,7 @@ def __init__(self, client, subscription, metrics, **configs):
         self.auto_commit_interval = self.config['auto_commit_interval_ms'] / 1000
         self.next_auto_commit_deadline = None
         self.completed_offset_commits = collections.deque()
+        self._offset_fetch_futures = dict()
 
         if self.config['default_offset_commit_callback'] is None:
             self.config['default_offset_commit_callback'] = self._default_offset_commit_callback
@@ -120,15 +122,21 @@ def __init__(self, client, subscription, metrics, **configs):
             else:
                 self.next_auto_commit_deadline = time.time() + self.auto_commit_interval
 
-        self.consumer_sensors = ConsumerCoordinatorMetrics(
-            metrics, self.config['metric_group_prefix'], self._subscription)
+        if self.config['metrics']:
+            self._consumer_sensors = ConsumerCoordinatorMetrics(
+                self.config['metrics'], self.config['metric_group_prefix'], self._subscription)
+        else:
+            self._consumer_sensors = None
 
         self._cluster.request_update()
         self._cluster.add_listener(WeakMethod(self._handle_metadata_update))
 
     def __del__(self):
         if hasattr(self, '_cluster') and self._cluster:
-            self._cluster.remove_listener(WeakMethod(self._handle_metadata_update))
+            try:
+                self._cluster.remove_listener(WeakMethod(self._handle_metadata_update))
+            except TypeError:
+                pass
         super(ConsumerCoordinator, self).__del__()
 
     def protocol_type(self):
@@ -200,8 +208,8 @@ def _auto_assign_all_partitions(self):
     def _build_metadata_snapshot(self, subscription, cluster):
         metadata_snapshot = {}
         for topic in subscription.group_subscription():
-            partitions = cluster.partitions_for_topic(topic) or []
-            metadata_snapshot[topic] = set(partitions)
+            partitions = cluster.partitions_for_topic(topic)
+            metadata_snapshot[topic] = partitions or set()
         return metadata_snapshot
 
     def _lookup_assignor(self, name):
@@ -222,10 +230,6 @@ def _on_join_complete(self, generation, member_id, protocol,
 
         assignment = ConsumerProtocol.ASSIGNMENT.decode(member_assignment_bytes)
 
-        # set the flag to refresh last committed offsets
-        self._subscription.needs_fetch_committed_offsets = True
-
-        # update partition assignment
         try:
             self._subscription.assign_from_subscribed(assignment.partitions())
         except ValueError as e:
@@ -246,16 +250,16 @@ def _on_join_complete(self, generation, member_id, protocol,
                  assigned, self.group_id)
 
         # execute the user's callback after rebalance
-        if self._subscription.listener:
+        if self._subscription.rebalance_listener:
             try:
-                self._subscription.listener.on_partitions_assigned(assigned)
+                self._subscription.rebalance_listener.on_partitions_assigned(assigned)
             except Exception:
-                log.exception("User provided listener %s for group %s"
+                log.exception("User provided rebalance listener %s for group %s"
                               " failed on partition assignment: %s",
-                              self._subscription.listener, self.group_id,
+                              self._subscription.rebalance_listener, self.group_id,
                               assigned)
 
-    def poll(self):
+    def poll(self, timeout_ms=None):
         """
         Poll for coordinator events. Only applicable if group_id is set, and
         broker version supports GroupCoordinators. This ensures that the
@@ -264,33 +268,43 @@ def poll(self):
         periodic offset commits if they are enabled.
         """
         if self.group_id is None:
-            return
+            return True
 
-        self._invoke_completed_offset_commit_callbacks()
-        self.ensure_coordinator_ready()
-
-        if self.config['api_version'] >= (0, 9) and self._subscription.partitions_auto_assigned():
-            if self.need_rejoin():
-                # due to a race condition between the initial metadata fetch and the
-                # initial rebalance, we need to ensure that the metadata is fresh
-                # before joining initially, and then request the metadata update. If
-                # metadata update arrives while the rebalance is still pending (for
-                # example, when the join group is still inflight), then we will lose
-                # track of the fact that we need to rebalance again to reflect the
-                # change to the topic subscription. Without ensuring that the
-                # metadata is fresh, any metadata update that changes the topic
-                # subscriptions and arrives while a rebalance is in progress will
-                # essentially be ignored. See KAFKA-3949 for the complete
-                # description of the problem.
-                if self._subscription.subscribed_pattern:
-                    metadata_update = self._client.cluster.request_update()
-                    self._client.poll(future=metadata_update)
-
-                self.ensure_active_group()
-
-            self.poll_heartbeat()
-
-        self._maybe_auto_commit_offsets_async()
+        timer = Timer(timeout_ms)
+        try:
+            self._invoke_completed_offset_commit_callbacks()
+            if not self.ensure_coordinator_ready(timeout_ms=timer.timeout_ms):
+                return False
+
+            if self.config['api_version'] >= (0, 9) and self._subscription.partitions_auto_assigned():
+                if self.need_rejoin():
+                    # due to a race condition between the initial metadata fetch and the
+                    # initial rebalance, we need to ensure that the metadata is fresh
+                    # before joining initially, and then request the metadata update. If
+                    # metadata update arrives while the rebalance is still pending (for
+                    # example, when the join group is still inflight), then we will lose
+                    # track of the fact that we need to rebalance again to reflect the
+                    # change to the topic subscription. Without ensuring that the
+                    # metadata is fresh, any metadata update that changes the topic
+                    # subscriptions and arrives while a rebalance is in progress will
+                    # essentially be ignored. See KAFKA-3949 for the complete
+                    # description of the problem.
+                    if self._subscription.subscribed_pattern:
+                        metadata_update = self._client.cluster.request_update()
+                        self._client.poll(future=metadata_update, timeout_ms=timer.timeout_ms)
+                        if not metadata_update.is_done:
+                            return False
+
+                    if not self.ensure_active_group(timeout_ms=timer.timeout_ms):
+                        return False
+
+                self.poll_heartbeat()
+
+            self._maybe_auto_commit_offsets_async()
+            return True
+
+        except Errors.KafkaTimeoutError:
+            return False
 
     def time_to_next_poll(self):
         """Return seconds (float) remaining until :meth:`.poll` should be called again"""
@@ -340,21 +354,21 @@ def _perform_assignment(self, leader_id, assignment_strategy, members):
             group_assignment[member_id] = assignment
         return group_assignment
 
-    def _on_join_prepare(self, generation, member_id):
+    def _on_join_prepare(self, generation, member_id, timeout_ms=None):
         # commit offsets prior to rebalance if auto-commit enabled
-        self._maybe_auto_commit_offsets_sync()
+        self._maybe_auto_commit_offsets_sync(timeout_ms=timeout_ms)
 
         # execute the user's callback before rebalance
         log.info("Revoking previously assigned partitions %s for group %s",
                  self._subscription.assigned_partitions(), self.group_id)
-        if self._subscription.listener:
+        if self._subscription.rebalance_listener:
             try:
                 revoked = set(self._subscription.assigned_partitions())
-                self._subscription.listener.on_partitions_revoked(revoked)
+                self._subscription.rebalance_listener.on_partitions_revoked(revoked)
             except Exception:
-                log.exception("User provided subscription listener %s"
+                log.exception("User provided subscription rebalance listener %s"
                               " for group %s failed on_partitions_revoked",
-                              self._subscription.listener, self.group_id)
+                              self._subscription.rebalance_listener, self.group_id)
 
         self._is_leader = False
         self._subscription.reset_group_subscription()
@@ -383,17 +397,19 @@ def need_rejoin(self):
 
         return super(ConsumerCoordinator, self).need_rejoin()
 
-    def refresh_committed_offsets_if_needed(self):
+    def refresh_committed_offsets_if_needed(self, timeout_ms=None):
         """Fetch committed offsets for assigned partitions."""
-        if self._subscription.needs_fetch_committed_offsets:
-            offsets = self.fetch_committed_offsets(self._subscription.assigned_partitions())
-            for partition, offset in six.iteritems(offsets):
-                # verify assignment is still active
-                if self._subscription.is_assigned(partition):
-                    self._subscription.assignment[partition].committed = offset
-            self._subscription.needs_fetch_committed_offsets = False
-
-    def fetch_committed_offsets(self, partitions):
+        missing_fetch_positions = set(self._subscription.missing_fetch_positions())
+        try:
+            offsets = self.fetch_committed_offsets(missing_fetch_positions, timeout_ms=timeout_ms)
+        except Errors.KafkaTimeoutError:
+            return False
+        for partition, offset in six.iteritems(offsets):
+            log.debug("Setting offset for partition %s to the committed offset %s", partition, offset.offset)
+            self._subscription.seek(partition, offset.offset)
+        return True
+
+    def fetch_committed_offsets(self, partitions, timeout_ms=None):
         """Fetch the current committed offsets for specified partitions
 
         Arguments:
@@ -401,26 +417,44 @@ def fetch_committed_offsets(self, partitions):
 
         Returns:
             dict: {TopicPartition: OffsetAndMetadata}
+
+        Raises:
+            KafkaTimeoutError if timeout_ms provided
         """
         if not partitions:
             return {}
 
+        future_key = frozenset(partitions)
+        timer = Timer(timeout_ms)
         while True:
-            self.ensure_coordinator_ready()
+            self.ensure_coordinator_ready(timeout_ms=timer.timeout_ms)
 
             # contact coordinator to fetch committed offsets
-            future = self._send_offset_fetch_request(partitions)
-            self._client.poll(future=future)
+            if future_key in self._offset_fetch_futures:
+                future = self._offset_fetch_futures[future_key]
+            else:
+                future = self._send_offset_fetch_request(partitions)
+                self._offset_fetch_futures[future_key] = future
+
+            self._client.poll(future=future, timeout_ms=timer.timeout_ms)
 
-            if future.succeeded():
-                return future.value
+            if future.is_done:
+                del self._offset_fetch_futures[future_key]
 
-            if not future.retriable():
-                raise future.exception # pylint: disable-msg=raising-bad-type
+                if future.succeeded():
+                    return future.value
 
-            time.sleep(self.config['retry_backoff_ms'] / 1000)
+                elif not future.retriable():
+                    raise future.exception # pylint: disable-msg=raising-bad-type
 
-    def close(self, autocommit=True):
+            # future failed but is retriable, or is not done yet
+            if timer.timeout_ms is None or timer.timeout_ms > self.config['retry_backoff_ms']:
+                time.sleep(self.config['retry_backoff_ms'] / 1000)
+            else:
+                time.sleep(timer.timeout_ms / 1000)
+            timer.maybe_raise()
+
+    def close(self, autocommit=True, timeout_ms=None):
         """Close the coordinator, leave the current group,
         and reset local generation / member_id.
 
@@ -431,14 +465,14 @@ def close(self, autocommit=True):
         """
         try:
             if autocommit:
-                self._maybe_auto_commit_offsets_sync()
+                self._maybe_auto_commit_offsets_sync(timeout_ms=timeout_ms)
         finally:
-            super(ConsumerCoordinator, self).close()
+            super(ConsumerCoordinator, self).close(timeout_ms=timeout_ms)
 
     def _invoke_completed_offset_commit_callbacks(self):
         while self.completed_offset_commits:
-            callback, offsets, exception = self.completed_offset_commits.popleft()
-            callback(offsets, exception)
+            callback, offsets, res_or_exc = self.completed_offset_commits.popleft()
+            callback(offsets, res_or_exc)
 
     def commit_offsets_async(self, offsets, callback=None):
         """Commit specific offsets asynchronously.
@@ -478,18 +512,18 @@ def commit_offsets_async(self, offsets, callback=None):
         return future
 
     def _do_commit_offsets_async(self, offsets, callback=None):
-        assert self.config['api_version'] >= (0, 8, 1), 'Unsupported Broker API'
+        if self.config['api_version'] < (0, 8, 1):
+            raise Errors.UnsupportedVersionError('OffsetCommitRequest requires 0.8.1+ broker')
         assert all(map(lambda k: isinstance(k, TopicPartition), offsets))
         assert all(map(lambda v: isinstance(v, OffsetAndMetadata),
                        offsets.values()))
         if callback is None:
             callback = self.config['default_offset_commit_callback']
-        self._subscription.needs_fetch_committed_offsets = True
         future = self._send_offset_commit_request(offsets)
         future.add_both(lambda res: self.completed_offset_commits.appendleft((callback, offsets, res)))
         return future
 
-    def commit_offsets_sync(self, offsets):
+    def commit_offsets_sync(self, offsets, timeout_ms=None):
         """Commit specific offsets synchronously.
 
         This method will retry until the commit completes successfully or an
@@ -500,7 +534,8 @@ def commit_offsets_sync(self, offsets):
 
         Raises error on failure
         """
-        assert self.config['api_version'] >= (0, 8, 1), 'Unsupported Broker API'
+        if self.config['api_version'] < (0, 8, 1):
+            raise Errors.UnsupportedVersionError('OffsetCommitRequest requires 0.8.1+ broker')
         assert all(map(lambda k: isinstance(k, TopicPartition), offsets))
         assert all(map(lambda v: isinstance(v, OffsetAndMetadata),
                        offsets.values()))
@@ -508,24 +543,31 @@ def commit_offsets_sync(self, offsets):
         if not offsets:
             return
 
+        timer = Timer(timeout_ms)
         while True:
-            self.ensure_coordinator_ready()
+            self.ensure_coordinator_ready(timeout_ms=timer.timeout_ms)
 
             future = self._send_offset_commit_request(offsets)
-            self._client.poll(future=future)
+            self._client.poll(future=future, timeout_ms=timer.timeout_ms)
 
-            if future.succeeded():
-                return future.value
+            if future.is_done:
+                if future.succeeded():
+                    return future.value
 
-            if not future.retriable():
-                raise future.exception # pylint: disable-msg=raising-bad-type
+                elif not future.retriable():
+                    raise future.exception # pylint: disable-msg=raising-bad-type
 
-            time.sleep(self.config['retry_backoff_ms'] / 1000)
+            # future failed but is retriable, or it is still pending
+            if timer.timeout_ms is None or timer.timeout_ms > self.config['retry_backoff_ms']:
+                time.sleep(self.config['retry_backoff_ms'] / 1000)
+            else:
+                time.sleep(timer.timeout_ms / 1000)
+            timer.maybe_raise()
 
-    def _maybe_auto_commit_offsets_sync(self):
+    def _maybe_auto_commit_offsets_sync(self, timeout_ms=None):
         if self.config['enable_auto_commit']:
             try:
-                self.commit_offsets_sync(self._subscription.all_consumed_offsets())
+                self.commit_offsets_sync(self._subscription.all_consumed_offsets(), timeout_ms=timeout_ms)
 
             # The three main group membership errors are known and should not
             # require a stacktrace -- just a warning
@@ -553,7 +595,8 @@ def _send_offset_commit_request(self, offsets):
         Returns:
             Future: indicating whether the commit was successful or not
         """
-        assert self.config['api_version'] >= (0, 8, 1), 'Unsupported Broker API'
+        if self.config['api_version'] < (0, 8, 1):
+            raise Errors.UnsupportedVersionError('OffsetCommitRequest requires 0.8.1+ broker')
         assert all(map(lambda k: isinstance(k, TopicPartition), offsets))
         assert all(map(lambda v: isinstance(v, OffsetAndMetadata),
                        offsets.values()))
@@ -563,7 +606,7 @@ def _send_offset_commit_request(self, offsets):
 
         node_id = self.coordinator()
         if node_id is None:
-            return Future().failure(Errors.GroupCoordinatorNotAvailableError)
+            return Future().failure(Errors.CoordinatorNotAvailableError)
 
 
         # create the offset commit request
@@ -571,7 +614,8 @@ def _send_offset_commit_request(self, offsets):
         for tp, offset in six.iteritems(offsets):
             offset_data[tp.topic][tp.partition] = offset
 
-        if self._subscription.partitions_auto_assigned():
+        version = self._client.api_version(OffsetCommitRequest, max_version=6)
+        if version > 1 and self._subscription.partitions_auto_assigned():
             generation = self.generation()
         else:
             generation = Generation.NO_GENERATION
@@ -579,42 +623,74 @@ def _send_offset_commit_request(self, offsets):
         # if the generation is None, we are not part of an active group
         # (and we expect to be). The only thing we can do is fail the commit
         # and let the user rejoin the group in poll()
-        if self.config['api_version'] >= (0, 9) and generation is None:
-            return Future().failure(Errors.CommitFailedError())
+        if generation is None:
+            log.info("Failing OffsetCommit request since the consumer is not part of an active group")
+            return Future().failure(Errors.CommitFailedError('Group rebalance in progress'))
 
-        if self.config['api_version'] >= (0, 9):
-            request = OffsetCommitRequest[2](
+        if version == 0:
+            request = OffsetCommitRequest[version](
                 self.group_id,
+                [(
+                    topic, [(
+                        partition,
+                        offset.offset,
+                        offset.metadata
+                    ) for partition, offset in six.iteritems(partitions)]
+                ) for topic, partitions in six.iteritems(offset_data)]
+            )
+        elif version == 1:
+            request = OffsetCommitRequest[version](
+                self.group_id,
+                # This api version was only used in v0.8.2, prior to join group apis
+                # so this always ends up as NO_GENERATION
                 generation.generation_id,
                 generation.member_id,
-                OffsetCommitRequest[2].DEFAULT_RETENTION_TIME,
                 [(
                     topic, [(
                         partition,
                         offset.offset,
+                        -1, # timestamp, unused
                         offset.metadata
                     ) for partition, offset in six.iteritems(partitions)]
                 ) for topic, partitions in six.iteritems(offset_data)]
             )
-        elif self.config['api_version'] >= (0, 8, 2):
-            request = OffsetCommitRequest[1](
-                self.group_id, -1, '',
+        elif version <= 4:
+            request = OffsetCommitRequest[version](
+                self.group_id,
+                generation.generation_id,
+                generation.member_id,
+                OffsetCommitRequest[version].DEFAULT_RETENTION_TIME,
                 [(
                     topic, [(
                         partition,
                         offset.offset,
-                        -1,
                         offset.metadata
                     ) for partition, offset in six.iteritems(partitions)]
                 ) for topic, partitions in six.iteritems(offset_data)]
             )
-        elif self.config['api_version'] >= (0, 8, 1):
-            request = OffsetCommitRequest[0](
+        elif version <= 5:
+            request = OffsetCommitRequest[version](
                 self.group_id,
+                generation.generation_id,
+                generation.member_id,
+                [(
+                    topic, [(
+                        partition,
+                        offset.offset,
+                        offset.metadata
+                    ) for partition, offset in six.iteritems(partitions)]
+                ) for topic, partitions in six.iteritems(offset_data)]
+            )
+        else:
+            request = OffsetCommitRequest[version](
+                self.group_id,
+                generation.generation_id,
+                generation.member_id,
                 [(
                     topic, [(
                         partition,
                         offset.offset,
+                        offset.leader_epoch,
                         offset.metadata
                     ) for partition, offset in six.iteritems(partitions)]
                 ) for topic, partitions in six.iteritems(offset_data)]
@@ -631,7 +707,8 @@ def _send_offset_commit_request(self, offsets):
 
     def _handle_offset_commit_response(self, offsets, future, send_time, response):
         # TODO look at adding request_latency_ms to response (like java kafka)
-        self.consumer_sensors.commit_latency.record((time.time() - send_time) * 1000)
+        if self._consumer_sensors:
+            self._consumer_sensors.commit_latency.record((time.time() - send_time) * 1000)
         unauthorized_topics = set()
 
         for topic, partitions in response.topics:
@@ -643,8 +720,6 @@ def _handle_offset_commit_response(self, offsets, future, send_time, response):
                 if error_type is Errors.NoError:
                     log.debug("Group %s committed offset %s for partition %s",
                               self.group_id, offset, tp)
-                    if self._subscription.is_assigned(tp):
-                        self._subscription.assignment[tp].committed = offset
                 elif error_type is Errors.GroupAuthorizationFailedError:
                     log.error("Not authorized to commit offsets for group %s",
                               self.group_id)
@@ -659,27 +734,36 @@ def _handle_offset_commit_response(self, offsets, future, send_time, response):
                               " %s", self.group_id, tp, error_type.__name__)
                     future.failure(error_type())
                     return
-                elif error_type is Errors.GroupLoadInProgressError:
+                elif error_type is Errors.CoordinatorLoadInProgressError:
                     # just retry
                     log.debug("OffsetCommit for group %s failed: %s",
                               self.group_id, error_type.__name__)
                     future.failure(error_type(self.group_id))
                     return
-                elif error_type in (Errors.GroupCoordinatorNotAvailableError,
-                                    Errors.NotCoordinatorForGroupError,
+                elif error_type in (Errors.CoordinatorNotAvailableError,
+                                    Errors.NotCoordinatorError,
                                     Errors.RequestTimedOutError):
                     log.debug("OffsetCommit for group %s failed: %s",
                               self.group_id, error_type.__name__)
                     self.coordinator_dead(error_type())
                     future.failure(error_type(self.group_id))
                     return
+                elif error_type is Errors.RebalanceInProgressError:
+                    # Consumer never tries to commit offset in between join-group and sync-group,
+                    # and hence on broker-side it is not expected to see a commit offset request
+                    # during CompletingRebalance phase; if it ever happens then broker would return
+                    # this error. In this case we should just treat as a fatal CommitFailed exception.
+                    # However, we do not need to reset generations and just request re-join, such that
+                    # if the caller decides to proceed and poll, it would still try to proceed and re-join normally.
+                    self.request_rejoin()
+                    future.failure(Errors.CommitFailedError('Group rebalance in progress'))
+                    return
                 elif error_type in (Errors.UnknownMemberIdError,
-                                    Errors.IllegalGenerationError,
-                                    Errors.RebalanceInProgressError):
-                    # need to re-join group
+                                    Errors.IllegalGenerationError):
+                    # need reset generation and re-join group
                     error = error_type(self.group_id)
-                    log.debug("OffsetCommit for group %s failed: %s",
-                              self.group_id, error)
+                    log.warning("OffsetCommit for group %s failed: %s",
+                                self.group_id, error)
                     self.reset_generation()
                     future.failure(Errors.CommitFailedError())
                     return
@@ -709,14 +793,15 @@ def _send_offset_fetch_request(self, partitions):
         Returns:
             Future: resolves to dict of offsets: {TopicPartition: OffsetAndMetadata}
         """
-        assert self.config['api_version'] >= (0, 8, 1), 'Unsupported Broker API'
+        if self.config['api_version'] < (0, 8, 1):
+            raise Errors.UnsupportedVersionError('OffsetFetchRequest requires 0.8.1+ broker')
         assert all(map(lambda k: isinstance(k, TopicPartition), partitions))
         if not partitions:
             return Future().success({})
 
         node_id = self.coordinator()
         if node_id is None:
-            return Future().failure(Errors.GroupCoordinatorNotAvailableError)
+            return Future().failure(Errors.CoordinatorNotAvailableError)
 
         # Verify node is ready
         if not self._client.ready(node_id):
@@ -731,16 +816,13 @@ def _send_offset_fetch_request(self, partitions):
         for tp in partitions:
             topic_partitions[tp.topic].add(tp.partition)
 
-        if self.config['api_version'] >= (0, 8, 2):
-            request = OffsetFetchRequest[1](
-                self.group_id,
-                list(topic_partitions.items())
-            )
-        else:
-            request = OffsetFetchRequest[0](
-                self.group_id,
-                list(topic_partitions.items())
-            )
+        version = self._client.api_version(OffsetFetchRequest, max_version=5)
+        # Starting in version 2, the request can contain a null topics array to indicate that offsets should be fetched
+        # TODO: support
+        request = OffsetFetchRequest[version](
+            self.group_id,
+            list(topic_partitions.items())
+        )
 
         # send the request with a callback
         future = Future()
@@ -750,21 +832,45 @@ def _send_offset_fetch_request(self, partitions):
         return future
 
     def _handle_offset_fetch_response(self, future, response):
+        if response.API_VERSION >= 2 and response.error_code != Errors.NoError.errno:
+            error_type = Errors.for_code(response.error_code)
+            log.debug("Offset fetch failed: %s", error_type.__name__)
+            error = error_type()
+            if error_type is Errors.CoordinatorLoadInProgressError:
+                # Retry
+                future.failure(error)
+            elif error_type is Errors.NotCoordinatorError:
+                # re-discover the coordinator and retry
+                self.coordinator_dead(error)
+                future.failure(error)
+            elif error_type is Errors.GroupAuthorizationFailedError:
+                future.failure(error)
+            else:
+                log.error("Unknown error fetching offsets: %s", error)
+                future.failure(error)
+            return
+
         offsets = {}
         for topic, partitions in response.topics:
-            for partition, offset, metadata, error_code in partitions:
+            for partition_data in partitions:
+                partition, offset = partition_data[:2]
+                if response.API_VERSION >= 5:
+                    leader_epoch, metadata, error_code = partition_data[2:]
+                else:
+                    metadata, error_code = partition_data[2:]
+                    leader_epoch = -1 # noqa: F841
                 tp = TopicPartition(topic, partition)
                 error_type = Errors.for_code(error_code)
                 if error_type is not Errors.NoError:
                     error = error_type()
                     log.debug("Group %s failed to fetch offset for partition"
                               " %s: %s", self.group_id, tp, error)
-                    if error_type is Errors.GroupLoadInProgressError:
+                    if error_type is Errors.CoordinatorLoadInProgressError:
                         # just retry
                         future.failure(error)
-                    elif error_type is Errors.NotCoordinatorForGroupError:
+                    elif error_type is Errors.NotCoordinatorError:
                         # re-discover the coordinator and retry
-                        self.coordinator_dead(error_type())
+                        self.coordinator_dead(error)
                         future.failure(error)
                     elif error_type is Errors.UnknownTopicOrPartitionError:
                         log.warning("OffsetFetchRequest -- unknown topic %s"
@@ -779,34 +885,41 @@ def _handle_offset_fetch_response(self, future, response):
                 elif offset >= 0:
                     # record the position with the offset
                     # (-1 indicates no committed offset to fetch)
-                    offsets[tp] = OffsetAndMetadata(offset, metadata)
+                    # TODO: save leader_epoch
+                    offsets[tp] = OffsetAndMetadata(offset, metadata, -1)
                 else:
                     log.debug("Group %s has no committed offset for partition"
                               " %s", self.group_id, tp)
         future.success(offsets)
 
-    def _default_offset_commit_callback(self, offsets, exception):
-        if exception is not None:
-            log.error("Offset commit failed: %s", exception)
-
-    def _commit_offsets_async_on_complete(self, offsets, exception):
-        if exception is not None:
+    def _default_offset_commit_callback(self, offsets, res_or_exc):
+        if isinstance(res_or_exc, Exception):
             log.warning("Auto offset commit failed for group %s: %s",
-                        self.group_id, exception)
-            if getattr(exception, 'retriable', False):
-                self.next_auto_commit_deadline = min(time.time() + self.config['retry_backoff_ms'] / 1000, self.next_auto_commit_deadline)
+                        self.group_id, res_or_exc)
         else:
             log.debug("Completed autocommit of offsets %s for group %s",
                       offsets, self.group_id)
 
+    def _commit_offsets_async_on_complete(self, offsets, res_or_exc):
+        if isinstance(res_or_exc, Exception) and getattr(res_or_exc, 'retriable', False):
+            self.next_auto_commit_deadline = min(time.time() + self.config['retry_backoff_ms'] / 1000, self.next_auto_commit_deadline)
+        self.config['default_offset_commit_callback'](offsets, res_or_exc)
+
     def _maybe_auto_commit_offsets_async(self):
         if self.config['enable_auto_commit']:
             if self.coordinator_unknown():
                 self.next_auto_commit_deadline = time.time() + self.config['retry_backoff_ms'] / 1000
             elif time.time() > self.next_auto_commit_deadline:
                 self.next_auto_commit_deadline = time.time() + self.auto_commit_interval
-                self.commit_offsets_async(self._subscription.all_consumed_offsets(),
-                                          self._commit_offsets_async_on_complete)
+                self._do_auto_commit_offsets_async()
+
+    def maybe_auto_commit_offsets_now(self):
+        if self.config['enable_auto_commit'] and not self.coordinator_unknown():
+            self._do_auto_commit_offsets_async()
+
+    def _do_auto_commit_offsets_async(self):
+        self.commit_offsets_async(self._subscription.all_consumed_offsets(),
+                                  self._commit_offsets_async_on_complete)
 
 
 class ConsumerCoordinatorMetrics(object):
diff --git a/kafka/errors.py b/kafka/errors.py
index b33cf51e2..898582615 100644
--- a/kafka/errors.py
+++ b/kafka/errors.py
@@ -16,21 +16,44 @@ def __str__(self):
                                super(KafkaError, self).__str__())
 
 
+class Cancelled(KafkaError):
+    retriable = True
+
+
+class CommitFailedError(KafkaError):
+    def __init__(self, *args):
+        if not args:
+            args = ("Commit cannot be completed since the group has already"
+                    " rebalanced and assigned the partitions to another member."
+                    " This means that the time between subsequent calls to poll()"
+                    " was longer than the configured max_poll_interval_ms, which"
+                    " typically implies that the poll loop is spending too much"
+                    " time message processing. You can address this either by"
+                    " increasing the rebalance timeout with max_poll_interval_ms,"
+                    " or by reducing the maximum size of batches returned in poll()"
+                    " with max_poll_records.",)
+        super(CommitFailedError, self).__init__(*args)
+
+
+class IllegalArgumentError(KafkaError):
+    pass
+
+
 class IllegalStateError(KafkaError):
     pass
 
 
-class IllegalArgumentError(KafkaError):
+class IncompatibleBrokerVersion(KafkaError):
     pass
 
 
-class NoBrokersAvailable(KafkaError):
-    retriable = True
-    invalid_metadata = True
+class KafkaConfigurationError(KafkaError):
+    pass
 
 
-class NodeNotReadyError(KafkaError):
+class KafkaConnectionError(KafkaError):
     retriable = True
+    invalid_metadata = True
 
 
 class KafkaProtocolError(KafkaError):
@@ -41,52 +64,46 @@ class CorrelationIdError(KafkaProtocolError):
     retriable = True
 
 
-class Cancelled(KafkaError):
+class KafkaTimeoutError(KafkaError):
     retriable = True
 
 
-class TooManyInFlightRequests(KafkaError):
+class MetadataEmptyBrokerList(KafkaError):
     retriable = True
 
 
-class StaleMetadata(KafkaError):
+class NoBrokersAvailable(KafkaError):
     retriable = True
     invalid_metadata = True
 
 
-class MetadataEmptyBrokerList(KafkaError):
+class NoOffsetForPartitionError(KafkaError):
+    pass
+
+
+class NodeNotReadyError(KafkaError):
     retriable = True
 
 
-class UnrecognizedBrokerVersion(KafkaError):
+class QuotaViolationError(KafkaError):
     pass
 
 
-class IncompatibleBrokerVersion(KafkaError):
-    pass
+class StaleMetadata(KafkaError):
+    retriable = True
+    invalid_metadata = True
 
 
-class CommitFailedError(KafkaError):
-    def __init__(self, *args, **kwargs):
-        super(CommitFailedError, self).__init__(
-            """Commit cannot be completed since the group has already
-            rebalanced and assigned the partitions to another member.
-            This means that the time between subsequent calls to poll()
-            was longer than the configured max_poll_interval_ms, which
-            typically implies that the poll loop is spending too much
-            time message processing. You can address this either by
-            increasing the rebalance timeout with max_poll_interval_ms,
-            or by reducing the maximum size of batches returned in poll()
-            with max_poll_records.
-            """, *args, **kwargs)
-
-
-class AuthenticationMethodNotSupported(KafkaError):
+class TooManyInFlightRequests(KafkaError):
+    retriable = True
+
+
+class UnrecognizedBrokerVersion(KafkaError):
     pass
 
 
-class AuthenticationFailedError(KafkaError):
-    retriable = False
+class UnsupportedCodecError(KafkaError):
+    pass
 
 
 class BrokerResponseError(KafkaError):
@@ -101,6 +118,10 @@ def __str__(self):
             super(BrokerResponseError, self).__str__())
 
 
+class AuthorizationError(BrokerResponseError):
+    pass
+
+
 class NoError(BrokerResponseError):
     errno = 0
     message = 'NO_ERROR'
@@ -120,14 +141,14 @@ class OffsetOutOfRangeError(BrokerResponseError):
                    ' maintained by the server for the given topic/partition.')
 
 
-class CorruptRecordException(BrokerResponseError):
+class CorruptRecordError(BrokerResponseError):
     errno = 2
     message = 'CORRUPT_MESSAGE'
     description = ('This message has failed its CRC checksum, exceeds the'
                    ' valid size, or is otherwise corrupt.')
 
 # Backward compatibility
-InvalidMessageError = CorruptRecordException
+CorruptRecordException = CorruptRecordError
 
 
 class UnknownTopicOrPartitionError(BrokerResponseError):
@@ -186,7 +207,8 @@ class ReplicaNotAvailableError(BrokerResponseError):
     message = 'REPLICA_NOT_AVAILABLE'
     description = ('If replica is expected on a broker, but is not (this can be'
                    ' safely ignored).')
-
+    retriable = True
+    invalid_metadata = True
 
 class MessageSizeTooLargeError(BrokerResponseError):
     errno = 10
@@ -210,39 +232,35 @@ class OffsetMetadataTooLargeError(BrokerResponseError):
                    ' offset metadata.')
 
 
-# TODO is this deprecated? https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-ErrorCodes
-class StaleLeaderEpochCodeError(BrokerResponseError):
+class NetworkExceptionError(BrokerResponseError):
     errno = 13
-    message = 'STALE_LEADER_EPOCH_CODE'
+    message = 'NETWORK_EXCEPTION'
+    retriable = True
+    invalid_metadata = True
 
 
-class GroupLoadInProgressError(BrokerResponseError):
+class CoordinatorLoadInProgressError(BrokerResponseError):
     errno = 14
-    message = 'OFFSETS_LOAD_IN_PROGRESS'
-    description = ('The broker returns this error code for an offset fetch'
-                   ' request if it is still loading offsets (after a leader'
-                   ' change for that offsets topic partition), or in response'
-                   ' to group membership requests (such as heartbeats) when'
-                   ' group metadata is being loaded by the coordinator.')
+    message = 'COORDINATOR_LOAD_IN_PROGRESS'
+    description = ('The broker returns this error code for txn or group requests,'
+                   ' when the coordinator is loading and hence cant process requests')
     retriable = True
 
 
-class GroupCoordinatorNotAvailableError(BrokerResponseError):
+class CoordinatorNotAvailableError(BrokerResponseError):
     errno = 15
-    message = 'CONSUMER_COORDINATOR_NOT_AVAILABLE'
-    description = ('The broker returns this error code for group coordinator'
-                   ' requests, offset commits, and most group management'
+    message = 'COORDINATOR_NOT_AVAILABLE'
+    description = ('The broker returns this error code for consumer and transaction'
                    ' requests if the offsets topic has not yet been created, or'
-                   ' if the group coordinator is not active.')
+                   ' if the group/txn coordinator is not active.')
     retriable = True
 
 
-class NotCoordinatorForGroupError(BrokerResponseError):
+class NotCoordinatorError(BrokerResponseError):
     errno = 16
-    message = 'NOT_COORDINATOR_FOR_CONSUMER'
-    description = ('The broker returns this error code if it receives an offset'
-                   ' fetch or commit request for a group that it is not a'
-                   ' coordinator for.')
+    message = 'NOT_COORDINATOR'
+    description = ('The broker returns this error code if it is not the correct'
+                   ' coordinator for the specified consumer or transaction group')
     retriable = True
 
 
@@ -339,21 +357,21 @@ class InvalidCommitOffsetSizeError(BrokerResponseError):
                    ' because of oversize metadata.')
 
 
-class TopicAuthorizationFailedError(BrokerResponseError):
+class TopicAuthorizationFailedError(AuthorizationError):
     errno = 29
     message = 'TOPIC_AUTHORIZATION_FAILED'
     description = ('Returned by the broker when the client is not authorized to'
                    ' access the requested topic.')
 
 
-class GroupAuthorizationFailedError(BrokerResponseError):
+class GroupAuthorizationFailedError(AuthorizationError):
     errno = 30
     message = 'GROUP_AUTHORIZATION_FAILED'
     description = ('Returned by the broker when the client is not authorized to'
                    ' access a particular groupId.')
 
 
-class ClusterAuthorizationFailedError(BrokerResponseError):
+class ClusterAuthorizationFailedError(AuthorizationError):
     errno = 31
     message = 'CLUSTER_AUTHORIZATION_FAILED'
     description = ('Returned by the broker when the client is not authorized to'
@@ -441,98 +459,615 @@ class PolicyViolationError(BrokerResponseError):
     errno = 44
     message = 'POLICY_VIOLATION'
     description = 'Request parameters do not satisfy the configured policy.'
+    retriable = False
+
+
+class OutOfOrderSequenceNumberError(BrokerResponseError):
+    errno = 45
+    message = 'OUT_OF_ORDER_SEQUENCE_NUMBER'
+    description = 'The broker received an out of order sequence number.'
+    retriable = False
+
+
+class DuplicateSequenceNumberError(BrokerResponseError):
+    errno = 46
+    message = 'DUPLICATE_SEQUENCE_NUMBER'
+    description = 'The broker received a duplicate sequence number.'
+    retriable = False
+
+
+class InvalidProducerEpochError(BrokerResponseError):
+    errno = 47
+    message = 'INVALID_PRODUCER_EPOCH'
+    description = 'Producer attempted to produce with an old epoch.'
+    retriable = False
+
+
+class InvalidTxnStateError(BrokerResponseError):
+    errno = 48
+    message = 'INVALID_TXN_STATE'
+    description = 'The producer attempted a transactional operation in an invalid state.'
+    retriable = False
+
+
+class InvalidProducerIdMappingError(BrokerResponseError):
+    errno = 49
+    message = 'INVALID_PRODUCER_ID_MAPPING'
+    description = 'The producer attempted to use a producer id which is not currently assigned to its transactional id.'
+    retriable = False
+
+
+class InvalidTransactionTimeoutError(BrokerResponseError):
+    errno = 50
+    message = 'INVALID_TRANSACTION_TIMEOUT'
+    description = 'The transaction timeout is larger than the maximum value allowed by the broker (as configured by transaction.max.timeout.ms).'
+    retriable = False
+
+
+class ConcurrentTransactionsError(BrokerResponseError):
+    errno = 51
+    message = 'CONCURRENT_TRANSACTIONS'
+    description = 'The producer attempted to update a transaction while another concurrent operation on the same transaction was ongoing.'
+    retriable = True
+
+
+class TransactionCoordinatorFencedError(BrokerResponseError):
+    errno = 52
+    message = 'TRANSACTION_COORDINATOR_FENCED'
+    description = 'Indicates that the transaction coordinator sending a WriteTxnMarker is no longer the current coordinator for a given producer.'
+    retriable = False
+
+
+class TransactionalIdAuthorizationFailedError(AuthorizationError):
+    errno = 53
+    message = 'TRANSACTIONAL_ID_AUTHORIZATION_FAILED'
+    description = 'Transactional Id authorization failed.'
+    retriable = False
 
 
 class SecurityDisabledError(BrokerResponseError):
     errno = 54
     message = 'SECURITY_DISABLED'
     description = 'Security features are disabled.'
+    retriable = False
+
+
+class OperationNotAttemptedError(BrokerResponseError):
+    errno = 55
+    message = 'OPERATION_NOT_ATTEMPTED'
+    description = 'The broker did not attempt to execute this operation. This may happen for batched RPCs where some operations in the batch failed, causing the broker to respond without trying the rest.'
+    retriable = False
+
+
+class KafkaStorageError(BrokerResponseError):
+    errno = 56
+    message = 'KAFKA_STORAGE_ERROR'
+    description = 'Disk error when trying to access log file on the disk.'
+    retriable = True
+    invalid_metadata = True
+
+
+class LogDirNotFoundError(BrokerResponseError):
+    errno = 57
+    message = 'LOG_DIR_NOT_FOUND'
+    description = 'The user-specified log directory is not found in the broker config.'
+    retriable = False
+
+
+class SaslAuthenticationFailedError(BrokerResponseError):
+    errno = 58
+    message = 'SASL_AUTHENTICATION_FAILED'
+    description = 'SASL Authentication failed.'
+    retriable = False
+
+
+class UnknownProducerIdError(BrokerResponseError):
+    errno = 59
+    message = 'UNKNOWN_PRODUCER_ID'
+    description = 'This exception is raised by the broker if it could not locate the producer metadata associated with the producerId in question. This could happen if, for instance, the producer\'s records were deleted because their retention time had elapsed. Once the last records of the producerId are removed, the producer\'s metadata is removed from the broker, and future appends by the producer will return this exception.'
+    retriable = False
+
+
+class ReassignmentInProgressError(BrokerResponseError):
+    errno = 60
+    message = 'REASSIGNMENT_IN_PROGRESS'
+    description = 'A partition reassignment is in progress.'
+    retriable = False
+
+
+class DelegationTokenAuthDisabledError(BrokerResponseError):
+    errno = 61
+    message = 'DELEGATION_TOKEN_AUTH_DISABLED'
+    description = 'Delegation Token feature is not enabled.'
+    retriable = False
+
+
+class DelegationTokenNotFoundError(BrokerResponseError):
+    errno = 62
+    message = 'DELEGATION_TOKEN_NOT_FOUND'
+    description = 'Delegation Token is not found on server.'
+    retriable = False
+
+
+class DelegationTokenOwnerMismatchError(BrokerResponseError):
+    errno = 63
+    message = 'DELEGATION_TOKEN_OWNER_MISMATCH'
+    description = 'Specified Principal is not valid Owner/Renewer.'
+    retriable = False
+
+
+class DelegationTokenRequestNotAllowedError(BrokerResponseError):
+    errno = 64
+    message = 'DELEGATION_TOKEN_REQUEST_NOT_ALLOWED'
+    description = 'Delegation Token requests are not allowed on PLAINTEXT/1-way SSL channels and on delegation token authenticated channels.'
+    retriable = False
+
+
+class DelegationTokenAuthorizationFailedError(AuthorizationError):
+    errno = 65
+    message = 'DELEGATION_TOKEN_AUTHORIZATION_FAILED'
+    description = 'Delegation Token authorization failed.'
+    retriable = False
+
+
+class DelegationTokenExpiredError(BrokerResponseError):
+    errno = 66
+    message = 'DELEGATION_TOKEN_EXPIRED'
+    description = 'Delegation Token is expired.'
+    retriable = False
+
+
+class InvalidPrincipalTypeError(BrokerResponseError):
+    errno = 67
+    message = 'INVALID_PRINCIPAL_TYPE'
+    description = 'Supplied principalType is not supported.'
+    retriable = False
 
 
 class NonEmptyGroupError(BrokerResponseError):
     errno = 68
     message = 'NON_EMPTY_GROUP'
     description = 'The group is not empty.'
+    retriable = False
 
 
 class GroupIdNotFoundError(BrokerResponseError):
     errno = 69
     message = 'GROUP_ID_NOT_FOUND'
     description = 'The group id does not exist.'
+    retriable = False
 
 
-class KafkaUnavailableError(KafkaError):
-    pass
+class FetchSessionIdNotFoundError(BrokerResponseError):
+    errno = 70
+    message = 'FETCH_SESSION_ID_NOT_FOUND'
+    description = 'The fetch session ID was not found.'
+    retriable = True
 
 
-class KafkaTimeoutError(KafkaError):
-    pass
+class InvalidFetchSessionEpochError(BrokerResponseError):
+    errno = 71
+    message = 'INVALID_FETCH_SESSION_EPOCH'
+    description = 'The fetch session epoch is invalid.'
+    retriable = True
+
 
+class ListenerNotFoundError(BrokerResponseError):
+    errno = 72
+    message = 'LISTENER_NOT_FOUND'
+    description = 'There is no listener on the leader broker that matches the listener on which metadata request was processed.'
+    retriable = True
+    invalid_metadata = True
 
-class FailedPayloadsError(KafkaError):
-    def __init__(self, payload, *args):
-        super(FailedPayloadsError, self).__init__(*args)
-        self.payload = payload
 
+class TopicDeletionDisabledError(BrokerResponseError):
+    errno = 73
+    message = 'TOPIC_DELETION_DISABLED'
+    description = 'Topic deletion is disabled.'
+    retriable = False
 
-class KafkaConnectionError(KafkaError):
+
+class FencedLeaderEpochError(BrokerResponseError):
+    errno = 74
+    message = 'FENCED_LEADER_EPOCH'
+    description = 'The leader epoch in the request is older than the epoch on the broker.'
     retriable = True
     invalid_metadata = True
 
 
-class ProtocolError(KafkaError):
-    pass
+class UnknownLeaderEpochError(BrokerResponseError):
+    errno = 75
+    message = 'UNKNOWN_LEADER_EPOCH'
+    description = 'The leader epoch in the request is newer than the epoch on the broker.'
+    retriable = True
+    invalid_metadata = True
 
 
-class UnsupportedCodecError(KafkaError):
-    pass
+class UnsupportedCompressionTypeError(BrokerResponseError):
+    errno = 76
+    message = 'UNSUPPORTED_COMPRESSION_TYPE'
+    description = 'The requesting client does not support the compression type of given partition.'
+    retriable = False
 
 
-class KafkaConfigurationError(KafkaError):
-    pass
+class StaleBrokerEpochError(BrokerResponseError):
+    errno = 77
+    message = 'STALE_BROKER_EPOCH'
+    description = 'Broker epoch has changed.'
+    retriable = False
 
 
-class QuotaViolationError(KafkaError):
-    pass
+class OffsetNotAvailableError(BrokerResponseError):
+    errno = 78
+    message = 'OFFSET_NOT_AVAILABLE'
+    description = 'The leader high watermark has not caught up from a recent leader election so the offsets cannot be guaranteed to be monotonically increasing.'
+    retriable = True
 
 
-class AsyncProducerQueueFull(KafkaError):
-    def __init__(self, failed_msgs, *args):
-        super(AsyncProducerQueueFull, self).__init__(*args)
-        self.failed_msgs = failed_msgs
+class MemberIdRequiredError(BrokerResponseError):
+    errno = 79
+    message = 'MEMBER_ID_REQUIRED'
+    description = 'The group member needs to have a valid member id before actually entering a consumer group.'
+    retriable = False
 
 
-def _iter_broker_errors():
-    for name, obj in inspect.getmembers(sys.modules[__name__]):
-        if inspect.isclass(obj) and issubclass(obj, BrokerResponseError) and obj != BrokerResponseError:
-            yield obj
+class PreferredLeaderNotAvailableError(BrokerResponseError):
+    errno = 80
+    message = 'PREFERRED_LEADER_NOT_AVAILABLE'
+    description = 'The preferred leader was not available.'
+    retriable = True
+    invalid_metadata = True
 
 
-kafka_errors = dict([(x.errno, x) for x in _iter_broker_errors()])
+class GroupMaxSizeReachedError(BrokerResponseError):
+    errno = 81
+    message = 'GROUP_MAX_SIZE_REACHED'
+    description = 'The consumer group has reached its max size.'
+    retriable = False
 
 
-def for_code(error_code):
-    return kafka_errors.get(error_code, UnknownError)
+class FencedInstanceIdError(BrokerResponseError):
+    errno = 82
+    message = 'FENCED_INSTANCE_ID'
+    description = 'The broker rejected this static consumer since another consumer with the same group.instance.id has registered with a different member.id.'
+    retriable = False
+
+
+class EligibleLeadersNotAvailableError(BrokerResponseError):
+    errno = 83
+    message = 'ELIGIBLE_LEADERS_NOT_AVAILABLE'
+    description = 'Eligible topic partition leaders are not available.'
+    retriable = True
+    invalid_metadata = True
+
+
+class ElectionNotNeededError(BrokerResponseError):
+    errno = 84
+    message = 'ELECTION_NOT_NEEDED'
+    description = 'Leader election not needed for topic partition.'
+    retriable = True
+    invalid_metadata = True
+
+
+class NoReassignmentInProgressError(BrokerResponseError):
+    errno = 85
+    message = 'NO_REASSIGNMENT_IN_PROGRESS'
+    description = 'No partition reassignment is in progress.'
+    retriable = False
+
+
+class GroupSubscribedToTopicError(BrokerResponseError):
+    errno = 86
+    message = 'GROUP_SUBSCRIBED_TO_TOPIC'
+    description = 'Deleting offsets of a topic is forbidden while the consumer group is actively subscribed to it.'
+    retriable = False
+
+
+class InvalidRecordError(BrokerResponseError):
+    errno = 87
+    message = 'INVALID_RECORD'
+    description = 'This record has failed the validation on broker and hence will be rejected.'
+    retriable = False
+
+
+class UnstableOffsetCommitError(BrokerResponseError):
+    errno = 88
+    message = 'UNSTABLE_OFFSET_COMMIT'
+    description = 'There are unstable offsets that need to be cleared.'
+    retriable = True
+
+
+class ThrottlingQuotaExceededError(BrokerResponseError):
+    errno = 89
+    message = 'THROTTLING_QUOTA_EXCEEDED'
+    description = 'The throttling quota has been exceeded.'
+    retriable = True
+
+
+class ProducerFencedError(BrokerResponseError):
+    errno = 90
+    message = 'PRODUCER_FENCED'
+    description = 'There is a newer producer with the same transactionalId which fences the current one.'
+    retriable = False
+
+
+class ResourceNotFoundError(BrokerResponseError):
+    errno = 91
+    message = 'RESOURCE_NOT_FOUND'
+    description = 'A request illegally referred to a resource that does not exist.'
+    retriable = False
+
+
+class DuplicateResourceError(BrokerResponseError):
+    errno = 92
+    message = 'DUPLICATE_RESOURCE'
+    description = 'A request illegally referred to the same resource twice.'
+    retriable = False
+
+
+class UnacceptableCredentialError(BrokerResponseError):
+    errno = 93
+    message = 'UNACCEPTABLE_CREDENTIAL'
+    description = 'Requested credential would not meet criteria for acceptability.'
+    retriable = False
+
+
+class InconsistentVoterSetError(BrokerResponseError):
+    errno = 94
+    message = 'INCONSISTENT_VOTER_SET'
+    description = 'Indicates that the either the sender or recipient of a voter-only request is not one of the expected voters.'
+    retriable = False
+
+
+class InvalidUpdateVersionError(BrokerResponseError):
+    errno = 95
+    message = 'INVALID_UPDATE_VERSION'
+    description = 'The given update version was invalid.'
+    retriable = False
+
+
+class FeatureUpdateFailedError(BrokerResponseError):
+    errno = 96
+    message = 'FEATURE_UPDATE_FAILED'
+    description = 'Unable to update finalized features due to an unexpected server error.'
+    retriable = False
+
+
+class PrincipalDeserializationFailureError(BrokerResponseError):
+    errno = 97
+    message = 'PRINCIPAL_DESERIALIZATION_FAILURE'
+    description = 'Request principal deserialization failed during forwarding. This indicates an internal error on the broker cluster security setup.'
+    retriable = False
+
+
+class SnapshotNotFoundError(BrokerResponseError):
+    errno = 98
+    message = 'SNAPSHOT_NOT_FOUND'
+    description = 'Requested snapshot was not found.'
+    retriable = False
+
+
+class PositionOutOfRangeError(BrokerResponseError):
+    errno = 99
+    message = 'POSITION_OUT_OF_RANGE'
+    description = 'Requested position is not greater than or equal to zero, and less than the size of the snapshot.'
+    retriable = False
+
+
+class UnknownTopicIdError(BrokerResponseError):
+    errno = 100
+    message = 'UNKNOWN_TOPIC_ID'
+    description = 'This server does not host this topic ID.'
+    retriable = True
+    invalid_metadata = True
+
+
+class DuplicateBrokerRegistrationError(BrokerResponseError):
+    errno = 101
+    message = 'DUPLICATE_BROKER_REGISTRATION'
+    description = 'This broker ID is already in use.'
+    retriable = False
+
+
+class BrokerIdNotRegisteredError(BrokerResponseError):
+    errno = 102
+    message = 'BROKER_ID_NOT_REGISTERED'
+    description = 'The given broker ID was not registered.'
+    retriable = False
+
+
+class InconsistentTopicIdError(BrokerResponseError):
+    errno = 103
+    message = 'INCONSISTENT_TOPIC_ID'
+    description = 'The log\'s topic ID did not match the topic ID in the request.'
+    retriable = True
+    invalid_metadata = True
 
 
-def check_error(response):
-    if isinstance(response, Exception):
-        raise response
-    if response.error:
-        error_class = kafka_errors.get(response.error, UnknownError)
-        raise error_class(response)
+class InconsistentClusterIdError(BrokerResponseError):
+    errno = 104
+    message = 'INCONSISTENT_CLUSTER_ID'
+    description = 'The clusterId in the request does not match that found on the server.'
+    retriable = False
+
+
+class TransactionalIdNotFoundError(BrokerResponseError):
+    errno = 105
+    message = 'TRANSACTIONAL_ID_NOT_FOUND'
+    description = 'The transactionalId could not be found.'
+    retriable = False
+
+
+class FetchSessionTopicIdError(BrokerResponseError):
+    errno = 106
+    message = 'FETCH_SESSION_TOPIC_ID_ERROR'
+    description = 'The fetch session encountered inconsistent topic ID usage.'
+    retriable = True
+
+
+class IneligibleReplicaError(BrokerResponseError):
+    errno = 107
+    message = 'INELIGIBLE_REPLICA'
+    description = 'The new ISR contains at least one ineligible replica.'
+    retriable = False
+
+
+class NewLeaderElectedError(BrokerResponseError):
+    errno = 108
+    message = 'NEW_LEADER_ELECTED'
+    description = 'The AlterPartition request successfully updated the partition state but the leader has changed.'
+    retriable = False
+
+
+class OffsetMovedToTieredStorageError(BrokerResponseError):
+    errno = 109
+    message = 'OFFSET_MOVED_TO_TIERED_STORAGE'
+    description = 'The requested offset is moved to tiered storage.'
+    retriable = False
+
+
+class FencedMemberEpochError(BrokerResponseError):
+    errno = 110
+    message = 'FENCED_MEMBER_EPOCH'
+    description = 'The member epoch is fenced by the group coordinator. The member must abandon all its partitions and rejoin.'
+    retriable = False
+
+
+class UnreleasedInstanceIdError(BrokerResponseError):
+    errno = 111
+    message = 'UNRELEASED_INSTANCE_ID'
+    description = 'The instance ID is still used by another member in the consumer group. That member must leave first.'
+    retriable = False
+
+
+class UnsupportedAssignorError(BrokerResponseError):
+    errno = 112
+    message = 'UNSUPPORTED_ASSIGNOR'
+    description = 'The assignor or its version range is not supported by the consumer group.'
+    retriable = False
+
+
+class StaleMemberEpochError(BrokerResponseError):
+    errno = 113
+    message = 'STALE_MEMBER_EPOCH'
+    description = 'The member epoch is stale. The member must retry after receiving its updated member epoch via the ConsumerGroupHeartbeat API.'
+    retriable = False
+
+
+class MismatchedEndpointTypeError(BrokerResponseError):
+    errno = 114
+    message = 'MISMATCHED_ENDPOINT_TYPE'
+    description = 'The request was sent to an endpoint of the wrong type.'
+    retriable = False
+
+
+class UnsupportedEndpointTypeError(BrokerResponseError):
+    errno = 115
+    message = 'UNSUPPORTED_ENDPOINT_TYPE'
+    description = 'This endpoint type is not supported yet.'
+    retriable = False
+
+
+class UnknownControllerIdError(BrokerResponseError):
+    errno = 116
+    message = 'UNKNOWN_CONTROLLER_ID'
+    description = 'This controller ID is not known.'
+    retriable = False
+
 
+class UnknownSubscriptionIdError(BrokerResponseError):
+    errno = 117
+    message = 'UNKNOWN_SUBSCRIPTION_ID'
+    description = 'Client sent a push telemetry request with an invalid or outdated subscription ID.'
+    retriable = False
+
+
+class TelemetryTooLargeError(BrokerResponseError):
+    errno = 118
+    message = 'TELEMETRY_TOO_LARGE'
+    description = 'Client sent a push telemetry request larger than the maximum size the broker will accept.'
+    retriable = False
+
+
+class InvalidRegistrationError(BrokerResponseError):
+    errno = 119
+    message = 'INVALID_REGISTRATION'
+    description = 'The controller has considered the broker registration to be invalid.'
+    retriable = False
+
+
+class TransactionAbortableError(BrokerResponseError):
+    errno = 120
+    message = 'TRANSACTION_ABORTABLE'
+    description = 'The server encountered an error with the transaction. The client can abort the transaction to continue using this transactional ID.'
+    retriable = False
+
+
+class InvalidRecordStateError(BrokerResponseError):
+    errno = 121
+    message = 'INVALID_RECORD_STATE'
+    description = 'The record state is invalid. The acknowledgement of delivery could not be completed.'
+    retriable = False
+
+
+class ShareSessionNotFoundError(BrokerResponseError):
+    errno = 122
+    message = 'SHARE_SESSION_NOT_FOUND'
+    description = 'The share session was not found.'
+    retriable = True
+
+
+class InvalidShareSessionEpochError(BrokerResponseError):
+    errno = 123
+    message = 'INVALID_SHARE_SESSION_EPOCH'
+    description = 'The share session epoch is invalid.'
+    retriable = True
 
-RETRY_BACKOFF_ERROR_TYPES = (
-    KafkaUnavailableError, LeaderNotAvailableError,
-    KafkaConnectionError, FailedPayloadsError
-)
+
+class FencedStateEpochError(BrokerResponseError):
+    errno = 124
+    message = 'FENCED_STATE_EPOCH'
+    description = 'The share coordinator rejected the request because the share-group state epoch did not match.'
+    retriable = False
+
+
+class InvalidVoterKeyError(BrokerResponseError):
+    errno = 125
+    message = 'INVALID_VOTER_KEY'
+    description = 'The voter key doesn\'t match the receiving replica\'s key.'
+    retriable = False
 
 
-RETRY_REFRESH_ERROR_TYPES = (
-    NotLeaderForPartitionError, UnknownTopicOrPartitionError,
-    LeaderNotAvailableError, KafkaConnectionError
-)
+class DuplicateVoterError(BrokerResponseError):
+    errno = 126
+    message = 'DUPLICATE_VOTER'
+    description = 'The voter is already part of the set of voters.'
+    retriable = False
+
 
+class VoterNotFoundError(BrokerResponseError):
+    errno = 127
+    message = 'VOTER_NOT_FOUND'
+    description = 'The voter is not part of the set of voters.'
+    retriable = False
 
-RETRY_ERROR_TYPES = RETRY_BACKOFF_ERROR_TYPES + RETRY_REFRESH_ERROR_TYPES
+
+def _iter_broker_errors():
+    for name, obj in inspect.getmembers(sys.modules[__name__]):
+        if inspect.isclass(obj) and issubclass(obj, BrokerResponseError) and obj != BrokerResponseError:
+            yield obj
+
+
+kafka_errors = dict([(x.errno, x) for x in _iter_broker_errors()])
+
+
+def for_code(error_code):
+    if error_code in kafka_errors:
+        return kafka_errors[error_code]
+    else:
+        # The broker error code was not found in our list. This can happen when connecting
+        # to a newer broker (with new error codes), or simply because our error list is
+        # not complete.
+        #
+        # To avoid dropping the error code, create a dynamic error class w/ errno override.
+        return type('UnrecognizedBrokerError', (UnknownError,), {'errno': error_code})
diff --git a/kafka/future.py b/kafka/future.py
index d0f3c6658..2af061ee7 100644
--- a/kafka/future.py
+++ b/kafka/future.py
@@ -2,6 +2,7 @@
 
 import functools
 import logging
+import threading
 
 log = logging.getLogger(__name__)
 
@@ -15,6 +16,7 @@ def __init__(self):
         self.exception = None
         self._callbacks = []
         self._errbacks = []
+        self._lock = threading.Lock()
 
     def succeeded(self):
         return self.is_done and not bool(self.exception)
@@ -30,37 +32,46 @@ def retriable(self):
 
     def success(self, value):
         assert not self.is_done, 'Future is already complete'
-        self.value = value
-        self.is_done = True
+        with self._lock:
+            self.value = value
+            self.is_done = True
         if self._callbacks:
             self._call_backs('callback', self._callbacks, self.value)
         return self
 
     def failure(self, e):
         assert not self.is_done, 'Future is already complete'
-        self.exception = e if type(e) is not type else e()
-        assert isinstance(self.exception, BaseException), (
+        exception = e if type(e) is not type else e()
+        assert isinstance(exception, BaseException), (
             'future failed without an exception')
-        self.is_done = True
+        with self._lock:
+            self.exception = exception
+            self.is_done = True
         self._call_backs('errback', self._errbacks, self.exception)
         return self
 
     def add_callback(self, f, *args, **kwargs):
         if args or kwargs:
             f = functools.partial(f, *args, **kwargs)
-        if self.is_done and not self.exception:
-            self._call_backs('callback', [f], self.value)
-        else:
-            self._callbacks.append(f)
+        with self._lock:
+            if not self.is_done:
+                self._callbacks.append(f)
+            elif self.succeeded():
+                self._lock.release()
+                self._call_backs('callback', [f], self.value)
+                self._lock.acquire()
         return self
 
     def add_errback(self, f, *args, **kwargs):
         if args or kwargs:
             f = functools.partial(f, *args, **kwargs)
-        if self.is_done and self.exception:
-            self._call_backs('errback', [f], self.exception)
-        else:
-            self._errbacks.append(f)
+        with self._lock:
+            if not self.is_done:
+                self._errbacks.append(f)
+            elif self.failed():
+                self._lock.release()
+                self._call_backs('errback', [f], self.exception)
+                self._lock.acquire()
         return self
 
     def add_both(self, f, *args, **kwargs):
diff --git a/kafka/metrics/compound_stat.py b/kafka/metrics/compound_stat.py
index ac92480dc..f5b482da2 100644
--- a/kafka/metrics/compound_stat.py
+++ b/kafka/metrics/compound_stat.py
@@ -3,16 +3,16 @@
 import abc
 
 from kafka.metrics.stat import AbstractStat
+from kafka.vendor.six import add_metaclass
 
 
+@add_metaclass(abc.ABCMeta)
 class AbstractCompoundStat(AbstractStat):
     """
     A compound stat is a stat where a single measurement and associated
     data structure feeds many metrics. This is the example for a
     histogram which has many associated percentiles.
     """
-    __metaclass__ = abc.ABCMeta
-
     def stats(self):
         """
         Return list of NamedMeasurable
@@ -21,6 +21,8 @@ def stats(self):
 
 
 class NamedMeasurable(object):
+    __slots__ = ('_name', '_stat')
+
     def __init__(self, metric_name, measurable_stat):
         self._name = metric_name
         self._stat = measurable_stat
diff --git a/kafka/metrics/kafka_metric.py b/kafka/metrics/kafka_metric.py
index 9fb8d89f1..fef684850 100644
--- a/kafka/metrics/kafka_metric.py
+++ b/kafka/metrics/kafka_metric.py
@@ -4,6 +4,8 @@
 
 
 class KafkaMetric(object):
+    __slots__ = ('_metric_name', '_measurable', '_config')
+
     # NOTE java constructor takes a lock instance
     def __init__(self, metric_name, measurable, config):
         if not metric_name:
@@ -33,4 +35,4 @@ def config(self, config):
     def value(self, time_ms=None):
         if time_ms is None:
             time_ms = time.time() * 1000
-        return self.measurable.measure(self.config, time_ms)
+        return self._measurable.measure(self._config, time_ms)
diff --git a/kafka/metrics/measurable_stat.py b/kafka/metrics/measurable_stat.py
index 4487adf6e..08222b144 100644
--- a/kafka/metrics/measurable_stat.py
+++ b/kafka/metrics/measurable_stat.py
@@ -4,8 +4,10 @@
 
 from kafka.metrics.measurable import AbstractMeasurable
 from kafka.metrics.stat import AbstractStat
+from kafka.vendor.six import add_metaclass
 
 
+@add_metaclass(abc.ABCMeta)
 class AbstractMeasurableStat(AbstractStat, AbstractMeasurable):
     """
     An AbstractMeasurableStat is an AbstractStat that is also
@@ -13,4 +15,3 @@ class AbstractMeasurableStat(AbstractStat, AbstractMeasurable):
     This is the interface used for most of the simple statistics such
     as Avg, Max, Count, etc.
     """
-    __metaclass__ = abc.ABCMeta
diff --git a/kafka/metrics/metric_config.py b/kafka/metrics/metric_config.py
index 2e55abfcb..7e5ead1fe 100644
--- a/kafka/metrics/metric_config.py
+++ b/kafka/metrics/metric_config.py
@@ -5,6 +5,8 @@
 
 class MetricConfig(object):
     """Configuration values for metrics"""
+    __slots__ = ('quota', '_samples', 'event_window', 'time_window_ms', 'tags')
+
     def __init__(self, quota=None, samples=2, event_window=sys.maxsize,
                  time_window_ms=30 * 1000, tags=None):
         """
diff --git a/kafka/metrics/metric_name.py b/kafka/metrics/metric_name.py
index b5acd1662..b8ab2a3ad 100644
--- a/kafka/metrics/metric_name.py
+++ b/kafka/metrics/metric_name.py
@@ -38,6 +38,7 @@ class MetricName(object):
         # as messages are sent we record the sizes
         sensor.record(message_size)
     """
+    __slots__ = ('_name', '_group', '_description', '_tags', '_hash')
 
     def __init__(self, name, group, description=None, tags=None):
         """
@@ -93,7 +94,7 @@ def __eq__(self, other):
             return True
         if other is None:
             return False
-        return (type(self) == type(other) and
+        return (isinstance(self, type(other)) and
                 self.group == other.group and
                 self.name == other.name and
                 self.tags == other.tags)
diff --git a/kafka/metrics/metrics.py b/kafka/metrics/metrics.py
index 2c53488ff..41a37db58 100644
--- a/kafka/metrics/metrics.py
+++ b/kafka/metrics/metrics.py
@@ -55,10 +55,11 @@ def __init__(self, default_config=None, reporters=None,
         self._reporters = reporters or []
         for reporter in self._reporters:
             reporter.init([])
+        self._closed = False
 
         if enable_expiration:
             def expire_loop():
-                while True:
+                while not self._closed:
                     # delay 30 seconds
                     time.sleep(30)
                     self.ExpireSensorTask.run(self)
@@ -259,3 +260,4 @@ def close(self):
             reporter.close()
 
         self._metrics.clear()
+        self._closed = True
diff --git a/kafka/metrics/metrics_reporter.py b/kafka/metrics/metrics_reporter.py
index d8bd12b3b..8df2e9ea6 100644
--- a/kafka/metrics/metrics_reporter.py
+++ b/kafka/metrics/metrics_reporter.py
@@ -2,14 +2,15 @@
 
 import abc
 
+from kafka.vendor.six import add_metaclass
 
+
+@add_metaclass(abc.ABCMeta)
 class AbstractMetricsReporter(object):
     """
     An abstract class to allow things to listen as new metrics
     are created so they can be reported.
     """
-    __metaclass__ = abc.ABCMeta
-
     @abc.abstractmethod
     def init(self, metrics):
         """
diff --git a/kafka/metrics/quota.py b/kafka/metrics/quota.py
index 4d1b0d6cb..36a30c44e 100644
--- a/kafka/metrics/quota.py
+++ b/kafka/metrics/quota.py
@@ -3,6 +3,8 @@
 
 class Quota(object):
     """An upper or lower bound for metrics"""
+    __slots__ = ('_bound', '_upper')
+
     def __init__(self, bound, is_upper):
         self._bound = bound
         self._upper = is_upper
@@ -34,7 +36,7 @@ def __hash__(self):
     def __eq__(self, other):
         if self is other:
             return True
-        return (type(self) == type(other) and
+        return (isinstance(self, type(other)) and
                 self.bound == other.bound and
                 self.is_upper_bound() == other.is_upper_bound())
 
diff --git a/kafka/metrics/stat.py b/kafka/metrics/stat.py
index 9fd2f01ec..8825d2783 100644
--- a/kafka/metrics/stat.py
+++ b/kafka/metrics/stat.py
@@ -2,14 +2,15 @@
 
 import abc
 
+from kafka.vendor.six import add_metaclass
 
+
+@add_metaclass(abc.ABCMeta)
 class AbstractStat(object):
     """
     An AbstractStat is a quantity such as average, max, etc that is computed
     off the stream of updates to a sensor
     """
-    __metaclass__ = abc.ABCMeta
-
     @abc.abstractmethod
     def record(self, config, value, time_ms):
         """
diff --git a/kafka/metrics/stats/avg.py b/kafka/metrics/stats/avg.py
index cfbaec309..906d95573 100644
--- a/kafka/metrics/stats/avg.py
+++ b/kafka/metrics/stats/avg.py
@@ -7,6 +7,8 @@ class Avg(AbstractSampledStat):
     """
     An AbstractSampledStat that maintains a simple average over its samples.
     """
+    __slots__ = ('_initial_value', '_samples', '_current')
+
     def __init__(self):
         super(Avg, self).__init__(0.0)
 
diff --git a/kafka/metrics/stats/count.py b/kafka/metrics/stats/count.py
index 6e0a2d545..6cd6d2abe 100644
--- a/kafka/metrics/stats/count.py
+++ b/kafka/metrics/stats/count.py
@@ -7,6 +7,8 @@ class Count(AbstractSampledStat):
     """
     An AbstractSampledStat that maintains a simple count of what it has seen.
     """
+    __slots__ = ('_initial_value', '_samples', '_current')
+
     def __init__(self):
         super(Count, self).__init__(0.0)
 
diff --git a/kafka/metrics/stats/histogram.py b/kafka/metrics/stats/histogram.py
index ecc6c9db4..2c8afbfb3 100644
--- a/kafka/metrics/stats/histogram.py
+++ b/kafka/metrics/stats/histogram.py
@@ -4,6 +4,8 @@
 
 
 class Histogram(object):
+    __slots__ = ('_hist', '_count', '_bin_scheme')
+
     def __init__(self, bin_scheme):
         self._hist = [0.0] * bin_scheme.bins
         self._count = 0.0
@@ -40,6 +42,8 @@ def __str__(self):
         return '{%s}' % ','.join(values)
 
     class ConstantBinScheme(object):
+        __slots__ = ('_min', '_max', '_bins', '_bucket_width')
+
         def __init__(self, bins, min_val, max_val):
             if bins < 2:
                 raise ValueError('Must have at least 2 bins.')
@@ -69,6 +73,8 @@ def to_bin(self, x):
                 return int(((x - self._min) / self._bucket_width) + 1)
 
     class LinearBinScheme(object):
+        __slots__ = ('_bins', '_max', '_scale')
+
         def __init__(self, num_bins, max_val):
             self._bins = num_bins
             self._max = max_val
diff --git a/kafka/metrics/stats/max_stat.py b/kafka/metrics/stats/max_stat.py
index 08aebddfd..9c5eeb6fd 100644
--- a/kafka/metrics/stats/max_stat.py
+++ b/kafka/metrics/stats/max_stat.py
@@ -5,6 +5,8 @@
 
 class Max(AbstractSampledStat):
     """An AbstractSampledStat that gives the max over its samples."""
+    __slots__ = ('_initial_value', '_samples', '_current')
+
     def __init__(self):
         super(Max, self).__init__(float('-inf'))
 
diff --git a/kafka/metrics/stats/min_stat.py b/kafka/metrics/stats/min_stat.py
index 072106d8a..6bebe57e0 100644
--- a/kafka/metrics/stats/min_stat.py
+++ b/kafka/metrics/stats/min_stat.py
@@ -7,6 +7,8 @@
 
 class Min(AbstractSampledStat):
     """An AbstractSampledStat that gives the min over its samples."""
+    __slots__ = ('_initial_value', '_samples', '_current')
+
     def __init__(self):
         super(Min, self).__init__(float(sys.maxsize))
 
diff --git a/kafka/metrics/stats/percentile.py b/kafka/metrics/stats/percentile.py
index 3a86a84a9..75e64ce5e 100644
--- a/kafka/metrics/stats/percentile.py
+++ b/kafka/metrics/stats/percentile.py
@@ -2,6 +2,8 @@
 
 
 class Percentile(object):
+    __slots__ = ('_metric_name', '_percentile')
+
     def __init__(self, metric_name, percentile):
         self._metric_name = metric_name
         self._percentile = float(percentile)
diff --git a/kafka/metrics/stats/percentiles.py b/kafka/metrics/stats/percentiles.py
index 6d702e80f..c36543ffa 100644
--- a/kafka/metrics/stats/percentiles.py
+++ b/kafka/metrics/stats/percentiles.py
@@ -13,6 +13,9 @@ class BucketSizing(object):
 
 class Percentiles(AbstractSampledStat, AbstractCompoundStat):
     """A compound stat that reports one or more percentiles"""
+    __slots__ = ('_initial_value', '_samples', '_current',
+                 '_percentiles', '_buckets', '_bin_scheme')
+
     def __init__(self, size_in_bytes, bucketing, max_val, min_val=0.0,
                  percentiles=None):
         super(Percentiles, self).__init__(0.0)
diff --git a/kafka/metrics/stats/rate.py b/kafka/metrics/stats/rate.py
index 68393fbf7..4d0ba0f27 100644
--- a/kafka/metrics/stats/rate.py
+++ b/kafka/metrics/stats/rate.py
@@ -37,6 +37,8 @@ class Rate(AbstractMeasurableStat):
     occurrences (e.g. the count of values measured over the time interval)
     or other such values.
     """
+    __slots__ = ('_stat', '_unit')
+
     def __init__(self, time_unit=TimeUnit.SECONDS, sampled_stat=None):
         self._stat = sampled_stat or SampledTotal()
         self._unit = time_unit
@@ -105,6 +107,7 @@ def convert(self, time_ms):
 
 
 class SampledTotal(AbstractSampledStat):
+    __slots__ = ('_initial_value', '_samples', '_current')
     def __init__(self, initial_value=None):
         if initial_value is not None:
             raise ValueError('initial_value cannot be set on SampledTotal')
diff --git a/kafka/metrics/stats/sampled_stat.py b/kafka/metrics/stats/sampled_stat.py
index c41b14bbc..fe8970dbf 100644
--- a/kafka/metrics/stats/sampled_stat.py
+++ b/kafka/metrics/stats/sampled_stat.py
@@ -3,8 +3,10 @@
 import abc
 
 from kafka.metrics.measurable_stat import AbstractMeasurableStat
+from kafka.vendor.six import add_metaclass
 
 
+@add_metaclass(abc.ABCMeta)
 class AbstractSampledStat(AbstractMeasurableStat):
     """
     An AbstractSampledStat records a single scalar value measured over
@@ -20,7 +22,7 @@ class AbstractSampledStat(AbstractMeasurableStat):
     Subclasses of this class define different statistics measured
     using this basic pattern.
     """
-    __metaclass__ = abc.ABCMeta
+    __slots__ = ('_initial_value', '_samples', '_current')
 
     def __init__(self, initial_value):
         self._initial_value = initial_value
diff --git a/kafka/metrics/stats/sensor.py b/kafka/metrics/stats/sensor.py
index 571723f97..9f7ac45f5 100644
--- a/kafka/metrics/stats/sensor.py
+++ b/kafka/metrics/stats/sensor.py
@@ -15,6 +15,10 @@ class Sensor(object):
     the `record(double)` api and would maintain a set
     of metrics about request sizes such as the average or max.
     """
+    __slots__ = ('_lock', '_registry', '_name', '_parents', '_metrics',
+                 '_stats', '_config', '_inactive_sensor_expiration_time_ms',
+                 '_last_record_time')
+
     def __init__(self, registry, name, parents, config,
                  inactive_sensor_expiration_time_seconds):
         if not name:
diff --git a/kafka/metrics/stats/total.py b/kafka/metrics/stats/total.py
index 5b3bb87fd..a78e99733 100644
--- a/kafka/metrics/stats/total.py
+++ b/kafka/metrics/stats/total.py
@@ -5,6 +5,8 @@
 
 class Total(AbstractMeasurableStat):
     """An un-windowed cumulative total maintained over all time."""
+    __slots__ = ('_total')
+
     def __init__(self, value=0.0):
         self._total = value
 
diff --git a/kafka/oauth/__init__.py b/kafka/oauth/__init__.py
deleted file mode 100644
index 8c8349564..000000000
--- a/kafka/oauth/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from __future__ import absolute_import
-
-from kafka.oauth.abstract import AbstractTokenProvider
diff --git a/kafka/oauth/abstract.py b/kafka/oauth/abstract.py
deleted file mode 100644
index 8d89ff51d..000000000
--- a/kafka/oauth/abstract.py
+++ /dev/null
@@ -1,42 +0,0 @@
-from __future__ import absolute_import
-
-import abc
-
-# This statement is compatible with both Python 2.7 & 3+
-ABC = abc.ABCMeta('ABC', (object,), {'__slots__': ()})
-
-class AbstractTokenProvider(ABC):
-    """
-    A Token Provider must be used for the SASL OAuthBearer protocol.
-
-    The implementation should ensure token reuse so that multiple
-    calls at connect time do not create multiple tokens. The implementation
-    should also periodically refresh the token in order to guarantee
-    that each call returns an unexpired token. A timeout error should
-    be returned after a short period of inactivity so that the
-    broker can log debugging info and retry.
-
-    Token Providers MUST implement the token() method
-    """
-
-    def __init__(self, **config):
-        pass
-
-    @abc.abstractmethod
-    def token(self):
-        """
-        Returns a (str) ID/Access Token to be sent to the Kafka
-        client.
-        """
-        pass
-
-    def extensions(self):
-        """
-        This is an OPTIONAL method that may be implemented.
-
-        Returns a map of key-value pairs that can
-        be sent with the SASL/OAUTHBEARER initial client request. If
-        not implemented, the values are ignored. This feature is only available
-        in Kafka >= 2.1.0.
-        """
-        return {}
diff --git a/kafka/producer/buffer.py b/kafka/producer/buffer.py
deleted file mode 100644
index 100801700..000000000
--- a/kafka/producer/buffer.py
+++ /dev/null
@@ -1,115 +0,0 @@
-from __future__ import absolute_import, division
-
-import collections
-import io
-import threading
-import time
-
-from kafka.metrics.stats import Rate
-
-import kafka.errors as Errors
-
-
-class SimpleBufferPool(object):
-    """A simple pool of BytesIO objects with a weak memory ceiling."""
-    def __init__(self, memory, poolable_size, metrics=None, metric_group_prefix='producer-metrics'):
-        """Create a new buffer pool.
-
-        Arguments:
-            memory (int): maximum memory that this buffer pool can allocate
-            poolable_size (int): memory size per buffer to cache in the free
-                list rather than deallocating
-        """
-        self._poolable_size = poolable_size
-        self._lock = threading.RLock()
-
-        buffers = int(memory / poolable_size) if poolable_size else 0
-        self._free = collections.deque([io.BytesIO() for _ in range(buffers)])
-
-        self._waiters = collections.deque()
-        self.wait_time = None
-        if metrics:
-            self.wait_time = metrics.sensor('bufferpool-wait-time')
-            self.wait_time.add(metrics.metric_name(
-                'bufferpool-wait-ratio', metric_group_prefix,
-                'The fraction of time an appender waits for space allocation.'),
-                Rate())
-
-    def allocate(self, size, max_time_to_block_ms):
-        """
-        Allocate a buffer of the given size. This method blocks if there is not
-        enough memory and the buffer pool is configured with blocking mode.
-
-        Arguments:
-            size (int): The buffer size to allocate in bytes [ignored]
-            max_time_to_block_ms (int): The maximum time in milliseconds to
-                block for buffer memory to be available
-
-        Returns:
-            io.BytesIO
-        """
-        with self._lock:
-            # check if we have a free buffer of the right size pooled
-            if self._free:
-                return self._free.popleft()
-
-            elif self._poolable_size == 0:
-                return io.BytesIO()
-
-            else:
-                # we are out of buffers and will have to block
-                buf = None
-                more_memory = threading.Condition(self._lock)
-                self._waiters.append(more_memory)
-                # loop over and over until we have a buffer or have reserved
-                # enough memory to allocate one
-                while buf is None:
-                    start_wait = time.time()
-                    more_memory.wait(max_time_to_block_ms / 1000.0)
-                    end_wait = time.time()
-                    if self.wait_time:
-                        self.wait_time.record(end_wait - start_wait)
-
-                    if self._free:
-                        buf = self._free.popleft()
-                    else:
-                        self._waiters.remove(more_memory)
-                        raise Errors.KafkaTimeoutError(
-                            "Failed to allocate memory within the configured"
-                            " max blocking time")
-
-                # remove the condition for this thread to let the next thread
-                # in line start getting memory
-                removed = self._waiters.popleft()
-                assert removed is more_memory, 'Wrong condition'
-
-                # signal any additional waiters if there is more memory left
-                # over for them
-                if self._free and self._waiters:
-                    self._waiters[0].notify()
-
-                # unlock and return the buffer
-                return buf
-
-    def deallocate(self, buf):
-        """
-        Return buffers to the pool. If they are of the poolable size add them
-        to the free list, otherwise just mark the memory as free.
-
-        Arguments:
-            buffer_ (io.BytesIO): The buffer to return
-        """
-        with self._lock:
-            # BytesIO.truncate here makes the pool somewhat pointless
-            # but we stick with the BufferPool API until migrating to
-            # bytesarray / memoryview. The buffer we return must not
-            # expose any prior data on read().
-            buf.truncate(0)
-            self._free.append(buf)
-            if self._waiters:
-                self._waiters[0].notify()
-
-    def queued(self):
-        """The number of threads blocked waiting on memory."""
-        with self._lock:
-            return len(self._waiters)
diff --git a/kafka/producer/future.py b/kafka/producer/future.py
index 07fa4adb4..f67db0979 100644
--- a/kafka/producer/future.py
+++ b/kafka/producer/future.py
@@ -38,7 +38,7 @@ def __init__(self, produce_future, relative_offset, timestamp_ms, checksum, seri
         produce_future.add_errback(self.failure)
 
     def _produce_success(self, offset_and_timestamp):
-        offset, produce_timestamp_ms, log_start_offset = offset_and_timestamp
+        offset, produce_timestamp_ms = offset_and_timestamp
 
         # Unpacking from args tuple is minor speed optimization
         (relative_offset, timestamp_ms, checksum,
@@ -51,7 +51,7 @@ def _produce_success(self, offset_and_timestamp):
         if offset != -1 and relative_offset is not None:
             offset += relative_offset
         tp = self._produce_future.topic_partition
-        metadata = RecordMetadata(tp[0], tp[1], tp, offset, timestamp_ms, log_start_offset,
+        metadata = RecordMetadata(tp[0], tp[1], tp, offset, timestamp_ms,
                                   checksum, serialized_key_size,
                                   serialized_value_size, serialized_header_size)
         self.success(metadata)
@@ -67,5 +67,5 @@ def get(self, timeout=None):
 
 
 RecordMetadata = collections.namedtuple(
-    'RecordMetadata', ['topic', 'partition', 'topic_partition', 'offset', 'timestamp', 'log_start_offset',
+    'RecordMetadata', ['topic', 'partition', 'topic_partition', 'offset', 'timestamp',
                        'checksum', 'serialized_key_size', 'serialized_value_size', 'serialized_header_size'])
diff --git a/kafka/producer/kafka.py b/kafka/producer/kafka.py
index cde26b008..66208bbe1 100644
--- a/kafka/producer/kafka.py
+++ b/kafka/producer/kafka.py
@@ -1,11 +1,11 @@
-from __future__ import absolute_import
+from __future__ import absolute_import, division
 
 import atexit
 import copy
 import logging
 import socket
 import threading
-import time
+import warnings
 import weakref
 
 from kafka.vendor import six
@@ -18,10 +18,12 @@
 from kafka.producer.future import FutureRecordMetadata, FutureProduceResult
 from kafka.producer.record_accumulator import AtomicInteger, RecordAccumulator
 from kafka.producer.sender import Sender
+from kafka.producer.transaction_manager import TransactionManager
 from kafka.record.default_records import DefaultRecordBatchBuilder
 from kafka.record.legacy_records import LegacyRecordBatchBuilder
 from kafka.serializer import Serializer
 from kafka.structs import TopicPartition
+from kafka.util import Timer, ensure_valid_topic_name
 
 
 log = logging.getLogger(__name__)
@@ -34,8 +36,8 @@ class KafkaProducer(object):
     The producer is thread safe and sharing a single producer instance across
     threads will generally be faster than having multiple instances.
 
-    The producer consists of a pool of buffer space that holds records that
-    haven't yet been transmitted to the server as well as a background I/O
+    The producer consists of a RecordAccumulator which holds records that
+    haven't yet been transmitted to the server, and a Sender background I/O
     thread that is responsible for turning these records into requests and
     transmitting them to the cluster.
 
@@ -71,14 +73,50 @@ class KafkaProducer(object):
     can lead to fewer, more efficient requests when not under maximal load at
     the cost of a small amount of latency.
 
-    The buffer_memory controls the total amount of memory available to the
-    producer for buffering. If records are sent faster than they can be
-    transmitted to the server then this buffer space will be exhausted. When
-    the buffer space is exhausted additional send calls will block.
-
     The key_serializer and value_serializer instruct how to turn the key and
     value objects the user provides into bytes.
 
+    From Kafka 0.11, the KafkaProducer supports two additional modes:
+    the idempotent producer and the transactional producer.
+    The idempotent producer strengthens Kafka's delivery semantics from
+    at least once to exactly once delivery. In particular, producer retries
+    will no longer introduce duplicates. The transactional producer allows an
+    application to send messages to multiple partitions (and topics!)
+    atomically.
+
+    To enable idempotence, the `enable_idempotence` configuration must be set
+    to True. If set, the `retries` config will default to `float('inf')` and
+    the `acks` config will default to 'all'. There are no API changes for the
+    idempotent producer, so existing applications will not need to be modified
+    to take advantage of this feature.
+
+    To take advantage of the idempotent producer, it is imperative to avoid
+    application level re-sends since these cannot be de-duplicated. As such, if
+    an application enables idempotence, it is recommended to leave the
+    `retries` config unset, as it will be defaulted to `float('inf')`.
+    Additionally, if a :meth:`~kafka.KafkaProducer.send` returns an error even
+    with infinite retries (for instance if the message expires in the buffer
+    before being sent), then it is recommended to shut down the producer and
+    check the contents of the last produced message to ensure that it is not
+    duplicated. Finally, the producer can only guarantee idempotence for
+    messages sent within a single session.
+
+    To use the transactional producer and the attendant APIs, you must set the
+    `transactional_id` configuration property. If the `transactional_id` is
+    set, idempotence is automatically enabled along with the producer configs
+    which idempotence depends on. Further, topics which are included in
+    transactions should be configured for durability. In particular, the
+    `replication.factor` should be at least `3`, and the `min.insync.replicas`
+    for these topics should be set to 2. Finally, in order for transactional
+    guarantees to be realized from end-to-end, the consumers must be
+    configured to read only committed messages as well.
+
+    The purpose of the `transactional_id` is to enable transaction recovery
+    across multiple sessions of a single producer instance. It would typically
+    be derived from the shard identifier in a partitioned, stateful,
+    application. As such, it should be unique to each producer instance running
+    within a partitioned application.
+
     Keyword Arguments:
         bootstrap_servers: 'host[:port]' string (or list of 'host[:port]'
             strings) that the producer should contact to bootstrap initial
@@ -96,6 +134,28 @@ class KafkaProducer(object):
         value_serializer (callable): used to convert user-supplied message
             values to bytes. If not None, called as f(value), should return
             bytes. Default: None.
+        enable_idempotence (bool): When set to True, the producer will ensure
+            that exactly one copy of each message is written in the stream.
+            If False, producer retries due to broker failures, etc., may write
+            duplicates of the retried message in the stream. Default: False.
+
+            Note that enabling idempotence requires
+            `max_in_flight_requests_per_connection` to be set to 1 and `retries`
+            cannot be zero. Additionally, `acks` must be set to 'all'. If these
+            values are left at their defaults, the producer will override the
+            defaults to be suitable. If the values are set to something
+            incompatible with the idempotent producer, a KafkaConfigurationError
+            will be raised.
+        delivery_timeout_ms (float): An upper bound on the time to report success
+            or failure after producer.send() returns. This limits the total time
+            that a record will be delayed prior to sending, the time to await
+            acknowledgement from the broker (if expected), and the time allowed
+            for retriable send failures. The producer may report failure to send
+            a record earlier than this config if either an unrecoverable error is
+            encountered, the retries have been exhausted, or the record is added
+            to a batch which reached an earlier delivery expiration deadline.
+            The value of this config should be greater than or equal to the
+            sum of (request_timeout_ms + linger_ms). Default: 120000.
         acks (0, 1, 'all'): The number of acknowledgments the producer requires
             the leader to have received before considering a request complete.
             This controls the durability of records that are sent. The
@@ -123,7 +183,7 @@ class KafkaProducer(object):
             Compression is of full batches of data, so the efficacy of batching
             will also impact the compression ratio (more batching means better
             compression). Default: None.
-        retries (int): Setting a value greater than zero will cause the client
+        retries (numeric): Setting a value greater than zero will cause the client
             to resend any record whose send fails with a potentially transient
             error. Note that this retry is no different than if the client
             resent the record upon receiving the error. Allowing retries
@@ -131,8 +191,12 @@ class KafkaProducer(object):
             potentially change the ordering of records because if two batches
             are sent to a single partition, and the first fails and is retried
             but the second succeeds, then the records in the second batch may
-            appear first.
-            Default: 0.
+            appear first. Note additionally that produce requests will be
+            failed before the number of retries has been exhausted if the timeout
+            configured by delivery_timeout_ms expires first before successful
+            acknowledgement. Users should generally prefer to leave this config
+            unset and instead use delivery_timeout_ms to control retry behavior.
+            Default: float('inf') (infinite)
         batch_size (int): Requests sent to brokers will contain multiple
             batches, one for each partition with data available to be sent.
             A small batch size will make batching less common and may reduce
@@ -165,12 +229,6 @@ class KafkaProducer(object):
             messages with the same key are assigned to the same partition.
             When a key is None, the message is delivered to a random partition
             (filtered to partitions with available leaders only, if possible).
-        buffer_memory (int): The total bytes of memory the producer should use
-            to buffer records waiting to be sent to the server. If records are
-            sent faster than they can be delivered to the server the producer
-            will block up to max_block_ms, raising an exception on timeout.
-            In the current implementation, this setting is an approximation.
-            Default: 33554432 (32MB)
         connections_max_idle_ms: Close idle connections after the number of
             milliseconds specified by this config. The broker closes idle
             connections after connections.max.idle.ms, so this avoids hitting
@@ -188,6 +246,9 @@ class KafkaProducer(object):
             This setting will limit the number of record batches the producer
             will send in a single request to avoid sending huge requests.
             Default: 1048576.
+        allow_auto_create_topics (bool): Enable/disable auto topic creation
+            on metadata request. Only available with api_version >= (0, 11).
+            Default: True
         metadata_max_age_ms (int): The period of time in milliseconds after
             which we force a refresh of metadata even if we haven't seen any
             partition leadership changes to proactively discover any new
@@ -216,7 +277,7 @@ class KafkaProducer(object):
             reconnection attempts will continue periodically with this fixed
             rate. To avoid connection storms, a randomization factor of 0.2
             will be applied to the backoff resulting in a random range between
-            20% below and 20% above the computed value. Default: 1000.
+            20% below and 20% above the computed value. Default: 30000.
         max_in_flight_requests_per_connection (int): Requests are pipelined
             to kafka brokers up to this number of maximum requests per
             broker connection. Note that if this setting is set to be greater
@@ -233,7 +294,7 @@ class KafkaProducer(object):
             should verify that the certificate matches the brokers hostname.
             default: true.
         ssl_cafile (str): optional filename of ca file to use in certificate
-            veriication. default: none.
+            verification. default: none.
         ssl_certfile (str): optional filename of file in pem format containing
             the client certificate, as well as any ca certificates needed to
             establish the certificate's authenticity. default: none.
@@ -252,14 +313,28 @@ class KafkaProducer(object):
             or other configuration forbids use of all the specified ciphers),
             an ssl.SSLError will be raised. See ssl.SSLContext.set_ciphers
         api_version (tuple): Specify which Kafka API version to use. If set to
-            None, the client will attempt to infer the broker version by probing
-            various APIs. Example: (0, 10, 2). Default: None
+            None, the client will attempt to determine the broker version via
+            ApiVersionsRequest API or, for brokers earlier than 0.10, probing
+            various known APIs. Dynamic version checking is performed eagerly
+            during __init__ and can raise NoBrokersAvailableError if no connection
+            was made before timeout (see api_version_auto_timeout_ms below).
+            Different versions enable different functionality.
+
+            Examples:
+                (3, 9) most recent broker release, enable all supported features
+                (0, 11) enables message format v2 (internal)
+                (0, 10, 0) enables sasl authentication and message format v1
+                (0, 8, 0) enables basic functionality only
+
+            Default: None
         api_version_auto_timeout_ms (int): number of milliseconds to throw a
             timeout exception from the constructor when checking the broker
             api version. Only applies if api_version set to None.
+            Default: 2000
         metric_reporters (list): A list of classes to use as metrics reporters.
             Implementing the AbstractMetricsReporter interface allows plugging
             in classes that will be notified of new metric creation. Default: []
+        metrics_enabled (bool): Whether to track metrics on this instance. Default True.
         metrics_num_samples (int): The number of samples maintained to compute
             metrics. Default: 2
         metrics_sample_window_ms (int): The maximum age in milliseconds of
@@ -274,33 +349,42 @@ class KafkaProducer(object):
             Required if sasl_mechanism is PLAIN or one of the SCRAM mechanisms.
         sasl_plain_password (str): password for sasl PLAIN and SCRAM authentication.
             Required if sasl_mechanism is PLAIN or one of the SCRAM mechanisms.
+        sasl_kerberos_name (str or gssapi.Name): Constructed gssapi.Name for use with
+            sasl mechanism handshake. If provided, sasl_kerberos_service_name and
+            sasl_kerberos_domain name are ignored. Default: None.
         sasl_kerberos_service_name (str): Service name to include in GSSAPI
             sasl mechanism handshake. Default: 'kafka'
         sasl_kerberos_domain_name (str): kerberos domain name to use in GSSAPI
             sasl mechanism handshake. Default: one of bootstrap servers
-        sasl_oauth_token_provider (AbstractTokenProvider): OAuthBearer token provider
-            instance. (See kafka.oauth.abstract). Default: None
+        sasl_oauth_token_provider (kafka.sasl.oauth.AbstractTokenProvider): OAuthBearer
+            token provider instance. Default: None
+        socks5_proxy (str): Socks5 proxy URL. Default: None
+        kafka_client (callable): Custom class / callable for creating KafkaClient instances
 
     Note:
         Configuration parameters are described in more detail at
-        https://kafka.apache.org/0100/configuration.html#producerconfigs
+        https://kafka.apache.org/0100/documentation/#producerconfigs
     """
     DEFAULT_CONFIG = {
         'bootstrap_servers': 'localhost',
         'client_id': None,
         'key_serializer': None,
         'value_serializer': None,
+        'enable_idempotence': False,
+        'transactional_id': None,
+        'transaction_timeout_ms': 60000,
+        'delivery_timeout_ms': 120000,
         'acks': 1,
         'bootstrap_topics_filter': set(),
         'compression_type': None,
-        'retries': 0,
+        'retries': float('inf'),
         'batch_size': 16384,
         'linger_ms': 0,
         'partitioner': DefaultPartitioner(),
-        'buffer_memory': 33554432,
         'connections_max_idle_ms': 9 * 60 * 1000,
         'max_block_ms': 60000,
         'max_request_size': 1048576,
+        'allow_auto_create_topics': True,
         'metadata_max_age_ms': 300000,
         'retry_backoff_ms': 100,
         'request_timeout_ms': 30000,
@@ -310,7 +394,7 @@ class KafkaProducer(object):
         'sock_chunk_bytes': 4096,  # undocumented experimental option
         'sock_chunk_buffer_count': 1000,  # undocumented experimental option
         'reconnect_backoff_ms': 50,
-        'reconnect_backoff_max_ms': 1000,
+        'reconnect_backoff_max_ms': 30000,
         'max_in_flight_requests_per_connection': 5,
         'security_protocol': 'PLAINTEXT',
         'ssl_context': None,
@@ -324,17 +408,23 @@ class KafkaProducer(object):
         'api_version': None,
         'api_version_auto_timeout_ms': 2000,
         'metric_reporters': [],
+        'metrics_enabled': True,
         'metrics_num_samples': 2,
         'metrics_sample_window_ms': 30000,
         'selector': selectors.DefaultSelector,
         'sasl_mechanism': None,
         'sasl_plain_username': None,
         'sasl_plain_password': None,
+        'sasl_kerberos_name': None,
         'sasl_kerberos_service_name': 'kafka',
         'sasl_kerberos_domain_name': None,
-        'sasl_oauth_token_provider': None
+        'sasl_oauth_token_provider': None,
+        'socks5_proxy': None,
+        'kafka_client': KafkaClient,
     }
 
+    DEPRECATED_CONFIGS = ('buffer_memory',)
+
     _COMPRESSORS = {
         'gzip': (has_gzip, LegacyRecordBatchBuilder.CODEC_GZIP),
         'snappy': (has_snappy, LegacyRecordBatchBuilder.CODEC_SNAPPY),
@@ -344,12 +434,17 @@ class KafkaProducer(object):
     }
 
     def __init__(self, **configs):
-        log.debug("Starting the Kafka producer")  # trace
         self.config = copy.copy(self.DEFAULT_CONFIG)
+        user_provided_configs = set(configs.keys())
         for key in self.config:
             if key in configs:
                 self.config[key] = configs.pop(key)
 
+        for key in self.DEPRECATED_CONFIGS:
+            if key in configs:
+                configs.pop(key)
+                warnings.warn('Deprecated Producer config: %s' % (key,), DeprecationWarning)
+
         # Only check for extra config keys in top-level class
         assert not configs, 'Unrecognized configs: %s' % (configs,)
 
@@ -367,30 +462,35 @@ def __init__(self, **configs):
                 self.config['api_version'] = None
             else:
                 self.config['api_version'] = tuple(map(int, deprecated.split('.')))
-            log.warning('use api_version=%s [tuple] -- "%s" as str is deprecated',
-                        str(self.config['api_version']), deprecated)
+            log.warning('%s: use api_version=%s [tuple] -- "%s" as str is deprecated',
+                        str(self), str(self.config['api_version']), deprecated)
+
+        log.debug("%s: Starting Kafka producer", str(self))
 
         # Configure metrics
-        metrics_tags = {'client-id': self.config['client_id']}
-        metric_config = MetricConfig(samples=self.config['metrics_num_samples'],
-                                     time_window_ms=self.config['metrics_sample_window_ms'],
-                                     tags=metrics_tags)
-        reporters = [reporter() for reporter in self.config['metric_reporters']]
-        self._metrics = Metrics(metric_config, reporters)
+        if self.config['metrics_enabled']:
+            metrics_tags = {'client-id': self.config['client_id']}
+            metric_config = MetricConfig(samples=self.config['metrics_num_samples'],
+                                         time_window_ms=self.config['metrics_sample_window_ms'],
+                                         tags=metrics_tags)
+            reporters = [reporter() for reporter in self.config['metric_reporters']]
+            self._metrics = Metrics(metric_config, reporters)
+        else:
+            self._metrics = None
 
-        client = KafkaClient(metrics=self._metrics, metric_group_prefix='producer',
-                             wakeup_timeout_ms=self.config['max_block_ms'],
-                             **self.config)
+        client = self.config['kafka_client'](
+            metrics=self._metrics, metric_group_prefix='producer',
+            wakeup_timeout_ms=self.config['max_block_ms'],
+            **self.config)
 
-        # Get auto-discovered version from client if necessary
-        if self.config['api_version'] is None:
-            self.config['api_version'] = client.config['api_version']
+        # Get auto-discovered / normalized version from client
+        self.config['api_version'] = client.config['api_version']
 
         if self.config['compression_type'] == 'lz4':
             assert self.config['api_version'] >= (0, 8, 2), 'LZ4 Requires >= Kafka 0.8.2 Brokers'
 
         if self.config['compression_type'] == 'zstd':
-            assert self.config['api_version'] >= (2, 1, 0), 'Zstd Requires >= Kafka 2.1.0 Brokers'
+            assert self.config['api_version'] >= (2, 1), 'Zstd Requires >= Kafka 2.1 Brokers'
 
         # Check compression_type for library support
         ct = self.config['compression_type']
@@ -401,12 +501,58 @@ def __init__(self, **configs):
             assert checker(), "Libraries for {} compression codec not found".format(ct)
             self.config['compression_attrs'] = compression_attrs
 
-        message_version = self._max_usable_produce_magic()
-        self._accumulator = RecordAccumulator(message_version=message_version, metrics=self._metrics, **self.config)
         self._metadata = client.cluster
+        self._transaction_manager = None
+        self._init_transactions_result = None
+        if 'enable_idempotence' in user_provided_configs and not self.config['enable_idempotence'] and self.config['transactional_id']:
+            raise Errors.KafkaConfigurationError("Cannot set transactional_id without enable_idempotence.")
+
+        if self.config['transactional_id']:
+            self.config['enable_idempotence'] = True
+
+        if self.config['enable_idempotence']:
+            assert self.config['api_version'] >= (0, 11), "Transactional/Idempotent producer requires >= Kafka 0.11 Brokers"
+
+            self._transaction_manager = TransactionManager(
+                transactional_id=self.config['transactional_id'],
+                transaction_timeout_ms=self.config['transaction_timeout_ms'],
+                retry_backoff_ms=self.config['retry_backoff_ms'],
+                api_version=self.config['api_version'],
+                metadata=self._metadata,
+            )
+            if self._transaction_manager.is_transactional():
+                log.info("%s: Instantiated a transactional producer.", str(self))
+            else:
+                log.info("%s: Instantiated an idempotent producer.", str(self))
+
+            if self.config['retries'] == 0:
+                raise Errors.KafkaConfigurationError("Must set 'retries' to non-zero when using the idempotent producer.")
+
+            if 'max_in_flight_requests_per_connection' not in user_provided_configs:
+                log.info("%s: Overriding the default 'max_in_flight_requests_per_connection' to 1 since idempontence is enabled.", str(self))
+                self.config['max_in_flight_requests_per_connection'] = 1
+            elif self.config['max_in_flight_requests_per_connection'] != 1:
+                raise Errors.KafkaConfigurationError("Must set 'max_in_flight_requests_per_connection' to 1 in order"
+                                                     " to use the idempotent producer."
+                                                     " Otherwise we cannot guarantee idempotence.")
+
+            if 'acks' not in user_provided_configs:
+                log.info("%s: Overriding the default 'acks' config to 'all' since idempotence is enabled", str(self))
+                self.config['acks'] = -1
+            elif self.config['acks'] != -1:
+                raise Errors.KafkaConfigurationError("Must set 'acks' config to 'all' in order to use the idempotent"
+                                                     " producer. Otherwise we cannot guarantee idempotence")
+
+        message_version = self.max_usable_produce_magic(self.config['api_version'])
+        self._accumulator = RecordAccumulator(
+                transaction_manager=self._transaction_manager,
+                message_version=message_version,
+                **self.config)
         guarantee_message_order = bool(self.config['max_in_flight_requests_per_connection'] == 1)
         self._sender = Sender(client, self._metadata,
-                              self._accumulator, self._metrics,
+                              self._accumulator,
+                              metrics=self._metrics,
+                              transaction_manager=self._transaction_manager,
                               guarantee_message_order=guarantee_message_order,
                               **self.config)
         self._sender.daemon = True
@@ -415,7 +561,7 @@ def __init__(self, **configs):
 
         self._cleanup = self._cleanup_factory()
         atexit.register(self._cleanup)
-        log.debug("Kafka producer started")
+        log.debug("%s: Kafka producer started", str(self))
 
     def bootstrap_connected(self):
         """Return True if the bootstrap is connected."""
@@ -426,7 +572,7 @@ def _cleanup_factory(self):
         _self = weakref.proxy(self)
         def wrapper():
             try:
-                _self.close(timeout=0)
+                _self.close(timeout=0, null_logger=True)
             except (ReferenceError, AttributeError):
                 pass
         return wrapper
@@ -449,28 +595,28 @@ def _unregister_cleanup(self):
         self._cleanup = None
 
     def __del__(self):
-        # Disable logger during destruction to avoid touching dangling references
-        class NullLogger(object):
-            def __getattr__(self, name):
-                return lambda *args: None
-
-        global log
-        log = NullLogger()
+        self.close(timeout=1, null_logger=True)
 
-        self.close()
-
-    def close(self, timeout=None):
+    def close(self, timeout=None, null_logger=False):
         """Close this producer.
 
         Arguments:
             timeout (float, optional): timeout in seconds to wait for completion.
         """
+        if null_logger:
+            # Disable logger during destruction to avoid touching dangling references
+            class NullLogger(object):
+                def __getattr__(self, name):
+                    return lambda *args: None
+
+            global log
+            log = NullLogger()
 
         # drop our atexit handler now to avoid leaks
         self._unregister_cleanup()
 
         if not hasattr(self, '_closed') or self._closed:
-            log.info('Kafka producer closed')
+            log.info('%s: Kafka producer closed', str(self))
             return
         if timeout is None:
             # threading.TIMEOUT_MAX is available in Python3.3+
@@ -480,15 +626,16 @@ def close(self, timeout=None):
         else:
             assert timeout >= 0
 
-        log.info("Closing the Kafka producer with %s secs timeout.", timeout)
+        log.info("%s: Closing the Kafka producer with %s secs timeout.", str(self), timeout)
+        self.flush(timeout)
         invoked_from_callback = bool(threading.current_thread() is self._sender)
         if timeout > 0:
             if invoked_from_callback:
-                log.warning("Overriding close timeout %s secs to 0 in order to"
+                log.warning("%s: Overriding close timeout %s secs to 0 in order to"
                             " prevent useless blocking due to self-join. This"
                             " means you have incorrectly invoked close with a"
                             " non-zero timeout from the producer call-back.",
-                            timeout)
+                            str(self), timeout)
             else:
                 # Try to close gracefully.
                 if self._sender is not None:
@@ -496,12 +643,13 @@ def close(self, timeout=None):
                     self._sender.join(timeout)
 
         if self._sender is not None and self._sender.is_alive():
-            log.info("Proceeding to force close the producer since pending"
+            log.info("%s: Proceeding to force close the producer since pending"
                      " requests could not be completed within timeout %s.",
-                     timeout)
+                     str(self), timeout)
             self._sender.force_close()
 
-        self._metrics.close()
+        if self._metrics:
+            self._metrics.close()
         try:
             self.config['key_serializer'].close()
         except AttributeError:
@@ -511,23 +659,23 @@ def close(self, timeout=None):
         except AttributeError:
             pass
         self._closed = True
-        log.debug("The Kafka producer has closed.")
+        log.debug("%s: The Kafka producer has closed.", str(self))
 
     def partitions_for(self, topic):
         """Returns set of all known partitions for the topic."""
-        max_wait = self.config['max_block_ms'] / 1000.0
-        return self._wait_on_metadata(topic, max_wait)
+        return self._wait_on_metadata(topic, self.config['max_block_ms'])
 
-    def _max_usable_produce_magic(self):
-        if self.config['api_version'] >= (0, 11):
+    @classmethod
+    def max_usable_produce_magic(cls, api_version):
+        if api_version >= (0, 11):
             return 2
-        elif self.config['api_version'] >= (0, 10):
+        elif api_version >= (0, 10, 0):
             return 1
         else:
             return 0
 
     def _estimate_size_in_bytes(self, key, value, headers=[]):
-        magic = self._max_usable_produce_magic()
+        magic = self.max_usable_produce_magic(self.config['api_version'])
         if magic == 2:
             return DefaultRecordBatchBuilder.estimate_size_in_bytes(
                 key, value, headers)
@@ -535,6 +683,114 @@ def _estimate_size_in_bytes(self, key, value, headers=[]):
             return LegacyRecordBatchBuilder.estimate_size_in_bytes(
                 magic, self.config['compression_type'], key, value)
 
+    def init_transactions(self):
+        """
+        Needs to be called before any other methods when the transactional.id is set in the configuration.
+
+        This method does the following:
+          1. Ensures any transactions initiated by previous instances of the producer with the same
+             transactional_id are completed. If the previous instance had failed with a transaction in
+             progress, it will be aborted. If the last transaction had begun completion,
+             but not yet finished, this method awaits its completion.
+          2. Gets the internal producer id and epoch, used in all future transactional
+             messages issued by the producer.
+
+        Note that this method will raise KafkaTimeoutError if the transactional state cannot
+        be initialized before expiration of `max_block_ms`.
+
+        Retrying after a KafkaTimeoutError will continue to wait for the prior request to succeed or fail.
+        Retrying after any other exception will start a new initialization attempt.
+        Retrying after a successful initialization will do nothing.
+
+        Raises:
+            IllegalStateError: if no transactional_id has been configured
+            AuthorizationError: fatal error indicating that the configured
+                transactional_id is not authorized.
+            KafkaError: if the producer has encountered a previous fatal error or for any other unexpected error
+            KafkaTimeoutError: if the time taken for initialize the transaction has surpassed `max.block.ms`.
+        """
+        if not self._transaction_manager:
+            raise Errors.IllegalStateError("Cannot call init_transactions without setting a transactional_id.")
+        if self._init_transactions_result is None:
+            self._init_transactions_result = self._transaction_manager.initialize_transactions()
+            self._sender.wakeup()
+
+        try:
+            if not self._init_transactions_result.wait(timeout_ms=self.config['max_block_ms']):
+                raise Errors.KafkaTimeoutError("Timeout expired while initializing transactional state in %s ms." % (self.config['max_block_ms'],))
+        finally:
+            if self._init_transactions_result.failed:
+                self._init_transactions_result = None
+
+    def begin_transaction(self):
+        """ Should be called before the start of each new transaction.
+
+        Note that prior to the first invocation of this method,
+        you must invoke `init_transactions()` exactly one time.
+
+        Raises:
+            ProducerFencedError if another producer is with the same
+                transactional_id is active.
+        """
+        # Set the transactional bit in the producer.
+        if not self._transaction_manager:
+            raise Errors.IllegalStateError("Cannot use transactional methods without enabling transactions")
+        self._transaction_manager.begin_transaction()
+
+    def send_offsets_to_transaction(self, offsets, consumer_group_id):
+        """
+        Sends a list of consumed offsets to the consumer group coordinator, and also marks
+        those offsets as part of the current transaction. These offsets will be considered
+        consumed only if the transaction is committed successfully.
+
+        This method should be used when you need to batch consumed and produced messages
+        together, typically in a consume-transform-produce pattern.
+
+        Arguments:
+            offsets ({TopicPartition: OffsetAndMetadata}): map of topic-partition -> offsets to commit
+                as part of current transaction.
+            consumer_group_id (str): Name of consumer group for offsets commit.
+
+        Raises:
+            IllegalStateError: if no transactional_id, or transaction has not been started.
+            ProducerFencedError: fatal error indicating another producer with the same transactional_id is active.
+            UnsupportedVersionError: fatal error indicating the broker does not support transactions (i.e. if < 0.11).
+            UnsupportedForMessageFormatError: fatal error indicating the message format used for the offsets
+                topic on the broker does not support transactions.
+            AuthorizationError: fatal error indicating that the configured transactional_id is not authorized.
+            KafkaErro:r if the producer has encountered a previous fatal or abortable error, or for any
+                other unexpected error
+        """
+        if not self._transaction_manager:
+            raise Errors.IllegalStateError("Cannot use transactional methods without enabling transactions")
+        result = self._transaction_manager.send_offsets_to_transaction(offsets, consumer_group_id)
+        self._sender.wakeup()
+        result.wait()
+
+    def commit_transaction(self):
+        """ Commits the ongoing transaction.
+
+        Raises: ProducerFencedError if another producer with the same
+                transactional_id is active.
+        """
+        if not self._transaction_manager:
+            raise Errors.IllegalStateError("Cannot commit transaction since transactions are not enabled")
+        result = self._transaction_manager.begin_commit()
+        self._sender.wakeup()
+        result.wait()
+
+    def abort_transaction(self):
+        """ Aborts the ongoing transaction.
+
+        Raises: ProducerFencedError if another producer with the same
+                transactional_id is active.
+        """
+        if not self._transaction_manager:
+            raise Errors.IllegalStateError("Cannot abort transaction since transactions are not enabled.")
+        result = self._transaction_manager.begin_abort()
+        self._sender.wakeup()
+        result.wait()
+
     def send(self, topic, value=None, key=None, headers=None, partition=None, timestamp_ms=None):
         """Publish a message to a topic.
 
@@ -567,44 +823,58 @@ def send(self, topic, value=None, key=None, headers=None, partition=None, timest
         Raises:
             KafkaTimeoutError: if unable to fetch topic metadata, or unable
                 to obtain memory buffer prior to configured max_block_ms
+            TypeError: if topic is not a string
+            ValueError: if topic is invalid: must be chars (a-zA-Z0-9._-), and less than 250 length
+            AssertionError: if KafkaProducer is closed, or key and value are both None
         """
+        assert not self._closed, 'KafkaProducer already closed!'
         assert value is not None or self.config['api_version'] >= (0, 8, 1), (
             'Null messages require kafka >= 0.8.1')
         assert not (value is None and key is None), 'Need at least one: key or value'
+        ensure_valid_topic_name(topic)
         key_bytes = value_bytes = None
+        timer = Timer(self.config['max_block_ms'], "Failed to assign partition for message in max_block_ms.")
         try:
-            self._wait_on_metadata(topic, self.config['max_block_ms'] / 1000.0)
-
-            key_bytes = self._serialize(
-                self.config['key_serializer'],
-                topic, key)
-            value_bytes = self._serialize(
-                self.config['value_serializer'],
-                topic, value)
-            assert type(key_bytes) in (bytes, bytearray, memoryview, type(None))
-            assert type(value_bytes) in (bytes, bytearray, memoryview, type(None))
-
-            partition = self._partition(topic, partition, key, value,
-                                        key_bytes, value_bytes)
+            assigned_partition = None
+            while assigned_partition is None and not timer.expired:
+                self._wait_on_metadata(topic, timer.timeout_ms)
+
+                key_bytes = self._serialize(
+                    self.config['key_serializer'],
+                    topic, key)
+                value_bytes = self._serialize(
+                    self.config['value_serializer'],
+                    topic, value)
+                assert type(key_bytes) in (bytes, bytearray, memoryview, type(None))
+                assert type(value_bytes) in (bytes, bytearray, memoryview, type(None))
+
+                assigned_partition = self._partition(topic, partition, key, value,
+                                                     key_bytes, value_bytes)
+            if assigned_partition is None:
+                raise Errors.KafkaTimeoutError("Failed to assign partition for message after %s secs." % timer.elapsed_ms / 1000)
+            else:
+                partition = assigned_partition
 
             if headers is None:
                 headers = []
-            assert type(headers) == list
-            assert all(type(item) == tuple and len(item) == 2 and type(item[0]) == str and type(item[1]) == bytes for item in headers)
+            assert isinstance(headers, list)
+            assert all(isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], str) and isinstance(item[1], bytes) for item in headers)
 
             message_size = self._estimate_size_in_bytes(key_bytes, value_bytes, headers)
             self._ensure_valid_record_size(message_size)
 
             tp = TopicPartition(topic, partition)
-            log.debug("Sending (key=%r value=%r headers=%r) to %s", key, value, headers, tp)
+            log.debug("%s: Sending (key=%r value=%r headers=%r) to %s", str(self), key, value, headers, tp)
+
+            if self._transaction_manager and self._transaction_manager.is_transactional():
+                self._transaction_manager.maybe_add_partition_to_transaction(tp)
+
             result = self._accumulator.append(tp, timestamp_ms,
-                                              key_bytes, value_bytes, headers,
-                                              self.config['max_block_ms'],
-                                              estimated_size=message_size)
+                                              key_bytes, value_bytes, headers)
             future, batch_is_full, new_batch_created = result
             if batch_is_full or new_batch_created:
-                log.debug("Waking up the sender since %s is either full or"
-                          " getting a new batch", tp)
+                log.debug("%s: Waking up the sender since %s is either full or"
+                          " getting a new batch", str(self), tp)
                 self._sender.wakeup()
 
             return future
@@ -612,7 +882,7 @@ def send(self, topic, value=None, key=None, headers=None, partition=None, timest
             # for API exceptions return them in the future,
             # for other exceptions raise directly
         except Errors.BrokerResponseError as e:
-            log.debug("Exception occurred during message send: %s", e)
+            log.error("%s: Exception occurred during message send: %s", str(self), e)
             return FutureRecordMetadata(
                 FutureProduceResult(TopicPartition(topic, partition)),
                 -1, None, None,
@@ -643,7 +913,7 @@ def flush(self, timeout=None):
             KafkaTimeoutError: failure to flush buffered records within the
                 provided timeout
         """
-        log.debug("Flushing accumulated records in producer.")  # trace
+        log.debug("%s: Flushing accumulated records in producer.", str(self))
         self._accumulator.begin_flush()
         self._sender.wakeup()
         self._accumulator.await_flush_completion(timeout=timeout)
@@ -655,13 +925,8 @@ def _ensure_valid_record_size(self, size):
                 "The message is %d bytes when serialized which is larger than"
                 " the maximum request size you have configured with the"
                 " max_request_size configuration" % (size,))
-        if size > self.config['buffer_memory']:
-            raise Errors.MessageSizeTooLargeError(
-                "The message is %d bytes when serialized which is larger than"
-                " the total memory buffer you have configured with the"
-                " buffer_memory configuration." % (size,))
 
-    def _wait_on_metadata(self, topic, max_wait):
+    def _wait_on_metadata(self, topic, max_wait_ms):
         """
         Wait for cluster metadata including partitions for the given topic to
         be available.
@@ -679,32 +944,29 @@ def _wait_on_metadata(self, topic, max_wait):
         """
         # add topic to metadata topic list if it is not there already.
         self._sender.add_topic(topic)
-        begin = time.time()
-        elapsed = 0.0
+        timer = Timer(max_wait_ms, "Failed to update metadata after %.1f secs." % (max_wait_ms * 1000,))
         metadata_event = None
         while True:
             partitions = self._metadata.partitions_for_topic(topic)
             if partitions is not None:
                 return partitions
-
+            timer.maybe_raise()
             if not metadata_event:
                 metadata_event = threading.Event()
 
-            log.debug("Requesting metadata update for topic %s", topic)
-
+            log.debug("%s: Requesting metadata update for topic %s", str(self), topic)
             metadata_event.clear()
             future = self._metadata.request_update()
             future.add_both(lambda e, *args: e.set(), metadata_event)
             self._sender.wakeup()
-            metadata_event.wait(max_wait - elapsed)
-            elapsed = time.time() - begin
+            metadata_event.wait(timer.timeout_ms / 1000)
             if not metadata_event.is_set():
                 raise Errors.KafkaTimeoutError(
-                    "Failed to update metadata after %.1f secs." % (max_wait,))
+                    "Failed to update metadata after %.1f secs." % (max_wait_ms * 1000,))
             elif topic in self._metadata.unauthorized_topics:
-                raise Errors.TopicAuthorizationFailedError(topic)
+                raise Errors.TopicAuthorizationFailedError(set([topic]))
             else:
-                log.debug("_wait_on_metadata woke after %s secs.", elapsed)
+                log.debug("%s: _wait_on_metadata woke after %s secs.", str(self), timer.elapsed_ms / 1000)
 
     def _serialize(self, f, topic, data):
         if not f:
@@ -715,16 +977,18 @@ def _serialize(self, f, topic, data):
 
     def _partition(self, topic, partition, key, value,
                    serialized_key, serialized_value):
+        all_partitions = self._metadata.partitions_for_topic(topic)
+        available = self._metadata.available_partitions_for_topic(topic)
+        if all_partitions is None or available is None:
+            return None
         if partition is not None:
             assert partition >= 0
-            assert partition in self._metadata.partitions_for_topic(topic), 'Unrecognized partition'
+            assert partition in all_partitions, 'Unrecognized partition'
             return partition
 
-        all_partitions = sorted(self._metadata.partitions_for_topic(topic))
-        available = list(self._metadata.available_partitions_for_topic(topic))
         return self.config['partitioner'](serialized_key,
-                                          all_partitions,
-                                          available)
+                                          sorted(all_partitions),
+                                          list(available))
 
     def metrics(self, raw=False):
         """Get metrics on producer performance.
@@ -736,6 +1000,8 @@ def metrics(self, raw=False):
             This is an unstable interface. It may change in future
             releases without warning.
         """
+        if not self._metrics:
+            return
         if raw:
             return self._metrics.metrics.copy()
 
@@ -747,3 +1013,6 @@ def metrics(self, raw=False):
                 metrics[k.group][k.name] = {}
             metrics[k.group][k.name] = v.value()
         return metrics
+
+    def __str__(self):
+        return "<KafkaProducer client_id=%s transactional_id=%s>" % (self.config['client_id'], self.config['transactional_id'])
diff --git a/kafka/producer/record_accumulator.py b/kafka/producer/record_accumulator.py
index a2aa0e8ec..1c250ee40 100644
--- a/kafka/producer/record_accumulator.py
+++ b/kafka/producer/record_accumulator.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import
+from __future__ import absolute_import, division
 
 import collections
 import copy
@@ -6,8 +6,14 @@
 import threading
 import time
 
+try:
+    # enum in stdlib as of py3.4
+    from enum import IntEnum  # pylint: disable=import-error
+except ImportError:
+    # vendored backport module
+    from kafka.vendor.enum34 import IntEnum
+
 import kafka.errors as Errors
-from kafka.producer.buffer import SimpleBufferPool
 from kafka.producer.future import FutureRecordMetadata, FutureProduceResult
 from kafka.record.memory_records import MemoryRecordsBuilder
 from kafka.structs import TopicPartition
@@ -35,10 +41,16 @@ def get(self):
         return self._val
 
 
+class FinalState(IntEnum):
+    ABORTED = 0
+    FAILED = 1
+    SUCCEEDED = 2
+
+
 class ProducerBatch(object):
-    def __init__(self, tp, records, buffer):
+    def __init__(self, tp, records, now=None):
+        now = time.time() if now is None else now
         self.max_record_size = 0
-        now = time.time()
         self.created = now
         self.drained = None
         self.attempts = 0
@@ -48,81 +60,120 @@ def __init__(self, tp, records, buffer):
         self.topic_partition = tp
         self.produce_future = FutureProduceResult(tp)
         self._retry = False
-        self._buffer = buffer  # We only save it, we don't write to it
+        self._final_state = None
+
+    @property
+    def final_state(self):
+        return self._final_state
 
     @property
     def record_count(self):
         return self.records.next_offset()
 
-    def try_append(self, timestamp_ms, key, value, headers):
+    @property
+    def producer_id(self):
+        return self.records.producer_id if self.records else None
+
+    @property
+    def producer_epoch(self):
+        return self.records.producer_epoch if self.records else None
+
+    @property
+    def has_sequence(self):
+        return self.records.has_sequence if self.records else False
+
+    def try_append(self, timestamp_ms, key, value, headers, now=None):
         metadata = self.records.append(timestamp_ms, key, value, headers)
         if metadata is None:
             return None
 
+        now = time.time() if now is None else now
         self.max_record_size = max(self.max_record_size, metadata.size)
-        self.last_append = time.time()
-        future = FutureRecordMetadata(self.produce_future, metadata.offset,
-                                      metadata.timestamp, metadata.crc,
-                                      len(key) if key is not None else -1,
-                                      len(value) if value is not None else -1,
-                                      sum(len(h_key.encode("utf-8")) + len(h_val) for h_key, h_val in headers) if headers else -1)
+        self.last_append = now
+        future = FutureRecordMetadata(
+            self.produce_future,
+            metadata.offset,
+            metadata.timestamp,
+            metadata.crc,
+            len(key) if key is not None else -1,
+            len(value) if value is not None else -1,
+            sum(len(h_key.encode("utf-8")) + len(h_val) for h_key, h_val in headers) if headers else -1)
         return future
 
-    def done(self, base_offset=None, timestamp_ms=None, exception=None, log_start_offset=None, global_error=None):
-        level = logging.DEBUG if exception is None else logging.WARNING
-        log.log(level, "Produced messages to topic-partition %s with base offset"
-                  " %s log start offset %s and error %s.", self.topic_partition, base_offset,
-                  log_start_offset, global_error)  # trace
-        if self.produce_future.is_done:
-            log.warning('Batch is already closed -- ignoring batch.done()')
-            return
-        elif exception is None:
-            self.produce_future.success((base_offset, timestamp_ms, log_start_offset))
-        else:
-            self.produce_future.failure(exception)
-
-    def maybe_expire(self, request_timeout_ms, retry_backoff_ms, linger_ms, is_full):
-        """Expire batches if metadata is not available
-
-        A batch whose metadata is not available should be expired if one
-        of the following is true:
+    def abort(self, exception):
+        """Abort the batch and complete the future and callbacks."""
+        if self._final_state is not None:
+            raise Errors.IllegalStateError("Batch has already been completed in final state: %s" % self._final_state)
+        self._final_state = FinalState.ABORTED
 
-          * the batch is not in retry AND request timeout has elapsed after
-            it is ready (full or linger.ms has reached).
+        log.debug("Aborting batch for partition %s: %s", self.topic_partition, exception)
+        self._complete_future(-1, -1, exception)
 
-          * the batch is in retry AND request timeout has elapsed after the
-            backoff period ended.
+    def done(self, base_offset=None, timestamp_ms=None, exception=None):
         """
-        now = time.time()
-        since_append = now - self.last_append
-        since_ready = now - (self.created + linger_ms / 1000.0)
-        since_backoff = now - (self.last_attempt + retry_backoff_ms / 1000.0)
-        timeout = request_timeout_ms / 1000.0
-
-        error = None
-        if not self.in_retry() and is_full and timeout < since_append:
-            error = "%d seconds have passed since last append" % (since_append,)
-        elif not self.in_retry() and timeout < since_ready:
-            error = "%d seconds have passed since batch creation plus linger time" % (since_ready,)
-        elif self.in_retry() and timeout < since_backoff:
-            error = "%d seconds have passed since last attempt plus backoff time" % (since_backoff,)
-
-        if error:
-            self.records.close()
-            self.done(-1, None, Errors.KafkaTimeoutError(
-                "Batch for %s containing %s record(s) expired: %s" % (
-                self.topic_partition, self.records.next_offset(), error)))
+        Finalize the state of a batch. Final state, once set, is immutable. This function may be called
+        once or twice on a batch. It may be called twice if
+            1. An inflight batch expires before a response from the broker is received. The batch's final
+            state is set to FAILED. But it could succeed on the broker and second time around batch.done() may
+            try to set SUCCEEDED final state.
+
+            2. If a transaction abortion happens or if the producer is closed forcefully, the final state is
+            ABORTED but again it could succeed if broker responds with a success.
+
+        Attempted transitions from [FAILED | ABORTED] --> SUCCEEDED are logged.
+        Attempted transitions from one failure state to the same or a different failed state are ignored.
+        Attempted transitions from SUCCEEDED to the same or a failed state throw an exception.
+        """
+        final_state = FinalState.SUCCEEDED if exception is None else FinalState.FAILED
+        if self._final_state is None:
+            self._final_state = final_state
+            if final_state is FinalState.SUCCEEDED:
+                log.debug("Successfully produced messages to %s with base offset %s", self.topic_partition, base_offset)
+            else:
+                log.warning("Failed to produce messages to topic-partition %s with base offset %s: %s",
+                            self.topic_partition, base_offset, exception)
+            self._complete_future(base_offset, timestamp_ms, exception)
             return True
+
+        elif self._final_state is not FinalState.SUCCEEDED:
+            if final_state is FinalState.SUCCEEDED:
+                # Log if a previously unsuccessful batch succeeded later on.
+                log.debug("ProduceResponse returned %s for %s after batch with base offset %s had already been %s.",
+                          final_state, self.topic_partition, base_offset, self._final_state)
+            else:
+                # FAILED --> FAILED and ABORTED --> FAILED transitions are ignored.
+                log.debug("Ignored state transition %s -> %s for %s batch with base offset %s",
+                          self._final_state, final_state, self.topic_partition, base_offset)
+        else:
+            # A SUCCESSFUL batch must not attempt another state change.
+            raise Errors.IllegalStateError("A %s batch must not attempt another state change to %s" % (self._final_state, final_state))
         return False
 
+    def _complete_future(self, base_offset, timestamp_ms, exception):
+        if self.produce_future.is_done:
+            raise Errors.IllegalStateError('Batch is already closed!')
+        elif exception is None:
+            self.produce_future.success((base_offset, timestamp_ms))
+        else:
+            self.produce_future.failure(exception)
+
+    def has_reached_delivery_timeout(self, delivery_timeout_ms, now=None):
+        now = time.time() if now is None else now
+        return delivery_timeout_ms / 1000 <= now - self.created
+
     def in_retry(self):
         return self._retry
 
-    def set_retry(self):
+    def retry(self, now=None):
+        now = time.time() if now is None else now
         self._retry = True
+        self.attempts += 1
+        self.last_attempt = now
+        self.last_append = now
 
-    def buffer(self):
-        return self._buffer
+    @property
+    def is_done(self):
+        return self.produce_future.is_done
 
     def __str__(self):
         return 'ProducerBatch(topic_partition=%s, record_count=%d)' % (
@@ -143,12 +194,6 @@ class RecordAccumulator(object):
             A small batch size will make batching less common and may reduce
             throughput (a batch size of zero will disable batching entirely).
             Default: 16384
-        buffer_memory (int): The total bytes of memory the producer should use
-            to buffer records waiting to be sent to the server. If records are
-            sent faster than they can be delivered to the server the producer
-            will block up to max_block_ms, raising an exception on timeout.
-            In the current implementation, this setting is an approximation.
-            Default: 33554432 (32MB)
         compression_attrs (int): The compression type for all data generated by
             the producer. Valid values are gzip(1), snappy(2), lz4(3), or
             none(0).
@@ -156,7 +201,7 @@ class RecordAccumulator(object):
             will also impact the compression ratio (more batching means better
             compression). Default: None.
         linger_ms (int): An artificial delay time to add before declaring a
-            messageset (that isn't full) ready for sending. This allows
+            record batch (that isn't full) ready for sending. This allows
             time for more records to arrive. Setting a non-zero linger_ms
             will trade off some latency for potentially better throughput
             due to more batching (and hence fewer, larger requests).
@@ -166,14 +211,14 @@ class RecordAccumulator(object):
             all retries in a short period of time. Default: 100
     """
     DEFAULT_CONFIG = {
-        'buffer_memory': 33554432,
         'batch_size': 16384,
         'compression_attrs': 0,
         'linger_ms': 0,
+        'request_timeout_ms': 30000,
+        'delivery_timeout_ms': 120000,
         'retry_backoff_ms': 100,
-        'message_version': 0,
-        'metrics': None,
-        'metric_group_prefix': 'producer-metrics',
+        'transaction_manager': None,
+        'message_version': 2,
     }
 
     def __init__(self, **configs):
@@ -183,22 +228,37 @@ def __init__(self, **configs):
                 self.config[key] = configs.pop(key)
 
         self._closed = False
+        self._transaction_manager = self.config['transaction_manager']
         self._flushes_in_progress = AtomicInteger()
         self._appends_in_progress = AtomicInteger()
         self._batches = collections.defaultdict(collections.deque) # TopicPartition: [ProducerBatch]
         self._tp_locks = {None: threading.Lock()} # TopicPartition: Lock, plus a lock to add entries
-        self._free = SimpleBufferPool(self.config['buffer_memory'],
-                                      self.config['batch_size'],
-                                      metrics=self.config['metrics'],
-                                      metric_group_prefix=self.config['metric_group_prefix'])
         self._incomplete = IncompleteProducerBatches()
         # The following variables should only be accessed by the sender thread,
         # so we don't need to protect them w/ locking.
         self.muted = set()
         self._drain_index = 0
+        self._next_batch_expiry_time_ms = float('inf')
 
-    def append(self, tp, timestamp_ms, key, value, headers, max_time_to_block_ms,
-               estimated_size=0):
+        if self.config['delivery_timeout_ms'] < self.config['linger_ms'] + self.config['request_timeout_ms']:
+            raise Errors.KafkaConfigurationError("Must set delivery_timeout_ms higher than linger_ms + request_timeout_ms")
+
+    @property
+    def delivery_timeout_ms(self):
+        return self.config['delivery_timeout_ms']
+
+    @property
+    def next_expiry_time_ms(self):
+        return self._next_batch_expiry_time_ms
+
+    def _tp_lock(self, tp):
+        if tp not in self._tp_locks:
+            with self._tp_locks[None]:
+                if tp not in self._tp_locks:
+                    self._tp_locks[tp] = threading.Lock()
+        return self._tp_locks[tp]
+
+    def append(self, tp, timestamp_ms, key, value, headers, now=None):
         """Add a record to the accumulator, return the append result.
 
         The append result will contain the future metadata, and flag for
@@ -211,59 +271,53 @@ def append(self, tp, timestamp_ms, key, value, headers, max_time_to_block_ms,
             key (bytes): The key for the record
             value (bytes): The value for the record
             headers (List[Tuple[str, bytes]]): The header fields for the record
-            max_time_to_block_ms (int): The maximum time in milliseconds to
-                block for buffer memory to be available
 
         Returns:
             tuple: (future, batch_is_full, new_batch_created)
         """
         assert isinstance(tp, TopicPartition), 'not TopicPartition'
         assert not self._closed, 'RecordAccumulator is closed'
+        now = time.time() if now is None else now
         # We keep track of the number of appending thread to make sure we do
         # not miss batches in abortIncompleteBatches().
         self._appends_in_progress.increment()
         try:
-            if tp not in self._tp_locks:
-                with self._tp_locks[None]:
-                    if tp not in self._tp_locks:
-                        self._tp_locks[tp] = threading.Lock()
-
-            with self._tp_locks[tp]:
+            with self._tp_lock(tp):
                 # check if we have an in-progress batch
                 dq = self._batches[tp]
                 if dq:
                     last = dq[-1]
-                    future = last.try_append(timestamp_ms, key, value, headers)
+                    future = last.try_append(timestamp_ms, key, value, headers, now=now)
                     if future is not None:
                         batch_is_full = len(dq) > 1 or last.records.is_full()
                         return future, batch_is_full, False
 
-            size = max(self.config['batch_size'], estimated_size)
-            log.debug("Allocating a new %d byte message buffer for %s", size, tp) # trace
-            buf = self._free.allocate(size, max_time_to_block_ms)
-            with self._tp_locks[tp]:
+            with self._tp_lock(tp):
                 # Need to check if producer is closed again after grabbing the
                 # dequeue lock.
                 assert not self._closed, 'RecordAccumulator is closed'
 
                 if dq:
                     last = dq[-1]
-                    future = last.try_append(timestamp_ms, key, value, headers)
+                    future = last.try_append(timestamp_ms, key, value, headers, now=now)
                     if future is not None:
                         # Somebody else found us a batch, return the one we
                         # waited for! Hopefully this doesn't happen often...
-                        self._free.deallocate(buf)
                         batch_is_full = len(dq) > 1 or last.records.is_full()
                         return future, batch_is_full, False
 
+                if self._transaction_manager and self.config['message_version'] < 2:
+                    raise Errors.UnsupportedVersionError("Attempting to use idempotence with a broker which"
+                                                         " does not support the required message format (v2)."
+                                                         " The broker must be version 0.11 or later.")
                 records = MemoryRecordsBuilder(
                     self.config['message_version'],
                     self.config['compression_attrs'],
                     self.config['batch_size']
                 )
 
-                batch = ProducerBatch(tp, records, buf)
-                future = batch.try_append(timestamp_ms, key, value, headers)
+                batch = ProducerBatch(tp, records, now=now)
+                future = batch.try_append(timestamp_ms, key, value, headers, now=now)
                 if not future:
                     raise Exception()
 
@@ -274,79 +328,40 @@ def append(self, tp, timestamp_ms, key, value, headers, max_time_to_block_ms,
         finally:
             self._appends_in_progress.decrement()
 
-    def abort_expired_batches(self, request_timeout_ms, cluster):
-        """Abort the batches that have been sitting in RecordAccumulator for
-        more than the configured request_timeout due to metadata being
-        unavailable.
+    def maybe_update_next_batch_expiry_time(self, batch):
+        self._next_batch_expiry_time_ms = min(self._next_batch_expiry_time_ms, batch.created * 1000 + self.delivery_timeout_ms)
 
-        Arguments:
-            request_timeout_ms (int): milliseconds to timeout
-            cluster (ClusterMetadata): current metadata for kafka cluster
-
-        Returns:
-            list of ProducerBatch that were expired
-        """
+    def expired_batches(self, now=None):
+        """Get a list of batches which have been sitting in the accumulator too long and need to be expired."""
         expired_batches = []
-        to_remove = []
-        count = 0
         for tp in list(self._batches.keys()):
-            assert tp in self._tp_locks, 'TopicPartition not in locks dict'
-
-            # We only check if the batch should be expired if the partition
-            # does not have a batch in flight. This is to avoid the later
-            # batches get expired when an earlier batch is still in progress.
-            # This protection only takes effect when user sets
-            # max.in.flight.request.per.connection=1. Otherwise the expiration
-            # order is not guranteed.
-            if tp in self.muted:
-                continue
-
-            with self._tp_locks[tp]:
+            with self._tp_lock(tp):
                 # iterate over the batches and expire them if they have stayed
                 # in accumulator for more than request_timeout_ms
                 dq = self._batches[tp]
-                for batch in dq:
-                    is_full = bool(bool(batch != dq[-1]) or batch.records.is_full())
-                    # check if the batch is expired
-                    if batch.maybe_expire(request_timeout_ms,
-                                          self.config['retry_backoff_ms'],
-                                          self.config['linger_ms'],
-                                          is_full):
+                while dq:
+                    batch = dq[0]
+                    if batch.has_reached_delivery_timeout(self.delivery_timeout_ms, now=now):
+                        dq.popleft()
+                        batch.records.close()
                         expired_batches.append(batch)
-                        to_remove.append(batch)
-                        count += 1
-                        self.deallocate(batch)
                     else:
                         # Stop at the first batch that has not expired.
+                        self.maybe_update_next_batch_expiry_time(batch)
                         break
-
-                # Python does not allow us to mutate the dq during iteration
-                # Assuming expired batches are infrequent, this is better than
-                # creating a new copy of the deque for iteration on every loop
-                if to_remove:
-                    for batch in to_remove:
-                        dq.remove(batch)
-                    to_remove = []
-
-        if expired_batches:
-            log.warning("Expired %d batches in accumulator", count) # trace
-
         return expired_batches
 
-    def reenqueue(self, batch):
-        """Re-enqueue the given record batch in the accumulator to retry."""
-        now = time.time()
-        batch.attempts += 1
-        batch.last_attempt = now
-        batch.last_append = now
-        batch.set_retry()
-        assert batch.topic_partition in self._tp_locks, 'TopicPartition not in locks dict'
-        assert batch.topic_partition in self._batches, 'TopicPartition not in batches'
-        dq = self._batches[batch.topic_partition]
-        with self._tp_locks[batch.topic_partition]:
+    def reenqueue(self, batch, now=None):
+        """
+        Re-enqueue the given record batch in the accumulator. In Sender._complete_batch method, we check
+        whether the batch has reached delivery_timeout_ms or not. Hence we do not do the delivery timeout check here.
+        """
+        batch.retry(now=now)
+        with self._tp_lock(batch.topic_partition):
+            dq = self._batches[batch.topic_partition]
             dq.appendleft(batch)
 
-    def ready(self, cluster):
+    def ready(self, cluster, now=None):
         """
         Get a list of nodes whose partitions are ready to be sent, and the
         earliest time at which any non-sendable partition will be ready;
@@ -380,9 +395,8 @@ def ready(self, cluster):
         ready_nodes = set()
         next_ready_check = 9999999.99
         unknown_leaders_exist = False
-        now = time.time()
+        now = time.time() if now is None else now
 
-        exhausted = bool(self._free.queued() > 0)
         # several threads are accessing self._batches -- to simplify
         # concurrent access, we iterate over a snapshot of partitions
         # and lock each partition separately as needed
@@ -397,22 +411,22 @@ def ready(self, cluster):
             elif tp in self.muted:
                 continue
 
-            with self._tp_locks[tp]:
+            with self._tp_lock(tp):
                 dq = self._batches[tp]
                 if not dq:
                     continue
                 batch = dq[0]
-                retry_backoff = self.config['retry_backoff_ms'] / 1000.0
-                linger = self.config['linger_ms'] / 1000.0
-                backing_off = bool(batch.attempts > 0 and
-                                   batch.last_attempt + retry_backoff > now)
+                retry_backoff = self.config['retry_backoff_ms'] / 1000
+                linger = self.config['linger_ms'] / 1000
+                backing_off = bool(batch.attempts > 0
+                                   and (batch.last_attempt + retry_backoff) > now)
                 waited_time = now - batch.last_attempt
                 time_to_wait = retry_backoff if backing_off else linger
                 time_left = max(time_to_wait - waited_time, 0)
                 full = bool(len(dq) > 1 or batch.records.is_full())
                 expired = bool(waited_time >= time_to_wait)
 
-                sendable = (full or expired or exhausted or self._closed or
+                sendable = (full or expired or self._closed or
                             self._flush_in_progress())
 
                 if sendable and not backing_off:
@@ -427,16 +441,98 @@ def ready(self, cluster):
 
         return ready_nodes, next_ready_check, unknown_leaders_exist
 
-    def has_unsent(self):
-        """Return whether there is any unsent record in the accumulator."""
+    def has_undrained(self):
+        """Check whether there are any batches which haven't been drained"""
         for tp in list(self._batches.keys()):
-            with self._tp_locks[tp]:
+            with self._tp_lock(tp):
                 dq = self._batches[tp]
                 if len(dq):
                     return True
         return False
 
-    def drain(self, cluster, nodes, max_size):
+    def _should_stop_drain_batches_for_partition(self, first, tp):
+        if self._transaction_manager:
+            if not self._transaction_manager.is_send_to_partition_allowed(tp):
+                return True
+            if not self._transaction_manager.producer_id_and_epoch.is_valid:
+                # we cannot send the batch until we have refreshed the PID
+                log.debug("Waiting to send ready batches because transaction producer id is not valid")
+                return True
+        return False
+
+    def drain_batches_for_one_node(self, cluster, node_id, max_size, now=None):
+        now = time.time() if now is None else now
+        size = 0
+        ready = []
+        partitions = list(cluster.partitions_for_broker(node_id))
+        if not partitions:
+            return ready
+        # to make starvation less likely this loop doesn't start at 0
+        self._drain_index %= len(partitions)
+        start = None
+        while start != self._drain_index:
+            tp = partitions[self._drain_index]
+            if start is None:
+                start = self._drain_index
+            self._drain_index += 1
+            self._drain_index %= len(partitions)
+
+            # Only proceed if the partition has no in-flight batches.
+            if tp in self.muted:
+                continue
+
+            if tp not in self._batches:
+                continue
+
+            with self._tp_lock(tp):
+                dq = self._batches[tp]
+                if len(dq) == 0:
+                    continue
+                first = dq[0]
+                backoff = bool(first.attempts > 0 and
+                               first.last_attempt + self.config['retry_backoff_ms'] / 1000 > now)
+                # Only drain the batch if it is not during backoff
+                if backoff:
+                    continue
+
+                if (size + first.records.size_in_bytes() > max_size
+                    and len(ready) > 0):
+                    # there is a rare case that a single batch
+                    # size is larger than the request size due
+                    # to compression; in this case we will
+                    # still eventually send this batch in a
+                    # single request
+                    break
+                else:
+                    if self._should_stop_drain_batches_for_partition(first, tp):
+                        break
+
+                    batch = dq.popleft()
+                    if self._transaction_manager and not batch.in_retry():
+                        # If the batch is in retry, then we should not change the pid and
+                        # sequence number, since this may introduce duplicates. In particular,
+                        # the previous attempt may actually have been accepted, and if we change
+                        # the pid and sequence here, this attempt will also be accepted, causing
+                        # a duplicate.
+                        sequence_number = self._transaction_manager.sequence_number(batch.topic_partition)
+                        log.debug("Dest: %s: %s producer_id=%s epoch=%s sequence=%s",
+                                  node_id, batch.topic_partition,
+                                  self._transaction_manager.producer_id_and_epoch.producer_id,
+                                  self._transaction_manager.producer_id_and_epoch.epoch,
+                                  sequence_number)
+                        batch.records.set_producer_state(
+                            self._transaction_manager.producer_id_and_epoch.producer_id,
+                            self._transaction_manager.producer_id_and_epoch.epoch,
+                            sequence_number,
+                            self._transaction_manager.is_transactional()
+                        )
+                    batch.records.close()
+                    size += batch.records.size_in_bytes()
+                    ready.append(batch)
+                    batch.drained = now
+        return ready
+
+    def drain(self, cluster, nodes, max_size, now=None):
         """
         Drain all the data for the given nodes and collate them into a list of
         batches that will fit within the specified size on a per-node basis.
@@ -454,57 +550,15 @@ def drain(self, cluster, nodes, max_size):
         if not nodes:
             return {}
 
-        now = time.time()
+        now = time.time() if now is None else now
         batches = {}
         for node_id in nodes:
-            size = 0
-            partitions = list(cluster.partitions_for_broker(node_id))
-            ready = []
-            # to make starvation less likely this loop doesn't start at 0
-            self._drain_index %= len(partitions)
-            start = self._drain_index
-            while True:
-                tp = partitions[self._drain_index]
-                if tp in self._batches and tp not in self.muted:
-                    with self._tp_locks[tp]:
-                        dq = self._batches[tp]
-                        if dq:
-                            first = dq[0]
-                            backoff = (
-                                bool(first.attempts > 0) and
-                                bool(first.last_attempt +
-                                     self.config['retry_backoff_ms'] / 1000.0
-                                     > now)
-                            )
-                            # Only drain the batch if it is not during backoff
-                            if not backoff:
-                                if (size + first.records.size_in_bytes() > max_size
-                                    and len(ready) > 0):
-                                    # there is a rare case that a single batch
-                                    # size is larger than the request size due
-                                    # to compression; in this case we will
-                                    # still eventually send this batch in a
-                                    # single request
-                                    break
-                                else:
-                                    batch = dq.popleft()
-                                    batch.records.close()
-                                    size += batch.records.size_in_bytes()
-                                    ready.append(batch)
-                                    batch.drained = now
-
-                self._drain_index += 1
-                self._drain_index %= len(partitions)
-                if start == self._drain_index:
-                    break
-
-            batches[node_id] = ready
+            batches[node_id] = self.drain_batches_for_one_node(cluster, node_id, max_size, now=now)
         return batches
 
     def deallocate(self, batch):
         """Deallocate the record batch."""
         self._incomplete.remove(batch)
-        self._free.deallocate(batch.buffer())
 
     def _flush_in_progress(self):
         """Are there any threads currently waiting on a flush?"""
@@ -535,6 +589,10 @@ def await_flush_completion(self, timeout=None):
         finally:
             self._flushes_in_progress.decrement()
 
+    @property
+    def has_incomplete(self):
+        return bool(self._incomplete)
+
     def abort_incomplete_batches(self):
         """
         This function is only called when sender is closed forcefully. It will fail all the
@@ -544,27 +602,41 @@ def abort_incomplete_batches(self):
         # 1. Avoid losing batches.
         # 2. Free up memory in case appending threads are blocked on buffer full.
         # This is a tight loop but should be able to get through very quickly.
+        error = Errors.IllegalStateError("Producer is closed forcefully.")
         while True:
-            self._abort_batches()
+            self._abort_batches(error)
             if not self._appends_in_progress.get():
                 break
         # After this point, no thread will append any messages because they will see the close
         # flag set. We need to do the last abort after no thread was appending in case the there was a new
         # batch appended by the last appending thread.
-        self._abort_batches()
+        self._abort_batches(error)
         self._batches.clear()
 
-    def _abort_batches(self):
+    def _abort_batches(self, error):
         """Go through incomplete batches and abort them."""
-        error = Errors.IllegalStateError("Producer is closed forcefully.")
         for batch in self._incomplete.all():
             tp = batch.topic_partition
             # Close the batch before aborting
-            with self._tp_locks[tp]:
+            with self._tp_lock(tp):
                 batch.records.close()
-            batch.done(exception=error)
+                self._batches[tp].remove(batch)
+            batch.abort(error)
             self.deallocate(batch)
 
+    def abort_undrained_batches(self, error):
+        for batch in self._incomplete.all():
+            tp = batch.topic_partition
+            with self._tp_lock(tp):
+                aborted = False
+                if not batch.is_done:
+                    aborted = True
+                    batch.records.close()
+                    self._batches[tp].remove(batch)
+            if aborted:
+                batch.abort(error)
+                self.deallocate(batch)
+
     def close(self):
         """Close this accumulator and force all the record buffers to be drained."""
         self._closed = True
@@ -579,12 +651,21 @@ def __init__(self):
 
     def add(self, batch):
         with self._lock:
-            return self._incomplete.add(batch)
+            self._incomplete.add(batch)
 
     def remove(self, batch):
         with self._lock:
-            return self._incomplete.remove(batch)
+            try:
+                self._incomplete.remove(batch)
+            except KeyError:
+                pass
 
     def all(self):
         with self._lock:
             return list(self._incomplete)
+
+    def __bool__(self):
+        return bool(self._incomplete)
+
+
+    __nonzero__ = __bool__
diff --git a/kafka/producer/sender.py b/kafka/producer/sender.py
index 35688d3f1..4a88b2f7a 100644
--- a/kafka/producer/sender.py
+++ b/kafka/producer/sender.py
@@ -2,6 +2,7 @@
 
 import collections
 import copy
+import heapq
 import logging
 import threading
 import time
@@ -11,6 +12,8 @@
 from kafka import errors as Errors
 from kafka.metrics.measurable import AnonMeasurable
 from kafka.metrics.stats import Avg, Max, Rate
+from kafka.producer.transaction_manager import ProducerIdAndEpoch
+from kafka.protocol.init_producer_id import InitProducerIdRequest
 from kafka.protocol.produce import ProduceRequest
 from kafka.structs import TopicPartition
 from kafka.version import __version__
@@ -27,14 +30,18 @@ class Sender(threading.Thread):
     DEFAULT_CONFIG = {
         'max_request_size': 1048576,
         'acks': 1,
-        'retries': 0,
+        'retries': float('inf'),
         'request_timeout_ms': 30000,
+        'retry_backoff_ms': 100,
+        'metrics': None,
         'guarantee_message_order': False,
+        'transaction_manager': None,
+        'transactional_id': None,
+        'transaction_timeout_ms': 60000,
         'client_id': 'kafka-python-' + __version__,
-        'api_version': (0, 8, 0),
     }
 
-    def __init__(self, client, metadata, accumulator, metrics, **configs):
+    def __init__(self, client, metadata, accumulator, **configs):
         super(Sender, self).__init__()
         self.config = copy.copy(self.DEFAULT_CONFIG)
         for key in self.config:
@@ -48,32 +55,75 @@ def __init__(self, client, metadata, accumulator, metrics, **configs):
         self._running = True
         self._force_close = False
         self._topics_to_add = set()
-        self._sensors = SenderMetrics(metrics, self._client, self._metadata)
+        if self.config['metrics']:
+            self._sensors = SenderMetrics(self.config['metrics'], self._client, self._metadata)
+        else:
+            self._sensors = None
+        self._transaction_manager = self.config['transaction_manager']
+        # A per-partition queue of batches ordered by creation time for tracking the in-flight batches
+        self._in_flight_batches = collections.defaultdict(list)
+
+    def _maybe_remove_from_inflight_batches(self, batch):
+        try:
+            queue = self._in_flight_batches[batch.topic_partition]
+        except KeyError:
+            return
+        try:
+            idx = queue.index((batch.created, batch))
+        except ValueError:
+            return
+        # https://stackoverflow.com/questions/10162679/python-delete-element-from-heap
+        queue[idx] = queue[-1]
+        queue.pop()
+        heapq.heapify(queue)
+
+    def _get_expired_inflight_batches(self):
+        """Get the in-flight batches that has reached delivery timeout."""
+        expired_batches = []
+        to_remove = []
+        for tp, queue in six.iteritems(self._in_flight_batches):
+            while queue:
+                _created_at, batch = queue[0]
+                if batch.has_reached_delivery_timeout(self._accumulator.delivery_timeout_ms):
+                    heapq.heappop(queue)
+                    if batch.final_state is None:
+                        expired_batches.append(batch)
+                    else:
+                        raise Errors.IllegalStateError("%s batch created at %s gets unexpected final state %s" % (batch.topic_partition, batch.created, batch.final_state))
+                else:
+                    self._accumulator.maybe_update_next_batch_expiry_time(batch)
+                    break
+            else:
+                # Avoid mutating in_flight_batches during iteration
+                to_remove.append(tp)
+        for tp in to_remove:
+            del self._in_flight_batches[tp]
+        return expired_batches
 
     def run(self):
         """The main run loop for the sender thread."""
-        log.debug("Starting Kafka producer I/O thread.")
+        log.debug("%s: Starting Kafka producer I/O thread.", str(self))
 
         # main loop, runs until close is called
         while self._running:
             try:
                 self.run_once()
             except Exception:
-                log.exception("Uncaught error in kafka producer I/O thread")
+                log.exception("%s: Uncaught error in kafka producer I/O thread", str(self))
 
-        log.debug("Beginning shutdown of Kafka producer I/O thread, sending"
-                  " remaining records.")
+        log.debug("%s: Beginning shutdown of Kafka producer I/O thread, sending"
+                  " remaining records.", str(self))
 
         # okay we stopped accepting requests but there may still be
         # requests in the accumulator or waiting for acknowledgment,
         # wait until these are completed.
         while (not self._force_close
-               and (self._accumulator.has_unsent()
+               and (self._accumulator.has_undrained()
                     or self._client.in_flight_request_count() > 0)):
             try:
                 self.run_once()
             except Exception:
-                log.exception("Uncaught error in kafka producer I/O thread")
+                log.exception("%s: Uncaught error in kafka producer I/O thread", str(self))
 
         if self._force_close:
             # We need to fail all the incomplete batches and wake up the
@@ -83,15 +133,46 @@ def run(self):
         try:
             self._client.close()
         except Exception:
-            log.exception("Failed to close network client")
+            log.exception("%s: Failed to close network client", str(self))
 
-        log.debug("Shutdown of Kafka producer I/O thread has completed.")
+        log.debug("%s: Shutdown of Kafka producer I/O thread has completed.", str(self))
 
     def run_once(self):
         """Run a single iteration of sending."""
         while self._topics_to_add:
             self._client.add_topic(self._topics_to_add.pop())
 
+        if self._transaction_manager:
+            try:
+                if not self._transaction_manager.is_transactional():
+                    # this is an idempotent producer, so make sure we have a producer id
+                    self._maybe_wait_for_producer_id()
+                elif self._transaction_manager.has_in_flight_transactional_request() or self._maybe_send_transactional_request():
+                    # as long as there are outstanding transactional requests, we simply wait for them to return
+                    self._client.poll(timeout_ms=self.config['retry_backoff_ms'])
+                    return
+
+                # do not continue sending if the transaction manager is in a failed state or if there
+                # is no producer id (for the idempotent case).
+                if self._transaction_manager.has_fatal_error() or not self._transaction_manager.has_producer_id():
+                    last_error = self._transaction_manager.last_error
+                    if last_error is not None:
+                        self._maybe_abort_batches(last_error)
+                    self._client.poll(timeout_ms=self.config['retry_backoff_ms'])
+                    return
+                elif self._transaction_manager.has_abortable_error():
+                    self._accumulator.abort_undrained_batches(self._transaction_manager.last_error)
+
+            except Errors.SaslAuthenticationFailedError as e:
+                # This is already logged as error, but propagated here to perform any clean ups.
+                log.debug("%s: Authentication exception while processing transactional request: %s", str(self), e)
+                self._transaction_manager.authentication_failed(e)
+
+        poll_timeout_ms = self._send_producer_data()
+        self._client.poll(timeout_ms=poll_timeout_ms)
+
+    def _send_producer_data(self, now=None):
+        now = time.time() if now is None else now
         # get the list of partitions with data ready to send
         result = self._accumulator.ready(self._metadata)
         ready_nodes, next_ready_check_delay, unknown_leaders_exist = result
@@ -99,65 +180,158 @@ def run_once(self):
         # if there are any partitions whose leaders are not known yet, force
         # metadata update
         if unknown_leaders_exist:
-            log.debug('Unknown leaders exist, requesting metadata update')
+            log.debug('%s: Unknown leaders exist, requesting metadata update', str(self))
             self._metadata.request_update()
 
         # remove any nodes we aren't ready to send to
-        not_ready_timeout = float('inf')
+        not_ready_timeout_ms = float('inf')
         for node in list(ready_nodes):
             if not self._client.is_ready(node):
-                log.debug('Node %s not ready; delaying produce of accumulated batch', node)
+                node_delay_ms = self._client.connection_delay(node)
+                log.debug('%s: Node %s not ready; delaying produce of accumulated batch (%f ms)', str(self), node, node_delay_ms)
                 self._client.maybe_connect(node, wakeup=False)
                 ready_nodes.remove(node)
-                not_ready_timeout = min(not_ready_timeout,
-                                        self._client.connection_delay(node))
+                not_ready_timeout_ms = min(not_ready_timeout_ms, node_delay_ms)
 
         # create produce requests
         batches_by_node = self._accumulator.drain(
             self._metadata, ready_nodes, self.config['max_request_size'])
 
+        for batch_list in six.itervalues(batches_by_node):
+            for batch in batch_list:
+                item = (batch.created, batch)
+                queue = self._in_flight_batches[batch.topic_partition]
+                heapq.heappush(queue, item)
+
         if self.config['guarantee_message_order']:
             # Mute all the partitions drained
             for batch_list in six.itervalues(batches_by_node):
                 for batch in batch_list:
                     self._accumulator.muted.add(batch.topic_partition)
 
-        expired_batches = self._accumulator.abort_expired_batches(
-            self.config['request_timeout_ms'], self._metadata)
+        expired_batches = self._accumulator.expired_batches()
+        expired_batches.extend(self._get_expired_inflight_batches())
+
+        if expired_batches:
+            log.debug("%s: Expired %s batches in accumulator", str(self), len(expired_batches))
+
+        # Reset the producer_id if an expired batch has previously been sent to the broker.
+        # See the documentation of `TransactionState.reset_producer_id` to understand why
+        # we need to reset the producer id here.
+        if self._transaction_manager and any([batch.in_retry() for batch in expired_batches]):
+            needs_transaction_state_reset = True
+        else:
+            needs_transaction_state_reset = False
+
         for expired_batch in expired_batches:
-            self._sensors.record_errors(expired_batch.topic_partition.topic, expired_batch.record_count)
+            error = Errors.KafkaTimeoutError(
+                "Expiring %d record(s) for %s: %s ms has passed since batch creation" % (
+                    expired_batch.record_count, expired_batch.topic_partition,
+                    int((time.time() - expired_batch.created) * 1000)))
+            self._fail_batch(expired_batch, error, base_offset=-1)
+
+        if self._sensors:
+            self._sensors.update_produce_request_metrics(batches_by_node)
+
+        if needs_transaction_state_reset:
+            self._transaction_manager.reset_producer_id()
+            return 0
 
-        self._sensors.update_produce_request_metrics(batches_by_node)
         requests = self._create_produce_requests(batches_by_node)
         # If we have any nodes that are ready to send + have sendable data,
         # poll with 0 timeout so this can immediately loop and try sending more
-        # data. Otherwise, the timeout is determined by nodes that have
-        # partitions with data that isn't yet sendable (e.g. lingering, backing
-        # off). Note that this specifically does not include nodes with
+        # data. Otherwise, the timeout will be the smaller value between next
+        # batch expiry time, and the delay time for checking data availability.
+        # Note that the nodes may have data that isn't yet sendable due to
+        # lingering, backing off, etc. This specifically does not include nodes with
         # sendable data that aren't ready to send since they would cause busy
         # looping.
-        poll_timeout_ms = min(next_ready_check_delay * 1000, not_ready_timeout)
+        poll_timeout_ms = min(next_ready_check_delay * 1000,
+                              not_ready_timeout_ms,
+                              self._accumulator.next_expiry_time_ms - now * 1000)
+        if poll_timeout_ms < 0:
+            poll_timeout_ms = 0
+
         if ready_nodes:
-            log.debug("Nodes with data ready to send: %s", ready_nodes) # trace
-            log.debug("Created %d produce requests: %s", len(requests), requests) # trace
+            log.debug("%s: Nodes with data ready to send: %s", str(self), ready_nodes) # trace
+            log.debug("%s: Created %d produce requests: %s", str(self), len(requests), requests) # trace
+            # if some partitions are already ready to be sent, the select time
+            # would be 0; otherwise if some partition already has some data
+            # accumulated but not ready yet, the select time will be the time
+            # difference between now and its linger expiry time; otherwise the
+            # select time will be the time difference between now and the
+            # metadata expiry time
             poll_timeout_ms = 0
 
         for node_id, request in six.iteritems(requests):
             batches = batches_by_node[node_id]
-            log.debug('Sending Produce Request: %r', request)
+            log.debug('%s: Sending Produce Request: %r', str(self), request)
             (self._client.send(node_id, request, wakeup=False)
                  .add_callback(
                      self._handle_produce_response, node_id, time.time(), batches)
                  .add_errback(
                      self._failed_produce, batches, node_id))
+        return poll_timeout_ms
+
+    def _maybe_send_transactional_request(self):
+        if self._transaction_manager.is_completing() and self._accumulator.has_incomplete:
+            if self._transaction_manager.is_aborting():
+                self._accumulator.abort_undrained_batches(Errors.KafkaError("Failing batch since transaction was aborted"))
+            # There may still be requests left which are being retried. Since we do not know whether they had
+            # been successfully appended to the broker log, we must resend them until their final status is clear.
+            # If they had been appended and we did not receive the error, then our sequence number would no longer
+            # be correct which would lead to an OutOfSequenceNumberError.
+            if not self._accumulator.flush_in_progress():
+                self._accumulator.begin_flush()
+
+        next_request_handler = self._transaction_manager.next_request_handler(self._accumulator.has_incomplete)
+        if next_request_handler is None:
+            return False
+
+        log.debug("%s: Sending transactional request %s", str(self), next_request_handler.request)
+        while not self._force_close:
+            target_node = None
+            try:
+                if next_request_handler.needs_coordinator():
+                    target_node = self._transaction_manager.coordinator(next_request_handler.coordinator_type)
+                    if target_node is None:
+                        self._transaction_manager.lookup_coordinator_for_request(next_request_handler)
+                        break
+                    elif not self._client.await_ready(target_node, timeout_ms=self.config['request_timeout_ms']):
+                        self._transaction_manager.lookup_coordinator_for_request(next_request_handler)
+                        target_node = None
+                        break
+                else:
+                    target_node = self._client.least_loaded_node()
+                    if target_node is not None and not self._client.await_ready(target_node, timeout_ms=self.config['request_timeout_ms']):
+                        target_node = None
+
+                if target_node is not None:
+                    if next_request_handler.is_retry:
+                        time.sleep(self.config['retry_backoff_ms'] / 1000)
+                    txn_correlation_id = self._transaction_manager.next_in_flight_request_correlation_id()
+                    future = self._client.send(target_node, next_request_handler.request)
+                    future.add_both(next_request_handler.on_complete, txn_correlation_id)
+                    return True
+
+            except Exception as e:
+                log.warn("%s: Got an exception when trying to find a node to send a transactional request to. Going to back off and retry: %s", str(self), e)
+                if next_request_handler.needs_coordinator():
+                    self._transaction_manager.lookup_coordinator_for_request(next_request_handler)
+                    break
+
+            time.sleep(self.config['retry_backoff_ms'] / 1000)
+            self._metadata.request_update()
 
-        # if some partitions are already ready to be sent, the select time
-        # would be 0; otherwise if some partition already has some data
-        # accumulated but not ready yet, the select time will be the time
-        # difference between now and its linger expiry time; otherwise the
-        # select time will be the time difference between now and the
-        # metadata expiry time
-        self._client.poll(timeout_ms=poll_timeout_ms)
+        if target_node is None:
+            self._transaction_manager.retry(next_request_handler)
+
+        return True
+
+    def _maybe_abort_batches(self, exc):
+        if self._accumulator.has_incomplete:
+            log.error("%s: Aborting producer batches due to fatal error: %s", str(self), exc)
+            self._accumulator.abort_batches(exc)
 
     def initiate_close(self):
         """Start closing the sender (won't complete until all data is sent)."""
@@ -180,82 +354,164 @@ def add_topic(self, topic):
             self._topics_to_add.add(topic)
             self.wakeup()
 
+    def _maybe_wait_for_producer_id(self):
+        while not self._transaction_manager.has_producer_id():
+            try:
+                node_id = self._client.least_loaded_node()
+                if node_id is None or not self._client.await_ready(node_id):
+                    log.debug("%s, Could not find an available broker to send InitProducerIdRequest to." +
+                              " Will back off and try again.", str(self))
+                    time.sleep(self._client.least_loaded_node_refresh_ms() / 1000)
+                    continue
+                version = self._client.api_version(InitProducerIdRequest, max_version=1)
+                request = InitProducerIdRequest[version](
+                    transactional_id=self.config['transactional_id'],
+                    transaction_timeout_ms=self.config['transaction_timeout_ms'],
+                )
+                response = self._client.send_and_receive(node_id, request)
+                error_type = Errors.for_code(response.error_code)
+                if error_type is Errors.NoError:
+                    self._transaction_manager.set_producer_id_and_epoch(ProducerIdAndEpoch(response.producer_id, response.producer_epoch))
+                    break
+                elif getattr(error_type, 'retriable', False):
+                    log.debug("%s: Retriable error from InitProducerId response: %s", str(self), error_type.__name__)
+                    if getattr(error_type, 'invalid_metadata', False):
+                        self._metadata.request_update()
+                else:
+                    self._transaction_manager.transition_to_fatal_error(error_type())
+                    break
+            except Errors.KafkaConnectionError:
+                log.debug("%s: Broker %s disconnected while awaiting InitProducerId response", str(self), node_id)
+            except Errors.RequestTimedOutError:
+                log.debug("%s: InitProducerId request to node %s timed out", str(self), node_id)
+            log.debug("%s: Retry InitProducerIdRequest in %sms.", str(self), self.config['retry_backoff_ms'])
+            time.sleep(self.config['retry_backoff_ms'] / 1000)
+
     def _failed_produce(self, batches, node_id, error):
-        log.debug("Error sending produce request to node %d: %s", node_id, error) # trace
+        log.error("%s: Error sending produce request to node %d: %s", str(self), node_id, error) # trace
         for batch in batches:
-            self._complete_batch(batch, error, -1, None)
+            self._complete_batch(batch, error, -1)
 
     def _handle_produce_response(self, node_id, send_time, batches, response):
         """Handle a produce response."""
         # if we have a response, parse it
-        log.debug('Parsing produce response: %r', response)
+        log.debug('%s: Parsing produce response: %r', str(self), response)
         if response:
             batches_by_partition = dict([(batch.topic_partition, batch)
                                          for batch in batches])
 
             for topic, partitions in response.topics:
                 for partition_info in partitions:
-                    global_error = None
-                    log_start_offset = None
                     if response.API_VERSION < 2:
                         partition, error_code, offset = partition_info
                         ts = None
                     elif 2 <= response.API_VERSION <= 4:
                         partition, error_code, offset, ts = partition_info
                     elif 5 <= response.API_VERSION <= 7:
-                        partition, error_code, offset, ts, log_start_offset = partition_info
+                        partition, error_code, offset, ts, _log_start_offset = partition_info
                     else:
-                        # the ignored parameter is record_error of type list[(batch_index: int, error_message: str)]
-                        partition, error_code, offset, ts, log_start_offset, _, global_error = partition_info
+                        # Currently unused / TODO: KIP-467
+                        partition, error_code, offset, ts, _log_start_offset, _record_errors, _global_error = partition_info
                     tp = TopicPartition(topic, partition)
                     error = Errors.for_code(error_code)
                     batch = batches_by_partition[tp]
-                    self._complete_batch(batch, error, offset, ts, log_start_offset, global_error)
-
-            if response.API_VERSION > 0:
-                self._sensors.record_throttle_time(response.throttle_time_ms, node=node_id)
+                    self._complete_batch(batch, error, offset, timestamp_ms=ts)
 
         else:
             # this is the acks = 0 case, just complete all requests
             for batch in batches:
-                self._complete_batch(batch, None, -1, None)
+                self._complete_batch(batch, None, -1)
+
+    def _fail_batch(self, batch, exception, base_offset=None, timestamp_ms=None):
+        exception = exception if type(exception) is not type else exception()
+        if self._transaction_manager:
+            if isinstance(exception, Errors.OutOfOrderSequenceNumberError) and \
+                    not self._transaction_manager.is_transactional() and \
+                    self._transaction_manager.has_producer_id(batch.producer_id):
+                log.error("%s: The broker received an out of order sequence number for topic-partition %s"
+                          " at offset %s. This indicates data loss on the broker, and should be investigated.",
+                          str(self), batch.topic_partition, base_offset)
+
+                # Reset the transaction state since we have hit an irrecoverable exception and cannot make any guarantees
+                # about the previously committed message. Note that this will discard the producer id and sequence
+                # numbers for all existing partitions.
+                self._transaction_manager.reset_producer_id()
+            elif isinstance(exception, (Errors.ClusterAuthorizationFailedError,
+                                        Errors.TransactionalIdAuthorizationFailedError,
+                                        Errors.ProducerFencedError,
+                                        Errors.InvalidTxnStateError)):
+                self._transaction_manager.transition_to_fatal_error(exception)
+            elif self._transaction_manager.is_transactional():
+                self._transaction_manager.transition_to_abortable_error(exception)
+
+        if self._sensors:
+            self._sensors.record_errors(batch.topic_partition.topic, batch.record_count)
+
+        if batch.done(base_offset=base_offset, timestamp_ms=timestamp_ms, exception=exception):
+            self._maybe_remove_from_inflight_batches(batch)
+            self._accumulator.deallocate(batch)
 
-    def _complete_batch(self, batch, error, base_offset, timestamp_ms=None, log_start_offset=None, global_error=None):
+    def _complete_batch(self, batch, error, base_offset, timestamp_ms=None):
         """Complete or retry the given batch of records.
 
         Arguments:
-            batch (RecordBatch): The record batch
+            batch (ProducerBatch): The record batch
             error (Exception): The error (or None if none)
             base_offset (int): The base offset assigned to the records if successful
             timestamp_ms (int, optional): The timestamp returned by the broker for this batch
-            log_start_offset (int): The start offset of the log at the time this produce response was created
-            global_error (str): The summarising error message
         """
         # Standardize no-error to None
         if error is Errors.NoError:
             error = None
 
-        if error is not None and self._can_retry(batch, error):
-            # retry
-            log.warning("Got error produce response on topic-partition %s,"
-                        " retrying (%d attempts left). Error: %s",
-                        batch.topic_partition,
-                        self.config['retries'] - batch.attempts - 1,
-                        global_error or error)
-            self._accumulator.reenqueue(batch)
-            self._sensors.record_retries(batch.topic_partition.topic, batch.record_count)
-        else:
-            if error is Errors.TopicAuthorizationFailedError:
-                error = error(batch.topic_partition.topic)
+        if error is not None:
+            if self._can_retry(batch, error):
+                # retry
+                log.warning("%s: Got error produce response on topic-partition %s,"
+                            " retrying (%s attempts left). Error: %s",
+                            str(self), batch.topic_partition,
+                            self.config['retries'] - batch.attempts - 1,
+                            error)
+
+                # If idempotence is enabled only retry the request if the batch matches our current producer id and epoch
+                if not self._transaction_manager or self._transaction_manager.producer_id_and_epoch.match(batch):
+                    log.debug("%s: Retrying batch to topic-partition %s. Sequence number: %s",
+                              str(self), batch.topic_partition,
+                              self._transaction_manager.sequence_number(batch.topic_partition) if self._transaction_manager else None)
+                    self._accumulator.reenqueue(batch)
+                    self._maybe_remove_from_inflight_batches(batch)
+                    if self._sensors:
+                        self._sensors.record_retries(batch.topic_partition.topic, batch.record_count)
+                else:
+                    log.warning("%s: Attempted to retry sending a batch but the producer id/epoch changed from %s/%s to %s/%s. This batch will be dropped",
+                                str(self), batch.producer_id, batch.producer_epoch,
+                                self._transaction_manager.producer_id_and_epoch.producer_id,
+                                self._transaction_manager.producer_id_and_epoch.epoch)
+                    self._fail_batch(batch, error, base_offset=base_offset, timestamp_ms=timestamp_ms)
+            else:
+                if error is Errors.TopicAuthorizationFailedError:
+                    error = error(batch.topic_partition.topic)
+
+                # tell the user the result of their request
+                self._fail_batch(batch, error, base_offset=base_offset, timestamp_ms=timestamp_ms)
+
+            if error is Errors.UnknownTopicOrPartitionError:
+                log.warning("%s: Received unknown topic or partition error in produce request on partition %s."
+                            " The topic/partition may not exist or the user may not have Describe access to it",
+                            str(self), batch.topic_partition)
+
+            if getattr(error, 'invalid_metadata', False):
+                self._metadata.request_update()
 
-            # tell the user the result of their request
-            batch.done(base_offset, timestamp_ms, error, log_start_offset, global_error)
-            self._accumulator.deallocate(batch)
-            if error is not None:
-                self._sensors.record_errors(batch.topic_partition.topic, batch.record_count)
+        else:
+            if batch.done(base_offset=base_offset, timestamp_ms=timestamp_ms):
+                self._maybe_remove_from_inflight_batches(batch)
+                self._accumulator.deallocate(batch)
 
-        if getattr(error, 'invalid_metadata', False):
-            self._metadata.request_update()
+            if self._transaction_manager and self._transaction_manager.producer_id_and_epoch.match(batch):
+                self._transaction_manager.increment_sequence_number(batch.topic_partition, batch.record_count)
+                log.debug("%s: Incremented sequence number for topic-partition %s to %s", str(self), batch.topic_partition,
+                          self._transaction_manager.sequence_number(batch.topic_partition))
 
         # Unmute the completed partition.
         if self.config['guarantee_message_order']:
@@ -266,8 +522,10 @@ def _can_retry(self, batch, error):
         We can retry a send if the error is transient and the number of
         attempts taken is fewer than the maximum allowed
         """
-        return (batch.attempts < self.config['retries']
-                and getattr(error, 'retriable', False))
+        return (not batch.has_reached_delivery_timeout(self._accumulator.delivery_timeout_ms) and
+                batch.attempts < self.config['retries'] and
+                batch.final_state is None and
+                getattr(error, 'retriable', False))
 
     def _create_produce_requests(self, collated):
         """
@@ -275,23 +533,24 @@ def _create_produce_requests(self, collated):
         per-node basis.
 
         Arguments:
-            collated: {node_id: [RecordBatch]}
+            collated: {node_id: [ProducerBatch]}
 
         Returns:
-            dict: {node_id: ProduceRequest} (version depends on api_version)
+            dict: {node_id: ProduceRequest} (version depends on client api_versions)
         """
         requests = {}
         for node_id, batches in six.iteritems(collated):
-            requests[node_id] = self._produce_request(
-                node_id, self.config['acks'],
-                self.config['request_timeout_ms'], batches)
+            if batches:
+                requests[node_id] = self._produce_request(
+                    node_id, self.config['acks'],
+                    self.config['request_timeout_ms'], batches)
         return requests
 
     def _produce_request(self, node_id, acks, timeout, batches):
         """Create a produce request from the given record batches.
 
         Returns:
-            ProduceRequest (version depends on api_version)
+            ProduceRequest (version depends on client api_versions)
         """
         produce_records_by_partition = collections.defaultdict(dict)
         for batch in batches:
@@ -301,32 +560,26 @@ def _produce_request(self, node_id, acks, timeout, batches):
             buf = batch.records.buffer()
             produce_records_by_partition[topic][partition] = buf
 
-        kwargs = {}
-        if self.config['api_version'] >= (2, 1):
-            version = 7
-        elif self.config['api_version'] >= (2, 0):
-            version = 6
-        elif self.config['api_version'] >= (1, 1):
-            version = 5
-        elif self.config['api_version'] >= (1, 0):
-            version = 4
-        elif self.config['api_version'] >= (0, 11):
-            version = 3
-            kwargs = dict(transactional_id=None)
-        elif self.config['api_version'] >= (0, 10):
-            version = 2
-        elif self.config['api_version'] == (0, 9):
-            version = 1
+        version = self._client.api_version(ProduceRequest, max_version=7)
+        topic_partition_data = [
+            (topic, list(partition_info.items()))
+            for topic, partition_info in six.iteritems(produce_records_by_partition)]
+        transactional_id = self._transaction_manager.transactional_id if self._transaction_manager else None
+        if version >= 3:
+            return ProduceRequest[version](
+                transactional_id=transactional_id,
+                required_acks=acks,
+                timeout=timeout,
+                topics=topic_partition_data,
+            )
         else:
-            version = 0
-        return ProduceRequest[version](
-            required_acks=acks,
-            timeout=timeout,
-            topics=[(topic, list(partition_info.items()))
-                    for topic, partition_info
-                    in six.iteritems(produce_records_by_partition)],
-            **kwargs
-        )
+            if transactional_id is not None:
+                log.warning('%s: Broker does not support ProduceRequest v3+, required for transactional_id', str(self))
+            return ProduceRequest[version](
+                required_acks=acks,
+                timeout=timeout,
+                topics=topic_partition_data,
+            )
 
     def wakeup(self):
         """Wake up the selector associated with this send thread."""
@@ -335,6 +588,9 @@ def wakeup(self):
     def bootstrap_connected(self):
         return self._client.bootstrap_connected()
 
+    def __str__(self):
+        return "<Sender client_id=%s transactional_id=%s>" % (self.config['client_id'], self.config['transactional_id'])
+
 
 class SenderMetrics(object):
 
@@ -367,15 +623,6 @@ def __init__(self, metrics, client, metadata):
                         sensor_name=sensor_name,
                         description='The maximum time in ms record batches spent in the record accumulator.')
 
-        sensor_name = 'produce-throttle-time'
-        self.produce_throttle_time_sensor = self.metrics.sensor(sensor_name)
-        self.add_metric('produce-throttle-time-avg', Avg(),
-                        sensor_name=sensor_name,
-                        description='The average throttle time in ms')
-        self.add_metric('produce-throttle-time-max', Max(),
-                        sensor_name=sensor_name,
-                        description='The maximum throttle time in ms')
-
         sensor_name = 'records-per-request'
         self.records_per_request_sensor = self.metrics.sensor(sensor_name)
         self.add_metric('record-send-rate', Rate(),
@@ -498,8 +745,9 @@ def update_produce_request_metrics(self, batches_map):
                 records += batch.record_count
                 total_bytes += batch.records.size_in_bytes()
 
-            self.records_per_request_sensor.record(records)
-            self.byte_rate_sensor.record(total_bytes)
+            if node_batch:
+                self.records_per_request_sensor.record(records)
+                self.byte_rate_sensor.record(total_bytes)
 
     def record_retries(self, topic, count):
         self.retry_sensor.record(count)
@@ -512,6 +760,3 @@ def record_errors(self, topic, count):
         sensor = self.metrics.get_sensor('topic.' + topic + '.record-errors')
         if sensor:
             sensor.record(count)
-
-    def record_throttle_time(self, throttle_time_ms, node=None):
-        self.produce_throttle_time_sensor.record(throttle_time_ms)
diff --git a/kafka/producer/transaction_manager.py b/kafka/producer/transaction_manager.py
new file mode 100644
index 000000000..7302eb00e
--- /dev/null
+++ b/kafka/producer/transaction_manager.py
@@ -0,0 +1,981 @@
+from __future__ import absolute_import, division
+
+import abc 
+import collections
+import heapq
+import logging
+import threading
+
+from kafka.vendor import six
+
+try:
+    # enum in stdlib as of py3.4
+    from enum import IntEnum  # pylint: disable=import-error
+except ImportError:
+    # vendored backport module
+    from kafka.vendor.enum34 import IntEnum
+
+import kafka.errors as Errors
+from kafka.protocol.add_offsets_to_txn import AddOffsetsToTxnRequest
+from kafka.protocol.add_partitions_to_txn import AddPartitionsToTxnRequest
+from kafka.protocol.end_txn import EndTxnRequest
+from kafka.protocol.find_coordinator import FindCoordinatorRequest
+from kafka.protocol.init_producer_id import InitProducerIdRequest
+from kafka.protocol.txn_offset_commit import TxnOffsetCommitRequest
+from kafka.structs import TopicPartition
+
+
+log = logging.getLogger(__name__)
+
+
+NO_PRODUCER_ID = -1
+NO_PRODUCER_EPOCH = -1
+NO_SEQUENCE = -1
+
+
+class ProducerIdAndEpoch(object):
+    __slots__ = ('producer_id', 'epoch')
+
+    def __init__(self, producer_id, epoch):
+        self.producer_id = producer_id
+        self.epoch = epoch
+
+    @property
+    def is_valid(self):
+        return NO_PRODUCER_ID < self.producer_id
+
+    def match(self, batch):
+        return self.producer_id == batch.producer_id and self.epoch == batch.producer_epoch
+
+    def __eq__(self, other):
+        return isinstance(other, ProducerIdAndEpoch) and self.producer_id == other.producer_id and self.epoch == other.epoch
+
+    def __str__(self):
+        return "ProducerIdAndEpoch(producer_id={}, epoch={})".format(self.producer_id, self.epoch)
+
+
+class TransactionState(IntEnum):
+    UNINITIALIZED = 0
+    INITIALIZING = 1
+    READY = 2
+    IN_TRANSACTION = 3
+    COMMITTING_TRANSACTION = 4
+    ABORTING_TRANSACTION = 5
+    ABORTABLE_ERROR = 6
+    FATAL_ERROR = 7
+
+    @classmethod
+    def is_transition_valid(cls, source, target):
+        if target == cls.INITIALIZING:
+            return source == cls.UNINITIALIZED
+        elif target == cls.READY:
+            return source in (cls.INITIALIZING, cls.COMMITTING_TRANSACTION, cls.ABORTING_TRANSACTION)
+        elif target == cls.IN_TRANSACTION:
+            return source == cls.READY
+        elif target == cls.COMMITTING_TRANSACTION:
+            return source == cls.IN_TRANSACTION
+        elif target == cls.ABORTING_TRANSACTION:
+            return source in (cls.IN_TRANSACTION, cls.ABORTABLE_ERROR)
+        elif target == cls.ABORTABLE_ERROR:
+            return source in (cls.IN_TRANSACTION, cls.COMMITTING_TRANSACTION, cls.ABORTABLE_ERROR)
+        elif target == cls.UNINITIALIZED:
+            # Disallow transitions to UNITIALIZED
+            return False
+        elif target == cls.FATAL_ERROR:
+            # We can transition to FATAL_ERROR unconditionally.
+            # FATAL_ERROR is never a valid starting state for any transition. So the only option is to close the
+            # producer or do purely non transactional requests.
+            return True
+
+
+class Priority(IntEnum):
+    # We use the priority to determine the order in which requests need to be sent out. For instance, if we have
+    # a pending FindCoordinator request, that must always go first. Next, If we need a producer id, that must go second.
+    # The endTxn request must always go last.
+    FIND_COORDINATOR = 0
+    INIT_PRODUCER_ID = 1
+    ADD_PARTITIONS_OR_OFFSETS = 2
+    END_TXN = 3
+
+
+class TransactionManager(object):
+    """
+    A class which maintains state for transactions. Also keeps the state necessary to ensure idempotent production.
+    """
+    NO_INFLIGHT_REQUEST_CORRELATION_ID = -1
+    # The retry_backoff_ms is overridden to the following value if the first AddPartitions receives a
+    # CONCURRENT_TRANSACTIONS error.
+    ADD_PARTITIONS_RETRY_BACKOFF_MS = 20
+
+    def __init__(self, transactional_id=None, transaction_timeout_ms=0, retry_backoff_ms=100, api_version=(0, 11), metadata=None):
+        self._api_version = api_version
+        self._metadata = metadata
+
+        self._sequence_numbers = collections.defaultdict(lambda: 0)
+
+        self.transactional_id = transactional_id
+        self.transaction_timeout_ms = transaction_timeout_ms
+        self._transaction_coordinator = None
+        self._consumer_group_coordinator = None
+        self._new_partitions_in_transaction = set()
+        self._pending_partitions_in_transaction = set()
+        self._partitions_in_transaction = set()
+        self._pending_txn_offset_commits = dict()
+
+        self._current_state = TransactionState.UNINITIALIZED
+        self._last_error = None
+        self.producer_id_and_epoch = ProducerIdAndEpoch(NO_PRODUCER_ID, NO_PRODUCER_EPOCH)
+
+        self._transaction_started = False
+
+        self._pending_requests = [] # priority queue via heapq
+        self._pending_requests_sort_id = 0
+        self._in_flight_request_correlation_id = self.NO_INFLIGHT_REQUEST_CORRELATION_ID
+
+        # This is used by the TxnRequestHandlers to control how long to back off before a given request is retried.
+        # For instance, this value is lowered by the AddPartitionsToTxnHandler when it receives a CONCURRENT_TRANSACTIONS
+        # error for the first AddPartitionsRequest in a transaction.
+        self.retry_backoff_ms = retry_backoff_ms
+        self._lock = threading.Condition()
+
+    def initialize_transactions(self):
+        with self._lock:
+            self._ensure_transactional()
+            self._transition_to(TransactionState.INITIALIZING)
+            self.set_producer_id_and_epoch(ProducerIdAndEpoch(NO_PRODUCER_ID, NO_PRODUCER_EPOCH))
+            self._sequence_numbers.clear()
+            handler = InitProducerIdHandler(self, self.transaction_timeout_ms)
+            self._enqueue_request(handler)
+            return handler.result
+
+    def begin_transaction(self):
+        with self._lock:
+            self._ensure_transactional()
+            self._maybe_fail_with_error()
+            self._transition_to(TransactionState.IN_TRANSACTION)
+
+    def begin_commit(self):
+        with self._lock:
+            self._ensure_transactional()
+            self._maybe_fail_with_error()
+            self._transition_to(TransactionState.COMMITTING_TRANSACTION)
+            return self._begin_completing_transaction(True)
+
+    def begin_abort(self):
+        with self._lock:
+            self._ensure_transactional()
+            if self._current_state != TransactionState.ABORTABLE_ERROR:
+                self._maybe_fail_with_error()
+            self._transition_to(TransactionState.ABORTING_TRANSACTION)
+
+            # We're aborting the transaction, so there should be no need to add new partitions
+            self._new_partitions_in_transaction.clear()
+            return self._begin_completing_transaction(False)
+
+    def _begin_completing_transaction(self, committed):
+        if self._new_partitions_in_transaction:
+            self._enqueue_request(self._add_partitions_to_transaction_handler())
+        handler = EndTxnHandler(self, committed)
+        self._enqueue_request(handler)
+        return handler.result
+
+    def send_offsets_to_transaction(self, offsets, consumer_group_id):
+        with self._lock:
+            self._ensure_transactional()
+            self._maybe_fail_with_error()
+            if self._current_state != TransactionState.IN_TRANSACTION:
+                raise Errors.KafkaError("Cannot send offsets to transaction because the producer is not in an active transaction")
+
+            log.debug("Begin adding offsets %s for consumer group %s to transaction", offsets, consumer_group_id)
+            handler = AddOffsetsToTxnHandler(self, consumer_group_id, offsets)
+            self._enqueue_request(handler)
+            return handler.result
+
+    def maybe_add_partition_to_transaction(self, topic_partition):
+        with self._lock:
+            self._fail_if_not_ready_for_send()
+
+            if self.is_partition_added(topic_partition) or self.is_partition_pending_add(topic_partition):
+                return
+
+            log.debug("Begin adding new partition %s to transaction", topic_partition)
+            self._new_partitions_in_transaction.add(topic_partition)
+
+    def _fail_if_not_ready_for_send(self):
+        with self._lock:
+            if self.has_error():
+                raise Errors.KafkaError(
+                        "Cannot perform send because at least one previous transactional or"
+                        " idempotent request has failed with errors.", self._last_error)
+
+            if self.is_transactional():
+                if not self.has_producer_id():
+                    raise Errors.IllegalStateError(
+                            "Cannot perform a 'send' before completing a call to init_transactions"
+                            " when transactions are enabled.")
+
+                if self._current_state != TransactionState.IN_TRANSACTION:
+                    raise Errors.IllegalStateError("Cannot call send in state %s" % (self._current_state.name,))
+
+    def is_send_to_partition_allowed(self, tp):
+        with self._lock:
+            if self.has_fatal_error():
+                return False
+            return not self.is_transactional() or tp in self._partitions_in_transaction
+
+    def has_producer_id(self, producer_id=None):
+        if producer_id is None:
+            return self.producer_id_and_epoch.is_valid
+        else:
+            return self.producer_id_and_epoch.producer_id == producer_id
+
+    def is_transactional(self):
+        return self.transactional_id is not None
+
+    def has_partitions_to_add(self):
+        with self._lock:
+            return bool(self._new_partitions_in_transaction) or bool(self._pending_partitions_in_transaction)
+
+    def is_completing(self):
+        with self._lock:
+            return self._current_state in (
+                TransactionState.COMMITTING_TRANSACTION,
+                TransactionState.ABORTING_TRANSACTION)
+
+    @property
+    def last_error(self):
+        return self._last_error
+
+    def has_error(self):
+        with self._lock:
+            return self._current_state in (
+                TransactionState.ABORTABLE_ERROR,
+                TransactionState.FATAL_ERROR)
+
+    def is_aborting(self):
+        with self._lock:
+            return self._current_state == TransactionState.ABORTING_TRANSACTION
+
+    def transition_to_abortable_error(self, exc):
+        with self._lock:
+            if self._current_state == TransactionState.ABORTING_TRANSACTION:
+                log.debug("Skipping transition to abortable error state since the transaction is already being "
+                          " aborted. Underlying exception: %s", exc)
+                return
+            self._transition_to(TransactionState.ABORTABLE_ERROR, error=exc)
+
+    def transition_to_fatal_error(self, exc):
+        with self._lock:
+            self._transition_to(TransactionState.FATAL_ERROR, error=exc)
+
+    def is_partition_added(self, partition):
+        with self._lock:
+            return partition in self._partitions_in_transaction
+
+    def is_partition_pending_add(self, partition):
+        return partition in self._new_partitions_in_transaction or partition in self._pending_partitions_in_transaction
+
+    def has_producer_id_and_epoch(self, producer_id, producer_epoch):
+        return (
+            self.producer_id_and_epoch.producer_id == producer_id and
+            self.producer_id_and_epoch.epoch == producer_epoch
+        )
+
+    def set_producer_id_and_epoch(self, producer_id_and_epoch):
+        if not isinstance(producer_id_and_epoch, ProducerIdAndEpoch):
+            raise TypeError("ProducerAndIdEpoch type required")
+        log.info("ProducerId set to %s with epoch %s",
+                 producer_id_and_epoch.producer_id, producer_id_and_epoch.epoch)
+        self.producer_id_and_epoch = producer_id_and_epoch
+
+    def reset_producer_id(self):
+        """
+        This method is used when the producer needs to reset its internal state because of an irrecoverable exception
+        from the broker.
+
+        We need to reset the producer id and associated state when we have sent a batch to the broker, but we either get
+        a non-retriable exception or we run out of retries, or the batch expired in the producer queue after it was already
+        sent to the broker.
+
+        In all of these cases, we don't know whether batch was actually committed on the broker, and hence whether the
+        sequence number was actually updated. If we don't reset the producer state, we risk the chance that all future
+        messages will return an OutOfOrderSequenceNumberError.
+
+        Note that we can't reset the producer state for the transactional producer as this would mean bumping the epoch
+        for the same producer id. This might involve aborting the ongoing transaction during the initProducerIdRequest,
+        and the user would not have any way of knowing this happened. So for the transactional producer,
+        it's best to return the produce error to the user and let them abort the transaction and close the producer explicitly.
+        """
+        with self._lock:
+            if self.is_transactional():
+                raise Errors.IllegalStateError( 
+                    "Cannot reset producer state for a transactional producer."
+                    " You must either abort the ongoing transaction or"
+                    " reinitialize the transactional producer instead")
+            self.set_producer_id_and_epoch(ProducerIdAndEpoch(NO_PRODUCER_ID, NO_PRODUCER_EPOCH))
+            self._sequence_numbers.clear()
+
+    def sequence_number(self, tp):
+        with self._lock:
+            return self._sequence_numbers[tp]
+
+    def increment_sequence_number(self, tp, increment):
+        with self._lock:
+            if tp not in self._sequence_numbers:
+                raise Errors.IllegalStateError("Attempt to increment sequence number for a partition with no current sequence.")
+            # Sequence number wraps at java max int
+            base = self._sequence_numbers[tp]
+            if base > (2147483647 - increment):
+              self._sequence_numbers[tp] = increment - (2147483647 - base) - 1
+            else:
+                self._sequence_numbers[tp] += increment
+
+    def next_request_handler(self, has_incomplete_batches):
+        with self._lock:
+            if self._new_partitions_in_transaction:
+                self._enqueue_request(self._add_partitions_to_transaction_handler())
+
+            if not self._pending_requests:
+                return None
+
+            _, _, next_request_handler = self._pending_requests[0]
+            # Do not send the EndTxn until all batches have been flushed
+            if isinstance(next_request_handler, EndTxnHandler) and has_incomplete_batches:
+                return None
+
+            heapq.heappop(self._pending_requests)
+            if self._maybe_terminate_request_with_error(next_request_handler):
+                log.debug("Not sending transactional request %s because we are in an error state",
+                          next_request_handler.request)
+                return None
+
+            if isinstance(next_request_handler, EndTxnHandler) and not self._transaction_started:
+                next_request_handler.result.done()
+                if self._current_state != TransactionState.FATAL_ERROR:
+                    log.debug("Not sending EndTxn for completed transaction since no partitions"
+                              " or offsets were successfully added")
+                    self._complete_transaction()
+                try:
+                    _, _, next_request_handler = heapq.heappop(self._pending_requests)
+                except IndexError:
+                    next_request_handler = None
+
+            if next_request_handler:
+                log.debug("Request %s dequeued for sending", next_request_handler.request)
+
+            return next_request_handler
+
+    def retry(self, request):
+        with self._lock:
+            request.set_retry()
+            self._enqueue_request(request)
+
+    def authentication_failed(self, exc):
+        with self._lock:
+            for _, _, request in self._pending_requests:
+                request.fatal_error(exc)
+
+    def coordinator(self, coord_type):
+        if coord_type == 'group':
+            return self._consumer_group_coordinator
+        elif coord_type == 'transaction':
+            return self._transaction_coordinator
+        else:
+            raise Errors.IllegalStateError("Received an invalid coordinator type: %s" % (coord_type,))
+
+    def lookup_coordinator_for_request(self, request):
+        self._lookup_coordinator(request.coordinator_type, request.coordinator_key)
+
+    def next_in_flight_request_correlation_id(self):
+        self._in_flight_request_correlation_id += 1
+        return self._in_flight_request_correlation_id
+
+    def clear_in_flight_transactional_request_correlation_id(self):
+        self._in_flight_request_correlation_id = self.NO_INFLIGHT_REQUEST_CORRELATION_ID
+
+    def has_in_flight_transactional_request(self):
+        return self._in_flight_request_correlation_id != self.NO_INFLIGHT_REQUEST_CORRELATION_ID
+
+    def has_fatal_error(self):
+        return self._current_state == TransactionState.FATAL_ERROR
+
+    def has_abortable_error(self):
+        return self._current_state == TransactionState.ABORTABLE_ERROR
+
+    # visible for testing
+    def _test_transaction_contains_partition(self, tp):
+        with self._lock:
+            return tp in self._partitions_in_transaction
+
+    # visible for testing
+    def _test_has_pending_offset_commits(self):
+        return bool(self._pending_txn_offset_commits)
+
+    # visible for testing
+    def _test_has_ongoing_transaction(self):
+        with self._lock:
+            # transactions are considered ongoing once started until completion or a fatal error
+            return self._current_state == TransactionState.IN_TRANSACTION or self.is_completing() or self.has_abortable_error()
+
+    # visible for testing
+    def _test_is_ready(self):
+        with self._lock:
+            return self.is_transactional() and self._current_state == TransactionState.READY
+
+    def _transition_to(self, target, error=None):
+        with self._lock:
+            if not self._current_state.is_transition_valid(self._current_state, target):
+                raise Errors.KafkaError("TransactionalId %s: Invalid transition attempted from state %s to state %s" % (
+                    self.transactional_id, self._current_state.name, target.name))
+
+            if target in (TransactionState.FATAL_ERROR, TransactionState.ABORTABLE_ERROR):
+                if error is None:
+                    raise Errors.IllegalArgumentError("Cannot transition to %s with an None exception" % (target.name,))
+                self._last_error = error
+            else:
+                self._last_error = None
+
+            if self._last_error is not None:
+                log.debug("Transition from state %s to error state %s (%s)", self._current_state.name, target.name, self._last_error)
+            else:
+                log.debug("Transition from state %s to %s", self._current_state, target)
+            self._current_state = target
+
+    def _ensure_transactional(self):
+        if not self.is_transactional():
+            raise Errors.IllegalStateError("Transactional method invoked on a non-transactional producer.")
+
+    def _maybe_fail_with_error(self):
+        if self.has_error():
+            raise Errors.KafkaError("Cannot execute transactional method because we are in an error state: %s" % (self._last_error,))
+
+    def _maybe_terminate_request_with_error(self, request_handler):
+        if self.has_error():
+            if self.has_abortable_error() and isinstance(request_handler, FindCoordinatorHandler):
+                # No harm letting the FindCoordinator request go through if we're expecting to abort
+                return False
+            request_handler.fail(self._last_error)
+            return True
+        return False
+
+    def _next_pending_requests_sort_id(self):
+        self._pending_requests_sort_id += 1
+        return self._pending_requests_sort_id
+
+    def _enqueue_request(self, request_handler):
+        log.debug("Enqueuing transactional request %s", request_handler.request)
+        heapq.heappush(
+            self._pending_requests,
+            (
+                request_handler.priority, # keep lowest priority at head of queue
+                self._next_pending_requests_sort_id(), # break ties
+                request_handler
+            )
+        )
+
+    def _lookup_coordinator(self, coord_type, coord_key):
+        with self._lock:
+            if coord_type == 'group':
+                self._consumer_group_coordinator = None
+            elif coord_type == 'transaction':
+                self._transaction_coordinator = None
+            else:
+                raise Errors.IllegalStateError("Invalid coordinator type: %s" % (coord_type,))
+        self._enqueue_request(FindCoordinatorHandler(self, coord_type, coord_key))
+
+    def _complete_transaction(self):
+        with self._lock:
+            self._transition_to(TransactionState.READY)
+            self._transaction_started = False
+            self._new_partitions_in_transaction.clear()
+            self._pending_partitions_in_transaction.clear()
+            self._partitions_in_transaction.clear()
+
+    def _add_partitions_to_transaction_handler(self):
+        with self._lock:
+            self._pending_partitions_in_transaction.update(self._new_partitions_in_transaction)
+            self._new_partitions_in_transaction.clear()
+            return AddPartitionsToTxnHandler(self, self._pending_partitions_in_transaction)
+
+
+class TransactionalRequestResult(object):
+    def __init__(self):
+        self._latch = threading.Event()
+        self._error = None
+
+    def done(self, error=None):
+        self._error = error
+        self._latch.set()
+
+    def wait(self, timeout_ms=None):
+        timeout = timeout_ms / 1000 if timeout_ms is not None else None
+        success = self._latch.wait(timeout)
+        if self._error:
+            raise self._error
+        return success
+
+    @property
+    def is_done(self):
+        return self._latch.is_set()
+
+    @property
+    def succeeded(self):
+        return self._latch.is_set() and self._error is None
+
+    @property
+    def failed(self):
+        return self._latch.is_set() and self._error is not None
+
+    @property
+    def exception(self):
+        return self._error
+
+
+@six.add_metaclass(abc.ABCMeta)
+class TxnRequestHandler(object):
+    def __init__(self, transaction_manager, result=None):
+        self.transaction_manager = transaction_manager
+        self.retry_backoff_ms = transaction_manager.retry_backoff_ms
+        self.request = None
+        self._result = result or TransactionalRequestResult()
+        self._is_retry = False
+
+    @property
+    def transactional_id(self):
+        return self.transaction_manager.transactional_id
+
+    @property
+    def producer_id(self):
+        return self.transaction_manager.producer_id_and_epoch.producer_id
+
+    @property
+    def producer_epoch(self):
+        return self.transaction_manager.producer_id_and_epoch.epoch
+
+    def fatal_error(self, exc):
+        self.transaction_manager._transition_to_fatal_error(exc)
+        self._result.done(error=exc)
+
+    def abortable_error(self, exc):
+        self.transaction_manager._transition_to_abortable_error(exc)
+        self._result.done(error=exc)
+
+    def fail(self, exc):
+        self._result.done(error=exc)
+
+    def reenqueue(self):
+        with self.transaction_manager._lock:
+            self._is_retry = True
+            self.transaction_manager._enqueue_request(self)
+
+    def on_complete(self, correlation_id, response_or_exc):
+        if correlation_id != self.transaction_manager._in_flight_request_correlation_id:
+            self.fatal_error(RuntimeError("Detected more than one in-flight transactional request."))
+        else:
+            self.transaction_manager.clear_in_flight_transactional_request_correlation_id()
+            if isinstance(response_or_exc, Errors.KafkaConnectionError):
+                log.debug("Disconnected from node. Will retry.")
+                if self.needs_coordinator():
+                    self.transaction_manager._lookup_coordinator(self.coordinator_type, self.coordinator_key)
+                self.reenqueue()
+            elif isinstance(response_or_exc, Errors.UnsupportedVersionError):
+                self.fatal_error(response_or_exc)
+            elif not isinstance(response_or_exc, (Exception, type(None))):
+                log.debug("Received transactional response %s for request %s", response_or_exc, self.request)
+                with self.transaction_manager._lock:
+                    self.handle_response(response_or_exc)
+            else:
+                self.fatal_error(Errors.KafkaError("Could not execute transactional request for unknown reasons: %s" % response_or_exc))
+
+    def needs_coordinator(self):
+        return self.coordinator_type is not None
+
+    @property
+    def result(self):
+        return self._result
+
+    @property
+    def coordinator_type(self):
+        return 'transaction'
+
+    @property
+    def coordinator_key(self):
+        return self.transaction_manager.transactional_id
+
+    def set_retry(self):
+        self._is_retry = True
+
+    @property
+    def is_retry(self):
+        return self._is_retry
+
+    @abc.abstractmethod
+    def handle_response(self, response):
+        pass
+
+    @abc.abstractproperty
+    def priority(self):
+        pass
+
+
+class InitProducerIdHandler(TxnRequestHandler):
+    def __init__(self, transaction_manager, transaction_timeout_ms):
+        super(InitProducerIdHandler, self).__init__(transaction_manager)
+
+        if transaction_manager._api_version >= (2, 0):
+            version = 1
+        else:
+            version = 0
+        self.request = InitProducerIdRequest[version](
+            transactional_id=self.transactional_id,
+            transaction_timeout_ms=transaction_timeout_ms)
+
+    @property
+    def priority(self):
+        return Priority.INIT_PRODUCER_ID
+
+    def handle_response(self, response):
+        error = Errors.for_code(response.error_code)
+
+        if error is Errors.NoError:
+            self.transaction_manager.set_producer_id_and_epoch(ProducerIdAndEpoch(response.producer_id, response.producer_epoch))
+            self.transaction_manager._transition_to(TransactionState.READY)
+            self._result.done()
+        elif error in (Errors.NotCoordinatorError, Errors.CoordinatorNotAvailableError):
+            self.transaction_manager._lookup_coordinator('transaction', self.transactional_id)
+            self.reenqueue()
+        elif error in (Errors.CoordinatorLoadInProgressError, Errors.ConcurrentTransactionsError):
+            self.reenqueue()
+        elif error is Errors.TransactionalIdAuthorizationFailedError:
+            self.fatal_error(error())
+        else:
+            self.fatal_error(Errors.KafkaError("Unexpected error in InitProducerIdResponse: %s" % (error())))
+
+class AddPartitionsToTxnHandler(TxnRequestHandler):
+    def __init__(self, transaction_manager, topic_partitions):
+        super(AddPartitionsToTxnHandler, self).__init__(transaction_manager)
+
+        if transaction_manager._api_version >= (2, 7):
+            version = 2
+        elif transaction_manager._api_version >= (2, 0):
+            version = 1
+        else:
+            version = 0
+        topic_data = collections.defaultdict(list)
+        for tp in topic_partitions:
+            topic_data[tp.topic].append(tp.partition)
+        self.request = AddPartitionsToTxnRequest[version](
+            transactional_id=self.transactional_id,
+            producer_id=self.producer_id,
+            producer_epoch=self.producer_epoch,
+            topics=list(topic_data.items()))
+
+    @property
+    def priority(self):
+        return Priority.ADD_PARTITIONS_OR_OFFSETS
+
+    def handle_response(self, response):
+        has_partition_errors = False
+        unauthorized_topics = set()
+        self.retry_backoff_ms = self.transaction_manager.retry_backoff_ms
+
+        results = {TopicPartition(topic, partition): Errors.for_code(error_code)
+                   for topic, partition_data in response.results
+                   for partition, error_code in partition_data}
+
+        for tp, error in six.iteritems(results):
+            if error is Errors.NoError:
+                continue
+            elif error in (Errors.CoordinatorNotAvailableError, Errors.NotCoordinatorError):
+                self.transaction_manager._lookup_coordinator('transaction', self.transactional_id)
+                self.reenqueue()
+                return
+            elif error is Errors.ConcurrentTransactionsError:
+                self.maybe_override_retry_backoff_ms()
+                self.reenqueue()
+                return
+            elif error in (Errors.CoordinatorLoadInProgressError, Errors.UnknownTopicOrPartitionError):
+                self.reenqueue()
+                return
+            elif error is Errors.InvalidProducerEpochError:
+                self.fatal_error(error())
+                return
+            elif error is Errors.TransactionalIdAuthorizationFailedError:
+                self.fatal_error(error())
+                return
+            elif error in (Errors.InvalidProducerIdMappingError, Errors.InvalidTxnStateError):
+                self.fatal_error(Errors.KafkaError(error()))
+                return
+            elif error is Errors.TopicAuthorizationFailedError:
+                unauthorized_topics.add(tp.topic)
+            elif error is Errors.OperationNotAttemptedError:
+                log.debug("Did not attempt to add partition %s to transaction because other partitions in the"
+                          " batch had errors.", tp)
+                has_partition_errors = True
+            else:
+                log.error("Could not add partition %s due to unexpected error %s", tp, error())
+                has_partition_errors = True
+
+        partitions = set(results)
+
+        # Remove the partitions from the pending set regardless of the result. We use the presence
+        # of partitions in the pending set to know when it is not safe to send batches. However, if
+        # the partitions failed to be added and we enter an error state, we expect the batches to be
+        # aborted anyway. In this case, we must be able to continue sending the batches which are in
+        # retry for partitions that were successfully added.
+        self.transaction_manager._pending_partitions_in_transaction -= partitions
+
+        if unauthorized_topics:
+            self.abortable_error(Errors.TopicAuthorizationFailedError(unauthorized_topics))
+        elif has_partition_errors:
+            self.abortable_error(Errors.KafkaError("Could not add partitions to transaction due to errors: %s" % (results)))
+        else:
+            log.debug("Successfully added partitions %s to transaction", partitions)
+            self.transaction_manager._partitions_in_transaction.update(partitions)
+            self.transaction_manager._transaction_started = True
+            self._result.done()
+
+    def maybe_override_retry_backoff_ms(self):
+        # We only want to reduce the backoff when retrying the first AddPartition which errored out due to a
+        # CONCURRENT_TRANSACTIONS error since this means that the previous transaction is still completing and
+        # we don't want to wait too long before trying to start the new one.
+        #
+        # This is only a temporary fix, the long term solution is being tracked in
+        # https://issues.apache.org/jira/browse/KAFKA-5482
+        if not self.transaction_manager._partitions_in_transaction:
+            self.retry_backoff_ms = min(self.transaction_manager.ADD_PARTITIONS_RETRY_BACKOFF_MS, self.retry_backoff_ms)
+
+
+class FindCoordinatorHandler(TxnRequestHandler):
+    def __init__(self, transaction_manager, coord_type, coord_key):
+        super(FindCoordinatorHandler, self).__init__(transaction_manager)
+
+        self._coord_type = coord_type
+        self._coord_key = coord_key
+        if transaction_manager._api_version >= (2, 0):
+            version = 2
+        else:
+            version = 1
+        if coord_type == 'group':
+            coord_type_int8 = 0
+        elif coord_type == 'transaction':
+            coord_type_int8 = 1
+        else:
+            raise ValueError("Unrecognized coordinator type: %s" % (coord_type,))
+        self.request = FindCoordinatorRequest[version](
+            coordinator_key=coord_key,
+            coordinator_type=coord_type_int8,
+        )
+
+    @property
+    def priority(self):
+        return Priority.FIND_COORDINATOR
+
+    @property
+    def coordinator_type(self):
+        return None
+
+    @property
+    def coordinator_key(self):
+        return None
+
+    def handle_response(self, response):
+        error = Errors.for_code(response.error_code)
+
+        if error is Errors.NoError:
+            coordinator_id = self.transaction_manager._metadata.add_coordinator(
+                response, self._coord_type, self._coord_key)
+            if self._coord_type == 'group':
+                self.transaction_manager._consumer_group_coordinator = coordinator_id
+            elif self._coord_type == 'transaction':
+                self.transaction_manager._transaction_coordinator = coordinator_id
+            self._result.done()
+        elif error is Errors.CoordinatorNotAvailableError:
+            self.reenqueue()
+        elif error is Errors.TransactionalIdAuthorizationFailedError:
+            self.fatal_error(error())
+        elif error is Errors.GroupAuthorizationFailedError:
+            self.abortable_error(error(self._coord_key))
+        else:
+            self.fatal_error(Errors.KafkaError(
+                "Could not find a coordinator with type %s with key %s due to"
+                " unexpected error: %s" % (self._coord_type, self._coord_key, error())))
+
+
+class EndTxnHandler(TxnRequestHandler):
+    def __init__(self, transaction_manager, committed):
+        super(EndTxnHandler, self).__init__(transaction_manager)
+
+        if self.transaction_manager._api_version >= (2, 7):
+            version = 2
+        elif self.transaction_manager._api_version >= (2, 0):
+            version = 1
+        else:
+            version = 0
+        self.request = EndTxnRequest[version](
+            transactional_id=self.transactional_id,
+            producer_id=self.producer_id,
+            producer_epoch=self.producer_epoch,
+            committed=committed)
+
+    @property
+    def priority(self):
+        return Priority.END_TXN
+
+    def handle_response(self, response):
+        error = Errors.for_code(response.error_code)
+
+        if error is Errors.NoError:
+            self.transaction_manager._complete_transaction()
+            self._result.done()
+        elif error in (Errors.CoordinatorNotAvailableError, Errors.NotCoordinatorError):
+            self.transaction_manager._lookup_coordinator('transaction', self.transactional_id)
+            self.reenqueue()
+        elif error in (Errors.CoordinatorLoadInProgressError, Errors.ConcurrentTransactionsError):
+            self.reenqueue()
+        elif error is Errors.InvalidProducerEpochError:
+            self.fatal_error(error())
+        elif error is Errors.TransactionalIdAuthorizationFailedError:
+            self.fatal_error(error())
+        elif error is Errors.InvalidTxnStateError:
+            self.fatal_error(error())
+        else:
+            self.fatal_error(Errors.KafkaError("Unhandled error in EndTxnResponse: %s" % (error())))
+
+
+class AddOffsetsToTxnHandler(TxnRequestHandler):
+    def __init__(self, transaction_manager, consumer_group_id, offsets):
+        super(AddOffsetsToTxnHandler, self).__init__(transaction_manager)
+
+        self.consumer_group_id = consumer_group_id
+        self.offsets = offsets
+        if self.transaction_manager._api_version >= (2, 7):
+            version = 2
+        elif self.transaction_manager._api_version >= (2, 0):
+            version = 1
+        else:
+            version = 0
+        self.request = AddOffsetsToTxnRequest[version](
+            transactional_id=self.transactional_id,
+            producer_id=self.producer_id,
+            producer_epoch=self.producer_epoch,
+            group_id=consumer_group_id)
+
+    @property
+    def priority(self):
+        return Priority.ADD_PARTITIONS_OR_OFFSETS
+
+    def handle_response(self, response):
+        error = Errors.for_code(response.error_code)
+
+        if error is Errors.NoError:
+            log.debug("Successfully added partition for consumer group %s to transaction", self.consumer_group_id)
+
+            # note the result is not completed until the TxnOffsetCommit returns
+            for tp, offset in six.iteritems(self.offsets):
+                self.transaction_manager._pending_txn_offset_commits[tp] = offset
+            handler = TxnOffsetCommitHandler(self.transaction_manager, self.consumer_group_id,
+                                             self.transaction_manager._pending_txn_offset_commits, self._result)
+            self.transaction_manager._enqueue_request(handler)
+            self.transaction_manager._transaction_started = True
+        elif error in (Errors.CoordinatorNotAvailableError, Errors.NotCoordinatorError):
+            self.transaction_manager._lookup_coordinator('transaction', self.transactional_id)
+            self.reenqueue()
+        elif error in (Errors.CoordinatorLoadInProgressError, Errors.ConcurrentTransactionsError):
+            self.reenqueue()
+        elif error is Errors.InvalidProducerEpochError:
+            self.fatal_error(error())
+        elif error is Errors.TransactionalIdAuthorizationFailedError:
+            self.fatal_error(error())
+        elif error is Errors.GroupAuthorizationFailedError:
+            self.abortable_error(error(self.consumer_group_id))
+        else:
+            self.fatal_error(Errors.KafkaError("Unexpected error in AddOffsetsToTxnResponse: %s" % (error())))
+
+
+class TxnOffsetCommitHandler(TxnRequestHandler):
+    def __init__(self, transaction_manager, consumer_group_id, offsets, result):
+        super(TxnOffsetCommitHandler, self).__init__(transaction_manager, result=result)
+
+        self.consumer_group_id = consumer_group_id
+        self.offsets = offsets
+        self.request = self._build_request()
+
+    def _build_request(self):
+        if self.transaction_manager._api_version >= (2, 1):
+            version = 2
+        elif self.transaction_manager._api_version >= (2, 0):
+            version = 1
+        else:
+            version = 0
+
+        topic_data = collections.defaultdict(list)
+        for tp, offset in six.iteritems(self.offsets):
+            if version >= 2:
+                partition_data = (tp.partition, offset.offset, offset.leader_epoch, offset.metadata)
+            else:
+                partition_data = (tp.partition, offset.offset, offset.metadata)
+            topic_data[tp.topic].append(partition_data)
+
+        return TxnOffsetCommitRequest[version](
+            transactional_id=self.transactional_id,
+            group_id=self.consumer_group_id,
+            producer_id=self.producer_id,
+            producer_epoch=self.producer_epoch,
+            topics=list(topic_data.items()))
+
+    @property
+    def priority(self):
+        return Priority.ADD_PARTITIONS_OR_OFFSETS
+
+    @property
+    def coordinator_type(self):
+        return 'group'
+
+    @property
+    def coordinator_key(self):
+        return self.consumer_group_id
+
+    def handle_response(self, response):
+        lookup_coordinator = False
+        retriable_failure = False
+
+        errors = {TopicPartition(topic, partition): Errors.for_code(error_code)
+                  for topic, partition_data in response.topics
+                  for partition, error_code in partition_data}
+
+        for tp, error in six.iteritems(errors):
+            if error is Errors.NoError:
+                log.debug("Successfully added offsets for %s from consumer group %s to transaction.",
+                          tp, self.consumer_group_id)
+                del self.transaction_manager._pending_txn_offset_commits[tp]
+            elif error in (errors.CoordinatorNotAvailableError, Errors.NotCoordinatorError, Errors.RequestTimedOutError):
+                retriable_failure = True
+                lookup_coordinator = True
+            elif error is Errors.UnknownTopicOrPartitionError:
+                retriable_failure = True
+            elif error is Errors.GroupAuthorizationFailedError:
+                self.abortable_error(error(self.consumer_group_id))
+                return
+            elif error in (Errors.TransactionalIdAuthorizationFailedError,
+                           Errors.InvalidProducerEpochError,
+                           Errors.UnsupportedForMessageFormatError):
+                self.fatal_error(error())
+                return
+            else:
+                self.fatal_error(Errors.KafkaError("Unexpected error in TxnOffsetCommitResponse: %s" % (error())))
+                return
+
+        if lookup_coordinator:
+            self.transaction_manager._lookup_coordinator('group', self.consumer_group_id)
+
+        if not retriable_failure:
+            # all attempted partitions were either successful, or there was a fatal failure.
+            # either way, we are not retrying, so complete the request.
+            self.result.done()
+
+        # retry the commits which failed with a retriable error.
+        elif self.transaction_manager._pending_txn_offset_commits:
+            self.offsets = self.transaction_manager._pending_txn_offset_commits
+            self.request = self._build_request()
+            self.reenqueue()
diff --git a/kafka/protocol/__init__.py b/kafka/protocol/__init__.py
index e739b5cb1..025447f99 100644
--- a/kafka/protocol/__init__.py
+++ b/kafka/protocol/__init__.py
@@ -43,5 +43,7 @@
     40: 'ExpireDelegationToken',
     41: 'DescribeDelegationToken',
     42: 'DeleteGroups',
+    45: 'AlterPartitionReassignments',
+    46: 'ListPartitionReassignments',
     48: 'DescribeClientQuotas',
 }
diff --git a/kafka/protocol/abstract.py b/kafka/protocol/abstract.py
index 2de65c4bb..7ce5fc18f 100644
--- a/kafka/protocol/abstract.py
+++ b/kafka/protocol/abstract.py
@@ -2,10 +2,11 @@
 
 import abc
 
+from kafka.vendor.six import add_metaclass
 
-class AbstractType(object):
-    __metaclass__ = abc.ABCMeta
 
+@add_metaclass(abc.ABCMeta)
+class AbstractType(object):
     @abc.abstractmethod
     def encode(cls, value): # pylint: disable=no-self-argument
         pass
diff --git a/kafka/protocol/add_offsets_to_txn.py b/kafka/protocol/add_offsets_to_txn.py
new file mode 100644
index 000000000..fa2509330
--- /dev/null
+++ b/kafka/protocol/add_offsets_to_txn.py
@@ -0,0 +1,59 @@
+from __future__ import absolute_import
+
+from kafka.protocol.api import Request, Response
+from kafka.protocol.types import Int16, Int32, Int64, Schema, String
+
+
+class AddOffsetsToTxnResponse_v0(Response):
+    API_KEY = 25
+    API_VERSION = 0
+    SCHEMA = Schema(
+        ('throttle_time_ms', Int32),
+        ('error_code', Int16),
+    )
+
+
+class AddOffsetsToTxnResponse_v1(Response):
+    API_KEY = 25
+    API_VERSION = 1
+    SCHEMA = AddOffsetsToTxnResponse_v0.SCHEMA
+
+
+class AddOffsetsToTxnResponse_v2(Response):
+    API_KEY = 25
+    API_VERSION = 2
+    SCHEMA = AddOffsetsToTxnResponse_v1.SCHEMA
+
+
+class AddOffsetsToTxnRequest_v0(Request):
+    API_KEY = 25
+    API_VERSION = 0
+    RESPONSE_TYPE = AddOffsetsToTxnResponse_v0
+    SCHEMA = Schema(
+        ('transactional_id', String('utf-8')),
+        ('producer_id', Int64),
+        ('producer_epoch', Int16),
+        ('group_id', String('utf-8')),
+    )
+
+
+class AddOffsetsToTxnRequest_v1(Request):
+    API_KEY = 25
+    API_VERSION = 1
+    RESPONSE_TYPE = AddOffsetsToTxnResponse_v1
+    SCHEMA = AddOffsetsToTxnRequest_v0.SCHEMA
+
+
+class AddOffsetsToTxnRequest_v2(Request):
+    API_KEY = 25
+    API_VERSION = 2
+    RESPONSE_TYPE = AddOffsetsToTxnResponse_v2
+    SCHEMA = AddOffsetsToTxnRequest_v1.SCHEMA
+
+
+AddOffsetsToTxnRequest = [
+    AddOffsetsToTxnRequest_v0, AddOffsetsToTxnRequest_v1, AddOffsetsToTxnRequest_v2,
+]
+AddOffsetsToTxnResponse = [
+    AddOffsetsToTxnResponse_v0, AddOffsetsToTxnResponse_v1, AddOffsetsToTxnResponse_v2,
+]
diff --git a/kafka/protocol/add_partitions_to_txn.py b/kafka/protocol/add_partitions_to_txn.py
new file mode 100644
index 000000000..fdf28f4ae
--- /dev/null
+++ b/kafka/protocol/add_partitions_to_txn.py
@@ -0,0 +1,63 @@
+from __future__ import absolute_import
+
+from kafka.protocol.api import Request, Response
+from kafka.protocol.types import Array, Int16, Int32, Int64, Schema, String
+
+
+class AddPartitionsToTxnResponse_v0(Response):
+    API_KEY = 24
+    API_VERSION = 0
+    SCHEMA = Schema(
+        ('throttle_time_ms', Int32),
+        ('results', Array(
+            ('topic', String('utf-8')),
+            ('partitions', Array(
+                ('partition', Int32),
+                ('error_code', Int16))))))
+
+
+class AddPartitionsToTxnResponse_v1(Response):
+    API_KEY = 24
+    API_VERSION = 1
+    SCHEMA = AddPartitionsToTxnResponse_v0.SCHEMA
+
+
+class AddPartitionsToTxnResponse_v2(Response):
+    API_KEY = 24
+    API_VERSION = 2
+    SCHEMA = AddPartitionsToTxnResponse_v1.SCHEMA
+
+
+class AddPartitionsToTxnRequest_v0(Request):
+    API_KEY = 24
+    API_VERSION = 0
+    RESPONSE_TYPE = AddPartitionsToTxnResponse_v0
+    SCHEMA = Schema(
+        ('transactional_id', String('utf-8')),
+        ('producer_id', Int64),
+        ('producer_epoch', Int16),
+        ('topics', Array(
+            ('topic', String('utf-8')),
+            ('partitions', Array(Int32)))))
+
+
+class AddPartitionsToTxnRequest_v1(Request):
+    API_KEY = 24
+    API_VERSION = 1
+    RESPONSE_TYPE = AddPartitionsToTxnResponse_v1
+    SCHEMA = AddPartitionsToTxnRequest_v0.SCHEMA
+
+
+class AddPartitionsToTxnRequest_v2(Request):
+    API_KEY = 24
+    API_VERSION = 2
+    RESPONSE_TYPE = AddPartitionsToTxnResponse_v2
+    SCHEMA = AddPartitionsToTxnRequest_v1.SCHEMA
+
+
+AddPartitionsToTxnRequest = [
+    AddPartitionsToTxnRequest_v0, AddPartitionsToTxnRequest_v1, AddPartitionsToTxnRequest_v2,
+]
+AddPartitionsToTxnResponse = [
+    AddPartitionsToTxnResponse_v0, AddPartitionsToTxnResponse_v1, AddPartitionsToTxnResponse_v2,
+]
diff --git a/kafka/protocol/admin.py b/kafka/protocol/admin.py
index 63a3327a6..255166801 100644
--- a/kafka/protocol/admin.py
+++ b/kafka/protocol/admin.py
@@ -1,67 +1,14 @@
 from __future__ import absolute_import
 
-from kafka.protocol.api import Request, Response
-from kafka.protocol.types import Array, Boolean, Bytes, Int8, Int16, Int32, Int64, Schema, String, Float64
-
-
-class ApiVersionResponse_v0(Response):
-    API_KEY = 18
-    API_VERSION = 0
-    SCHEMA = Schema(
-        ('error_code', Int16),
-        ('api_versions', Array(
-            ('api_key', Int16),
-            ('min_version', Int16),
-            ('max_version', Int16)))
-    )
-
-
-class ApiVersionResponse_v1(Response):
-    API_KEY = 18
-    API_VERSION = 1
-    SCHEMA = Schema(
-        ('error_code', Int16),
-        ('api_versions', Array(
-            ('api_key', Int16),
-            ('min_version', Int16),
-            ('max_version', Int16))),
-        ('throttle_time_ms', Int32)
-    )
-
-
-class ApiVersionResponse_v2(Response):
-    API_KEY = 18
-    API_VERSION = 2
-    SCHEMA = ApiVersionResponse_v1.SCHEMA
-
-
-class ApiVersionRequest_v0(Request):
-    API_KEY = 18
-    API_VERSION = 0
-    RESPONSE_TYPE = ApiVersionResponse_v0
-    SCHEMA = Schema()
+# enum in stdlib as of py3.4
+try:
+    from enum import IntEnum  # pylint: disable=import-error
+except ImportError:
+    # vendored backport module
+    from kafka.vendor.enum34 import IntEnum
 
-
-class ApiVersionRequest_v1(Request):
-    API_KEY = 18
-    API_VERSION = 1
-    RESPONSE_TYPE = ApiVersionResponse_v1
-    SCHEMA = ApiVersionRequest_v0.SCHEMA
-
-
-class ApiVersionRequest_v2(Request):
-    API_KEY = 18
-    API_VERSION = 2
-    RESPONSE_TYPE = ApiVersionResponse_v1
-    SCHEMA = ApiVersionRequest_v0.SCHEMA
-
-
-ApiVersionRequest = [
-    ApiVersionRequest_v0, ApiVersionRequest_v1, ApiVersionRequest_v2,
-]
-ApiVersionResponse = [
-    ApiVersionResponse_v0, ApiVersionResponse_v1, ApiVersionResponse_v2,
-]
+from kafka.protocol.api import Request, Response
+from kafka.protocol.types import Array, Boolean, Bytes, Int8, Int16, Int32, Int64, Schema, String, Float64, CompactString, CompactArray, TaggedFields
 
 
 class CreateTopicsResponse_v0(Response):
@@ -239,6 +186,38 @@ class DeleteTopicsRequest_v3(Request):
 ]
 
 
+class DeleteRecordsResponse_v0(Response):
+    API_KEY = 21
+    API_VERSION = 0
+    SCHEMA = Schema(
+        ('throttle_time_ms', Int32),
+        ('topics', Array(
+            ('name', String('utf-8')),
+            ('partitions', Array(
+                ('partition_index', Int32),
+                ('low_watermark', Int64),
+                ('error_code', Int16))))),
+    )
+
+
+class DeleteRecordsRequest_v0(Request):
+    API_KEY = 21
+    API_VERSION = 0
+    RESPONSE_TYPE = DeleteRecordsResponse_v0
+    SCHEMA = Schema(
+        ('topics', Array(
+            ('name', String('utf-8')),
+            ('partitions', Array(
+                ('partition_index', Int32),
+                ('offset', Int64))))),
+        ('timeout_ms', Int32)
+    )
+
+
+DeleteRecordsResponse = [DeleteRecordsResponse_v0]
+DeleteRecordsRequest = [DeleteRecordsRequest_v0]
+
+
 class ListGroupsResponse_v0(Response):
     API_KEY = 16
     API_VERSION = 0
@@ -406,41 +385,6 @@ class DescribeGroupsRequest_v3(Request):
 ]
 
 
-class SaslHandShakeResponse_v0(Response):
-    API_KEY = 17
-    API_VERSION = 0
-    SCHEMA = Schema(
-        ('error_code', Int16),
-        ('enabled_mechanisms', Array(String('utf-8')))
-    )
-
-
-class SaslHandShakeResponse_v1(Response):
-    API_KEY = 17
-    API_VERSION = 1
-    SCHEMA = SaslHandShakeResponse_v0.SCHEMA
-
-
-class SaslHandShakeRequest_v0(Request):
-    API_KEY = 17
-    API_VERSION = 0
-    RESPONSE_TYPE = SaslHandShakeResponse_v0
-    SCHEMA = Schema(
-        ('mechanism', String('utf-8'))
-    )
-
-
-class SaslHandShakeRequest_v1(Request):
-    API_KEY = 17
-    API_VERSION = 1
-    RESPONSE_TYPE = SaslHandShakeResponse_v1
-    SCHEMA = SaslHandShakeRequest_v0.SCHEMA
-
-
-SaslHandShakeRequest = [SaslHandShakeRequest_v0, SaslHandShakeRequest_v1]
-SaslHandShakeResponse = [SaslHandShakeResponse_v0, SaslHandShakeResponse_v1]
-
-
 class DescribeAclsResponse_v0(Response):
     API_KEY = 29
     API_VERSION = 0
@@ -523,8 +467,8 @@ class DescribeAclsRequest_v2(Request):
     SCHEMA = DescribeAclsRequest_v1.SCHEMA
 
 
-DescribeAclsRequest = [DescribeAclsRequest_v0, DescribeAclsRequest_v1]
-DescribeAclsResponse = [DescribeAclsResponse_v0, DescribeAclsResponse_v1]
+DescribeAclsRequest = [DescribeAclsRequest_v0, DescribeAclsRequest_v1, DescribeAclsRequest_v2]
+DescribeAclsResponse = [DescribeAclsResponse_v0, DescribeAclsResponse_v1, DescribeAclsResponse_v2]
 
 class CreateAclsResponse_v0(Response):
     API_KEY = 30
@@ -719,7 +663,7 @@ class DescribeConfigsResponse_v1(Response):
                 ('config_names', String('utf-8')),
                 ('config_value', String('utf-8')),
                 ('read_only', Boolean),
-                ('is_default', Boolean),
+                ('config_source', Int8),
                 ('is_sensitive', Boolean),
                 ('config_synonyms', Array(
                     ('config_name', String('utf-8')),
@@ -790,6 +734,47 @@ class DescribeConfigsRequest_v2(Request):
 ]
 
 
+class DescribeLogDirsResponse_v0(Response):
+    API_KEY = 35
+    API_VERSION = 0
+    SCHEMA = Schema(
+        ('throttle_time_ms', Int32),
+        ('log_dirs', Array(
+            ('error_code', Int16),
+            ('log_dir', String('utf-8')),
+            ('topics', Array(
+                ('name', String('utf-8')),
+                ('partitions', Array(
+                    ('partition_index', Int32),
+                    ('partition_size', Int64),
+                    ('offset_lag', Int64),
+                    ('is_future_key', Boolean)
+                ))
+            ))
+        ))
+    )
+
+
+class DescribeLogDirsRequest_v0(Request):
+    API_KEY = 35
+    API_VERSION = 0
+    RESPONSE_TYPE = DescribeLogDirsResponse_v0
+    SCHEMA = Schema(
+                     ('topics', Array(
+                         ('topic', String('utf-8')),
+                         ('partitions', Int32)
+                         ))
+                 )
+
+
+DescribeLogDirsResponse = [
+    DescribeLogDirsResponse_v0,
+]
+DescribeLogDirsRequest = [
+    DescribeLogDirsRequest_v0,
+]
+
+
 class SaslAuthenticateResponse_v0(Response):
     API_KEY = 36
     API_VERSION = 0
@@ -925,7 +910,7 @@ class DeleteGroupsRequest_v1(Request):
 ]
 
 
-class DescribeClientQuotasResponse_v0(Request):
+class DescribeClientQuotasResponse_v0(Response):
     API_KEY = 48
     API_VERSION = 0
     SCHEMA = Schema(
@@ -963,3 +948,168 @@ class DescribeClientQuotasRequest_v0(Request):
 DescribeClientQuotasResponse = [
     DescribeClientQuotasResponse_v0,
 ]
+
+
+class AlterPartitionReassignmentsResponse_v0(Response):
+    API_KEY = 45
+    API_VERSION = 0
+    SCHEMA = Schema(
+        ("throttle_time_ms", Int32),
+        ("error_code", Int16),
+        ("error_message", CompactString("utf-8")),
+        ("responses", CompactArray(
+            ("name", CompactString("utf-8")),
+            ("partitions", CompactArray(
+                ("partition_index", Int32),
+                ("error_code", Int16),
+                ("error_message", CompactString("utf-8")),
+                ("tags", TaggedFields)
+            )),
+            ("tags", TaggedFields)
+        )),
+        ("tags", TaggedFields)
+    )
+    FLEXIBLE_VERSION = True
+
+
+class AlterPartitionReassignmentsRequest_v0(Request):
+    FLEXIBLE_VERSION = True
+    API_KEY = 45
+    API_VERSION = 0
+    RESPONSE_TYPE = AlterPartitionReassignmentsResponse_v0
+    SCHEMA = Schema(
+        ("timeout_ms", Int32),
+        ("topics", CompactArray(
+            ("name", CompactString("utf-8")),
+            ("partitions", CompactArray(
+                ("partition_index", Int32),
+                ("replicas", CompactArray(Int32)),
+                ("tags", TaggedFields)
+            )),
+            ("tags", TaggedFields)
+        )),
+        ("tags", TaggedFields)
+    )
+
+
+AlterPartitionReassignmentsRequest = [AlterPartitionReassignmentsRequest_v0]
+
+AlterPartitionReassignmentsResponse = [AlterPartitionReassignmentsResponse_v0]
+
+
+class ListPartitionReassignmentsResponse_v0(Response):
+    API_KEY = 46
+    API_VERSION = 0
+    SCHEMA = Schema(
+        ("throttle_time_ms", Int32),
+        ("error_code", Int16),
+        ("error_message", CompactString("utf-8")),
+        ("topics", CompactArray(
+            ("name", CompactString("utf-8")),
+            ("partitions", CompactArray(
+                ("partition_index", Int32),
+                ("replicas", CompactArray(Int32)),
+                ("adding_replicas", CompactArray(Int32)),
+                ("removing_replicas", CompactArray(Int32)),
+                ("tags", TaggedFields)
+            )),
+            ("tags", TaggedFields)
+        )),
+        ("tags", TaggedFields)
+    )
+    FLEXIBLE_VERSION = True
+
+
+class ListPartitionReassignmentsRequest_v0(Request):
+    FLEXIBLE_VERSION = True
+    API_KEY = 46
+    API_VERSION = 0
+    RESPONSE_TYPE = ListPartitionReassignmentsResponse_v0
+    SCHEMA = Schema(
+        ("timeout_ms", Int32),
+        ("topics", CompactArray(
+            ("name", CompactString("utf-8")),
+            ("partition_index", CompactArray(Int32)),
+            ("tags", TaggedFields)
+        )),
+        ("tags", TaggedFields)
+    )
+
+
+ListPartitionReassignmentsRequest = [ListPartitionReassignmentsRequest_v0]
+
+ListPartitionReassignmentsResponse = [ListPartitionReassignmentsResponse_v0]
+
+
+class ElectLeadersResponse_v0(Response):
+    API_KEY = 43
+    API_VERSION = 1
+    SCHEMA = Schema(
+        ('throttle_time_ms', Int32),
+        ('error_code', Int16),
+        ('replication_election_results', Array(
+            ('topic', String('utf-8')),
+            ('partition_result', Array(
+                ('partition_id', Int32),
+                ('error_code', Int16),
+                ('error_message', String('utf-8'))
+            ))
+        ))
+    )
+
+
+class ElectLeadersRequest_v0(Request):
+    API_KEY = 43
+    API_VERSION = 1
+    RESPONSE_TYPE = ElectLeadersResponse_v0
+    SCHEMA = Schema(
+        ('election_type', Int8),
+        ('topic_partitions', Array(
+            ('topic', String('utf-8')),
+            ('partition_ids', Array(Int32))
+        )),
+        ('timeout', Int32),
+    )
+
+
+class ElectLeadersResponse_v1(Response):
+    API_KEY = 43
+    API_VERSION = 1
+    SCHEMA = Schema(
+        ('throttle_time_ms', Int32),
+        ('error_code', Int16),
+        ('replication_election_results', Array(
+            ('topic', String('utf-8')),
+            ('partition_result', Array(
+                ('partition_id', Int32),
+                ('error_code', Int16),
+                ('error_message', String('utf-8'))
+            ))
+        ))
+    )
+
+
+class ElectLeadersRequest_v1(Request):
+    API_KEY = 43
+    API_VERSION = 1
+    RESPONSE_TYPE = ElectLeadersResponse_v1
+    SCHEMA = Schema(
+        ('election_type', Int8),
+        ('topic_partitions', Array(
+            ('topic', String('utf-8')),
+            ('partition_ids', Array(Int32))
+        )),
+        ('timeout', Int32),
+    )
+
+
+class ElectionType(IntEnum):
+    """ Leader election type
+    """
+
+    PREFERRED = 0,
+    UNCLEAN = 1
+
+
+ElectLeadersRequest = [ElectLeadersRequest_v0, ElectLeadersRequest_v1]
+ElectLeadersResponse = [ElectLeadersResponse_v0, ElectLeadersResponse_v1]
diff --git a/kafka/protocol/api.py b/kafka/protocol/api.py
index 64276fc17..9cd5767c1 100644
--- a/kafka/protocol/api.py
+++ b/kafka/protocol/api.py
@@ -3,7 +3,9 @@
 import abc
 
 from kafka.protocol.struct import Struct
-from kafka.protocol.types import Int16, Int32, String, Schema, Array
+from kafka.protocol.types import Int16, Int32, String, Schema, Array, TaggedFields
+
+from kafka.vendor.six import add_metaclass
 
 
 class RequestHeader(Struct):
@@ -20,8 +22,38 @@ def __init__(self, request, correlation_id=0, client_id='kafka-python'):
         )
 
 
+class RequestHeaderV2(Struct):
+    # Flexible response / request headers end in field buffer
+    SCHEMA = Schema(
+        ('api_key', Int16),
+        ('api_version', Int16),
+        ('correlation_id', Int32),
+        ('client_id', String('utf-8')),
+        ('tags', TaggedFields),
+    )
+
+    def __init__(self, request, correlation_id=0, client_id='kafka-python', tags=None):
+        super(RequestHeaderV2, self).__init__(
+            request.API_KEY, request.API_VERSION, correlation_id, client_id, tags or {}
+        )
+
+
+class ResponseHeader(Struct):
+    SCHEMA = Schema(
+        ('correlation_id', Int32),
+    )
+
+
+class ResponseHeaderV2(Struct):
+    SCHEMA = Schema(
+        ('correlation_id', Int32),
+        ('tags', TaggedFields),
+    )
+
+
+@add_metaclass(abc.ABCMeta)
 class Request(Struct):
-    __metaclass__ = abc.ABCMeta
+    FLEXIBLE_VERSION = False
 
     @abc.abstractproperty
     def API_KEY(self):
@@ -50,9 +82,15 @@ def expect_response(self):
     def to_object(self):
         return _to_object(self.SCHEMA, self)
 
+    def build_header(self, correlation_id, client_id):
+        if self.FLEXIBLE_VERSION:
+            return RequestHeaderV2(self, correlation_id=correlation_id, client_id=client_id)
+        return RequestHeader(self, correlation_id=correlation_id, client_id=client_id)
 
+
+@add_metaclass(abc.ABCMeta)
 class Response(Struct):
-    __metaclass__ = abc.ABCMeta
+    FLEXIBLE_VERSION = False
 
     @abc.abstractproperty
     def API_KEY(self):
@@ -72,6 +110,12 @@ def SCHEMA(self):
     def to_object(self):
         return _to_object(self.SCHEMA, self)
 
+    @classmethod
+    def parse_header(cls, read_buffer):
+        if cls.FLEXIBLE_VERSION:
+            return ResponseHeaderV2.decode(read_buffer)
+        return ResponseHeader.decode(read_buffer)
+
 
 def _to_object(schema, data):
     obj = {}
diff --git a/kafka/protocol/api_versions.py b/kafka/protocol/api_versions.py
new file mode 100644
index 000000000..e7cedd954
--- /dev/null
+++ b/kafka/protocol/api_versions.py
@@ -0,0 +1,134 @@
+from __future__ import absolute_import
+
+from io import BytesIO
+
+from kafka.protocol.api import Request, Response
+from kafka.protocol.types import Array, CompactArray, CompactString, Int16, Int32, Schema, TaggedFields
+
+
+class BaseApiVersionsResponse(Response):
+    API_KEY = 18
+    API_VERSION = 0
+    SCHEMA = Schema(
+        ('error_code', Int16),
+        ('api_versions', Array(
+            ('api_key', Int16),
+            ('min_version', Int16),
+            ('max_version', Int16)))
+    )
+
+    @classmethod
+    def decode(cls, data):
+        if isinstance(data, bytes):
+            data = BytesIO(data)
+        # Check error_code, decode as v0 if any error
+        curr = data.tell()
+        err = Int16.decode(data)
+        data.seek(curr)
+        if err != 0:
+            return ApiVersionsResponse_v0.decode(data)
+        return super(BaseApiVersionsResponse, cls).decode(data)
+
+
+class ApiVersionsResponse_v0(Response):
+    API_KEY = 18
+    API_VERSION = 0
+    SCHEMA = Schema(
+        ('error_code', Int16),
+        ('api_versions', Array(
+            ('api_key', Int16),
+            ('min_version', Int16),
+            ('max_version', Int16)))
+    )
+
+
+class ApiVersionsResponse_v1(BaseApiVersionsResponse):
+    API_KEY = 18
+    API_VERSION = 1
+    SCHEMA = Schema(
+        ('error_code', Int16),
+        ('api_versions', Array(
+            ('api_key', Int16),
+            ('min_version', Int16),
+            ('max_version', Int16))),
+        ('throttle_time_ms', Int32)
+    )
+
+
+class ApiVersionsResponse_v2(BaseApiVersionsResponse):
+    API_KEY = 18
+    API_VERSION = 2
+    SCHEMA = ApiVersionsResponse_v1.SCHEMA
+
+
+class ApiVersionsResponse_v3(BaseApiVersionsResponse):
+    API_KEY = 18
+    API_VERSION = 3
+    SCHEMA = Schema(
+        ('error_code', Int16),
+        ('api_versions', CompactArray(
+            ('api_key', Int16),
+            ('min_version', Int16),
+            ('max_version', Int16),
+            ('_tagged_fields', TaggedFields))),
+        ('throttle_time_ms', Int32),
+        ('_tagged_fields', TaggedFields)
+    )
+    # Note: ApiVersions Response does not send FLEXIBLE_VERSION header!
+
+
+class ApiVersionsResponse_v4(BaseApiVersionsResponse):
+    API_KEY = 18
+    API_VERSION = 4
+    SCHEMA = ApiVersionsResponse_v3.SCHEMA
+
+
+class ApiVersionsRequest_v0(Request):
+    API_KEY = 18
+    API_VERSION = 0
+    RESPONSE_TYPE = ApiVersionsResponse_v0
+    SCHEMA = Schema()
+
+
+class ApiVersionsRequest_v1(Request):
+    API_KEY = 18
+    API_VERSION = 1
+    RESPONSE_TYPE = ApiVersionsResponse_v1
+    SCHEMA = ApiVersionsRequest_v0.SCHEMA
+
+
+class ApiVersionsRequest_v2(Request):
+    API_KEY = 18
+    API_VERSION = 2
+    RESPONSE_TYPE = ApiVersionsResponse_v2
+    SCHEMA = ApiVersionsRequest_v1.SCHEMA
+
+
+class ApiVersionsRequest_v3(Request):
+    API_KEY = 18
+    API_VERSION = 3
+    RESPONSE_TYPE = ApiVersionsResponse_v3
+    SCHEMA = Schema(
+        ('client_software_name', CompactString('utf-8')),
+        ('client_software_version', CompactString('utf-8')),
+        ('_tagged_fields', TaggedFields)
+    )
+    FLEXIBLE_VERSION = True
+
+
+class ApiVersionsRequest_v4(Request):
+    API_KEY = 18
+    API_VERSION = 4
+    RESPONSE_TYPE = ApiVersionsResponse_v4
+    SCHEMA = ApiVersionsRequest_v3.SCHEMA
+    FLEXIBLE_VERSION = True
+
+
+ApiVersionsRequest = [
+    ApiVersionsRequest_v0, ApiVersionsRequest_v1, ApiVersionsRequest_v2,
+    ApiVersionsRequest_v3, ApiVersionsRequest_v4,
+]
+ApiVersionsResponse = [
+    ApiVersionsResponse_v0, ApiVersionsResponse_v1, ApiVersionsResponse_v2,
+    ApiVersionsResponse_v3, ApiVersionsResponse_v4,
+]
diff --git a/kafka/protocol/broker_api_versions.py b/kafka/protocol/broker_api_versions.py
new file mode 100644
index 000000000..af142d07c
--- /dev/null
+++ b/kafka/protocol/broker_api_versions.py
@@ -0,0 +1,68 @@
+BROKER_API_VERSIONS = {
+    # api_versions responses prior to (0, 10) are synthesized for compatibility
+    (0, 8, 0): {0: (0, 0), 1: (0, 0), 2: (0, 0), 3: (0, 0)},
+    # adds offset commit + fetch
+    (0, 8, 1): {0: (0, 0), 1: (0, 0), 2: (0, 0), 3: (0, 0), 8: (0, 0), 9: (0, 0)},
+    # adds find coordinator
+    (0, 8, 2): {0: (0, 0), 1: (0, 0), 2: (0, 0), 3: (0, 0), 8: (0, 1), 9: (0, 1), 10: (0, 0)},
+    # adds group management (join/sync/leave/heartbeat)
+    (0, 9): {0: (0, 1), 1: (0, 1), 2: (0, 0), 3: (0, 0), 8: (0, 2), 9: (0, 1), 10: (0, 0), 11: (0, 0), 12: (0, 0), 13: (0, 0), 14: (0, 0), 15: (0, 0), 16: (0, 0)},
+    # adds message format v1, sasl, and api versions api
+    (0, 10, 0): {0: (0, 2), 1: (0, 2), 2: (0, 0), 3: (0, 1), 4: (0, 0), 5: (0, 0), 6: (0, 2), 7: (1, 1), 8: (0, 2), 9: (0, 1), 10: (0, 0), 11: (0, 0), 12: (0, 0), 13: (0, 0), 14: (0, 0), 15: (0, 0), 16: (0, 0), 17: (0, 0), 18: (0, 0)},
+
+    # All data below is copied from brokers via api_versions_response (see make servers/*/api_versions)
+    # adds admin apis create/delete topics, and bumps fetch/listoffsets/metadata/joingroup
+    (0, 10, 1): {0: (0, 2), 1: (0, 3), 2: (0, 1), 3: (0, 2), 4: (0, 0), 5: (0, 0), 6: (0, 2), 7: (1, 1), 8: (0, 2), 9: (0, 1), 10: (0, 0), 11: (0, 1), 12: (0, 0), 13: (0, 0), 14: (0, 0), 15: (0, 0), 16: (0, 0), 17: (0, 0), 18: (0, 0), 19: (0, 0), 20: (0, 0)},
+
+    # bumps offsetfetch/create-topics
+    (0, 10, 2): {0: (0, 2), 1: (0, 3), 2: (0, 1), 3: (0, 2), 4: (0, 0), 5: (0, 0), 6: (0, 3), 7: (1, 1), 8: (0, 2), 9: (0, 2), 10: (0, 0), 11: (0, 1), 12: (0, 0), 13: (0, 0), 14: (0, 0), 15: (0, 0), 16: (0, 0), 17: (0, 0), 18: (0, 0), 19: (0, 1), 20: (0, 0)},
+
+    # Adds message format v2, and more admin apis (describe/create/delete acls, describe/alter configs, etc)
+    (0, 11): {0: (0, 3), 1: (0, 5), 2: (0, 2), 3: (0, 4), 4: (0, 0), 5: (0, 0), 6: (0, 3), 7: (1, 1), 8: (0, 3), 9: (0, 3), 10: (0, 1), 11: (0, 2), 12: (0, 1), 13: (0, 1), 14: (0, 1), 15: (0, 1), 16: (0, 1), 17: (0, 0), 18: (0, 1), 19: (0, 2), 20: (0, 1), 21: (0, 0), 22: (0, 0), 23: (0, 0), 24: (0, 0), 25: (0, 0), 26: (0, 0), 27: (0, 0), 28: (0, 0), 29: (0, 0), 30: (0, 0), 31: (0, 0), 32: (0, 0), 33: (0, 0)},
+
+    # Adds Sasl Authenticate, and additional admin apis (describe/alter log dirs, etc)
+    (1, 0): {0: (0, 5), 1: (0, 6), 2: (0, 2), 3: (0, 5), 4: (0, 1), 5: (0, 0), 6: (0, 4), 7: (0, 1), 8: (0, 3), 9: (0, 3), 10: (0, 1), 11: (0, 2), 12: (0, 1), 13: (0, 1), 14: (0, 1), 15: (0, 1), 16: (0, 1), 17: (0, 1), 18: (0, 1), 19: (0, 2), 20: (0, 1), 21: (0, 0), 22: (0, 0), 23: (0, 0), 24: (0, 0), 25: (0, 0), 26: (0, 0), 27: (0, 0), 28: (0, 0), 29: (0, 0), 30: (0, 0), 31: (0, 0), 32: (0, 0), 33: (0, 0), 34: (0, 0), 35: (0, 0), 36: (0, 0), 37: (0, 0)},
+
+    (1, 1): {0: (0, 5), 1: (0, 7), 2: (0, 2), 3: (0, 5), 4: (0, 1), 5: (0, 0), 6: (0, 4), 7: (0, 1), 8: (0, 3), 9: (0, 3), 10: (0, 1), 11: (0, 2), 12: (0, 1), 13: (0, 1), 14: (0, 1), 15: (0, 1), 16: (0, 1), 17: (0, 1), 18: (0, 1), 19: (0, 2), 20: (0, 1), 21: (0, 0), 22: (0, 0), 23: (0, 0), 24: (0, 0), 25: (0, 0), 26: (0, 0), 27: (0, 0), 28: (0, 0), 29: (0, 0), 30: (0, 0), 31: (0, 0), 32: (0, 1), 33: (0, 0), 34: (0, 0), 35: (0, 0), 36: (0, 0), 37: (0, 0), 38: (0, 0), 39: (0, 0), 40: (0, 0), 41: (0, 0), 42: (0, 0)},
+
+    (2, 0): {0: (0, 6), 1: (0, 8), 2: (0, 3), 3: (0, 6), 4: (0, 1), 5: (0, 0), 6: (0, 4), 7: (0, 1), 8: (0, 4), 9: (0, 4), 10: (0, 2), 11: (0, 3), 12: (0, 2), 13: (0, 2), 14: (0, 2), 15: (0, 2), 16: (0, 2), 17: (0, 1), 18: (0, 2), 19: (0, 3), 20: (0, 2), 21: (0, 1), 22: (0, 1), 23: (0, 1), 24: (0, 1), 25: (0, 1), 26: (0, 1), 27: (0, 0), 28: (0, 1), 29: (0, 1), 30: (0, 1), 31: (0, 1), 32: (0, 2), 33: (0, 1), 34: (0, 1), 35: (0, 1), 36: (0, 0), 37: (0, 1), 38: (0, 1), 39: (0, 1), 40: (0, 1), 41: (0, 1), 42: (0, 1)},
+
+    (2, 1): {0: (0, 7), 1: (0, 10), 2: (0, 4), 3: (0, 7), 4: (0, 1), 5: (0, 0), 6: (0, 4), 7: (0, 1), 8: (0, 6), 9: (0, 5), 10: (0, 2), 11: (0, 3), 12: (0, 2), 13: (0, 2), 14: (0, 2), 15: (0, 2), 16: (0, 2), 17: (0, 1), 18: (0, 2), 19: (0, 3), 20: (0, 3), 21: (0, 1), 22: (0, 1), 23: (0, 2), 24: (0, 1), 25: (0, 1), 26: (0, 1), 27: (0, 0), 28: (0, 2), 29: (0, 1), 30: (0, 1), 31: (0, 1), 32: (0, 2), 33: (0, 1), 34: (0, 1), 35: (0, 1), 36: (0, 0), 37: (0, 1), 38: (0, 1), 39: (0, 1), 40: (0, 1), 41: (0, 1), 42: (0, 1)},
+
+    (2, 2): {0: (0, 7), 1: (0, 10), 2: (0, 5), 3: (0, 7), 4: (0, 2), 5: (0, 1), 6: (0, 5), 7: (0, 2), 8: (0, 6), 9: (0, 5), 10: (0, 2), 11: (0, 4), 12: (0, 2), 13: (0, 2), 14: (0, 2), 15: (0, 2), 16: (0, 2), 17: (0, 1), 18: (0, 2), 19: (0, 3), 20: (0, 3), 21: (0, 1), 22: (0, 1), 23: (0, 2), 24: (0, 1), 25: (0, 1), 26: (0, 1), 27: (0, 0), 28: (0, 2), 29: (0, 1), 30: (0, 1), 31: (0, 1), 32: (0, 2), 33: (0, 1), 34: (0, 1), 35: (0, 1), 36: (0, 1), 37: (0, 1), 38: (0, 1), 39: (0, 1), 40: (0, 1), 41: (0, 1), 42: (0, 1), 43: (0, 0)},
+
+    (2, 3): {0: (0, 7), 1: (0, 11), 2: (0, 5), 3: (0, 8), 4: (0, 2), 5: (0, 1), 6: (0, 5), 7: (0, 2), 8: (0, 7), 9: (0, 5), 10: (0, 2), 11: (0, 5), 12: (0, 3), 13: (0, 2), 14: (0, 3), 15: (0, 3), 16: (0, 2), 17: (0, 1), 18: (0, 2), 19: (0, 3), 20: (0, 3), 21: (0, 1), 22: (0, 1), 23: (0, 3), 24: (0, 1), 25: (0, 1), 26: (0, 1), 27: (0, 0), 28: (0, 2), 29: (0, 1), 30: (0, 1), 31: (0, 1), 32: (0, 2), 33: (0, 1), 34: (0, 1), 35: (0, 1), 36: (0, 1), 37: (0, 1), 38: (0, 1), 39: (0, 1), 40: (0, 1), 41: (0, 1), 42: (0, 1), 43: (0, 0), 44: (0, 0)},
+
+    (2, 4): {0: (0, 8), 1: (0, 11), 2: (0, 5), 3: (0, 9), 4: (0, 4), 5: (0, 2), 6: (0, 6), 7: (0, 3), 8: (0, 8), 9: (0, 6), 10: (0, 3), 11: (0, 6), 12: (0, 4), 13: (0, 4), 14: (0, 4), 15: (0, 5), 16: (0, 3), 17: (0, 1), 18: (0, 3), 19: (0, 5), 20: (0, 4), 21: (0, 1), 22: (0, 2), 23: (0, 3), 24: (0, 1), 25: (0, 1), 26: (0, 1), 27: (0, 0), 28: (0, 2), 29: (0, 1), 30: (0, 1), 31: (0, 1), 32: (0, 2), 33: (0, 1), 34: (0, 1), 35: (0, 1), 36: (0, 1), 37: (0, 1), 38: (0, 2), 39: (0, 1), 40: (0, 1), 41: (0, 1), 42: (0, 2), 43: (0, 2), 44: (0, 1), 45: (0, 0), 46: (0, 0), 47: (0, 0)},
+
+    (2, 5): {0: (0, 8), 1: (0, 11), 2: (0, 5), 3: (0, 9), 4: (0, 4), 5: (0, 2), 6: (0, 6), 7: (0, 3), 8: (0, 8), 9: (0, 7), 10: (0, 3), 11: (0, 7), 12: (0, 4), 13: (0, 4), 14: (0, 5), 15: (0, 5), 16: (0, 3), 17: (0, 1), 18: (0, 3), 19: (0, 5), 20: (0, 4), 21: (0, 1), 22: (0, 3), 23: (0, 3), 24: (0, 1), 25: (0, 1), 26: (0, 1), 27: (0, 0), 28: (0, 3), 29: (0, 2), 30: (0, 2), 31: (0, 2), 32: (0, 2), 33: (0, 1), 34: (0, 1), 35: (0, 1), 36: (0, 2), 37: (0, 2), 38: (0, 2), 39: (0, 2), 40: (0, 2), 41: (0, 2), 42: (0, 2), 43: (0, 2), 44: (0, 1), 45: (0, 0), 46: (0, 0), 47: (0, 0)},
+
+    (2, 6): {0: (0, 8), 1: (0, 11), 2: (0, 5), 3: (0, 9), 4: (0, 4), 5: (0, 3), 6: (0, 6), 7: (0, 3), 8: (0, 8), 9: (0, 7), 10: (0, 3), 11: (0, 7), 12: (0, 4), 13: (0, 4), 14: (0, 5), 15: (0, 5), 16: (0, 4), 17: (0, 1), 18: (0, 3), 19: (0, 5), 20: (0, 4), 21: (0, 2), 22: (0, 3), 23: (0, 3), 24: (0, 1), 25: (0, 1), 26: (0, 1), 27: (0, 0), 28: (0, 3), 29: (0, 2), 30: (0, 2), 31: (0, 2), 32: (0, 3), 33: (0, 1), 34: (0, 1), 35: (0, 2), 36: (0, 2), 37: (0, 2), 38: (0, 2), 39: (0, 2), 40: (0, 2), 41: (0, 2), 42: (0, 2), 43: (0, 2), 44: (0, 1), 45: (0, 0), 46: (0, 0), 47: (0, 0), 48: (0, 0), 49: (0, 0)},
+
+    (2, 7): {0: (0, 8), 1: (0, 12), 2: (0, 5), 3: (0, 9), 4: (0, 4), 5: (0, 3), 6: (0, 6), 7: (0, 3), 8: (0, 8), 9: (0, 7), 10: (0, 3), 11: (0, 7), 12: (0, 4), 13: (0, 4), 14: (0, 5), 15: (0, 5), 16: (0, 4), 17: (0, 1), 18: (0, 3), 19: (0, 6), 20: (0, 5), 21: (0, 2), 22: (0, 4), 23: (0, 3), 24: (0, 2), 25: (0, 2), 26: (0, 2), 27: (0, 0), 28: (0, 3), 29: (0, 2), 30: (0, 2), 31: (0, 2), 32: (0, 3), 33: (0, 1), 34: (0, 1), 35: (0, 2), 36: (0, 2), 37: (0, 3), 38: (0, 2), 39: (0, 2), 40: (0, 2), 41: (0, 2), 42: (0, 2), 43: (0, 2), 44: (0, 1), 45: (0, 0), 46: (0, 0), 47: (0, 0), 48: (0, 0), 49: (0, 0), 50: (0, 0), 51: (0, 0), 56: (0, 0), 57: (0, 0)},
+
+    (2, 8): {0: (0, 9), 1: (0, 12), 2: (0, 6), 3: (0, 11), 4: (0, 5), 5: (0, 3), 6: (0, 7), 7: (0, 3), 8: (0, 8), 9: (0, 7), 10: (0, 3), 11: (0, 7), 12: (0, 4), 13: (0, 4), 14: (0, 5), 15: (0, 5), 16: (0, 4), 17: (0, 1), 18: (0, 3), 19: (0, 7), 20: (0, 6), 21: (0, 2), 22: (0, 4), 23: (0, 4), 24: (0, 3), 25: (0, 3), 26: (0, 3), 27: (0, 1), 28: (0, 3), 29: (0, 2), 30: (0, 2), 31: (0, 2), 32: (0, 4), 33: (0, 2), 34: (0, 2), 35: (0, 2), 36: (0, 2), 37: (0, 3), 38: (0, 2), 39: (0, 2), 40: (0, 2), 41: (0, 2), 42: (0, 2), 43: (0, 2), 44: (0, 1), 45: (0, 0), 46: (0, 0), 47: (0, 0), 48: (0, 1), 49: (0, 1), 50: (0, 0), 51: (0, 0), 56: (0, 0), 57: (0, 0), 60: (0, 0), 61: (0, 0)},
+
+    (3, 0): {0: (0, 9), 1: (0, 12), 2: (0, 7), 3: (0, 11), 4: (0, 5), 5: (0, 3), 6: (0, 7), 7: (0, 3), 8: (0, 8), 9: (0, 8), 10: (0, 4), 11: (0, 7), 12: (0, 4), 13: (0, 4), 14: (0, 5), 15: (0, 5), 16: (0, 4), 17: (0, 1), 18: (0, 3), 19: (0, 7), 20: (0, 6), 21: (0, 2), 22: (0, 4), 23: (0, 4), 24: (0, 3), 25: (0, 3), 26: (0, 3), 27: (0, 1), 28: (0, 3), 29: (0, 2), 30: (0, 2), 31: (0, 2), 32: (0, 4), 33: (0, 2), 34: (0, 2), 35: (0, 2), 36: (0, 2), 37: (0, 3), 38: (0, 2), 39: (0, 2), 40: (0, 2), 41: (0, 2), 42: (0, 2), 43: (0, 2), 44: (0, 1), 45: (0, 0), 46: (0, 0), 47: (0, 0), 48: (0, 1), 49: (0, 1), 50: (0, 0), 51: (0, 0), 56: (0, 0), 57: (0, 0), 60: (0, 0), 61: (0, 0), 65: (0, 0), 66: (0, 0), 67: (0, 0)},
+
+    (3, 1): {0: (0, 9), 1: (0, 13), 2: (0, 7), 3: (0, 12), 4: (0, 5), 5: (0, 3), 6: (0, 7), 7: (0, 3), 8: (0, 8), 9: (0, 8), 10: (0, 4), 11: (0, 7), 12: (0, 4), 13: (0, 4), 14: (0, 5), 15: (0, 5), 16: (0, 4), 17: (0, 1), 18: (0, 3), 19: (0, 7), 20: (0, 6), 21: (0, 2), 22: (0, 4), 23: (0, 4), 24: (0, 3), 25: (0, 3), 26: (0, 3), 27: (0, 1), 28: (0, 3), 29: (0, 2), 30: (0, 2), 31: (0, 2), 32: (0, 4), 33: (0, 2), 34: (0, 2), 35: (0, 2), 36: (0, 2), 37: (0, 3), 38: (0, 2), 39: (0, 2), 40: (0, 2), 41: (0, 2), 42: (0, 2), 43: (0, 2), 44: (0, 1), 45: (0, 0), 46: (0, 0), 47: (0, 0), 48: (0, 1), 49: (0, 1), 50: (0, 0), 51: (0, 0), 56: (0, 0), 57: (0, 0), 60: (0, 0), 61: (0, 0), 65: (0, 0), 66: (0, 0), 67: (0, 0)},
+
+    (3, 2): {0: (0, 9), 1: (0, 13), 2: (0, 7), 3: (0, 12), 4: (0, 6), 5: (0, 3), 6: (0, 7), 7: (0, 3), 8: (0, 8), 9: (0, 8), 10: (0, 4), 11: (0, 9), 12: (0, 4), 13: (0, 5), 14: (0, 5), 15: (0, 5), 16: (0, 4), 17: (0, 1), 18: (0, 3), 19: (0, 7), 20: (0, 6), 21: (0, 2), 22: (0, 4), 23: (0, 4), 24: (0, 3), 25: (0, 3), 26: (0, 3), 27: (0, 1), 28: (0, 3), 29: (0, 2), 30: (0, 2), 31: (0, 2), 32: (0, 4), 33: (0, 2), 34: (0, 2), 35: (0, 3), 36: (0, 2), 37: (0, 3), 38: (0, 2), 39: (0, 2), 40: (0, 2), 41: (0, 2), 42: (0, 2), 43: (0, 2), 44: (0, 1), 45: (0, 0), 46: (0, 0), 47: (0, 0), 48: (0, 1), 49: (0, 1), 50: (0, 0), 51: (0, 0), 56: (0, 1), 57: (0, 0), 60: (0, 0), 61: (0, 0), 65: (0, 0), 66: (0, 0), 67: (0, 0)},
+
+    (3, 3): {0: (0, 9), 1: (0, 13), 2: (0, 7), 3: (0, 12), 4: (0, 6), 5: (0, 3), 6: (0, 7), 7: (0, 3), 8: (0, 8), 9: (0, 8), 10: (0, 4), 11: (0, 9), 12: (0, 4), 13: (0, 5), 14: (0, 5), 15: (0, 5), 16: (0, 4), 17: (0, 1), 18: (0, 3), 19: (0, 7), 20: (0, 6), 21: (0, 2), 22: (0, 4), 23: (0, 4), 24: (0, 3), 25: (0, 3), 26: (0, 3), 27: (0, 1), 28: (0, 3), 29: (0, 3), 30: (0, 3), 31: (0, 3), 32: (0, 4), 33: (0, 2), 34: (0, 2), 35: (0, 4), 36: (0, 2), 37: (0, 3), 38: (0, 3), 39: (0, 2), 40: (0, 2), 41: (0, 3), 42: (0, 2), 43: (0, 2), 44: (0, 1), 45: (0, 0), 46: (0, 0), 47: (0, 0), 48: (0, 1), 49: (0, 1), 50: (0, 0), 51: (0, 0), 56: (0, 2), 57: (0, 1), 60: (0, 0), 61: (0, 0), 65: (0, 0), 66: (0, 0), 67: (0, 0)},
+
+    (3, 4): {0: (0, 9), 1: (0, 13), 2: (0, 7), 3: (0, 12), 4: (0, 7), 5: (0, 4), 6: (0, 8), 7: (0, 3), 8: (0, 8), 9: (0, 8), 10: (0, 4), 11: (0, 9), 12: (0, 4), 13: (0, 5), 14: (0, 5), 15: (0, 5), 16: (0, 4), 17: (0, 1), 18: (0, 3), 19: (0, 7), 20: (0, 6), 21: (0, 2), 22: (0, 4), 23: (0, 4), 24: (0, 3), 25: (0, 3), 26: (0, 3), 27: (0, 1), 28: (0, 3), 29: (0, 3), 30: (0, 3), 31: (0, 3), 32: (0, 4), 33: (0, 2), 34: (0, 2), 35: (0, 4), 36: (0, 2), 37: (0, 3), 38: (0, 3), 39: (0, 2), 40: (0, 2), 41: (0, 3), 42: (0, 2), 43: (0, 2), 44: (0, 1), 45: (0, 0), 46: (0, 0), 47: (0, 0), 48: (0, 1), 49: (0, 1), 50: (0, 0), 51: (0, 0), 56: (0, 2), 57: (0, 1), 58: (0, 0), 60: (0, 0), 61: (0, 0), 65: (0, 0), 66: (0, 0), 67: (0, 0)},
+
+    (3, 5): {0: (0, 9), 1: (0, 15), 2: (0, 8), 3: (0, 12), 4: (0, 7), 5: (0, 4), 6: (0, 8), 7: (0, 3), 8: (0, 8), 9: (0, 8), 10: (0, 4), 11: (0, 9), 12: (0, 4), 13: (0, 5), 14: (0, 5), 15: (0, 5), 16: (0, 4), 17: (0, 1), 18: (0, 3), 19: (0, 7), 20: (0, 6), 21: (0, 2), 22: (0, 4), 23: (0, 4), 24: (0, 3), 25: (0, 3), 26: (0, 3), 27: (0, 1), 28: (0, 3), 29: (0, 3), 30: (0, 3), 31: (0, 3), 32: (0, 4), 33: (0, 2), 34: (0, 2), 35: (0, 4), 36: (0, 2), 37: (0, 3), 38: (0, 3), 39: (0, 2), 40: (0, 2), 41: (0, 3), 42: (0, 2), 43: (0, 2), 44: (0, 1), 45: (0, 0), 46: (0, 0), 47: (0, 0), 48: (0, 1), 49: (0, 1), 50: (0, 0), 51: (0, 0), 56: (0, 3), 57: (0, 1), 58: (0, 0), 60: (0, 0), 61: (0, 0), 65: (0, 0), 66: (0, 0), 67: (0, 0)},
+
+    (3, 6): {0: (0, 9), 1: (0, 15), 2: (0, 8), 3: (0, 12), 4: (0, 7), 5: (0, 4), 6: (0, 8), 7: (0, 3), 8: (0, 8), 9: (0, 8), 10: (0, 4), 11: (0, 9), 12: (0, 4), 13: (0, 5), 14: (0, 5), 15: (0, 5), 16: (0, 4), 17: (0, 1), 18: (0, 3), 19: (0, 7), 20: (0, 6), 21: (0, 2), 22: (0, 4), 23: (0, 4), 24: (0, 4), 25: (0, 3), 26: (0, 3), 27: (0, 1), 28: (0, 3), 29: (0, 3), 30: (0, 3), 31: (0, 3), 32: (0, 4), 33: (0, 2), 34: (0, 2), 35: (0, 4), 36: (0, 2), 37: (0, 3), 38: (0, 3), 39: (0, 2), 40: (0, 2), 41: (0, 3), 42: (0, 2), 43: (0, 2), 44: (0, 1), 45: (0, 0), 46: (0, 0), 47: (0, 0), 48: (0, 1), 49: (0, 1), 50: (0, 0), 51: (0, 0), 56: (0, 3), 57: (0, 1), 58: (0, 0), 60: (0, 0), 61: (0, 0), 65: (0, 0), 66: (0, 0), 67: (0, 0)},
+
+    (3, 7): {0: (0, 10), 1: (0, 16), 2: (0, 8), 3: (0, 12), 4: (0, 7), 5: (0, 4), 6: (0, 8), 7: (0, 3), 8: (0, 9), 9: (0, 9), 10: (0, 4), 11: (0, 9), 12: (0, 4), 13: (0, 5), 14: (0, 5), 15: (0, 5), 16: (0, 4), 17: (0, 1), 18: (0, 3), 19: (0, 7), 20: (0, 6), 21: (0, 2), 22: (0, 4), 23: (0, 4), 24: (0, 4), 25: (0, 3), 26: (0, 3), 27: (0, 1), 28: (0, 3), 29: (0, 3), 30: (0, 3), 31: (0, 3), 32: (0, 4), 33: (0, 2), 34: (0, 2), 35: (0, 4), 36: (0, 2), 37: (0, 3), 38: (0, 3), 39: (0, 2), 40: (0, 2), 41: (0, 3), 42: (0, 2), 43: (0, 2), 44: (0, 1), 45: (0, 0), 46: (0, 0), 47: (0, 0), 48: (0, 1), 49: (0, 1), 50: (0, 0), 51: (0, 0), 56: (0, 3), 57: (0, 1), 58: (0, 0), 60: (0, 1), 61: (0, 0), 65: (0, 0), 66: (0, 0), 67: (0, 0), 68: (0, 0)},
+
+    (3, 8): {0: (0, 11), 1: (0, 16), 2: (0, 8), 3: (0, 12), 4: (0, 7), 5: (0, 4), 6: (0, 8), 7: (0, 3), 8: (0, 9), 9: (0, 9), 10: (0, 5), 11: (0, 9), 12: (0, 4), 13: (0, 5), 14: (0, 5), 15: (0, 5), 16: (0, 5), 17: (0, 1), 18: (0, 3), 19: (0, 7), 20: (0, 6), 21: (0, 2), 22: (0, 5), 23: (0, 4), 24: (0, 5), 25: (0, 4), 26: (0, 4), 27: (0, 1), 28: (0, 4), 29: (0, 3), 30: (0, 3), 31: (0, 3), 32: (0, 4), 33: (0, 2), 34: (0, 2), 35: (0, 4), 36: (0, 2), 37: (0, 3), 38: (0, 3), 39: (0, 2), 40: (0, 2), 41: (0, 3), 42: (0, 2), 43: (0, 2), 44: (0, 1), 45: (0, 0), 46: (0, 0), 47: (0, 0), 48: (0, 1), 49: (0, 1), 50: (0, 0), 51: (0, 0), 56: (0, 3), 57: (0, 1), 58: (0, 0), 60: (0, 1), 61: (0, 0), 65: (0, 0), 66: (0, 1), 67: (0, 0), 68: (0, 0), 69: (0, 0)},
+
+    (3, 9): {0: (0, 11), 1: (0, 17), 2: (0, 9), 3: (0, 12), 4: (0, 7), 5: (0, 4), 6: (0, 8), 7: (0, 3), 8: (0, 9), 9: (0, 9), 10: (0, 6), 11: (0, 9), 12: (0, 4), 13: (0, 5), 14: (0, 5), 15: (0, 5), 16: (0, 5), 17: (0, 1), 18: (0, 4), 19: (0, 7), 20: (0, 6), 21: (0, 2), 22: (0, 5), 23: (0, 4), 24: (0, 5), 25: (0, 4), 26: (0, 4), 27: (0, 1), 28: (0, 4), 29: (0, 3), 30: (0, 3), 31: (0, 3), 32: (0, 4), 33: (0, 2), 34: (0, 2), 35: (0, 4), 36: (0, 2), 37: (0, 3), 38: (0, 3), 39: (0, 2), 40: (0, 2), 41: (0, 3), 42: (0, 2), 43: (0, 2), 44: (0, 1), 45: (0, 0), 46: (0, 0), 47: (0, 0), 48: (0, 1), 49: (0, 1), 50: (0, 0), 51: (0, 0), 56: (0, 3), 57: (0, 1), 58: (0, 0), 60: (0, 1), 61: (0, 0), 65: (0, 0), 66: (0, 1), 67: (0, 0), 68: (0, 0), 69: (0, 0)},
+
+    (4, 0): {0: (0, 12), 1: (4, 17), 2: (1, 10), 3: (0, 13), 8: (2, 9), 9: (1, 9), 10: (0, 6), 11: (2, 9), 12: (0, 4), 13: (0, 5), 14: (0, 5), 15: (0, 6), 16: (0, 5), 17: (0, 1), 18: (0, 4), 19: (2, 7), 20: (1, 6), 21: (0, 2), 22: (0, 5), 23: (2, 4), 24: (0, 5), 25: (0, 4), 26: (0, 5), 27: (1, 1), 28: (0, 5), 29: (1, 3), 30: (1, 3), 31: (1, 3), 32: (1, 4), 33: (0, 2), 34: (1, 2), 35: (1, 4), 36: (0, 2), 37: (0, 3), 38: (1, 3), 39: (1, 2), 40: (1, 2), 41: (1, 3), 42: (0, 2), 43: (0, 2), 44: (0, 1), 45: (0, 0), 46: (0, 0), 47: (0, 0), 48: (0, 1), 49: (0, 1), 50: (0, 0), 51: (0, 0), 55: (0, 2), 57: (0, 2), 60: (0, 2), 61: (0, 0), 64: (0, 0), 65: (0, 0), 66: (0, 1), 68: (0, 1), 69: (0, 1), 74: (0, 0), 75: (0, 0), 80: (0, 0), 81: (0, 0)},
+
+}
diff --git a/kafka/protocol/commit.py b/kafka/protocol/commit.py
index 31fc23707..a0439e7ef 100644
--- a/kafka/protocol/commit.py
+++ b/kafka/protocol/commit.py
@@ -1,7 +1,7 @@
 from __future__ import absolute_import
 
 from kafka.protocol.api import Request, Response
-from kafka.protocol.types import Array, Int8, Int16, Int32, Int64, Schema, String
+from kafka.protocol.types import Array, Int16, Int32, Int64, Schema, String
 
 
 class OffsetCommitResponse_v0(Response):
@@ -41,6 +41,24 @@ class OffsetCommitResponse_v3(Response):
     )
 
 
+class OffsetCommitResponse_v4(Response):
+    API_KEY = 8
+    API_VERSION = 4
+    SCHEMA = OffsetCommitResponse_v3.SCHEMA
+
+
+class OffsetCommitResponse_v5(Response):
+    API_KEY = 8
+    API_VERSION = 5
+    SCHEMA = OffsetCommitResponse_v4.SCHEMA
+
+
+class OffsetCommitResponse_v6(Response):
+    API_KEY = 8
+    API_VERSION = 6
+    SCHEMA = OffsetCommitResponse_v5.SCHEMA
+
+
 class OffsetCommitRequest_v0(Request):
     API_KEY = 8
     API_VERSION = 0  # Zookeeper-backed storage
@@ -76,13 +94,13 @@ class OffsetCommitRequest_v1(Request):
 
 class OffsetCommitRequest_v2(Request):
     API_KEY = 8
-    API_VERSION = 2  # added retention_time, dropped timestamp
+    API_VERSION = 2
     RESPONSE_TYPE = OffsetCommitResponse_v2
     SCHEMA = Schema(
         ('consumer_group', String('utf-8')),
         ('consumer_group_generation_id', Int32),
         ('consumer_id', String('utf-8')),
-        ('retention_time', Int64),
+        ('retention_time', Int64), # added retention_time, dropped timestamp
         ('topics', Array(
             ('topic', String('utf-8')),
             ('partitions', Array(
@@ -90,7 +108,6 @@ class OffsetCommitRequest_v2(Request):
                 ('offset', Int64),
                 ('metadata', String('utf-8'))))))
     )
-    DEFAULT_GENERATION_ID = -1
     DEFAULT_RETENTION_TIME = -1
 
 
@@ -99,15 +116,63 @@ class OffsetCommitRequest_v3(Request):
     API_VERSION = 3
     RESPONSE_TYPE = OffsetCommitResponse_v3
     SCHEMA = OffsetCommitRequest_v2.SCHEMA
+    DEFAULT_RETENTION_TIME = -1
+
+
+class OffsetCommitRequest_v4(Request):
+    API_KEY = 8
+    API_VERSION = 4
+    RESPONSE_TYPE = OffsetCommitResponse_v4
+    SCHEMA = OffsetCommitRequest_v3.SCHEMA
+    DEFAULT_RETENTION_TIME = -1
+
+
+class OffsetCommitRequest_v5(Request):
+    API_KEY = 8
+    API_VERSION = 5 # drops retention_time
+    RESPONSE_TYPE = OffsetCommitResponse_v5
+    SCHEMA = Schema(
+        ('consumer_group', String('utf-8')),
+        ('consumer_group_generation_id', Int32),
+        ('consumer_id', String('utf-8')),
+        ('topics', Array(
+            ('topic', String('utf-8')),
+            ('partitions', Array(
+                ('partition', Int32),
+                ('offset', Int64),
+                ('metadata', String('utf-8'))))))
+    )
+
+
+class OffsetCommitRequest_v6(Request):
+    API_KEY = 8
+    API_VERSION = 6
+    RESPONSE_TYPE = OffsetCommitResponse_v6
+    SCHEMA = Schema(
+        ('consumer_group', String('utf-8')),
+        ('consumer_group_generation_id', Int32),
+        ('consumer_id', String('utf-8')),
+        ('topics', Array(
+            ('topic', String('utf-8')),
+            ('partitions', Array(
+                ('partition', Int32),
+                ('offset', Int64),
+                ('leader_epoch', Int32), # added for fencing / kip-320. default -1
+                ('metadata', String('utf-8'))))))
+    )
 
 
 OffsetCommitRequest = [
     OffsetCommitRequest_v0, OffsetCommitRequest_v1,
-    OffsetCommitRequest_v2, OffsetCommitRequest_v3
+    OffsetCommitRequest_v2, OffsetCommitRequest_v3,
+    OffsetCommitRequest_v4, OffsetCommitRequest_v5,
+    OffsetCommitRequest_v6,
 ]
 OffsetCommitResponse = [
     OffsetCommitResponse_v0, OffsetCommitResponse_v1,
-    OffsetCommitResponse_v2, OffsetCommitResponse_v3
+    OffsetCommitResponse_v2, OffsetCommitResponse_v3,
+    OffsetCommitResponse_v4, OffsetCommitResponse_v5,
+    OffsetCommitResponse_v6,
 ]
 
 
@@ -163,6 +228,29 @@ class OffsetFetchResponse_v3(Response):
     )
 
 
+class OffsetFetchResponse_v4(Response):
+    API_KEY = 9
+    API_VERSION = 4
+    SCHEMA = OffsetFetchResponse_v3.SCHEMA
+
+
+class OffsetFetchResponse_v5(Response):
+    API_KEY = 9
+    API_VERSION = 5
+    SCHEMA = Schema(
+        ('throttle_time_ms', Int32),
+        ('topics', Array(
+            ('topic', String('utf-8')),
+            ('partitions', Array(
+                ('partition', Int32),
+                ('offset', Int64),
+                ('leader_epoch', Int32),
+                ('metadata', String('utf-8')),
+                ('error_code', Int16))))),
+        ('error_code', Int16)
+    )
+
+
 class OffsetFetchRequest_v0(Request):
     API_KEY = 9
     API_VERSION = 0  # zookeeper-backed storage
@@ -199,57 +287,27 @@ class OffsetFetchRequest_v3(Request):
     SCHEMA = OffsetFetchRequest_v2.SCHEMA
 
 
+class OffsetFetchRequest_v4(Request):
+    API_KEY = 9
+    API_VERSION = 4
+    RESPONSE_TYPE = OffsetFetchResponse_v4
+    SCHEMA = OffsetFetchRequest_v3.SCHEMA
+
+
+class OffsetFetchRequest_v5(Request):
+    API_KEY = 9
+    API_VERSION = 5
+    RESPONSE_TYPE = OffsetFetchResponse_v5
+    SCHEMA = OffsetFetchRequest_v4.SCHEMA
+
+
 OffsetFetchRequest = [
     OffsetFetchRequest_v0, OffsetFetchRequest_v1,
     OffsetFetchRequest_v2, OffsetFetchRequest_v3,
+    OffsetFetchRequest_v4, OffsetFetchRequest_v5,
 ]
 OffsetFetchResponse = [
     OffsetFetchResponse_v0, OffsetFetchResponse_v1,
     OffsetFetchResponse_v2, OffsetFetchResponse_v3,
+    OffsetFetchResponse_v4, OffsetFetchResponse_v5,
 ]
-
-
-class GroupCoordinatorResponse_v0(Response):
-    API_KEY = 10
-    API_VERSION = 0
-    SCHEMA = Schema(
-        ('error_code', Int16),
-        ('coordinator_id', Int32),
-        ('host', String('utf-8')),
-        ('port', Int32)
-    )
-
-
-class GroupCoordinatorResponse_v1(Response):
-    API_KEY = 10
-    API_VERSION = 1
-    SCHEMA = Schema(
-        ('error_code', Int16),
-        ('error_message', String('utf-8')),
-        ('coordinator_id', Int32),
-        ('host', String('utf-8')),
-        ('port', Int32)
-    )
-
-
-class GroupCoordinatorRequest_v0(Request):
-    API_KEY = 10
-    API_VERSION = 0
-    RESPONSE_TYPE = GroupCoordinatorResponse_v0
-    SCHEMA = Schema(
-        ('consumer_group', String('utf-8'))
-    )
-
-
-class GroupCoordinatorRequest_v1(Request):
-    API_KEY = 10
-    API_VERSION = 1
-    RESPONSE_TYPE = GroupCoordinatorResponse_v1
-    SCHEMA = Schema(
-        ('coordinator_key', String('utf-8')),
-        ('coordinator_type', Int8)
-    )
-
-
-GroupCoordinatorRequest = [GroupCoordinatorRequest_v0, GroupCoordinatorRequest_v1]
-GroupCoordinatorResponse = [GroupCoordinatorResponse_v0, GroupCoordinatorResponse_v1]
diff --git a/kafka/protocol/end_txn.py b/kafka/protocol/end_txn.py
new file mode 100644
index 000000000..96d6cc514
--- /dev/null
+++ b/kafka/protocol/end_txn.py
@@ -0,0 +1,58 @@
+from __future__ import absolute_import
+
+from kafka.protocol.api import Request, Response
+from kafka.protocol.types import Boolean, Int16, Int32, Int64, Schema, String
+
+
+class EndTxnResponse_v0(Response):
+    API_KEY = 26
+    API_VERSION = 0
+    SCHEMA = Schema(
+        ('throttle_time_ms', Int32),
+        ('error_code', Int16),
+    )
+
+
+class EndTxnResponse_v1(Response):
+    API_KEY = 26
+    API_VERSION = 1
+    SCHEMA = EndTxnResponse_v0.SCHEMA
+
+
+class EndTxnResponse_v2(Response):
+    API_KEY = 26
+    API_VERSION = 2
+    SCHEMA = EndTxnResponse_v1.SCHEMA
+
+
+class EndTxnRequest_v0(Request):
+    API_KEY = 26
+    API_VERSION = 0
+    RESPONSE_TYPE = EndTxnResponse_v0
+    SCHEMA = Schema(
+        ('transactional_id', String('utf-8')),
+        ('producer_id', Int64),
+        ('producer_epoch', Int16),
+        ('committed', Boolean))
+
+
+class EndTxnRequest_v1(Request):
+    API_KEY = 26
+    API_VERSION = 1
+    RESPONSE_TYPE = EndTxnResponse_v1
+    SCHEMA = EndTxnRequest_v0.SCHEMA
+
+
+class EndTxnRequest_v2(Request):
+    API_KEY = 26
+    API_VERSION = 2
+    RESPONSE_TYPE = EndTxnResponse_v2
+    SCHEMA = EndTxnRequest_v1.SCHEMA
+
+
+EndTxnRequest = [
+    EndTxnRequest_v0, EndTxnRequest_v1, EndTxnRequest_v2,
+]
+EndTxnResponse = [
+    EndTxnResponse_v0, EndTxnResponse_v1, EndTxnResponse_v2,
+]
diff --git a/kafka/protocol/fetch.py b/kafka/protocol/fetch.py
index f367848ce..036a37eb8 100644
--- a/kafka/protocol/fetch.py
+++ b/kafka/protocol/fetch.py
@@ -1,9 +1,15 @@
 from __future__ import absolute_import
 
+import collections
+
 from kafka.protocol.api import Request, Response
 from kafka.protocol.types import Array, Int8, Int16, Int32, Int64, Schema, String, Bytes
 
 
+AbortedTransaction = collections.namedtuple("AbortedTransaction",
+    ["producer_id", "first_offset"])
+
+
 class FetchResponse_v0(Response):
     API_KEY = 1
     API_VERSION = 0
@@ -14,7 +20,7 @@ class FetchResponse_v0(Response):
                 ('partition', Int32),
                 ('error_code', Int16),
                 ('highwater_offset', Int64),
-                ('message_set', Bytes)))))
+                ('records', Bytes)))))
     )
 
 
@@ -29,7 +35,7 @@ class FetchResponse_v1(Response):
                 ('partition', Int32),
                 ('error_code', Int16),
                 ('highwater_offset', Int64),
-                ('message_set', Bytes)))))
+                ('records', Bytes)))))
     )
 
 
@@ -46,6 +52,7 @@ class FetchResponse_v3(Response):
 
 
 class FetchResponse_v4(Response):
+    # Adds message format v2
     API_KEY = 1
     API_VERSION = 4
     SCHEMA = Schema(
@@ -60,7 +67,7 @@ class FetchResponse_v4(Response):
                 ('aborted_transactions', Array(
                     ('producer_id', Int64),
                     ('first_offset', Int64))),
-                ('message_set', Bytes)))))
+                ('records', Bytes)))))
     )
 
 
@@ -80,7 +87,7 @@ class FetchResponse_v5(Response):
                 ('aborted_transactions', Array(
                     ('producer_id', Int64),
                     ('first_offset', Int64))),
-                ('message_set', Bytes)))))
+                ('records', Bytes)))))
     )
 
 
@@ -115,7 +122,7 @@ class FetchResponse_v7(Response):
                 ('aborted_transactions', Array(
                     ('producer_id', Int64),
                     ('first_offset', Int64))),
-                ('message_set', Bytes)))))
+                ('records', Bytes)))))
     )
 
 
@@ -156,7 +163,7 @@ class FetchResponse_v11(Response):
                     ('producer_id', Int64),
                     ('first_offset', Int64))),
                 ('preferred_read_replica', Int32),
-                ('message_set', Bytes)))))
+                ('records', Bytes)))))
     )
 
 
@@ -211,6 +218,7 @@ class FetchRequest_v3(Request):
 
 class FetchRequest_v4(Request):
     # Adds isolation_level field
+    # Adds message format v2
     API_KEY = 1
     API_VERSION = 4
     RESPONSE_TYPE = FetchResponse_v4
@@ -264,7 +272,7 @@ class FetchRequest_v6(Request):
 
 class FetchRequest_v7(Request):
     """
-    Add incremental fetch requests
+    Add incremental fetch requests (see KIP-227)
     """
     API_KEY = 1
     API_VERSION = 7
@@ -285,7 +293,7 @@ class FetchRequest_v7(Request):
                 ('log_start_offset', Int64),
                 ('max_bytes', Int32))))),
         ('forgotten_topics_data', Array(
-            ('topic', String),
+            ('topic', String('utf-8')),
             ('partitions', Array(Int32))
         )),
     )
@@ -325,7 +333,7 @@ class FetchRequest_v9(Request):
                 ('log_start_offset', Int64),
                 ('max_bytes', Int32))))),
         ('forgotten_topics_data', Array(
-            ('topic', String),
+            ('topic', String('utf-8')),
             ('partitions', Array(Int32)),
         )),
     )
@@ -365,7 +373,7 @@ class FetchRequest_v11(Request):
                 ('log_start_offset', Int64),
                 ('max_bytes', Int32))))),
         ('forgotten_topics_data', Array(
-            ('topic', String),
+            ('topic', String('utf-8')),
             ('partitions', Array(Int32))
         )),
         ('rack_id', String('utf-8')),
diff --git a/kafka/protocol/find_coordinator.py b/kafka/protocol/find_coordinator.py
new file mode 100644
index 000000000..be5b45ded
--- /dev/null
+++ b/kafka/protocol/find_coordinator.py
@@ -0,0 +1,64 @@
+from __future__ import absolute_import
+
+from kafka.protocol.api import Request, Response
+from kafka.protocol.types import Int8, Int16, Int32, Schema, String
+
+
+class FindCoordinatorResponse_v0(Response):
+    API_KEY = 10
+    API_VERSION = 0
+    SCHEMA = Schema(
+        ('error_code', Int16),
+        ('coordinator_id', Int32),
+        ('host', String('utf-8')),
+        ('port', Int32)
+    )
+
+
+class FindCoordinatorResponse_v1(Response):
+    API_KEY = 10
+    API_VERSION = 1
+    SCHEMA = Schema(
+        ('throttle_time_ms', Int32),
+        ('error_code', Int16),
+        ('error_message', String('utf-8')),
+        ('coordinator_id', Int32),
+        ('host', String('utf-8')),
+        ('port', Int32)
+    )
+
+
+class FindCoordinatorResponse_v2(Response):
+    API_KEY = 10
+    API_VERSION = 2
+    SCHEMA = FindCoordinatorResponse_v1.SCHEMA
+
+
+class FindCoordinatorRequest_v0(Request):
+    API_KEY = 10
+    API_VERSION = 0
+    RESPONSE_TYPE = FindCoordinatorResponse_v0
+    SCHEMA = Schema(
+        ('consumer_group', String('utf-8'))
+    )
+
+
+class FindCoordinatorRequest_v1(Request):
+    API_KEY = 10
+    API_VERSION = 1
+    RESPONSE_TYPE = FindCoordinatorResponse_v1
+    SCHEMA = Schema(
+        ('coordinator_key', String('utf-8')),
+        ('coordinator_type', Int8) # 0: consumer, 1: transaction
+    )
+
+
+class FindCoordinatorRequest_v2(Request):
+    API_KEY = 10
+    API_VERSION = 2
+    RESPONSE_TYPE = FindCoordinatorResponse_v2
+    SCHEMA = FindCoordinatorRequest_v1.SCHEMA
+
+
+FindCoordinatorRequest = [FindCoordinatorRequest_v0, FindCoordinatorRequest_v1, FindCoordinatorRequest_v2]
+FindCoordinatorResponse = [FindCoordinatorResponse_v0, FindCoordinatorResponse_v1, FindCoordinatorResponse_v2]
diff --git a/kafka/protocol/group.py b/kafka/protocol/group.py
index bcb96553b..74e19c94b 100644
--- a/kafka/protocol/group.py
+++ b/kafka/protocol/group.py
@@ -5,6 +5,10 @@
 from kafka.protocol.types import Array, Bytes, Int16, Int32, Schema, String
 
 
+DEFAULT_GENERATION_ID = -1
+UNKNOWN_MEMBER_ID = ''
+
+
 class JoinGroupResponse_v0(Response):
     API_KEY = 11
     API_VERSION = 0
@@ -42,6 +46,18 @@ class JoinGroupResponse_v2(Response):
     )
 
 
+class JoinGroupResponse_v3(Response):
+    API_KEY = 11
+    API_VERSION = 3
+    SCHEMA = JoinGroupResponse_v2.SCHEMA
+
+
+class JoinGroupResponse_v4(Response):
+    API_KEY = 11
+    API_VERSION = 4
+    SCHEMA = JoinGroupResponse_v3.SCHEMA
+
+
 class JoinGroupRequest_v0(Request):
     API_KEY = 11
     API_VERSION = 0
@@ -55,7 +71,6 @@ class JoinGroupRequest_v0(Request):
             ('protocol_name', String('utf-8')),
             ('protocol_metadata', Bytes)))
     )
-    UNKNOWN_MEMBER_ID = ''
 
 
 class JoinGroupRequest_v1(Request):
@@ -72,7 +87,6 @@ class JoinGroupRequest_v1(Request):
             ('protocol_name', String('utf-8')),
             ('protocol_metadata', Bytes)))
     )
-    UNKNOWN_MEMBER_ID = ''
 
 
 class JoinGroupRequest_v2(Request):
@@ -80,14 +94,29 @@ class JoinGroupRequest_v2(Request):
     API_VERSION = 2
     RESPONSE_TYPE = JoinGroupResponse_v2
     SCHEMA = JoinGroupRequest_v1.SCHEMA
-    UNKNOWN_MEMBER_ID = ''
+
+
+class JoinGroupRequest_v3(Request):
+    API_KEY = 11
+    API_VERSION = 3
+    RESPONSE_TYPE = JoinGroupResponse_v3
+    SCHEMA = JoinGroupRequest_v2.SCHEMA
+
+
+class JoinGroupRequest_v4(Request):
+    API_KEY = 11
+    API_VERSION = 4
+    RESPONSE_TYPE = JoinGroupResponse_v4
+    SCHEMA = JoinGroupRequest_v3.SCHEMA
 
 
 JoinGroupRequest = [
-    JoinGroupRequest_v0, JoinGroupRequest_v1, JoinGroupRequest_v2
+    JoinGroupRequest_v0, JoinGroupRequest_v1, JoinGroupRequest_v2,
+    JoinGroupRequest_v3, JoinGroupRequest_v4,
 ]
 JoinGroupResponse = [
-    JoinGroupResponse_v0, JoinGroupResponse_v1, JoinGroupResponse_v2
+    JoinGroupResponse_v0, JoinGroupResponse_v1, JoinGroupResponse_v2,
+    JoinGroupResponse_v3, JoinGroupResponse_v4,
 ]
 
 
@@ -118,6 +147,12 @@ class SyncGroupResponse_v1(Response):
     )
 
 
+class SyncGroupResponse_v2(Response):
+    API_KEY = 14
+    API_VERSION = 2
+    SCHEMA = SyncGroupResponse_v1.SCHEMA
+
+
 class SyncGroupRequest_v0(Request):
     API_KEY = 14
     API_VERSION = 0
@@ -139,8 +174,15 @@ class SyncGroupRequest_v1(Request):
     SCHEMA = SyncGroupRequest_v0.SCHEMA
 
 
-SyncGroupRequest = [SyncGroupRequest_v0, SyncGroupRequest_v1]
-SyncGroupResponse = [SyncGroupResponse_v0, SyncGroupResponse_v1]
+class SyncGroupRequest_v2(Request):
+    API_KEY = 14
+    API_VERSION = 2
+    RESPONSE_TYPE = SyncGroupResponse_v2
+    SCHEMA = SyncGroupRequest_v1.SCHEMA
+
+
+SyncGroupRequest = [SyncGroupRequest_v0, SyncGroupRequest_v1, SyncGroupRequest_v2]
+SyncGroupResponse = [SyncGroupResponse_v0, SyncGroupResponse_v1, SyncGroupResponse_v2]
 
 
 class MemberAssignment(Struct):
@@ -170,6 +212,12 @@ class HeartbeatResponse_v1(Response):
     )
 
 
+class HeartbeatResponse_v2(Response):
+    API_KEY = 12
+    API_VERSION = 2
+    SCHEMA = HeartbeatResponse_v1.SCHEMA
+
+
 class HeartbeatRequest_v0(Request):
     API_KEY = 12
     API_VERSION = 0
@@ -188,8 +236,15 @@ class HeartbeatRequest_v1(Request):
     SCHEMA = HeartbeatRequest_v0.SCHEMA
 
 
-HeartbeatRequest = [HeartbeatRequest_v0, HeartbeatRequest_v1]
-HeartbeatResponse = [HeartbeatResponse_v0, HeartbeatResponse_v1]
+class HeartbeatRequest_v2(Request):
+    API_KEY = 12
+    API_VERSION = 2
+    RESPONSE_TYPE = HeartbeatResponse_v2
+    SCHEMA = HeartbeatRequest_v1.SCHEMA
+
+
+HeartbeatRequest = [HeartbeatRequest_v0, HeartbeatRequest_v1, HeartbeatRequest_v2]
+HeartbeatResponse = [HeartbeatResponse_v0, HeartbeatResponse_v1, HeartbeatResponse_v2]
 
 
 class LeaveGroupResponse_v0(Response):
@@ -209,6 +264,12 @@ class LeaveGroupResponse_v1(Response):
     )
 
 
+class LeaveGroupResponse_v2(Response):
+    API_KEY = 13
+    API_VERSION = 2
+    SCHEMA = LeaveGroupResponse_v1.SCHEMA
+
+
 class LeaveGroupRequest_v0(Request):
     API_KEY = 13
     API_VERSION = 0
@@ -226,5 +287,12 @@ class LeaveGroupRequest_v1(Request):
     SCHEMA = LeaveGroupRequest_v0.SCHEMA
 
 
-LeaveGroupRequest = [LeaveGroupRequest_v0, LeaveGroupRequest_v1]
-LeaveGroupResponse = [LeaveGroupResponse_v0, LeaveGroupResponse_v1]
+class LeaveGroupRequest_v2(Request):
+    API_KEY = 13
+    API_VERSION = 2
+    RESPONSE_TYPE = LeaveGroupResponse_v2
+    SCHEMA = LeaveGroupRequest_v1.SCHEMA
+
+
+LeaveGroupRequest = [LeaveGroupRequest_v0, LeaveGroupRequest_v1, LeaveGroupRequest_v2]
+LeaveGroupResponse = [LeaveGroupResponse_v0, LeaveGroupResponse_v1, LeaveGroupResponse_v2]
diff --git a/kafka/protocol/init_producer_id.py b/kafka/protocol/init_producer_id.py
new file mode 100644
index 000000000..8426fe00b
--- /dev/null
+++ b/kafka/protocol/init_producer_id.py
@@ -0,0 +1,46 @@
+from __future__ import absolute_import
+
+from kafka.protocol.api import Request, Response
+from kafka.protocol.types import Int16, Int32, Int64, Schema, String
+
+
+class InitProducerIdResponse_v0(Response):
+    API_KEY = 22
+    API_VERSION = 0
+    SCHEMA = Schema(
+        ('throttle_time_ms', Int32),
+        ('error_code', Int16),
+        ('producer_id', Int64),
+        ('producer_epoch', Int16),
+    )
+
+
+class InitProducerIdResponse_v1(Response):
+    API_KEY = 22
+    API_VERSION = 1
+    SCHEMA = InitProducerIdResponse_v0.SCHEMA
+
+
+class InitProducerIdRequest_v0(Request):
+    API_KEY = 22
+    API_VERSION = 0
+    RESPONSE_TYPE = InitProducerIdResponse_v0
+    SCHEMA = Schema(
+        ('transactional_id', String('utf-8')),
+        ('transaction_timeout_ms', Int32),
+    )
+
+
+class InitProducerIdRequest_v1(Request):
+    API_KEY = 22
+    API_VERSION = 1
+    RESPONSE_TYPE = InitProducerIdResponse_v1
+    SCHEMA = InitProducerIdRequest_v0.SCHEMA
+
+
+InitProducerIdRequest = [
+    InitProducerIdRequest_v0, InitProducerIdRequest_v1,
+]
+InitProducerIdResponse = [
+    InitProducerIdResponse_v0, InitProducerIdResponse_v1,
+]
diff --git a/kafka/protocol/offset.py b/kafka/protocol/list_offsets.py
similarity index 73%
rename from kafka/protocol/offset.py
rename to kafka/protocol/list_offsets.py
index 1ed382b0d..2e36dd660 100644
--- a/kafka/protocol/offset.py
+++ b/kafka/protocol/list_offsets.py
@@ -12,7 +12,7 @@ class OffsetResetStrategy(object):
     NONE = 0
 
 
-class OffsetResponse_v0(Response):
+class ListOffsetsResponse_v0(Response):
     API_KEY = 2
     API_VERSION = 0
     SCHEMA = Schema(
@@ -24,7 +24,7 @@ class OffsetResponse_v0(Response):
                 ('offsets', Array(Int64))))))
     )
 
-class OffsetResponse_v1(Response):
+class ListOffsetsResponse_v1(Response):
     API_KEY = 2
     API_VERSION = 1
     SCHEMA = Schema(
@@ -38,7 +38,7 @@ class OffsetResponse_v1(Response):
     )
 
 
-class OffsetResponse_v2(Response):
+class ListOffsetsResponse_v2(Response):
     API_KEY = 2
     API_VERSION = 2
     SCHEMA = Schema(
@@ -53,16 +53,16 @@ class OffsetResponse_v2(Response):
     )
 
 
-class OffsetResponse_v3(Response):
+class ListOffsetsResponse_v3(Response):
     """
     on quota violation, brokers send out responses before throttling
     """
     API_KEY = 2
     API_VERSION = 3
-    SCHEMA = OffsetResponse_v2.SCHEMA
+    SCHEMA = ListOffsetsResponse_v2.SCHEMA
 
 
-class OffsetResponse_v4(Response):
+class ListOffsetsResponse_v4(Response):
     """
     Add leader_epoch to response
     """
@@ -81,19 +81,19 @@ class OffsetResponse_v4(Response):
     )
 
 
-class OffsetResponse_v5(Response):
+class ListOffsetsResponse_v5(Response):
     """
     adds a new error code, OFFSET_NOT_AVAILABLE
     """
     API_KEY = 2
     API_VERSION = 5
-    SCHEMA = OffsetResponse_v4.SCHEMA
+    SCHEMA = ListOffsetsResponse_v4.SCHEMA
 
 
-class OffsetRequest_v0(Request):
+class ListOffsetsRequest_v0(Request):
     API_KEY = 2
     API_VERSION = 0
-    RESPONSE_TYPE = OffsetResponse_v0
+    RESPONSE_TYPE = ListOffsetsResponse_v0
     SCHEMA = Schema(
         ('replica_id', Int32),
         ('topics', Array(
@@ -107,10 +107,10 @@ class OffsetRequest_v0(Request):
         'replica_id': -1
     }
 
-class OffsetRequest_v1(Request):
+class ListOffsetsRequest_v1(Request):
     API_KEY = 2
     API_VERSION = 1
-    RESPONSE_TYPE = OffsetResponse_v1
+    RESPONSE_TYPE = ListOffsetsResponse_v1
     SCHEMA = Schema(
         ('replica_id', Int32),
         ('topics', Array(
@@ -124,10 +124,10 @@ class OffsetRequest_v1(Request):
     }
 
 
-class OffsetRequest_v2(Request):
+class ListOffsetsRequest_v2(Request):
     API_KEY = 2
     API_VERSION = 2
-    RESPONSE_TYPE = OffsetResponse_v2
+    RESPONSE_TYPE = ListOffsetsResponse_v2
     SCHEMA = Schema(
         ('replica_id', Int32),
         ('isolation_level', Int8),  # <- added isolation_level
@@ -142,23 +142,23 @@ class OffsetRequest_v2(Request):
     }
 
 
-class OffsetRequest_v3(Request):
+class ListOffsetsRequest_v3(Request):
     API_KEY = 2
     API_VERSION = 3
-    RESPONSE_TYPE = OffsetResponse_v3
-    SCHEMA = OffsetRequest_v2.SCHEMA
+    RESPONSE_TYPE = ListOffsetsResponse_v3
+    SCHEMA = ListOffsetsRequest_v2.SCHEMA
     DEFAULTS = {
         'replica_id': -1
     }
 
 
-class OffsetRequest_v4(Request):
+class ListOffsetsRequest_v4(Request):
     """
     Add current_leader_epoch to request
     """
     API_KEY = 2
     API_VERSION = 4
-    RESPONSE_TYPE = OffsetResponse_v4
+    RESPONSE_TYPE = ListOffsetsResponse_v4
     SCHEMA = Schema(
         ('replica_id', Int32),
         ('isolation_level', Int8),  # <- added isolation_level
@@ -166,7 +166,7 @@ class OffsetRequest_v4(Request):
             ('topic', String('utf-8')),
             ('partitions', Array(
                 ('partition', Int32),
-                ('current_leader_epoch', Int64),
+                ('current_leader_epoch', Int32),
                 ('timestamp', Int64)))))
     )
     DEFAULTS = {
@@ -174,21 +174,21 @@ class OffsetRequest_v4(Request):
     }
 
 
-class OffsetRequest_v5(Request):
+class ListOffsetsRequest_v5(Request):
     API_KEY = 2
     API_VERSION = 5
-    RESPONSE_TYPE = OffsetResponse_v5
-    SCHEMA = OffsetRequest_v4.SCHEMA
+    RESPONSE_TYPE = ListOffsetsResponse_v5
+    SCHEMA = ListOffsetsRequest_v4.SCHEMA
     DEFAULTS = {
         'replica_id': -1
     }
 
 
-OffsetRequest = [
-    OffsetRequest_v0, OffsetRequest_v1, OffsetRequest_v2,
-    OffsetRequest_v3, OffsetRequest_v4, OffsetRequest_v5,
+ListOffsetsRequest = [
+    ListOffsetsRequest_v0, ListOffsetsRequest_v1, ListOffsetsRequest_v2,
+    ListOffsetsRequest_v3, ListOffsetsRequest_v4, ListOffsetsRequest_v5,
 ]
-OffsetResponse = [
-    OffsetResponse_v0, OffsetResponse_v1, OffsetResponse_v2,
-    OffsetResponse_v3, OffsetResponse_v4, OffsetResponse_v5,
+ListOffsetsResponse = [
+    ListOffsetsResponse_v0, ListOffsetsResponse_v1, ListOffsetsResponse_v2,
+    ListOffsetsResponse_v3, ListOffsetsResponse_v4, ListOffsetsResponse_v5,
 ]
diff --git a/kafka/protocol/metadata.py b/kafka/protocol/metadata.py
index 414e5b84a..bb22ba997 100644
--- a/kafka/protocol/metadata.py
+++ b/kafka/protocol/metadata.py
@@ -128,6 +128,42 @@ class MetadataResponse_v5(Response):
     )
 
 
+class MetadataResponse_v6(Response):
+    """Metadata Request/Response v6 is the same as v5,
+    but on quota violation, brokers send out responses before throttling."""
+    API_KEY = 3
+    API_VERSION = 6
+    SCHEMA = MetadataResponse_v5.SCHEMA
+
+
+class MetadataResponse_v7(Response):
+    """v7 adds per-partition leader_epoch field"""
+    API_KEY = 3
+    API_VERSION = 7
+    SCHEMA = Schema(
+        ('throttle_time_ms', Int32),
+        ('brokers', Array(
+            ('node_id', Int32),
+            ('host', String('utf-8')),
+            ('port', Int32),
+            ('rack', String('utf-8')))),
+        ('cluster_id', String('utf-8')),
+        ('controller_id', Int32),
+        ('topics', Array(
+            ('error_code', Int16),
+            ('topic', String('utf-8')),
+            ('is_internal', Boolean),
+            ('partitions', Array(
+                ('error_code', Int16),
+                ('partition', Int32),
+                ('leader', Int32),
+                ('leader_epoch', Int32),
+                ('replicas', Array(Int32)),
+                ('isr', Array(Int32)),
+                ('offline_replicas', Array(Int32))))))
+    )
+
+
 class MetadataRequest_v0(Request):
     API_KEY = 3
     API_VERSION = 0
@@ -135,7 +171,8 @@ class MetadataRequest_v0(Request):
     SCHEMA = Schema(
         ('topics', Array(String('utf-8')))
     )
-    ALL_TOPICS = None  # Empty Array (len 0) for topics returns all topics
+    ALL_TOPICS = [] # Empty Array (len 0) for topics returns all topics
+    NO_TOPICS = [] # v0 does not support a 'no topics' request, so we'll just ask for ALL
 
 
 class MetadataRequest_v1(Request):
@@ -143,8 +180,8 @@ class MetadataRequest_v1(Request):
     API_VERSION = 1
     RESPONSE_TYPE = MetadataResponse_v1
     SCHEMA = MetadataRequest_v0.SCHEMA
-    ALL_TOPICS = -1  # Null Array (len -1) for topics returns all topics
-    NO_TOPICS = None  # Empty array (len 0) for topics returns no topics
+    ALL_TOPICS = None # Null Array (len -1) for topics returns all topics
+    NO_TOPICS = [] # Empty array (len 0) for topics returns no topics
 
 
 class MetadataRequest_v2(Request):
@@ -152,8 +189,8 @@ class MetadataRequest_v2(Request):
     API_VERSION = 2
     RESPONSE_TYPE = MetadataResponse_v2
     SCHEMA = MetadataRequest_v1.SCHEMA
-    ALL_TOPICS = -1  # Null Array (len -1) for topics returns all topics
-    NO_TOPICS = None  # Empty array (len 0) for topics returns no topics
+    ALL_TOPICS = None
+    NO_TOPICS = []
 
 
 class MetadataRequest_v3(Request):
@@ -161,8 +198,8 @@ class MetadataRequest_v3(Request):
     API_VERSION = 3
     RESPONSE_TYPE = MetadataResponse_v3
     SCHEMA = MetadataRequest_v1.SCHEMA
-    ALL_TOPICS = -1  # Null Array (len -1) for topics returns all topics
-    NO_TOPICS = None  # Empty array (len 0) for topics returns no topics
+    ALL_TOPICS = None
+    NO_TOPICS = []
 
 
 class MetadataRequest_v4(Request):
@@ -173,8 +210,8 @@ class MetadataRequest_v4(Request):
         ('topics', Array(String('utf-8'))),
         ('allow_auto_topic_creation', Boolean)
     )
-    ALL_TOPICS = -1  # Null Array (len -1) for topics returns all topics
-    NO_TOPICS = None  # Empty array (len 0) for topics returns no topics
+    ALL_TOPICS = None
+    NO_TOPICS = []
 
 
 class MetadataRequest_v5(Request):
@@ -186,15 +223,35 @@ class MetadataRequest_v5(Request):
     API_VERSION = 5
     RESPONSE_TYPE = MetadataResponse_v5
     SCHEMA = MetadataRequest_v4.SCHEMA
-    ALL_TOPICS = -1  # Null Array (len -1) for topics returns all topics
-    NO_TOPICS = None  # Empty array (len 0) for topics returns no topics
+    ALL_TOPICS = None
+    NO_TOPICS = []
+
+
+class MetadataRequest_v6(Request):
+    API_KEY = 3
+    API_VERSION = 6
+    RESPONSE_TYPE = MetadataResponse_v6
+    SCHEMA = MetadataRequest_v5.SCHEMA
+    ALL_TOPICS = None
+    NO_TOPICS = []
+
+
+class MetadataRequest_v7(Request):
+    API_KEY = 3
+    API_VERSION = 7
+    RESPONSE_TYPE = MetadataResponse_v7
+    SCHEMA = MetadataRequest_v6.SCHEMA
+    ALL_TOPICS = None
+    NO_TOPICS = []
 
 
 MetadataRequest = [
     MetadataRequest_v0, MetadataRequest_v1, MetadataRequest_v2,
-    MetadataRequest_v3, MetadataRequest_v4, MetadataRequest_v5
+    MetadataRequest_v3, MetadataRequest_v4, MetadataRequest_v5,
+    MetadataRequest_v6, MetadataRequest_v7,
 ]
 MetadataResponse = [
     MetadataResponse_v0, MetadataResponse_v1, MetadataResponse_v2,
-    MetadataResponse_v3, MetadataResponse_v4, MetadataResponse_v5
+    MetadataResponse_v3, MetadataResponse_v4, MetadataResponse_v5,
+    MetadataResponse_v6, MetadataResponse_v7,
 ]
diff --git a/kafka/protocol/offset_for_leader_epoch.py b/kafka/protocol/offset_for_leader_epoch.py
new file mode 100644
index 000000000..8465588a3
--- /dev/null
+++ b/kafka/protocol/offset_for_leader_epoch.py
@@ -0,0 +1,140 @@
+from __future__ import absolute_import
+
+from kafka.protocol.api import Request, Response
+from kafka.protocol.types import Array, CompactArray, CompactString, Int16, Int32, Int64, Schema, String, TaggedFields
+
+
+class OffsetForLeaderEpochResponse_v0(Response):
+    API_KEY = 23
+    API_VERSION = 0
+    SCHEMA = Schema(
+        ('topics', Array(
+            ('topic', String('utf-8')),
+            ('partitions', Array(
+                ('error_code', Int16),
+                ('partition', Int32),
+                ('end_offset', Int64))))))
+
+
+class OffsetForLeaderEpochResponse_v1(Response):
+    API_KEY = 23
+    API_VERSION = 1
+    SCHEMA = Schema(
+        ('topics', Array(
+            ('topic', String('utf-8')),
+            ('partitions', Array(
+                ('error_code', Int16),
+                ('partition', Int32),
+                ('leader_epoch', Int32),
+                ('end_offset', Int64))))))
+
+
+class OffsetForLeaderEpochResponse_v2(Response):
+    API_KEY = 23
+    API_VERSION = 2
+    SCHEMA = Schema(
+        ('throttle_time_ms', Int32),
+        ('topics', Array(
+            ('topic', String('utf-8')),
+            ('partitions', Array(
+                ('error_code', Int16),
+                ('partition', Int32),
+                ('leader_epoch', Int32),
+                ('end_offset', Int64))))))
+
+
+class OffsetForLeaderEpochResponse_v3(Response):
+    API_KEY = 23
+    API_VERSION = 3
+    SCHEMA = OffsetForLeaderEpochResponse_v2.SCHEMA
+
+
+class OffsetForLeaderEpochResponse_v4(Response):
+    API_KEY = 23
+    API_VERSION = 4
+    SCHEMA = Schema(
+        ('throttle_time_ms', Int32),
+        ('topics', CompactArray(
+            ('topic', CompactString('utf-8')),
+            ('partitions', CompactArray(
+                ('error_code', Int16),
+                ('partition', Int32),
+                ('leader_epoch', Int32),
+                ('end_offset', Int64),
+                ('tags', TaggedFields))),
+            ('tags', TaggedFields))),
+        ('tags', TaggedFields))
+
+
+class OffsetForLeaderEpochRequest_v0(Request):
+    API_KEY = 23
+    API_VERSION = 0
+    RESPONSE_TYPE = OffsetForLeaderEpochResponse_v0
+    SCHEMA = Schema(
+        ('topics', Array(
+            ('topic', String('utf-8')),
+            ('partitions', Array(
+                ('partition', Int32),
+                ('leader_epoch', Int32))))))
+
+
+class OffsetForLeaderEpochRequest_v1(Request):
+    API_KEY = 23
+    API_VERSION = 1
+    RESPONSE_TYPE = OffsetForLeaderEpochResponse_v1
+    SCHEMA = OffsetForLeaderEpochRequest_v0.SCHEMA
+
+
+class OffsetForLeaderEpochRequest_v2(Request):
+    API_KEY = 23
+    API_VERSION = 2
+    RESPONSE_TYPE = OffsetForLeaderEpochResponse_v2
+    SCHEMA = Schema(
+        ('topics', Array(
+            ('topic', String('utf-8')),
+            ('partitions', Array(
+                ('partition', Int32),
+                ('current_leader_epoch', Int32),
+                ('leader_epoch', Int32))))))
+
+
+class OffsetForLeaderEpochRequest_v3(Request):
+    API_KEY = 23
+    API_VERSION = 3
+    RESPONSE_TYPE = OffsetForLeaderEpochResponse_v3
+    SCHEMA = Schema(
+        ('replica_id', Int32),
+        ('topics', Array(
+            ('topic', String('utf-8')),
+            ('partitions', Array(
+                ('partition', Int32),
+                ('current_leader_epoch', Int32),
+                ('leader_epoch', Int32))))))
+
+
+class OffsetForLeaderEpochRequest_v4(Request):
+    API_KEY = 23
+    API_VERSION = 4
+    RESPONSE_TYPE = OffsetForLeaderEpochResponse_v4
+    SCHEMA = Schema(
+        ('replica_id', Int32),
+        ('topics', CompactArray(
+            ('topic', CompactString('utf-8')),
+            ('partitions', CompactArray(
+                ('partition', Int32),
+                ('current_leader_epoch', Int32),
+                ('leader_epoch', Int32),
+                ('tags', TaggedFields))),
+            ('tags', TaggedFields))),
+        ('tags', TaggedFields))
+
+OffsetForLeaderEpochRequest = [
+    OffsetForLeaderEpochRequest_v0, OffsetForLeaderEpochRequest_v1,
+    OffsetForLeaderEpochRequest_v2, OffsetForLeaderEpochRequest_v3,
+    OffsetForLeaderEpochRequest_v4,
+]
+OffsetForLeaderEpochResponse = [
+    OffsetForLeaderEpochResponse_v0, OffsetForLeaderEpochResponse_v1,
+    OffsetForLeaderEpochResponse_v2, OffsetForLeaderEpochResponse_v3,
+    OffsetForLeaderEpochResponse_v4,
+]
diff --git a/kafka/protocol/parser.py b/kafka/protocol/parser.py
index cfee0466d..4bc427330 100644
--- a/kafka/protocol/parser.py
+++ b/kafka/protocol/parser.py
@@ -4,10 +4,9 @@
 import logging
 
 import kafka.errors as Errors
-from kafka.protocol.api import RequestHeader
-from kafka.protocol.commit import GroupCoordinatorResponse
+from kafka.protocol.find_coordinator import FindCoordinatorResponse
 from kafka.protocol.frame import KafkaBytes
-from kafka.protocol.types import Int32
+from kafka.protocol.types import Int32, TaggedFields
 from kafka.version import __version__
 
 log = logging.getLogger(__name__)
@@ -59,9 +58,8 @@ def send_request(self, request, correlation_id=None):
         log.debug('Sending request %s', request)
         if correlation_id is None:
             correlation_id = self._next_correlation_id()
-        header = RequestHeader(request,
-                               correlation_id=correlation_id,
-                               client_id=self._client_id)
+
+        header = request.build_header(correlation_id=correlation_id, client_id=self._client_id)
         message = b''.join([header.encode(), request.encode()])
         size = Int32.encode(len(message))
         data = size + message
@@ -135,21 +133,17 @@ def receive_bytes(self, data):
         return responses
 
     def _process_response(self, read_buffer):
-        recv_correlation_id = Int32.decode(read_buffer)
-        log.debug('Received correlation id: %d', recv_correlation_id)
-
         if not self.in_flight_requests:
-            raise Errors.CorrelationIdError(
-                'No in-flight-request found for server response'
-                ' with correlation ID %d'
-                % (recv_correlation_id,))
-
+            raise Errors.CorrelationIdError('No in-flight-request found for server response')
         (correlation_id, request) = self.in_flight_requests.popleft()
-
+        response_type = request.RESPONSE_TYPE
+        response_header = response_type.parse_header(read_buffer)
+        recv_correlation_id = response_header.correlation_id
+        log.debug('Received correlation id: %d', recv_correlation_id)
         # 0.8.2 quirk
         if (recv_correlation_id == 0 and
             correlation_id != 0 and
-            request.RESPONSE_TYPE is GroupCoordinatorResponse[0] and
+            response_type is FindCoordinatorResponse[0] and
             (self._api_version == (0, 8, 2) or self._api_version is None)):
             log.warning('Kafka 0.8.2 quirk -- GroupCoordinatorResponse'
                         ' Correlation ID does not match request. This'
@@ -163,15 +157,15 @@ def _process_response(self, read_buffer):
                 % (correlation_id, recv_correlation_id))
 
         # decode response
-        log.debug('Processing response %s', request.RESPONSE_TYPE.__name__)
+        log.debug('Processing response %s', response_type.__name__)
         try:
-            response = request.RESPONSE_TYPE.decode(read_buffer)
+            response = response_type.decode(read_buffer)
         except ValueError:
             read_buffer.seek(0)
             buf = read_buffer.read()
             log.error('Response %d [ResponseType: %s Request: %s]:'
                       ' Unable to decode %d-byte buffer: %r',
-                      correlation_id, request.RESPONSE_TYPE,
+                      correlation_id, response_type,
                       request, len(buf), buf)
             raise Errors.KafkaProtocolError('Unable to decode response')
 
diff --git a/kafka/protocol/produce.py b/kafka/protocol/produce.py
index 9b3f6bf55..3076a2810 100644
--- a/kafka/protocol/produce.py
+++ b/kafka/protocol/produce.py
@@ -47,6 +47,7 @@ class ProduceResponse_v2(Response):
 
 
 class ProduceResponse_v3(Response):
+    # Adds support for message format v2
     API_KEY = 0
     API_VERSION = 3
     SCHEMA = ProduceResponse_v2.SCHEMA
@@ -141,7 +142,7 @@ class ProduceRequest_v0(ProduceRequest):
             ('topic', String('utf-8')),
             ('partitions', Array(
                 ('partition', Int32),
-                ('messages', Bytes)))))
+                ('records', Bytes)))))
     )
 
 
@@ -158,6 +159,7 @@ class ProduceRequest_v2(ProduceRequest):
 
 
 class ProduceRequest_v3(ProduceRequest):
+    # Adds support for message format v2
     API_VERSION = 3
     RESPONSE_TYPE = ProduceResponse_v3
     SCHEMA = Schema(
@@ -168,7 +170,7 @@ class ProduceRequest_v3(ProduceRequest):
             ('topic', String('utf-8')),
             ('partitions', Array(
                 ('partition', Int32),
-                ('messages', Bytes)))))
+                ('records', Bytes)))))
     )
 
 
diff --git a/kafka/protocol/sasl_authenticate.py b/kafka/protocol/sasl_authenticate.py
new file mode 100644
index 000000000..a2b9b1988
--- /dev/null
+++ b/kafka/protocol/sasl_authenticate.py
@@ -0,0 +1,42 @@
+from __future__ import absolute_import
+
+from kafka.protocol.api import Request, Response
+from kafka.protocol.types import Bytes, Int16, Int64, Schema, String
+
+
+class SaslAuthenticateResponse_v0(Response):
+    API_KEY = 36
+    API_VERSION = 0
+    SCHEMA = Schema(
+        ('error_code', Int16),
+        ('error_message', String('utf-8')),
+        ('auth_bytes', Bytes))
+
+
+class SaslAuthenticateResponse_v1(Response):
+    API_KEY = 36
+    API_VERSION = 1
+    SCHEMA = Schema(
+        ('error_code', Int16),
+        ('error_message', String('utf-8')),
+        ('auth_bytes', Bytes),
+        ('session_lifetime_ms', Int64))
+
+
+class SaslAuthenticateRequest_v0(Request):
+    API_KEY = 36
+    API_VERSION = 0
+    RESPONSE_TYPE = SaslAuthenticateResponse_v0
+    SCHEMA = Schema(
+        ('auth_bytes', Bytes))
+
+
+class SaslAuthenticateRequest_v1(Request):
+    API_KEY = 36
+    API_VERSION = 1
+    RESPONSE_TYPE = SaslAuthenticateResponse_v1
+    SCHEMA = SaslAuthenticateRequest_v0.SCHEMA
+
+
+SaslAuthenticateRequest = [SaslAuthenticateRequest_v0, SaslAuthenticateRequest_v1]
+SaslAuthenticateResponse = [SaslAuthenticateResponse_v0, SaslAuthenticateResponse_v1]
diff --git a/kafka/protocol/sasl_handshake.py b/kafka/protocol/sasl_handshake.py
new file mode 100644
index 000000000..e91c856ca
--- /dev/null
+++ b/kafka/protocol/sasl_handshake.py
@@ -0,0 +1,39 @@
+from __future__ import absolute_import
+
+from kafka.protocol.api import Request, Response
+from kafka.protocol.types import Array, Int16, Schema, String
+
+
+class SaslHandshakeResponse_v0(Response):
+    API_KEY = 17
+    API_VERSION = 0
+    SCHEMA = Schema(
+        ('error_code', Int16),
+        ('enabled_mechanisms', Array(String('utf-8')))
+    )
+
+
+class SaslHandshakeResponse_v1(Response):
+    API_KEY = 17
+    API_VERSION = 1
+    SCHEMA = SaslHandshakeResponse_v0.SCHEMA
+
+
+class SaslHandshakeRequest_v0(Request):
+    API_KEY = 17
+    API_VERSION = 0
+    RESPONSE_TYPE = SaslHandshakeResponse_v0
+    SCHEMA = Schema(
+        ('mechanism', String('utf-8'))
+    )
+
+
+class SaslHandshakeRequest_v1(Request):
+    API_KEY = 17
+    API_VERSION = 1
+    RESPONSE_TYPE = SaslHandshakeResponse_v1
+    SCHEMA = SaslHandshakeRequest_v0.SCHEMA
+
+
+SaslHandshakeRequest = [SaslHandshakeRequest_v0, SaslHandshakeRequest_v1]
+SaslHandshakeResponse = [SaslHandshakeResponse_v0, SaslHandshakeResponse_v1]
diff --git a/kafka/protocol/txn_offset_commit.py b/kafka/protocol/txn_offset_commit.py
new file mode 100644
index 000000000..df1b1bd1e
--- /dev/null
+++ b/kafka/protocol/txn_offset_commit.py
@@ -0,0 +1,78 @@
+from __future__ import absolute_import
+
+from kafka.protocol.api import Request, Response
+from kafka.protocol.types import Array, Int16, Int32, Int64, Schema, String
+
+
+class TxnOffsetCommitResponse_v0(Response):
+    API_KEY = 28
+    API_VERSION = 0
+    SCHEMA = Schema(
+        ('throttle_time_ms', Int32),
+        ('topics', Array(
+            ('topic', String('utf-8')),
+            ('partitions', Array(
+                ('partition', Int32),
+                ('error_code', Int16))))))
+
+
+class TxnOffsetCommitResponse_v1(Response):
+    API_KEY = 28
+    API_VERSION = 1
+    SCHEMA = TxnOffsetCommitResponse_v0.SCHEMA
+
+
+class TxnOffsetCommitResponse_v2(Response):
+    API_KEY = 28
+    API_VERSION = 2
+    SCHEMA = TxnOffsetCommitResponse_v1.SCHEMA
+
+
+class TxnOffsetCommitRequest_v0(Request):
+    API_KEY = 28
+    API_VERSION = 0
+    RESPONSE_TYPE = TxnOffsetCommitResponse_v0
+    SCHEMA = Schema(
+        ('transactional_id', String('utf-8')),
+        ('group_id', String('utf-8')),
+        ('producer_id', Int64),
+        ('producer_epoch', Int16),
+        ('topics', Array(
+            ('topic', String('utf-8')),
+            ('partitions', Array(
+                ('partition', Int32),
+                ('offset', Int64),
+                ('metadata', String('utf-8')))))))
+
+
+class TxnOffsetCommitRequest_v1(Request):
+    API_KEY = 28
+    API_VERSION = 1
+    RESPONSE_TYPE = TxnOffsetCommitResponse_v1
+    SCHEMA = TxnOffsetCommitRequest_v0.SCHEMA
+
+
+class TxnOffsetCommitRequest_v2(Request):
+    API_KEY = 28
+    API_VERSION = 2
+    RESPONSE_TYPE = TxnOffsetCommitResponse_v2
+    SCHEMA = Schema(
+        ('transactional_id', String('utf-8')),
+        ('group_id', String('utf-8')),
+        ('producer_id', Int64),
+        ('producer_epoch', Int16),
+        ('topics', Array(
+            ('topic', String('utf-8')),
+            ('partitions', Array(
+                ('partition', Int32),
+                ('offset', Int64),
+                ('leader_epoch', Int32),
+                ('metadata', String('utf-8')))))))
+
+
+TxnOffsetCommitRequest = [
+    TxnOffsetCommitRequest_v0, TxnOffsetCommitRequest_v1, TxnOffsetCommitRequest_v2, 
+]
+TxnOffsetCommitResponse = [
+    TxnOffsetCommitResponse_v0, TxnOffsetCommitResponse_v1, TxnOffsetCommitResponse_v2, 
+]
diff --git a/kafka/protocol/types.py b/kafka/protocol/types.py
index 2fde24fcc..0e3685d73 100644
--- a/kafka/protocol/types.py
+++ b/kafka/protocol/types.py
@@ -210,3 +210,156 @@ def repr(self, list_of_items):
         if list_of_items is None:
             return 'NULL'
         return '[' + ', '.join([self.array_of.repr(item) for item in list_of_items]) + ']'
+
+
+class UnsignedVarInt32(AbstractType):
+    @classmethod
+    def decode(cls, data):
+        value, i = 0, 0
+        while True:
+            b, = struct.unpack('B', data.read(1))
+            if not (b & 0x80):
+                break
+            value |= (b & 0x7f) << i
+            i += 7
+            if i > 28:
+                raise ValueError('Invalid value {}'.format(value))
+        value |= b << i
+        return value
+
+    @classmethod
+    def encode(cls, value):
+        value &= 0xffffffff
+        ret = b''
+        while (value & 0xffffff80) != 0:
+            b = (value & 0x7f) | 0x80
+            ret += struct.pack('B', b)
+            value >>= 7
+        ret += struct.pack('B', value)
+        return ret
+
+
+class VarInt32(AbstractType):
+    @classmethod
+    def decode(cls, data):
+        value = UnsignedVarInt32.decode(data)
+        return (value >> 1) ^ -(value & 1)
+
+    @classmethod
+    def encode(cls, value):
+        # bring it in line with the java binary repr
+        value &= 0xffffffff
+        return UnsignedVarInt32.encode((value << 1) ^ (value >> 31))
+
+
+class VarInt64(AbstractType):
+    @classmethod
+    def decode(cls, data):
+        value, i = 0, 0
+        while True:
+            b = data.read(1)
+            if not (b & 0x80):
+                break
+            value |= (b & 0x7f) << i
+            i += 7
+            if i > 63:
+                raise ValueError('Invalid value {}'.format(value))
+        value |= b << i
+        return (value >> 1) ^ -(value & 1)
+
+    @classmethod
+    def encode(cls, value):
+        # bring it in line with the java binary repr
+        value &= 0xffffffffffffffff
+        v = (value << 1) ^ (value >> 63)
+        ret = b''
+        while (v & 0xffffffffffffff80) != 0:
+            b = (value & 0x7f) | 0x80
+            ret += struct.pack('B', b)
+            v >>= 7
+        ret += struct.pack('B', v)
+        return ret
+
+
+class CompactString(String):
+    def decode(self, data):
+        length = UnsignedVarInt32.decode(data) - 1
+        if length < 0:
+            return None
+        value = data.read(length)
+        if len(value) != length:
+            raise ValueError('Buffer underrun decoding string')
+        return value.decode(self.encoding)
+
+    def encode(self, value):
+        if value is None:
+            return UnsignedVarInt32.encode(0)
+        value = str(value).encode(self.encoding)
+        return UnsignedVarInt32.encode(len(value) + 1) + value
+
+
+class TaggedFields(AbstractType):
+    @classmethod
+    def decode(cls, data):
+        num_fields = UnsignedVarInt32.decode(data)
+        ret = {}
+        if not num_fields:
+            return ret
+        prev_tag = -1
+        for i in range(num_fields):
+            tag = UnsignedVarInt32.decode(data)
+            if tag <= prev_tag:
+                raise ValueError('Invalid or out-of-order tag {}'.format(tag))
+            prev_tag = tag
+            size = UnsignedVarInt32.decode(data)
+            val = data.read(size)
+            ret[tag] = val
+        return ret
+
+    @classmethod
+    def encode(cls, value):
+        ret = UnsignedVarInt32.encode(len(value))
+        for k, v in value.items():
+            # do we allow for other data types ?? It could get complicated really fast
+            assert isinstance(v, bytes), 'Value {} is not a byte array'.format(v)
+            assert isinstance(k, int) and k > 0, 'Key {} is not a positive integer'.format(k)
+            ret += UnsignedVarInt32.encode(k)
+            ret += v
+        return ret
+
+
+class CompactBytes(AbstractType):
+    @classmethod
+    def decode(cls, data):
+        length = UnsignedVarInt32.decode(data) - 1
+        if length < 0:
+            return None
+        value = data.read(length)
+        if len(value) != length:
+            raise ValueError('Buffer underrun decoding Bytes')
+        return value
+
+    @classmethod
+    def encode(cls, value):
+        if value is None:
+            return UnsignedVarInt32.encode(0)
+        else:
+            return UnsignedVarInt32.encode(len(value) + 1) + value
+
+
+class CompactArray(Array):
+
+    def encode(self, items):
+        if items is None:
+            return UnsignedVarInt32.encode(0)
+        return b''.join(
+            [UnsignedVarInt32.encode(len(items) + 1)] +
+            [self.array_of.encode(item) for item in items]
+        )
+
+    def decode(self, data):
+        length = UnsignedVarInt32.decode(data) - 1
+        if length == -1:
+            return None
+        return [self.array_of.decode(data) for _ in range(length)]
+
diff --git a/kafka/record/_crc32c.py b/kafka/record/_crc32c.py
index ecff48f5e..9b51ad8a9 100644
--- a/kafka/record/_crc32c.py
+++ b/kafka/record/_crc32c.py
@@ -105,7 +105,7 @@ def crc_update(crc, data):
     Returns:
         32-bit updated CRC-32C as long.
     """
-    if type(data) != array.array or data.itemsize != 1:
+    if not isinstance(data, array.array) or data.itemsize != 1:
         buf = array.array("B", data)
     else:
         buf = data
diff --git a/kafka/record/abc.py b/kafka/record/abc.py
index d5c172aaa..c78f0da69 100644
--- a/kafka/record/abc.py
+++ b/kafka/record/abc.py
@@ -1,11 +1,19 @@
 from __future__ import absolute_import
+
 import abc
 
+from kafka.vendor.six import add_metaclass
+
 
+@add_metaclass(abc.ABCMeta)
 class ABCRecord(object):
-    __metaclass__ = abc.ABCMeta
     __slots__ = ()
 
+    @abc.abstractproperty
+    def size_in_bytes(self):
+        """ Number of total bytes in record
+        """
+
     @abc.abstractproperty
     def offset(self):
         """ Absolute offset of record
@@ -37,6 +45,11 @@ def checksum(self):
             be the checksum for v0 and v1 and None for v2 and above.
         """
 
+    @abc.abstractmethod
+    def validate_crc(self):
+        """ Return True if v0/v1 record matches checksum. noop/True for v2 records
+        """
+
     @abc.abstractproperty
     def headers(self):
         """ If supported by version list of key-value tuples, or empty list if
@@ -44,8 +57,8 @@ def headers(self):
         """
 
 
+@add_metaclass(abc.ABCMeta)
 class ABCRecordBatchBuilder(object):
-    __metaclass__ = abc.ABCMeta
     __slots__ = ()
 
     @abc.abstractmethod
@@ -84,11 +97,11 @@ def build(self):
         """
 
 
+@add_metaclass(abc.ABCMeta)
 class ABCRecordBatch(object):
-    """ For v2 incapsulates a RecordBatch, for v0/v1 a single (maybe
+    """ For v2 encapsulates a RecordBatch, for v0/v1 a single (maybe
         compressed) message.
     """
-    __metaclass__ = abc.ABCMeta
     __slots__ = ()
 
     @abc.abstractmethod
@@ -97,9 +110,24 @@ def __iter__(self):
             if needed.
         """
 
+    @abc.abstractproperty
+    def base_offset(self):
+        """ Return base offset for batch
+        """
+
+    @abc.abstractproperty
+    def size_in_bytes(self):
+        """ Return size of batch in bytes (includes header overhead)
+        """
+
+    @abc.abstractproperty
+    def magic(self):
+        """ Return magic value (0, 1, 2) for batch.
+        """
+
 
+@add_metaclass(abc.ABCMeta)
 class ABCRecords(object):
-    __metaclass__ = abc.ABCMeta
     __slots__ = ()
 
     @abc.abstractmethod
diff --git a/kafka/record/default_records.py b/kafka/record/default_records.py
index a098c42a9..a3b9cd5d8 100644
--- a/kafka/record/default_records.py
+++ b/kafka/record/default_records.py
@@ -60,7 +60,7 @@
 from kafka.record.util import (
     decode_varint, encode_varint, calc_crc32c, size_of_varint
 )
-from kafka.errors import CorruptRecordException, UnsupportedCodecError
+from kafka.errors import CorruptRecordError, UnsupportedCodecError
 from kafka.codec import (
     gzip_encode, snappy_encode, lz4_encode, zstd_encode,
     gzip_decode, snappy_decode, lz4_decode, zstd_decode
@@ -104,6 +104,9 @@ class DefaultRecordBase(object):
 
     LOG_APPEND_TIME = 1
     CREATE_TIME = 0
+    NO_PRODUCER_ID = -1
+    NO_SEQUENCE = -1
+    MAX_INT = 2147483647
 
     def _assert_has_codec(self, compression_type):
         if compression_type == self.CODEC_GZIP:
@@ -114,6 +117,8 @@ def _assert_has_codec(self, compression_type):
             checker, name = codecs.has_lz4, "lz4"
         elif compression_type == self.CODEC_ZSTD:
             checker, name = codecs.has_zstd, "zstd"
+        else:
+            raise UnsupportedCodecError("Unrecognized compression type: %s" % (compression_type,))
         if not checker():
             raise UnsupportedCodecError(
                 "Libraries for {} compression codec not found".format(name))
@@ -136,6 +141,14 @@ def __init__(self, buffer):
     def base_offset(self):
         return self._header_data[0]
 
+    @property
+    def size_in_bytes(self):
+        return self._header_data[1] + self.AFTER_LEN_OFFSET
+
+    @property
+    def leader_epoch(self):
+        return self._header_data[2]
+
     @property
     def magic(self):
         return self._header_data[3]
@@ -152,6 +165,14 @@ def attributes(self):
     def last_offset_delta(self):
         return self._header_data[6]
 
+    @property
+    def last_offset(self):
+        return self.base_offset + self.last_offset_delta
+
+    @property
+    def next_offset(self):
+        return self.last_offset + 1
+
     @property
     def compression_type(self):
         return self.attributes & self.CODEC_MASK
@@ -176,6 +197,40 @@ def first_timestamp(self):
     def max_timestamp(self):
         return self._header_data[8]
 
+    @property
+    def producer_id(self):
+        return self._header_data[9]
+
+    def has_producer_id(self):
+        return self.producer_id > self.NO_PRODUCER_ID
+
+    @property
+    def producer_epoch(self):
+        return self._header_data[10]
+
+    @property
+    def base_sequence(self):
+        return self._header_data[11]
+
+    @property
+    def has_sequence(self):
+        return self._header_data[11] != -1 # NO_SEQUENCE
+
+    @property
+    def last_sequence(self):
+        if self.base_sequence == self.NO_SEQUENCE:
+            return self.NO_SEQUENCE
+        return self._increment_sequence(self.base_sequence, self.last_offset_delta)
+
+    def _increment_sequence(self, base, increment):
+        if base > (self.MAX_INT - increment):
+            return increment - (self.MAX_INT - base) - 1
+        return base + increment
+
+    @property
+    def records_count(self):
+        return self._header_data[12]
+
     def _maybe_uncompress(self):
         if not self._decompressed:
             compression_type = self.compression_type
@@ -239,14 +294,14 @@ def _read_msg(
 
         header_count, pos = decode_varint(buffer, pos)
         if header_count < 0:
-            raise CorruptRecordException("Found invalid number of record "
+            raise CorruptRecordError("Found invalid number of record "
                                          "headers {}".format(header_count))
         headers = []
         while header_count:
             # Header key is of type String, that can't be None
             h_key_len, pos = decode_varint(buffer, pos)
             if h_key_len < 0:
-                raise CorruptRecordException(
+                raise CorruptRecordError(
                     "Invalid negative header key size {}".format(h_key_len))
             h_key = buffer[pos: pos + h_key_len].decode("utf-8")
             pos += h_key_len
@@ -264,13 +319,17 @@ def _read_msg(
 
         # validate whether we have read all header bytes in the current record
         if pos - start_pos != length:
-            raise CorruptRecordException(
+            raise CorruptRecordError(
                 "Invalid record size: expected to read {} bytes in record "
                 "payload, but instead read {}".format(length, pos - start_pos))
         self._pos = pos
 
-        return DefaultRecord(
-            offset, timestamp, self.timestamp_type, key, value, headers)
+        if self.is_control_batch:
+            return ControlRecord(
+                length, offset, timestamp, self.timestamp_type, key, value, headers)
+        else:
+            return DefaultRecord(
+                length, offset, timestamp, self.timestamp_type, key, value, headers)
 
     def __iter__(self):
         self._maybe_uncompress()
@@ -279,14 +338,14 @@ def __iter__(self):
     def __next__(self):
         if self._next_record_index >= self._num_records:
             if self._pos != len(self._buffer):
-                raise CorruptRecordException(
+                raise CorruptRecordError(
                     "{} unconsumed bytes after all records consumed".format(
                         len(self._buffer) - self._pos))
             raise StopIteration
         try:
             msg = self._read_msg()
         except (ValueError, IndexError) as err:
-            raise CorruptRecordException(
+            raise CorruptRecordError(
                 "Found invalid record structure: {!r}".format(err))
         else:
             self._next_record_index += 1
@@ -303,13 +362,25 @@ def validate_crc(self):
         verify_crc = calc_crc32c(data_view.tobytes())
         return crc == verify_crc
 
+    def __str__(self):
+        return (
+            "DefaultRecordBatch(magic={}, base_offset={}, last_offset_delta={},"
+            " first_timestamp={}, max_timestamp={},"
+            " is_transactional={}, producer_id={}, producer_epoch={}, base_sequence={},"
+            " records_count={})".format(
+                self.magic, self.base_offset, self.last_offset_delta,
+                self.first_timestamp, self.max_timestamp,
+                self.is_transactional, self.producer_id, self.producer_epoch, self.base_sequence,
+                self.records_count))
+
 
 class DefaultRecord(ABCRecord):
 
-    __slots__ = ("_offset", "_timestamp", "_timestamp_type", "_key", "_value",
+    __slots__ = ("_size_in_bytes", "_offset", "_timestamp", "_timestamp_type", "_key", "_value",
                  "_headers")
 
-    def __init__(self, offset, timestamp, timestamp_type, key, value, headers):
+    def __init__(self, size_in_bytes, offset, timestamp, timestamp_type, key, value, headers):
+        self._size_in_bytes = size_in_bytes
         self._offset = offset
         self._timestamp = timestamp
         self._timestamp_type = timestamp_type
@@ -317,6 +388,10 @@ def __init__(self, offset, timestamp, timestamp_type, key, value, headers):
         self._value = value
         self._headers = headers
 
+    @property
+    def size_in_bytes(self):
+        return self._size_in_bytes
+
     @property
     def offset(self):
         return self._offset
@@ -353,6 +428,9 @@ def headers(self):
     def checksum(self):
         return None
 
+    def validate_crc(self):
+        return True
+
     def __repr__(self):
         return (
             "DefaultRecord(offset={!r}, timestamp={!r}, timestamp_type={!r},"
@@ -362,6 +440,45 @@ def __repr__(self):
         )
 
 
+class ControlRecord(DefaultRecord):
+    __slots__ = ("_size_in_bytes", "_offset", "_timestamp", "_timestamp_type", "_key", "_value",
+                 "_headers", "_version", "_type")
+
+    KEY_STRUCT = struct.Struct(
+        ">h"  # Current Version => Int16
+        "h"  # Type => Int16 (0 indicates an abort marker, 1 indicates a commit)
+    )
+
+    def __init__(self, size_in_bytes, offset, timestamp, timestamp_type, key, value, headers):
+        super(ControlRecord, self).__init__(size_in_bytes, offset, timestamp, timestamp_type, key, value, headers)
+        (self._version, self._type) = self.KEY_STRUCT.unpack(self._key)
+
+    # see https://kafka.apache.org/documentation/#controlbatch
+    @property
+    def version(self):
+        return self._version
+
+    @property
+    def type(self):
+        return self._type
+
+    @property
+    def abort(self):
+        return self._type == 0
+
+    @property
+    def commit(self):
+        return self._type == 1
+
+    def __repr__(self):
+        return (
+            "ControlRecord(offset={!r}, timestamp={!r}, timestamp_type={!r},"
+            " version={!r}, type={!r} <{!s}>)".format(
+                self._offset, self._timestamp, self._timestamp_type,
+                self._version, self._type, "abort" if self.abort else "commit")
+        )
+
+
 class DefaultRecordBatchBuilder(DefaultRecordBase, ABCRecordBatchBuilder):
 
     # excluding key, value and headers:
@@ -393,6 +510,23 @@ def __init__(
 
         self._buffer = bytearray(self.HEADER_STRUCT.size)
 
+    def set_producer_state(self, producer_id, producer_epoch, base_sequence, is_transactional):
+        assert not is_transactional or producer_id != -1, "Cannot write transactional messages without a valid producer ID"
+        assert producer_id == -1 or producer_epoch != -1, "Invalid negative producer epoch"
+        assert producer_id == -1 or base_sequence != -1, "Invalid negative sequence number"
+        self._producer_id = producer_id
+        self._producer_epoch = producer_epoch
+        self._base_sequence = base_sequence
+        self._is_transactional = is_transactional
+
+    @property
+    def producer_id(self):
+        return self._producer_id
+
+    @property
+    def producer_epoch(self):
+        return self._producer_epoch
+
     def _get_attributes(self, include_compression_type=True):
         attrs = 0
         if include_compression_type:
@@ -501,8 +635,8 @@ def write_header(self, use_compression_type=True):
             0,  # CRC will be set below, as we need a filled buffer for it
             self._get_attributes(use_compression_type),
             self._last_offset,
-            self._first_timestamp,
-            self._max_timestamp,
+            self._first_timestamp or 0,
+            self._max_timestamp or 0,
             self._producer_id,
             self._producer_epoch,
             self._base_sequence,
@@ -547,14 +681,15 @@ def size(self):
         """
         return len(self._buffer)
 
-    def size_in_bytes(self, offset, timestamp, key, value, headers):
-        if self._first_timestamp is not None:
-            timestamp_delta = timestamp - self._first_timestamp
-        else:
-            timestamp_delta = 0
+    @classmethod
+    def header_size_in_bytes(self):
+        return self.HEADER_STRUCT.size
+
+    @classmethod
+    def size_in_bytes(self, offset_delta, timestamp_delta, key, value, headers):
         size_of_body = (
             1 +  # Attrs
-            size_of_varint(offset) +
+            size_of_varint(offset_delta) +
             size_of_varint(timestamp_delta) +
             self.size_of(key, value, headers)
         )
@@ -597,6 +732,17 @@ def estimate_size_in_bytes(cls, key, value, headers):
             cls.size_of(key, value, headers)
         )
 
+    def __str__(self):
+        return (
+            "DefaultRecordBatchBuilder(magic={}, base_offset={}, last_offset_delta={},"
+            " first_timestamp={}, max_timestamp={},"
+            " is_transactional={}, producer_id={}, producer_epoch={}, base_sequence={},"
+            " records_count={})".format(
+                self._magic, 0, self._last_offset,
+                self._first_timestamp or 0, self._max_timestamp or 0,
+                self._is_transactional, self._producer_id, self._producer_epoch, self._base_sequence,
+                self._num_records))
+
 
 class DefaultRecordMetadata(object):
 
diff --git a/kafka/record/legacy_records.py b/kafka/record/legacy_records.py
index e2ee5490c..f085978f0 100644
--- a/kafka/record/legacy_records.py
+++ b/kafka/record/legacy_records.py
@@ -52,7 +52,7 @@
     gzip_decode, snappy_decode, lz4_decode, lz4_decode_old_kafka,
 )
 import kafka.codec as codecs
-from kafka.errors import CorruptRecordException, UnsupportedCodecError
+from kafka.errors import CorruptRecordError, UnsupportedCodecError
 
 
 class LegacyRecordBase(object):
@@ -129,7 +129,7 @@ def _assert_has_codec(self, compression_type):
 
 class LegacyRecordBatch(ABCRecordBatch, LegacyRecordBase):
 
-    __slots__ = ("_buffer", "_magic", "_offset", "_crc", "_timestamp",
+    __slots__ = ("_buffer", "_magic", "_offset", "_length", "_crc", "_timestamp",
                  "_attributes", "_decompressed")
 
     def __init__(self, buffer, magic):
@@ -141,11 +141,20 @@ def __init__(self, buffer, magic):
         assert magic == magic_
 
         self._offset = offset
+        self._length = length
         self._crc = crc
         self._timestamp = timestamp
         self._attributes = attrs
         self._decompressed = False
 
+    @property
+    def base_offset(self):
+        return self._offset
+
+    @property
+    def size_in_bytes(self):
+        return self._length + self.LOG_OVERHEAD
+
     @property
     def timestamp_type(self):
         """0 for CreateTime; 1 for LogAppendTime; None if unsupported.
@@ -164,6 +173,10 @@ def timestamp_type(self):
     def compression_type(self):
         return self._attributes & self.CODEC_MASK
 
+    @property
+    def magic(self):
+        return self._magic
+
     def validate_crc(self):
         crc = calc_crc32(self._buffer[self.MAGIC_OFFSET:])
         return self._crc == crc
@@ -178,7 +191,7 @@ def _decompress(self, key_offset):
         value_size = struct.unpack_from(">i", self._buffer, pos)[0]
         pos += self.VALUE_LENGTH
         if value_size == -1:
-            raise CorruptRecordException("Value of compressed message is None")
+            raise CorruptRecordError("Value of compressed message is None")
         else:
             data = self._buffer[pos:pos + value_size]
 
@@ -232,6 +245,9 @@ def _read_key_value(self, pos):
             value = self._buffer[pos:pos + value_size].tobytes()
         return key, value
 
+    def _crc_bytes(self, msg_pos, length):
+        return self._buffer[msg_pos + self.MAGIC_OFFSET:msg_pos + self.LOG_OVERHEAD + length]
+
     def __iter__(self):
         if self._magic == 1:
             key_offset = self.KEY_OFFSET_V1
@@ -255,7 +271,7 @@ def __iter__(self):
                 absolute_base_offset = -1
 
             for header, msg_pos in headers:
-                offset, _, crc, _, attrs, timestamp = header
+                offset, length, crc, _, attrs, timestamp = header
                 # There should only ever be a single layer of compression
                 assert not attrs & self.CODEC_MASK, (
                     'MessageSet at offset %d appears double-compressed. This '
@@ -263,7 +279,7 @@ def __iter__(self):
 
                 # When magic value is greater than 0, the timestamp
                 # of a compressed message depends on the
-                # typestamp type of the wrapper message:
+                # timestamp type of the wrapper message:
                 if timestamp_type == self.LOG_APPEND_TIME:
                     timestamp = self._timestamp
 
@@ -271,28 +287,36 @@ def __iter__(self):
                     offset += absolute_base_offset
 
                 key, value = self._read_key_value(msg_pos + key_offset)
+                crc_bytes = self._crc_bytes(msg_pos, length)
                 yield LegacyRecord(
-                    offset, timestamp, timestamp_type,
-                    key, value, crc)
+                    self._magic, offset, timestamp, timestamp_type,
+                    key, value, crc, crc_bytes)
         else:
             key, value = self._read_key_value(key_offset)
+            crc_bytes = self._crc_bytes(0, len(self._buffer) - self.LOG_OVERHEAD)
             yield LegacyRecord(
-                self._offset, self._timestamp, timestamp_type,
-                key, value, self._crc)
+                self._magic, self._offset, self._timestamp, timestamp_type,
+                key, value, self._crc, crc_bytes)
 
 
 class LegacyRecord(ABCRecord):
 
-    __slots__ = ("_offset", "_timestamp", "_timestamp_type", "_key", "_value",
-                 "_crc")
+    __slots__ = ("_magic", "_offset", "_timestamp", "_timestamp_type", "_key", "_value",
+                 "_crc", "_crc_bytes")
 
-    def __init__(self, offset, timestamp, timestamp_type, key, value, crc):
+    def __init__(self, magic, offset, timestamp, timestamp_type, key, value, crc, crc_bytes):
+        self._magic = magic
         self._offset = offset
         self._timestamp = timestamp
         self._timestamp_type = timestamp_type
         self._key = key
         self._value = value
         self._crc = crc
+        self._crc_bytes = crc_bytes
+
+    @property
+    def magic(self):
+        return self._magic
 
     @property
     def offset(self):
@@ -330,11 +354,19 @@ def headers(self):
     def checksum(self):
         return self._crc
 
+    def validate_crc(self):
+        crc = calc_crc32(self._crc_bytes)
+        return self._crc == crc
+
+    @property
+    def size_in_bytes(self):
+        return LegacyRecordBatchBuilder.estimate_size_in_bytes(self._magic, None, self._key, self._value)
+
     def __repr__(self):
         return (
-            "LegacyRecord(offset={!r}, timestamp={!r}, timestamp_type={!r},"
+            "LegacyRecord(magic={!r} offset={!r}, timestamp={!r}, timestamp_type={!r},"
             " key={!r}, value={!r}, crc={!r})".format(
-                self._offset, self._timestamp, self._timestamp_type,
+                self._magic, self._offset, self._timestamp, self._timestamp_type,
                 self._key, self._value, self._crc)
         )
 
diff --git a/kafka/record/memory_records.py b/kafka/record/memory_records.py
index fc2ef2d6b..9df733059 100644
--- a/kafka/record/memory_records.py
+++ b/kafka/record/memory_records.py
@@ -22,7 +22,7 @@
 
 import struct
 
-from kafka.errors import CorruptRecordException
+from kafka.errors import CorruptRecordError, IllegalStateError, UnsupportedVersionError
 from kafka.record.abc import ABCRecords
 from kafka.record.legacy_records import LegacyRecordBatch, LegacyRecordBatchBuilder
 from kafka.record.default_records import DefaultRecordBatch, DefaultRecordBatchBuilder
@@ -99,7 +99,7 @@ def next_batch(self, _min_slice=MIN_SLICE,
         if next_slice is None:
             return None
         if len(next_slice) < _min_slice:
-            raise CorruptRecordException(
+            raise CorruptRecordError(
                 "Record size is less than the minimum record overhead "
                 "({})".format(_min_slice - self.LOG_OVERHEAD))
         self._cache_next()
@@ -109,31 +109,56 @@ def next_batch(self, _min_slice=MIN_SLICE,
         else:
             return DefaultRecordBatch(next_slice)
 
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if not self.has_next():
+            raise StopIteration
+        return self.next_batch()
+
+    next = __next__
+
 
 class MemoryRecordsBuilder(object):
 
     __slots__ = ("_builder", "_batch_size", "_buffer", "_next_offset", "_closed",
-                 "_bytes_written")
+                 "_magic", "_bytes_written", "_producer_id", "_producer_epoch")
 
-    def __init__(self, magic, compression_type, batch_size):
+    def __init__(self, magic, compression_type, batch_size, offset=0,
+                 transactional=False, producer_id=-1, producer_epoch=-1, base_sequence=-1):
         assert magic in [0, 1, 2], "Not supported magic"
         assert compression_type in [0, 1, 2, 3, 4], "Not valid compression type"
         if magic >= 2:
+            assert not transactional or producer_id != -1, "Cannot write transactional messages without a valid producer ID"
+            assert producer_id == -1 or producer_epoch != -1, "Invalid negative producer epoch"
+            assert producer_id == -1 or base_sequence != -1, "Invalid negative sequence number used"
+
             self._builder = DefaultRecordBatchBuilder(
                 magic=magic, compression_type=compression_type,
-                is_transactional=False, producer_id=-1, producer_epoch=-1,
-                base_sequence=-1, batch_size=batch_size)
+                is_transactional=transactional, producer_id=producer_id,
+                producer_epoch=producer_epoch, base_sequence=base_sequence,
+                batch_size=batch_size)
+            self._producer_id = producer_id
+            self._producer_epoch = producer_epoch
         else:
+            assert not transactional and producer_id == -1, "Idempotent messages are not supported for magic %s" % (magic,)
             self._builder = LegacyRecordBatchBuilder(
                 magic=magic, compression_type=compression_type,
                 batch_size=batch_size)
+            self._producer_id = None
         self._batch_size = batch_size
         self._buffer = None
 
-        self._next_offset = 0
+        self._next_offset = offset
         self._closed = False
+        self._magic = magic
         self._bytes_written = 0
 
+    def skip(self, offsets_to_skip):
+        # Exposed for testing compacted records
+        self._next_offset += offsets_to_skip
+
     def append(self, timestamp, key, value, headers=[]):
         """ Append a message to the buffer.
 
@@ -151,6 +176,30 @@ def append(self, timestamp, key, value, headers=[]):
         self._next_offset += 1
         return metadata
 
+    def set_producer_state(self, producer_id, producer_epoch, base_sequence, is_transactional):
+        if self._magic < 2:
+            raise UnsupportedVersionError('Producer State requires Message format v2+')
+        elif self._closed:
+            # Sequence numbers are assigned when the batch is closed while the accumulator is being drained.
+            # If the resulting ProduceRequest to the partition leader failed for a retriable error, the batch will
+            # be re queued. In this case, we should not attempt to set the state again, since changing the pid and sequence
+            # once a batch has been sent to the broker risks introducing duplicates.
+            raise IllegalStateError("Trying to set producer state of an already closed batch. This indicates a bug on the client.")
+        self._builder.set_producer_state(producer_id, producer_epoch, base_sequence, is_transactional)
+        self._producer_id = producer_id
+
+    @property
+    def producer_id(self):
+        return self._producer_id
+
+    @property
+    def producer_epoch(self):
+        return self._producer_epoch
+
+    def records(self):
+        assert self._closed
+        return MemoryRecords(self._buffer)
+
     def close(self):
         # This method may be called multiple times on the same batch
         # i.e., on retries
@@ -160,6 +209,9 @@ def close(self):
         if not self._closed:
             self._bytes_written = self._builder.size()
             self._buffer = bytes(self._builder.build())
+            if self._magic == 2:
+                self._producer_id = self._builder.producer_id
+                self._producer_epoch = self._builder.producer_epoch
             self._builder = None
         self._closed = True
 
diff --git a/kafka/sasl/__init__.py b/kafka/sasl/__init__.py
new file mode 100644
index 000000000..90f05e733
--- /dev/null
+++ b/kafka/sasl/__init__.py
@@ -0,0 +1,34 @@
+from __future__ import absolute_import
+
+import platform
+
+from kafka.sasl.gssapi import SaslMechanismGSSAPI
+from kafka.sasl.msk import SaslMechanismAwsMskIam
+from kafka.sasl.oauth import SaslMechanismOAuth
+from kafka.sasl.plain import SaslMechanismPlain
+from kafka.sasl.scram import SaslMechanismScram
+from kafka.sasl.sspi import SaslMechanismSSPI
+
+
+SASL_MECHANISMS = {}
+
+
+def register_sasl_mechanism(name, klass, overwrite=False):
+    if not overwrite and name in SASL_MECHANISMS:
+        raise ValueError('Sasl mechanism %s already defined!' % name)
+    SASL_MECHANISMS[name] = klass
+
+
+def get_sasl_mechanism(name):
+    return SASL_MECHANISMS[name]
+
+
+register_sasl_mechanism('AWS_MSK_IAM', SaslMechanismAwsMskIam)
+if platform.system() == 'Windows':
+    register_sasl_mechanism('GSSAPI', SaslMechanismSSPI)
+else:
+    register_sasl_mechanism('GSSAPI', SaslMechanismGSSAPI)
+register_sasl_mechanism('OAUTHBEARER', SaslMechanismOAuth)
+register_sasl_mechanism('PLAIN', SaslMechanismPlain)
+register_sasl_mechanism('SCRAM-SHA-256', SaslMechanismScram)
+register_sasl_mechanism('SCRAM-SHA-512', SaslMechanismScram)
diff --git a/kafka/sasl/abc.py b/kafka/sasl/abc.py
new file mode 100644
index 000000000..0577888a9
--- /dev/null
+++ b/kafka/sasl/abc.py
@@ -0,0 +1,33 @@
+from __future__ import absolute_import
+
+import abc
+
+from kafka.vendor.six import add_metaclass
+
+
+@add_metaclass(abc.ABCMeta)
+class SaslMechanism(object):
+    @abc.abstractmethod
+    def __init__(self, **config):
+        pass
+
+    @abc.abstractmethod
+    def auth_bytes(self):
+        pass
+
+    @abc.abstractmethod
+    def receive(self, auth_bytes):
+        pass
+
+    @abc.abstractmethod
+    def is_done(self):
+        pass
+
+    @abc.abstractmethod
+    def is_authenticated(self):
+        pass
+
+    def auth_details(self):
+        if not self.is_authenticated:
+            raise RuntimeError('Not authenticated yet!')
+        return 'Authenticated via SASL'
diff --git a/kafka/sasl/gssapi.py b/kafka/sasl/gssapi.py
new file mode 100644
index 000000000..be84269da
--- /dev/null
+++ b/kafka/sasl/gssapi.py
@@ -0,0 +1,87 @@
+from __future__ import absolute_import
+
+# needed for SASL_GSSAPI authentication:
+try:
+    import gssapi
+    from gssapi.raw.misc import GSSError
+except (ImportError, OSError):
+    #no gssapi available, will disable gssapi mechanism
+    gssapi = None
+    GSSError = None
+
+from kafka.sasl.abc import SaslMechanism
+
+
+class SaslMechanismGSSAPI(SaslMechanism):
+    # Establish security context and negotiate protection level
+    # For reference RFC 2222, section 7.2.1
+
+    SASL_QOP_AUTH = 1
+    SASL_QOP_AUTH_INT = 2
+    SASL_QOP_AUTH_CONF = 4
+
+    def __init__(self, **config):
+        assert gssapi is not None, 'GSSAPI lib not available'
+        if 'sasl_kerberos_name' not in config and 'sasl_kerberos_service_name' not in config:
+            raise ValueError('sasl_kerberos_service_name or sasl_kerberos_name required for GSSAPI sasl configuration')
+        self._is_done = False
+        self._is_authenticated = False
+        if config.get('sasl_kerberos_name', None) is not None:
+            self.auth_id = str(config['sasl_kerberos_name'])
+        else:
+            kerberos_domain_name = config.get('sasl_kerberos_domain_name', '') or config.get('host', '')
+            self.auth_id = config['sasl_kerberos_service_name'] + '@' + kerberos_domain_name
+        if isinstance(config.get('sasl_kerberos_name', None), gssapi.Name):
+            self.gssapi_name = config['sasl_kerberos_name']
+        else:
+            self.gssapi_name = gssapi.Name(self.auth_id, name_type=gssapi.NameType.hostbased_service).canonicalize(gssapi.MechType.kerberos)
+        self._client_ctx = gssapi.SecurityContext(name=self.gssapi_name, usage='initiate')
+        self._next_token = self._client_ctx.step(None)
+
+    def auth_bytes(self):
+        # GSSAPI Auth does not have a final broker->client message
+        # so mark is_done after the final auth_bytes are provided
+        # in practice we'll still receive a response when using SaslAuthenticate
+        # but not when using the prior unframed approach.
+        if self._client_ctx.complete:
+            self._is_done = True
+            self._is_authenticated = True
+        return self._next_token or b''
+
+    def receive(self, auth_bytes):
+        if not self._client_ctx.complete:
+            # The server will send a token back. Processing of this token either
+            # establishes a security context, or it needs further token exchange.
+            # The gssapi will be able to identify the needed next step.
+            self._next_token = self._client_ctx.step(auth_bytes)
+        elif self._is_done:
+            # The final step of gssapi is send, so we do not expect any additional bytes
+            # however, allow an empty message to support SaslAuthenticate response
+            if auth_bytes != b'':
+                raise ValueError("Unexpected receive auth_bytes after sasl/gssapi completion")
+        else:
+            # unwraps message containing supported protection levels and msg size
+            msg = self._client_ctx.unwrap(auth_bytes).message
+            # Kafka currently doesn't support integrity or confidentiality security layers, so we
+            # simply set QoP to 'auth' only (first octet). We reuse the max message size proposed
+            # by the server
+            client_flags = self.SASL_QOP_AUTH
+            server_flags = msg[0]
+            message_parts = [
+                bytes(client_flags & server_flags),
+                msg[:1],
+                self.auth_id.encode('utf-8'),
+            ]
+            # add authorization identity to the response, and GSS-wrap
+            self._next_token = self._client_ctx.wrap(b''.join(message_parts), False).message
+
+    def is_done(self):
+        return self._is_done
+
+    def is_authenticated(self):
+        return self._is_authenticated
+
+    def auth_details(self):
+        if not self.is_authenticated:
+            raise RuntimeError('Not authenticated yet!')
+        return 'Authenticated as %s to %s via SASL / GSSAPI' % (self._client_ctx.initiator_name, self._client_ctx.target_name)
diff --git a/kafka/sasl/msk.py b/kafka/sasl/msk.py
new file mode 100644
index 000000000..db56b4801
--- /dev/null
+++ b/kafka/sasl/msk.py
@@ -0,0 +1,233 @@
+from __future__ import absolute_import
+
+import datetime
+import hashlib
+import hmac
+import json
+import string
+
+# needed for AWS_MSK_IAM authentication:
+try:
+    from botocore.session import Session as BotoSession
+except ImportError:
+    # no botocore available, will disable AWS_MSK_IAM mechanism
+    BotoSession = None
+
+from kafka.sasl.abc import SaslMechanism
+from kafka.vendor.six.moves import urllib
+
+
+class SaslMechanismAwsMskIam(SaslMechanism):
+    def __init__(self, **config):
+        assert BotoSession is not None, 'AWS_MSK_IAM requires the "botocore" package'
+        assert config.get('security_protocol', '') == 'SASL_SSL', 'AWS_MSK_IAM requires SASL_SSL'
+        assert 'host' in config, 'AWS_MSK_IAM requires host configuration'
+        self.host = config['host']
+        self._auth = None
+        self._is_done = False
+        self._is_authenticated = False
+
+    def auth_bytes(self):
+        session = BotoSession()
+        credentials = session.get_credentials().get_frozen_credentials()
+        client = AwsMskIamClient(
+            host=self.host,
+            access_key=credentials.access_key,
+            secret_key=credentials.secret_key,
+            region=session.get_config_variable('region'),
+            token=credentials.token,
+        )
+        return client.first_message()
+
+    def receive(self, auth_bytes):
+        self._is_done = True
+        self._is_authenticated = auth_bytes != b''
+        self._auth = auth_bytes.deode('utf-8')
+
+    def is_done(self):
+        return self._is_done
+
+    def is_authenticated(self):
+        return self._is_authenticated
+
+    def auth_details(self):
+        if not self.is_authenticated:
+            raise RuntimeError('Not authenticated yet!')
+        return 'Authenticated via SASL / AWS_MSK_IAM %s' % (self._auth,)
+
+
+class AwsMskIamClient:
+    UNRESERVED_CHARS = string.ascii_letters + string.digits + '-._~'
+
+    def __init__(self, host, access_key, secret_key, region, token=None):
+        """
+        Arguments:
+            host (str): The hostname of the broker.
+            access_key (str): An AWS_ACCESS_KEY_ID.
+            secret_key (str): An AWS_SECRET_ACCESS_KEY.
+            region (str): An AWS_REGION.
+            token (Optional[str]): An AWS_SESSION_TOKEN if using temporary
+                credentials.
+        """
+        self.algorithm = 'AWS4-HMAC-SHA256'
+        self.expires = '900'
+        self.hashfunc = hashlib.sha256
+        self.headers = [
+            ('host', host)
+        ]
+        self.version = '2020_10_22'
+
+        self.service = 'kafka-cluster'
+        self.action = '{}:Connect'.format(self.service)
+
+        now = datetime.datetime.utcnow()
+        self.datestamp = now.strftime('%Y%m%d')
+        self.timestamp = now.strftime('%Y%m%dT%H%M%SZ')
+
+        self.host = host
+        self.access_key = access_key
+        self.secret_key = secret_key
+        self.region = region
+        self.token = token
+
+    @property
+    def _credential(self):
+        return '{0.access_key}/{0._scope}'.format(self)
+
+    @property
+    def _scope(self):
+        return '{0.datestamp}/{0.region}/{0.service}/aws4_request'.format(self)
+
+    @property
+    def _signed_headers(self):
+        """
+        Returns (str):
+            An alphabetically sorted, semicolon-delimited list of lowercase
+            request header names.
+        """
+        return ';'.join(sorted(k.lower() for k, _ in self.headers))
+
+    @property
+    def _canonical_headers(self):
+        """
+        Returns (str):
+            A newline-delited list of header names and values.
+            Header names are lowercased.
+        """
+        return '\n'.join(map(':'.join, self.headers)) + '\n'
+
+    @property
+    def _canonical_request(self):
+        """
+        Returns (str):
+            An AWS Signature Version 4 canonical request in the format:
+                <Method>\n
+                <Path>\n
+                <CanonicalQueryString>\n
+                <CanonicalHeaders>\n
+                <SignedHeaders>\n
+                <HashedPayload>
+        """
+        # The hashed_payload is always an empty string for MSK.
+        hashed_payload = self.hashfunc(b'').hexdigest()
+        return '\n'.join((
+            'GET',
+            '/',
+            self._canonical_querystring,
+            self._canonical_headers,
+            self._signed_headers,
+            hashed_payload,
+        ))
+
+    @property
+    def _canonical_querystring(self):
+        """
+        Returns (str):
+            A '&'-separated list of URI-encoded key/value pairs.
+        """
+        params = []
+        params.append(('Action', self.action))
+        params.append(('X-Amz-Algorithm', self.algorithm))
+        params.append(('X-Amz-Credential', self._credential))
+        params.append(('X-Amz-Date', self.timestamp))
+        params.append(('X-Amz-Expires', self.expires))
+        if self.token:
+            params.append(('X-Amz-Security-Token', self.token))
+        params.append(('X-Amz-SignedHeaders', self._signed_headers))
+
+        return '&'.join(self._uriencode(k) + '=' + self._uriencode(v) for k, v in params)
+
+    @property
+    def _signing_key(self):
+        """
+        Returns (bytes):
+            An AWS Signature V4 signing key generated from the secret_key, date,
+            region, service, and request type.
+        """
+        key = self._hmac(('AWS4' + self.secret_key).encode('utf-8'), self.datestamp)
+        key = self._hmac(key, self.region)
+        key = self._hmac(key, self.service)
+        key = self._hmac(key, 'aws4_request')
+        return key
+
+    @property
+    def _signing_str(self):
+        """
+        Returns (str):
+            A string used to sign the AWS Signature V4 payload in the format:
+                <Algorithm>\n
+                <Timestamp>\n
+                <Scope>\n
+                <CanonicalRequestHash>
+        """
+        canonical_request_hash = self.hashfunc(self._canonical_request.encode('utf-8')).hexdigest()
+        return '\n'.join((self.algorithm, self.timestamp, self._scope, canonical_request_hash))
+
+    def _uriencode(self, msg):
+        """
+        Arguments:
+            msg (str): A string to URI-encode.
+
+        Returns (str):
+            The URI-encoded version of the provided msg, following the encoding
+            rules specified: https://github.com/aws/aws-msk-iam-auth#uriencode
+        """
+        return urllib.parse.quote(msg, safe=self.UNRESERVED_CHARS)
+
+    def _hmac(self, key, msg):
+        """
+        Arguments:
+            key (bytes): A key to use for the HMAC digest.
+            msg (str): A value to include in the HMAC digest.
+        Returns (bytes):
+            An HMAC digest of the given key and msg.
+        """
+        return hmac.new(key, msg.encode('utf-8'), digestmod=self.hashfunc).digest()
+
+    def first_message(self):
+        """
+        Returns (bytes):
+            An encoded JSON authentication payload that can be sent to the
+            broker.
+        """
+        signature = hmac.new(
+            self._signing_key,
+            self._signing_str.encode('utf-8'),
+            digestmod=self.hashfunc,
+        ).hexdigest()
+        msg = {
+            'version':  self.version,
+            'host': self.host,
+            'user-agent': 'kafka-python',
+            'action': self.action,
+            'x-amz-algorithm': self.algorithm,
+            'x-amz-credential': self._credential,
+            'x-amz-date': self.timestamp,
+            'x-amz-signedheaders': self._signed_headers,
+            'x-amz-expires': self.expires,
+            'x-amz-signature': signature,
+        }
+        if self.token:
+            msg['x-amz-security-token'] = self.token
+
+        return json.dumps(msg, separators=(',', ':')).encode('utf-8')
diff --git a/kafka/sasl/oauth.py b/kafka/sasl/oauth.py
new file mode 100644
index 000000000..f1e959cb6
--- /dev/null
+++ b/kafka/sasl/oauth.py
@@ -0,0 +1,100 @@
+from __future__ import absolute_import
+
+import abc
+import logging
+
+from kafka.sasl.abc import SaslMechanism
+
+
+log = logging.getLogger(__name__)
+
+
+class SaslMechanismOAuth(SaslMechanism):
+
+    def __init__(self, **config):
+        assert 'sasl_oauth_token_provider' in config, 'sasl_oauth_token_provider required for OAUTHBEARER sasl'
+        assert isinstance(config['sasl_oauth_token_provider'], AbstractTokenProvider), \
+            'sasl_oauth_token_provider must implement kafka.sasl.oauth.AbstractTokenProvider'
+        self.token_provider = config['sasl_oauth_token_provider']
+        self._error = None
+        self._is_done = False
+        self._is_authenticated = False
+
+    def auth_bytes(self):
+        if self._error:
+            # Server should respond to this with SaslAuthenticate failure, which ends the auth process
+            return self._error
+        token = self.token_provider.token()
+        extensions = self._token_extensions()
+        return "n,,\x01auth=Bearer {}{}\x01\x01".format(token, extensions).encode('utf-8')
+
+    def receive(self, auth_bytes):
+        if auth_bytes != b'':
+            error = auth_bytes.decode('utf-8')
+            log.debug("Sending x01 response to server after receiving SASL OAuth error: %s", error)
+            self._error = b'\x01'
+        else:
+            self._is_done = True
+            self._is_authenticated = True
+
+    def is_done(self):
+        return self._is_done
+
+    def is_authenticated(self):
+        return self._is_authenticated
+
+    def _token_extensions(self):
+        """
+        Return a string representation of the OPTIONAL key-value pairs that can be sent with an OAUTHBEARER
+        initial request.
+        """
+        # Builds up a string separated by \x01 via a dict of key value pairs
+        extensions = self.token_provider.extensions()
+        msg = '\x01'.join(['{}={}'.format(k, v) for k, v in extensions.items()])
+        return '\x01' + msg if msg else ''
+
+    def auth_details(self):
+        if not self.is_authenticated:
+            raise RuntimeError('Not authenticated yet!')
+        return 'Authenticated via SASL / OAuth'
+
+# This statement is compatible with both Python 2.7 & 3+
+ABC = abc.ABCMeta('ABC', (object,), {'__slots__': ()})
+
+class AbstractTokenProvider(ABC):
+    """
+    A Token Provider must be used for the SASL OAuthBearer protocol.
+
+    The implementation should ensure token reuse so that multiple
+    calls at connect time do not create multiple tokens. The implementation
+    should also periodically refresh the token in order to guarantee
+    that each call returns an unexpired token. A timeout error should
+    be returned after a short period of inactivity so that the
+    broker can log debugging info and retry.
+
+    Token Providers MUST implement the token() method
+    """
+
+    def __init__(self, **config):
+        pass
+
+    @abc.abstractmethod
+    def token(self):
+        """
+        Returns a (str) ID/Access Token to be sent to the Kafka
+        client.
+        """
+        pass
+
+    def extensions(self):
+        """
+        This is an OPTIONAL method that may be implemented.
+
+        Returns a map of key-value pairs that can
+        be sent with the SASL/OAUTHBEARER initial client request. If
+        not implemented, the values are ignored. This feature is only available
+        in Kafka >= 2.1.0.
+
+        All returned keys and values should be type str
+        """
+        return {}
diff --git a/kafka/sasl/plain.py b/kafka/sasl/plain.py
new file mode 100644
index 000000000..81443f5fe
--- /dev/null
+++ b/kafka/sasl/plain.py
@@ -0,0 +1,41 @@
+from __future__ import absolute_import
+
+import logging
+
+from kafka.sasl.abc import SaslMechanism
+
+
+log = logging.getLogger(__name__)
+
+
+class SaslMechanismPlain(SaslMechanism):
+
+    def __init__(self, **config):
+        if config.get('security_protocol', '') == 'SASL_PLAINTEXT':
+            log.warning('Sending username and password in the clear')
+        assert 'sasl_plain_username' in config, 'sasl_plain_username required for PLAIN sasl'
+        assert 'sasl_plain_password' in config, 'sasl_plain_password required for PLAIN sasl'
+
+        self.username = config['sasl_plain_username']
+        self.password = config['sasl_plain_password']
+        self._is_done = False
+        self._is_authenticated = False
+
+    def auth_bytes(self):
+        # Send PLAIN credentials per RFC-4616
+        return bytes('\0'.join([self.username, self.username, self.password]).encode('utf-8'))
+
+    def receive(self, auth_bytes):
+        self._is_done = True
+        self._is_authenticated = auth_bytes == b''
+
+    def is_done(self):
+        return self._is_done
+
+    def is_authenticated(self):
+        return self._is_authenticated
+
+    def auth_details(self):
+        if not self.is_authenticated:
+            raise RuntimeError('Not authenticated yet!')
+        return 'Authenticated as %s via SASL / Plain' % self.username
diff --git a/kafka/sasl/scram.py b/kafka/sasl/scram.py
new file mode 100644
index 000000000..d8cd071a7
--- /dev/null
+++ b/kafka/sasl/scram.py
@@ -0,0 +1,133 @@
+from __future__ import absolute_import
+
+import base64
+import hashlib
+import hmac
+import logging
+import uuid
+
+
+from kafka.sasl.abc import SaslMechanism
+from kafka.vendor import six
+
+
+log = logging.getLogger(__name__)
+
+
+if six.PY2:
+    def xor_bytes(left, right):
+        return bytearray(ord(lb) ^ ord(rb) for lb, rb in zip(left, right))
+else:
+    def xor_bytes(left, right):
+        return bytes(lb ^ rb for lb, rb in zip(left, right))
+
+
+class SaslMechanismScram(SaslMechanism):
+    def __init__(self, **config):
+        assert 'sasl_plain_username' in config, 'sasl_plain_username required for SCRAM sasl'
+        assert 'sasl_plain_password' in config, 'sasl_plain_password required for SCRAM sasl'
+        assert config.get('sasl_mechanism', '') in ScramClient.MECHANISMS, 'Unrecognized SCRAM mechanism'
+        if config.get('security_protocol', '') == 'SASL_PLAINTEXT':
+            log.warning('Exchanging credentials in the clear during Sasl Authentication')
+
+        self.username = config['sasl_plain_username']
+        self.mechanism = config['sasl_mechanism']
+        self._scram_client = ScramClient(
+            config['sasl_plain_username'],
+            config['sasl_plain_password'],
+            config['sasl_mechanism']
+        )
+        self._state = 0
+
+    def auth_bytes(self):
+        if self._state == 0:
+            return self._scram_client.first_message()
+        elif self._state == 1:
+            return self._scram_client.final_message()
+        else:
+            raise ValueError('No auth_bytes for state: %s' % self._state)
+
+    def receive(self, auth_bytes):
+        if self._state == 0:
+            self._scram_client.process_server_first_message(auth_bytes)
+        elif self._state == 1:
+            self._scram_client.process_server_final_message(auth_bytes)
+        else:
+            raise ValueError('Cannot receive bytes in state: %s' % self._state)
+        self._state += 1
+        return self.is_done()
+
+    def is_done(self):
+        return self._state == 2
+
+    def is_authenticated(self):
+        # receive raises if authentication fails...?
+        return self._state == 2
+
+    def auth_details(self):
+        if not self.is_authenticated:
+            raise RuntimeError('Not authenticated yet!')
+        return 'Authenticated as %s via SASL / %s' % (self.username, self.mechanism)
+
+
+class ScramClient:
+    MECHANISMS = {
+        'SCRAM-SHA-256': hashlib.sha256,
+        'SCRAM-SHA-512': hashlib.sha512
+    }
+
+    def __init__(self, user, password, mechanism):
+        self.nonce = str(uuid.uuid4()).replace('-', '').encode('utf-8')
+        self.auth_message = b''
+        self.salted_password = None
+        self.user = user.encode('utf-8')
+        self.password = password.encode('utf-8')
+        self.hashfunc = self.MECHANISMS[mechanism]
+        self.hashname = ''.join(mechanism.lower().split('-')[1:3])
+        self.stored_key = None
+        self.client_key = None
+        self.client_signature = None
+        self.client_proof = None
+        self.server_key = None
+        self.server_signature = None
+
+    def first_message(self):
+        client_first_bare = b'n=' + self.user + b',r=' + self.nonce
+        self.auth_message += client_first_bare
+        return b'n,,' + client_first_bare
+
+    def process_server_first_message(self, server_first_message):
+        self.auth_message += b',' + server_first_message
+        params = dict(pair.split('=', 1) for pair in server_first_message.decode('utf-8').split(','))
+        server_nonce = params['r'].encode('utf-8')
+        if not server_nonce.startswith(self.nonce):
+            raise ValueError("Server nonce, did not start with client nonce!")
+        self.nonce = server_nonce
+        self.auth_message += b',c=biws,r=' + self.nonce
+
+        salt = base64.b64decode(params['s'].encode('utf-8'))
+        iterations = int(params['i'])
+        self.create_salted_password(salt, iterations)
+
+        self.client_key = self.hmac(self.salted_password, b'Client Key')
+        self.stored_key = self.hashfunc(self.client_key).digest()
+        self.client_signature = self.hmac(self.stored_key, self.auth_message)
+        self.client_proof = xor_bytes(self.client_key, self.client_signature)
+        self.server_key = self.hmac(self.salted_password, b'Server Key')
+        self.server_signature = self.hmac(self.server_key, self.auth_message)
+
+    def hmac(self, key, msg):
+        return hmac.new(key, msg, digestmod=self.hashfunc).digest()
+
+    def create_salted_password(self, salt, iterations):
+        self.salted_password = hashlib.pbkdf2_hmac(
+            self.hashname, self.password, salt, iterations
+        )
+
+    def final_message(self):
+        return b'c=biws,r=' + self.nonce + b',p=' + base64.b64encode(self.client_proof)
+
+    def process_server_final_message(self, server_final_message):
+        params = dict(pair.split('=', 1) for pair in server_final_message.decode('utf-8').split(','))
+        if self.server_signature != base64.b64decode(params['v'].encode('utf-8')):
+            raise ValueError("Server sent wrong signature!")
diff --git a/kafka/sasl/sspi.py b/kafka/sasl/sspi.py
new file mode 100644
index 000000000..f4c95d037
--- /dev/null
+++ b/kafka/sasl/sspi.py
@@ -0,0 +1,111 @@
+from __future__ import absolute_import
+
+import logging
+
+# Windows-only
+try:
+    import sspi
+    import pywintypes
+    import sspicon
+    import win32security
+except ImportError:
+    sspi = None
+
+from kafka.sasl.abc import SaslMechanism
+
+
+log = logging.getLogger(__name__)
+
+
+class SaslMechanismSSPI(SaslMechanism):
+    # Establish security context and negotiate protection level
+    # For reference see RFC 4752, section 3
+
+    SASL_QOP_AUTH = 1
+    SASL_QOP_AUTH_INT = 2
+    SASL_QOP_AUTH_CONF = 4
+
+    def __init__(self, **config):
+        assert sspi is not None, 'No GSSAPI lib available (gssapi or sspi)'
+        if 'sasl_kerberos_name' not in config and 'sasl_kerberos_service_name' not in config:
+            raise ValueError('sasl_kerberos_service_name or sasl_kerberos_name required for GSSAPI sasl configuration')
+        self._is_done = False
+        self._is_authenticated = False
+        if config.get('sasl_kerberos_name', None) is not None:
+            self.auth_id = str(config['sasl_kerberos_name'])
+        else:
+            kerberos_domain_name = config.get('sasl_kerberos_domain_name', '') or config.get('host', '')
+            self.auth_id = config['sasl_kerberos_service_name'] + '/' + kerberos_domain_name
+        scheme = "Kerberos"  # Do not try with Negotiate for SASL authentication. Tokens are different.
+        # https://docs.microsoft.com/en-us/windows/win32/secauthn/context-requirements
+        flags = (
+            sspicon.ISC_REQ_MUTUAL_AUTH |      # mutual authentication
+            sspicon.ISC_REQ_INTEGRITY |        # check for integrity
+            sspicon.ISC_REQ_SEQUENCE_DETECT |  # enable out-of-order messages
+            sspicon.ISC_REQ_CONFIDENTIALITY    # request confidentiality
+        )
+        self._client_ctx = sspi.ClientAuth(scheme, targetspn=self.auth_id, scflags=flags)
+        self._next_token = self._client_ctx.step(None)
+
+    def auth_bytes(self):
+        # GSSAPI Auth does not have a final broker->client message
+        # so mark is_done after the final auth_bytes are provided
+        # in practice we'll still receive a response when using SaslAuthenticate
+        # but not when using the prior unframed approach.
+        if self._client_ctx.authenticated:
+            self._is_done = True
+            self._is_authenticated = True
+        return self._next_token or b''
+
+    def receive(self, auth_bytes):
+        log.debug("Received token from server (size %s)", len(auth_bytes))
+        if not self._client_ctx.authenticated:
+            # calculate an output token from kafka token (or None on first iteration)
+            # https://docs.microsoft.com/en-us/windows/win32/api/sspi/nf-sspi-initializesecuritycontexta
+            # https://docs.microsoft.com/en-us/windows/win32/secauthn/initializesecuritycontext--kerberos
+            # authorize method will wrap for us our token in sspi structures
+            error, auth = self._client_ctx.authorize(auth_bytes)
+            if len(auth) > 0 and len(auth[0].Buffer):
+                log.debug("Got token from context")
+                # this buffer must be sent to the server whatever the result is
+                self._next_token = auth[0].Buffer
+            else:
+                log.debug("Got no token, exchange finished")
+                # seems to be the end of the loop
+                self._next_token = b''
+        elif self._is_done:
+            # The final step of gssapi is send, so we do not expect any additional bytes
+            # however, allow an empty message to support SaslAuthenticate response
+            if auth_bytes != b'':
+                raise ValueError("Unexpected receive auth_bytes after sasl/gssapi completion")
+        else:
+            # Process the security layer negotiation token, sent by the server
+            # once the security context is established.
+
+            # The following part is required by SASL, but not by classic Kerberos.
+            # See RFC 4752
+
+            # unwraps message containing supported protection levels and msg size
+            msg, _was_encrypted = self._client_ctx.unwrap(auth_bytes)
+
+            # Kafka currently doesn't support integrity or confidentiality security layers, so we
+            # simply set QoP to 'auth' only (first octet). We reuse the max message size proposed
+            # by the server
+            client_flags = self.SASL_QOP_AUTH
+            server_flags = msg[0]
+            message_parts = [
+                bytes(client_flags & server_flags),
+                msg[:1],
+                self.auth_id.encode('utf-8'),
+            ]
+            # add authorization identity to the response, and GSS-wrap
+            self._next_token = self._client_ctx.wrap(b''.join(message_parts), False)
+
+    def is_done(self):
+        return self._is_done
+
+    def is_authenticated(self):
+        return self._is_authenticated
+
+    def auth_details(self):
+        return 'Authenticated as %s to %s via SASL / SSPI/GSSAPI \\o/' % (self._client_ctx.initiator_name, self._client_ctx.service_name)
diff --git a/kafka/scram.py b/kafka/scram.py
deleted file mode 100644
index 7f003750c..000000000
--- a/kafka/scram.py
+++ /dev/null
@@ -1,81 +0,0 @@
-from __future__ import absolute_import
-
-import base64
-import hashlib
-import hmac
-import uuid
-
-from kafka.vendor import six
-
-
-if six.PY2:
-    def xor_bytes(left, right):
-        return bytearray(ord(lb) ^ ord(rb) for lb, rb in zip(left, right))
-else:
-    def xor_bytes(left, right):
-        return bytes(lb ^ rb for lb, rb in zip(left, right))
-
-
-class ScramClient:
-    MECHANISMS = {
-        'SCRAM-SHA-256': hashlib.sha256,
-        'SCRAM-SHA-512': hashlib.sha512
-    }
-
-    def __init__(self, user, password, mechanism):
-        self.nonce = str(uuid.uuid4()).replace('-', '')
-        self.auth_message = ''
-        self.salted_password = None
-        self.user = user
-        self.password = password.encode('utf-8')
-        self.hashfunc = self.MECHANISMS[mechanism]
-        self.hashname = ''.join(mechanism.lower().split('-')[1:3])
-        self.stored_key = None
-        self.client_key = None
-        self.client_signature = None
-        self.client_proof = None
-        self.server_key = None
-        self.server_signature = None
-
-    def first_message(self):
-        client_first_bare = 'n={},r={}'.format(self.user, self.nonce)
-        self.auth_message += client_first_bare
-        return 'n,,' + client_first_bare
-
-    def process_server_first_message(self, server_first_message):
-        self.auth_message += ',' + server_first_message
-        params = dict(pair.split('=', 1) for pair in server_first_message.split(','))
-        server_nonce = params['r']
-        if not server_nonce.startswith(self.nonce):
-            raise ValueError("Server nonce, did not start with client nonce!")
-        self.nonce = server_nonce
-        self.auth_message += ',c=biws,r=' + self.nonce
-
-        salt = base64.b64decode(params['s'].encode('utf-8'))
-        iterations = int(params['i'])
-        self.create_salted_password(salt, iterations)
-
-        self.client_key = self.hmac(self.salted_password, b'Client Key')
-        self.stored_key = self.hashfunc(self.client_key).digest()
-        self.client_signature = self.hmac(self.stored_key, self.auth_message.encode('utf-8'))
-        self.client_proof = xor_bytes(self.client_key, self.client_signature)
-        self.server_key = self.hmac(self.salted_password, b'Server Key')
-        self.server_signature = self.hmac(self.server_key, self.auth_message.encode('utf-8'))
-
-    def hmac(self, key, msg):
-        return hmac.new(key, msg, digestmod=self.hashfunc).digest()
-
-    def create_salted_password(self, salt, iterations):
-        self.salted_password = hashlib.pbkdf2_hmac(
-            self.hashname, self.password, salt, iterations
-        )
-
-    def final_message(self):
-        return 'c=biws,r={},p={}'.format(self.nonce, base64.b64encode(self.client_proof).decode('utf-8'))
-
-    def process_server_final_message(self, server_final_message):
-        params = dict(pair.split('=', 1) for pair in server_final_message.split(','))
-        if self.server_signature != base64.b64decode(params['v'].encode('utf-8')):
-            raise ValueError("Server sent wrong signature!")
-
-
diff --git a/kafka/socks5_wrapper.py b/kafka/socks5_wrapper.py
new file mode 100644
index 000000000..18bea7c8d
--- /dev/null
+++ b/kafka/socks5_wrapper.py
@@ -0,0 +1,248 @@
+try:
+    from urllib.parse import urlparse
+except ImportError:
+    from urlparse import urlparse
+
+import errno
+import logging
+import random
+import socket
+import struct
+
+log = logging.getLogger(__name__)
+
+
+class ProxyConnectionStates:
+    DISCONNECTED = '<disconnected>'
+    CONNECTING = '<connecting>'
+    NEGOTIATE_PROPOSE = '<negotiate_propose>'
+    NEGOTIATING = '<negotiating>'
+    AUTHENTICATING = '<authenticating>'
+    REQUEST_SUBMIT = '<request_submit>'
+    REQUESTING = '<requesting>'
+    READ_ADDRESS = '<read_address>'
+    COMPLETE = '<complete>'
+
+
+class Socks5Wrapper:
+    """Socks5 proxy wrapper
+
+    Manages connection through socks5 proxy with support for username/password
+    authentication.
+    """
+
+    def __init__(self, proxy_url, afi):
+        self._buffer_in = b''
+        self._buffer_out = b''
+        self._proxy_url = urlparse(proxy_url)
+        self._sock = None
+        self._state = ProxyConnectionStates.DISCONNECTED
+        self._target_afi = socket.AF_UNSPEC
+
+        proxy_addrs = self.dns_lookup(self._proxy_url.hostname, self._proxy_url.port, afi)
+        # TODO raise error on lookup failure
+        self._proxy_addr = random.choice(proxy_addrs)
+
+    @classmethod
+    def is_inet_4_or_6(cls, gai):
+        """Given a getaddrinfo struct, return True iff ipv4 or ipv6"""
+        return gai[0] in (socket.AF_INET, socket.AF_INET6)
+
+    @classmethod
+    def dns_lookup(cls, host, port, afi=socket.AF_UNSPEC):
+        """Returns a list of getaddrinfo structs, optionally filtered to an afi (ipv4 / ipv6)"""
+        # XXX: all DNS functions in Python are blocking. If we really
+        # want to be non-blocking here, we need to use a 3rd-party
+        # library like python-adns, or move resolution onto its
+        # own thread. This will be subject to the default libc
+        # name resolution timeout (5s on most Linux boxes)
+        try:
+            return list(filter(cls.is_inet_4_or_6,
+                               socket.getaddrinfo(host, port, afi,
+                                                  socket.SOCK_STREAM)))
+        except socket.gaierror as ex:
+            log.warning("DNS lookup failed for proxy %s:%d, %r", host, port, ex)
+            return []
+
+    def socket(self, family, sock_type):
+        """Open and record a socket.
+
+        Returns the actual underlying socket
+        object to ensure e.g. selects and ssl wrapping works as expected.
+        """
+        self._target_afi = family  # Store the address family of the target
+        afi, _, _, _, _ = self._proxy_addr
+        self._sock = socket.socket(afi, sock_type)
+        return self._sock
+
+    def _flush_buf(self):
+        """Send out all data that is stored in the outgoing buffer.
+
+        It is expected that the caller handles error handling, including non-blocking
+        as well as connection failure exceptions.
+        """
+        while self._buffer_out:
+            sent_bytes = self._sock.send(self._buffer_out)
+            self._buffer_out = self._buffer_out[sent_bytes:]
+
+    def _peek_buf(self, datalen):
+        """Ensure local inbound buffer has enough data, and return that data without
+        consuming the local buffer
+
+        It's expected that the caller handles e.g. blocking exceptions"""
+        while True:
+            bytes_remaining = datalen - len(self._buffer_in)
+            if bytes_remaining <= 0:
+                break
+            data = self._sock.recv(bytes_remaining)
+            if not data:
+                break
+            self._buffer_in = self._buffer_in + data
+
+        return self._buffer_in[:datalen]
+
+    def _read_buf(self, datalen):
+        """Read and consume bytes from socket connection
+
+        It's expected that the caller handles e.g. blocking exceptions"""
+        buf = self._peek_buf(datalen)
+        if buf:
+            self._buffer_in = self._buffer_in[len(buf):]
+        return buf
+
+    def connect_ex(self, addr):
+        """Runs a state machine through connection to authentication to
+        proxy connection request.
+
+        The somewhat strange setup is to facilitate non-intrusive use from
+        BrokerConnection state machine.
+
+        This function is called with a socket in non-blocking mode. Both
+        send and receive calls can return in EWOULDBLOCK/EAGAIN which we
+        specifically avoid handling here. These are handled in main
+        BrokerConnection connection loop, which then would retry calls
+        to this function."""
+
+        if self._state == ProxyConnectionStates.DISCONNECTED:
+            self._state = ProxyConnectionStates.CONNECTING
+
+        if self._state == ProxyConnectionStates.CONNECTING:
+            _, _, _, _, sockaddr = self._proxy_addr
+            ret = self._sock.connect_ex(sockaddr)
+            if not ret or ret == errno.EISCONN:
+                self._state = ProxyConnectionStates.NEGOTIATE_PROPOSE
+            else:
+                return ret
+
+        if self._state == ProxyConnectionStates.NEGOTIATE_PROPOSE:
+            if self._proxy_url.username and self._proxy_url.password:
+                # Propose username/password
+                self._buffer_out = b"\x05\x01\x02"
+            else:
+                # Propose no auth
+                self._buffer_out = b"\x05\x01\x00"
+            self._state = ProxyConnectionStates.NEGOTIATING
+
+        if self._state == ProxyConnectionStates.NEGOTIATING:
+            self._flush_buf()
+            buf = self._read_buf(2)
+            if buf[0:1] != b"\x05":
+                log.error("Unrecognized SOCKS version")
+                self._state = ProxyConnectionStates.DISCONNECTED
+                self._sock.close()
+                return errno.ECONNREFUSED
+
+            if buf[1:2] == b"\x00":
+                # No authentication required
+                self._state = ProxyConnectionStates.REQUEST_SUBMIT
+            elif buf[1:2] == b"\x02":
+                # Username/password authentication selected
+                userlen = len(self._proxy_url.username)
+                passlen = len(self._proxy_url.password)
+                self._buffer_out = struct.pack(
+                    "!bb{}sb{}s".format(userlen, passlen),
+                    1,  # version
+                    userlen,
+                    self._proxy_url.username.encode(),
+                    passlen,
+                    self._proxy_url.password.encode(),
+                )
+                self._state = ProxyConnectionStates.AUTHENTICATING
+            else:
+                log.error("Unrecognized SOCKS authentication method")
+                self._state = ProxyConnectionStates.DISCONNECTED
+                self._sock.close()
+                return errno.ECONNREFUSED
+
+        if self._state == ProxyConnectionStates.AUTHENTICATING:
+            self._flush_buf()
+            buf = self._read_buf(2)
+            if buf == b"\x01\x00":
+                # Authentication succesful
+                self._state = ProxyConnectionStates.REQUEST_SUBMIT
+            else:
+                log.error("Socks5 proxy authentication failure")
+                self._state = ProxyConnectionStates.DISCONNECTED
+                self._sock.close()
+                return errno.ECONNREFUSED
+
+        if self._state == ProxyConnectionStates.REQUEST_SUBMIT:
+            if self._target_afi == socket.AF_INET:
+                addr_type = 1
+                addr_len = 4
+            elif self._target_afi == socket.AF_INET6:
+                addr_type = 4
+                addr_len = 16
+            else:
+                log.error("Unknown address family, %r", self._target_afi)
+                self._state = ProxyConnectionStates.DISCONNECTED
+                self._sock.close()
+                return errno.ECONNREFUSED
+
+            self._buffer_out = struct.pack(
+                "!bbbb{}sh".format(addr_len),
+                5,  # version
+                1,  # command: connect
+                0,  # reserved
+                addr_type,  # 1 for ipv4, 4 for ipv6 address
+                socket.inet_pton(self._target_afi, addr[0]),  # either 4 or 16 bytes of actual address
+                addr[1],  # port
+            )
+            self._state = ProxyConnectionStates.REQUESTING
+
+        if self._state == ProxyConnectionStates.REQUESTING:
+            self._flush_buf()
+            buf = self._read_buf(2)
+            if buf[0:2] == b"\x05\x00":
+                self._state = ProxyConnectionStates.READ_ADDRESS
+            else:
+                log.error("Proxy request failed: %r", buf[1:2])
+                self._state = ProxyConnectionStates.DISCONNECTED
+                self._sock.close()
+                return errno.ECONNREFUSED
+
+        if self._state == ProxyConnectionStates.READ_ADDRESS:
+            # we don't really care about the remote endpoint address, but need to clear the stream
+            buf = self._peek_buf(2)
+            if buf[0:2] == b"\x00\x01":
+                _ = self._read_buf(2 + 4 + 2)  # ipv4 address + port
+            elif buf[0:2] == b"\x00\x05":
+                _ = self._read_buf(2 + 16 + 2)  # ipv6 address + port
+            else:
+                log.error("Unrecognized remote address type %r", buf[1:2])
+                self._state = ProxyConnectionStates.DISCONNECTED
+                self._sock.close()
+                return errno.ECONNREFUSED
+            self._state = ProxyConnectionStates.COMPLETE
+
+        if self._state == ProxyConnectionStates.COMPLETE:
+            return 0
+
+        # not reached;
+        # Send and recv will raise socket error on EWOULDBLOCK/EAGAIN that is assumed to be handled by
+        # the caller. The caller re-enters this state machine from retry logic with timer or via select & family
+        log.error("Internal error, state %r not handled correctly", self._state)
+        self._state = ProxyConnectionStates.DISCONNECTED
+        if self._sock:
+            self._sock.close()
+        return errno.ECONNREFUSED
diff --git a/kafka/structs.py b/kafka/structs.py
index bcb023670..16ba0daac 100644
--- a/kafka/structs.py
+++ b/kafka/structs.py
@@ -42,7 +42,7 @@
                         this partition metadata.
 """
 PartitionMetadata = namedtuple("PartitionMetadata",
-    ["topic", "partition", "leader", "replicas", "isr", "error"])
+    ["topic", "partition", "leader", "leader_epoch", "replicas", "isr", "offline_replicas", "error"])
 
 
 """The Kafka offset commit API
@@ -55,10 +55,10 @@
 Keyword Arguments:
     offset (int): The offset to be committed
     metadata (str): Non-null metadata
+    leader_epoch (int): The last known epoch from the leader / broker
 """
 OffsetAndMetadata = namedtuple("OffsetAndMetadata",
-    # TODO add leaderEpoch: OffsetAndMetadata(offset, leaderEpoch, metadata)
-    ["offset", "metadata"])
+    ["offset", "metadata", "leader_epoch"])
 
 
 """An offset and timestamp tuple
@@ -66,9 +66,10 @@
 Keyword Arguments:
     offset (int): An offset
     timestamp (int): The timestamp associated to the offset
+    leader_epoch (int): The last known epoch from the leader / broker
 """
 OffsetAndTimestamp = namedtuple("OffsetAndTimestamp",
-    ["offset", "timestamp"])
+    ["offset", "timestamp", "leader_epoch"])
 
 MemberInformation = namedtuple("MemberInformation",
     ["member_id", "client_id", "client_host", "member_metadata", "member_assignment"])
diff --git a/kafka/util.py b/kafka/util.py
index e31d99305..bfb9365ad 100644
--- a/kafka/util.py
+++ b/kafka/util.py
@@ -1,8 +1,11 @@
-from __future__ import absolute_import
+from __future__ import absolute_import, division
 
 import binascii
+import re
+import time
 import weakref
 
+from kafka.errors import KafkaTimeoutError
 from kafka.vendor import six
 
 
@@ -19,7 +22,69 @@ def crc32(data):
             crc -= TO_SIGNED
         return crc
 else:
-    from binascii import crc32
+    from binascii import crc32 # noqa: F401
+
+
+class Timer:
+    __slots__ = ('_start_at', '_expire_at', '_timeout_ms', '_error_message')
+
+    def __init__(self, timeout_ms, error_message=None, start_at=None):
+        self._timeout_ms = timeout_ms
+        self._start_at = start_at or time.time()
+        if timeout_ms is not None:
+            self._expire_at = self._start_at + timeout_ms / 1000
+        else:
+            self._expire_at = float('inf')
+        self._error_message = error_message
+
+    @property
+    def expired(self):
+        return time.time() >= self._expire_at
+
+    @property
+    def timeout_ms(self):
+        if self._timeout_ms is None:
+            return None
+        elif self._expire_at == float('inf'):
+            return float('inf')
+        remaining = self._expire_at - time.time()
+        if remaining < 0:
+            return 0
+        else:
+            return int(remaining * 1000)
+
+    @property
+    def elapsed_ms(self):
+        return int(1000 * (time.time() - self._start_at))
+
+    def maybe_raise(self):
+        if self.expired:
+            raise KafkaTimeoutError(self._error_message)
+
+    def __str__(self):
+        return "Timer(%s ms remaining)" % (self.timeout_ms)
+
+# Taken from: https://github.com/apache/kafka/blob/39eb31feaeebfb184d98cc5d94da9148c2319d81/clients/src/main/java/org/apache/kafka/common/internals/Topic.java#L29
+TOPIC_MAX_LENGTH = 249
+TOPIC_LEGAL_CHARS = re.compile('^[a-zA-Z0-9._-]+$')
+
+def ensure_valid_topic_name(topic):
+    """ Ensures that the topic name is valid according to the kafka source. """
+
+    # See Kafka Source:
+    # https://github.com/apache/kafka/blob/39eb31feaeebfb184d98cc5d94da9148c2319d81/clients/src/main/java/org/apache/kafka/common/internals/Topic.java
+    if topic is None:
+        raise TypeError('All topics must not be None')
+    if not isinstance(topic, six.string_types):
+        raise TypeError('All topics must be strings')
+    if len(topic) == 0:
+        raise ValueError('All topics must be non-empty strings')
+    if topic == '.' or topic == '..':
+        raise ValueError('Topic name cannot be "." or ".."')
+    if len(topic) > TOPIC_MAX_LENGTH:
+        raise ValueError('Topic name is illegal, it can\'t be longer than {0} characters, topic: "{1}"'.format(TOPIC_MAX_LENGTH, topic))
+    if not TOPIC_LEGAL_CHARS.match(topic):
+        raise ValueError('Topic name "{0}" is illegal, it contains a character other than ASCII alphanumerics, ".", "_" and "-"'.format(topic))
 
 
 class WeakMethod(object):
diff --git a/kafka/vendor/selectors34.py b/kafka/vendor/selectors34.py
index ebf5d515e..787490340 100644
--- a/kafka/vendor/selectors34.py
+++ b/kafka/vendor/selectors34.py
@@ -15,7 +15,11 @@
 from __future__ import absolute_import
 
 from abc import ABCMeta, abstractmethod
-from collections import namedtuple, Mapping
+from collections import namedtuple
+try:
+    from collections.abc import Mapping
+except ImportError:
+    from collections import Mapping
 from errno import EINTR
 import math
 import select
diff --git a/kafka/vendor/six.py b/kafka/vendor/six.py
index 3621a0ab4..319821353 100644
--- a/kafka/vendor/six.py
+++ b/kafka/vendor/six.py
@@ -1,6 +1,6 @@
 # pylint: skip-file
 
-# Copyright (c) 2010-2017 Benjamin Peterson
+# Copyright (c) 2010-2020 Benjamin Peterson
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -31,7 +31,7 @@
 import types
 
 __author__ = "Benjamin Peterson <benjamin@python.org>"
-__version__ = "1.11.0"
+__version__ = "1.16.0"
 
 
 # Useful for very coarse version differentiation.
@@ -77,6 +77,11 @@ def __len__(self):
         # https://github.com/dpkp/kafka-python/pull/979#discussion_r100403389
         # del X
 
+if PY34:
+    from importlib.util import spec_from_loader
+else:
+    spec_from_loader = None
+
 
 def _add_doc(func, doc):
     """Add documentation to a function."""
@@ -192,6 +197,11 @@ def find_module(self, fullname, path=None):
             return self
         return None
 
+    def find_spec(self, fullname, path, target=None):
+        if fullname in self.known_modules:
+            return spec_from_loader(fullname, self)
+        return None
+
     def __get_module(self, fullname):
         try:
             return self.known_modules[fullname]
@@ -229,6 +239,12 @@ def get_code(self, fullname):
         return None
     get_source = get_code  # same as get_code
 
+    def create_module(self, spec):
+        return self.load_module(spec.name)
+
+    def exec_module(self, module):
+        pass
+
 _importer = _SixMetaPathImporter(__name__)
 
 
@@ -253,7 +269,7 @@ class _MovedItems(_LazyModule):
     MovedAttribute("reduce", "__builtin__", "functools"),
     MovedAttribute("shlex_quote", "pipes", "shlex", "quote"),
     MovedAttribute("StringIO", "StringIO", "io"),
-    MovedAttribute("UserDict", "UserDict", "collections"),
+    MovedAttribute("UserDict", "UserDict", "collections", "IterableUserDict", "UserDict"),
     MovedAttribute("UserList", "UserList", "collections"),
     MovedAttribute("UserString", "UserString", "collections"),
     MovedAttribute("xrange", "__builtin__", "builtins", "xrange", "range"),
@@ -261,9 +277,11 @@ class _MovedItems(_LazyModule):
     MovedAttribute("zip_longest", "itertools", "itertools", "izip_longest", "zip_longest"),
     MovedModule("builtins", "__builtin__"),
     MovedModule("configparser", "ConfigParser"),
+    MovedModule("collections_abc", "collections", "collections.abc" if sys.version_info >= (3, 3) else "collections"),
     MovedModule("copyreg", "copy_reg"),
     MovedModule("dbm_gnu", "gdbm", "dbm.gnu"),
-    MovedModule("_dummy_thread", "dummy_thread", "_dummy_thread"),
+    MovedModule("dbm_ndbm", "dbm", "dbm.ndbm"),
+    MovedModule("_dummy_thread", "dummy_thread", "_dummy_thread" if sys.version_info < (3, 9) else "_thread"),
     MovedModule("http_cookiejar", "cookielib", "http.cookiejar"),
     MovedModule("http_cookies", "Cookie", "http.cookies"),
     MovedModule("html_entities", "htmlentitydefs", "html.entities"),
@@ -643,13 +661,16 @@ def u(s):
     import io
     StringIO = io.StringIO
     BytesIO = io.BytesIO
+    del io
     _assertCountEqual = "assertCountEqual"
     if sys.version_info[1] <= 1:
         _assertRaisesRegex = "assertRaisesRegexp"
         _assertRegex = "assertRegexpMatches"
+        _assertNotRegex = "assertNotRegexpMatches"
     else:
         _assertRaisesRegex = "assertRaisesRegex"
         _assertRegex = "assertRegex"
+        _assertNotRegex = "assertNotRegex"
 else:
     def b(s):
         return s
@@ -671,6 +692,7 @@ def indexbytes(buf, i):
     _assertCountEqual = "assertItemsEqual"
     _assertRaisesRegex = "assertRaisesRegexp"
     _assertRegex = "assertRegexpMatches"
+    _assertNotRegex = "assertNotRegexpMatches"
 _add_doc(b, """Byte literal""")
 _add_doc(u, """Text literal""")
 
@@ -687,6 +709,10 @@ def assertRegex(self, *args, **kwargs):
     return getattr(self, _assertRegex)(*args, **kwargs)
 
 
+def assertNotRegex(self, *args, **kwargs):
+    return getattr(self, _assertNotRegex)(*args, **kwargs)
+
+
 if PY3:
     exec_ = getattr(moves.builtins, "exec")
 
@@ -722,16 +748,7 @@ def exec_(_code_, _globs_=None, _locs_=None):
 """)
 
 
-if sys.version_info[:2] == (3, 2):
-    exec_("""def raise_from(value, from_value):
-    try:
-        if from_value is None:
-            raise value
-        raise value from from_value
-    finally:
-        value = None
-""")
-elif sys.version_info[:2] > (3, 2):
+if sys.version_info[:2] > (3,):
     exec_("""def raise_from(value, from_value):
     try:
         raise value from from_value
@@ -811,13 +828,33 @@ def print_(*args, **kwargs):
 _add_doc(reraise, """Reraise an exception.""")
 
 if sys.version_info[0:2] < (3, 4):
+    # This does exactly the same what the :func:`py3:functools.update_wrapper`
+    # function does on Python versions after 3.2. It sets the ``__wrapped__``
+    # attribute on ``wrapper`` object and it doesn't raise an error if any of
+    # the attributes mentioned in ``assigned`` and ``updated`` are missing on
+    # ``wrapped`` object.
+    def _update_wrapper(wrapper, wrapped,
+                        assigned=functools.WRAPPER_ASSIGNMENTS,
+                        updated=functools.WRAPPER_UPDATES):
+        for attr in assigned:
+            try:
+                value = getattr(wrapped, attr)
+            except AttributeError:
+                continue
+            else:
+                setattr(wrapper, attr, value)
+        for attr in updated:
+            getattr(wrapper, attr).update(getattr(wrapped, attr, {}))
+        wrapper.__wrapped__ = wrapped
+        return wrapper
+    _update_wrapper.__doc__ = functools.update_wrapper.__doc__
+
     def wraps(wrapped, assigned=functools.WRAPPER_ASSIGNMENTS,
               updated=functools.WRAPPER_UPDATES):
-        def wrapper(f):
-            f = functools.wraps(wrapped, assigned, updated)(f)
-            f.__wrapped__ = wrapped
-            return f
-        return wrapper
+        return functools.partial(_update_wrapper, wrapped=wrapped,
+                                 assigned=assigned, updated=updated)
+    wraps.__doc__ = functools.wraps.__doc__
+
 else:
     wraps = functools.wraps
 
@@ -830,7 +867,15 @@ def with_metaclass(meta, *bases):
     class metaclass(type):
 
         def __new__(cls, name, this_bases, d):
-            return meta(name, bases, d)
+            if sys.version_info[:2] >= (3, 7):
+                # This version introduced PEP 560 that requires a bit
+                # of extra care (we mimic what is done by __build_class__).
+                resolved_bases = types.resolve_bases(bases)
+                if resolved_bases is not bases:
+                    d['__orig_bases__'] = bases
+            else:
+                resolved_bases = bases
+            return meta(name, resolved_bases, d)
 
         @classmethod
         def __prepare__(cls, name, this_bases):
@@ -850,13 +895,75 @@ def wrapper(cls):
                 orig_vars.pop(slots_var)
         orig_vars.pop('__dict__', None)
         orig_vars.pop('__weakref__', None)
+        if hasattr(cls, '__qualname__'):
+            orig_vars['__qualname__'] = cls.__qualname__
         return metaclass(cls.__name__, cls.__bases__, orig_vars)
     return wrapper
 
 
+def ensure_binary(s, encoding='utf-8', errors='strict'):
+    """Coerce **s** to six.binary_type.
+
+    For Python 2:
+      - `unicode` -> encoded to `str`
+      - `str` -> `str`
+
+    For Python 3:
+      - `str` -> encoded to `bytes`
+      - `bytes` -> `bytes`
+    """
+    if isinstance(s, binary_type):
+        return s
+    if isinstance(s, text_type):
+        return s.encode(encoding, errors)
+    raise TypeError("not expecting type '%s'" % type(s))
+
+
+def ensure_str(s, encoding='utf-8', errors='strict'):
+    """Coerce *s* to `str`.
+
+    For Python 2:
+      - `unicode` -> encoded to `str`
+      - `str` -> `str`
+
+    For Python 3:
+      - `str` -> `str`
+      - `bytes` -> decoded to `str`
+    """
+    # Optimization: Fast return for the common case.
+    if type(s) is str:
+        return s
+    if PY2 and isinstance(s, text_type):
+        return s.encode(encoding, errors)
+    elif PY3 and isinstance(s, binary_type):
+        return s.decode(encoding, errors)
+    elif not isinstance(s, (text_type, binary_type)):
+        raise TypeError("not expecting type '%s'" % type(s))
+    return s
+
+
+def ensure_text(s, encoding='utf-8', errors='strict'):
+    """Coerce *s* to six.text_type.
+
+    For Python 2:
+      - `unicode` -> `unicode`
+      - `str` -> `unicode`
+
+    For Python 3:
+      - `str` -> `str`
+      - `bytes` -> decoded to `str`
+    """
+    if isinstance(s, binary_type):
+        return s.decode(encoding, errors)
+    elif isinstance(s, text_type):
+        return s
+    else:
+        raise TypeError("not expecting type '%s'" % type(s))
+
+
 def python_2_unicode_compatible(klass):
     """
-    A decorator that defines __unicode__ and __str__ methods under Python 2.
+    A class decorator that defines __unicode__ and __str__ methods under Python 2.
     Under Python 3 it does nothing.
 
     To support Python 2 and 3 with a single code base, define a __str__ method
diff --git a/kafka/vendor/socketpair.py b/kafka/vendor/socketpair.py
index b55e629ee..54d908767 100644
--- a/kafka/vendor/socketpair.py
+++ b/kafka/vendor/socketpair.py
@@ -53,6 +53,23 @@ def socketpair(family=socket.AF_INET, type=socket.SOCK_STREAM, proto=0):
                 raise
         finally:
             lsock.close()
+
+        # Authenticating avoids using a connection from something else
+        # able to connect to {host}:{port} instead of us.
+        # We expect only AF_INET and AF_INET6 families.
+        try:
+            if (
+                ssock.getsockname() != csock.getpeername()
+                or csock.getsockname() != ssock.getpeername()
+            ):
+                raise ConnectionError("Unexpected peer connection")
+        except:
+            # getsockname() and getpeername() can fail
+            # if either socket isn't connected.
+            ssock.close()
+            csock.close()
+            raise
+
         return (ssock, csock)
 
     socket.socketpair = socketpair
diff --git a/kafka/version.py b/kafka/version.py
index 06306bd1f..e604ff743 100644
--- a/kafka/version.py
+++ b/kafka/version.py
@@ -1 +1 @@
-__version__ = '2.0.3-dev'
+__version__ = '2.2.5.dev'
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 000000000..d575a8959
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,56 @@
+[build-system]
+requires = ["setuptools>=61.2"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "kafka-python"
+dynamic = ["version"]
+authors = [{name = "Dana Powers", email = "dana.powers@gmail.com"}]
+description = "Pure Python client for Apache Kafka"
+keywords = ["apache kafka", "kafka"]
+readme = "README.rst"
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 2",
+    "Programming Language :: Python :: 2.7",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.4",
+    "Programming Language :: Python :: 3.5",
+    "Programming Language :: Python :: 3.6",
+    "Programming Language :: Python :: 3.7",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: Implementation :: CPython",
+    "Programming Language :: Python :: Implementation :: PyPy",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+urls = {Homepage = "https://github.com/dpkp/kafka-python"}
+
+[project.optional-dependencies]
+crc32c = ["crc32c"]
+lz4 = ["lz4"]
+snappy = ["python-snappy"]
+zstd = ["zstandard"]
+testing = ["pytest", "mock; python_version < '3.3'", "pytest-mock", "pytest-timeout"]
+benchmarks = ["pyperf"]
+
+[tool.setuptools]
+include-package-data = false
+license-files = [] # workaround for https://github.com/pypa/setuptools/issues/4759
+
+[tool.setuptools.packages.find]
+exclude = ["test"]
+namespaces = false
+
+[tool.distutils.bdist_wheel]
+universal = 1
+
+[tool.setuptools.dynamic]
+version = {attr = "kafka.__version__"}
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 000000000..7fcb1f4a8
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,4 @@
+[pytest]
+log_format = %(asctime)s.%(msecs)03d %(levelname)-8s %(thread)d:%(threadName)s %(name)-23s %(message)s
+log_level = DEBUG
+addopts = --durations=10 --timeout=300
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 00ad68c22..8de5e28d4 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,17 +1,19 @@
-coveralls==2.1.2
-crc32c==2.1
-docker-py==1.10.6
-flake8==3.8.3
-lz4==3.1.0
-mock==4.0.2
-py==1.9.0
-pylint==2.6.0
-pytest==6.0.2
-pytest-cov==2.10.1
-pytest-mock==3.3.1
-pytest-pylint==0.17.0
-python-snappy==0.5.4
-Sphinx==3.2.1
-sphinx-rtd-theme==0.5.0
-tox==3.20.0
-xxhash==2.0.0
+coveralls
+crc32c
+docker-py
+flake8
+lz4
+mock; python_version < '3.3'
+py
+pylint
+pyperf
+pytest
+pytest-cov
+pytest-mock
+pytest-pylint
+pytest-timeout
+python-snappy
+Sphinx
+sphinx-rtd-theme
+xxhash
+zstandard
diff --git a/servers/0.11.0.0/resources/kafka.properties b/servers/0.11.0.0/resources/kafka.properties
index 5775cfdc4..219023551 100644
--- a/servers/0.11.0.0/resources/kafka.properties
+++ b/servers/0.11.0.0/resources/kafka.properties
@@ -4,14 +4,15 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 # see kafka.server.KafkaConfig for additional details and defaults
 
 ############################# Server Basics #############################
@@ -21,6 +22,12 @@ broker.id={broker_id}
 
 ############################# Socket Server Settings #############################
 
+# The address the socket server listens on. It will get the value returned from 
+# java.net.InetAddress.getCanonicalHostName() if not configured.
+#   FORMAT:
+#     listeners = listener_name://host_name:port
+#   EXAMPLE:
+#     listeners = PLAINTEXT://your.host.name:9092
 listeners={transport}://{host}:{port}
 security.inter.broker.protocol={transport}
 
@@ -38,22 +45,18 @@ allow.everyone.if.no.acl.found=true
 # The port the socket server listens on
 #port=9092
 
-# Hostname the broker will bind to. If not set, the server will bind to all interfaces
-#host.name=localhost
-
-# Hostname the broker will advertise to producers and consumers. If not set, it uses the
-# value for "host.name" if configured.  Otherwise, it will use the value returned from
-# java.net.InetAddress.getCanonicalHostName().
-#advertised.host.name=<hostname routable by clients>
+# Hostname and port the broker will advertise to producers and consumers. If not set, 
+# it uses the value for "listeners" if configured.  Otherwise, it will use the value
+# returned from java.net.InetAddress.getCanonicalHostName().
+#advertised.listeners=PLAINTEXT://your.host.name:9092
 
-# The port to publish to ZooKeeper for clients to use. If this is not set,
-# it will publish the same port that the broker binds to.
-#advertised.port=<port accessible by clients>
+# Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
+#listener.security.protocol.map=PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
 
-# The number of threads handling network requests
+# The number of threads that the server uses for receiving requests from the network and sending responses to the network
 num.network.threads=3
- 
-# The number of threads doing disk I/O
+
+# The number of threads that the server uses for processing requests, which may include disk I/O
 num.io.threads=8
 
 # The send buffer (SO_SNDBUF) used by the socket server
@@ -68,7 +71,7 @@ socket.request.max.bytes=104857600
 
 ############################# Log Basics #############################
 
-# A comma seperated list of directories under which to store log files
+# A comma separated list of directories under which to store log files
 log.dirs={tmp_dir}/data
 
 # The default number of log partitions per topic. More partitions allow greater
@@ -81,14 +84,25 @@ default.replication.factor={replicas}
 replica.lag.time.max.ms=1000
 replica.socket.timeout.ms=1000
 
+# The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
+# This value is recommended to be increased for installations with data dirs located in RAID array.
+num.recovery.threads.per.data.dir=1
+
+############################# Internal Topic Settings  #############################
+# The replication factor for the group metadata internal topics "__consumer_offsets" and "__transaction_state"
+# For anything other than development testing, a value greater than 1 is recommended to ensure availability such as 3.
+offsets.topic.replication.factor=1
+transaction.state.log.replication.factor=1
+transaction.state.log.min.isr=1
+
 ############################# Log Flush Policy #############################
 
 # Messages are immediately written to the filesystem but by default we only fsync() to sync
-# the OS cache lazily. The following configurations control the flush of data to disk. 
+# the OS cache lazily. The following configurations control the flush of data to disk.
 # There are a few important trade-offs here:
 #    1. Durability: Unflushed data may be lost if you are not using replication.
 #    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
-#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 
+#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to excessive seeks.
 # The settings below allow one to configure the flush policy to flush data after a period of time or
 # every N messages (or both). This can be done globally and overridden on a per-topic basis.
 
@@ -105,17 +119,17 @@ replica.socket.timeout.ms=1000
 # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
 # from the end of the log.
 
-# The minimum age of a log file to be eligible for deletion
+# The minimum age of a log file to be eligible for deletion due to age
 log.retention.hours=168
 
-# A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
-# segments don't drop below log.retention.bytes.
+# A size-based retention policy for logs. Segments are pruned from the log unless the remaining
+# segments drop below log.retention.bytes. Functions independently of log.retention.hours.
 #log.retention.bytes=1073741824
 
 # The maximum size of a log segment file. When this size is reached a new log segment will be created.
 log.segment.bytes=1073741824
 
-# The interval at which log segments are checked to see if they can be deleted according 
+# The interval at which log segments are checked to see if they can be deleted according
 # to the retention policies
 log.retention.check.interval.ms=300000
 
@@ -145,3 +159,13 @@ zookeeper.connect={zk_host}:{zk_port}/{zk_chroot}
 zookeeper.connection.timeout.ms=30000
 # We want to expire kafka broker sessions quickly when brokers die b/c we restart them quickly
 zookeeper.session.timeout.ms=500
+
+
+############################# Group Coordinator Settings #############################
+
+# The following configuration specifies the time, in milliseconds, that the GroupCoordinator will delay the initial consumer rebalance.
+# The rebalance will be further delayed by the value of group.initial.rebalance.delay.ms as new members join the group, up to a maximum of max.poll.interval.ms.
+# The default value for this is 3 seconds.
+# We override this to 0 here as it makes for a better out-of-the-box experience for development and testing.
+# However, in production environments the default value of 3 seconds is more suitable as this will help to avoid unnecessary, and potentially expensive, rebalances during application startup.
+group.initial.rebalance.delay.ms=0
diff --git a/servers/0.11.0.1/resources/kafka.properties b/servers/0.11.0.1/resources/kafka.properties
index 5775cfdc4..219023551 100644
--- a/servers/0.11.0.1/resources/kafka.properties
+++ b/servers/0.11.0.1/resources/kafka.properties
@@ -4,14 +4,15 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 # see kafka.server.KafkaConfig for additional details and defaults
 
 ############################# Server Basics #############################
@@ -21,6 +22,12 @@ broker.id={broker_id}
 
 ############################# Socket Server Settings #############################
 
+# The address the socket server listens on. It will get the value returned from 
+# java.net.InetAddress.getCanonicalHostName() if not configured.
+#   FORMAT:
+#     listeners = listener_name://host_name:port
+#   EXAMPLE:
+#     listeners = PLAINTEXT://your.host.name:9092
 listeners={transport}://{host}:{port}
 security.inter.broker.protocol={transport}
 
@@ -38,22 +45,18 @@ allow.everyone.if.no.acl.found=true
 # The port the socket server listens on
 #port=9092
 
-# Hostname the broker will bind to. If not set, the server will bind to all interfaces
-#host.name=localhost
-
-# Hostname the broker will advertise to producers and consumers. If not set, it uses the
-# value for "host.name" if configured.  Otherwise, it will use the value returned from
-# java.net.InetAddress.getCanonicalHostName().
-#advertised.host.name=<hostname routable by clients>
+# Hostname and port the broker will advertise to producers and consumers. If not set, 
+# it uses the value for "listeners" if configured.  Otherwise, it will use the value
+# returned from java.net.InetAddress.getCanonicalHostName().
+#advertised.listeners=PLAINTEXT://your.host.name:9092
 
-# The port to publish to ZooKeeper for clients to use. If this is not set,
-# it will publish the same port that the broker binds to.
-#advertised.port=<port accessible by clients>
+# Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
+#listener.security.protocol.map=PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
 
-# The number of threads handling network requests
+# The number of threads that the server uses for receiving requests from the network and sending responses to the network
 num.network.threads=3
- 
-# The number of threads doing disk I/O
+
+# The number of threads that the server uses for processing requests, which may include disk I/O
 num.io.threads=8
 
 # The send buffer (SO_SNDBUF) used by the socket server
@@ -68,7 +71,7 @@ socket.request.max.bytes=104857600
 
 ############################# Log Basics #############################
 
-# A comma seperated list of directories under which to store log files
+# A comma separated list of directories under which to store log files
 log.dirs={tmp_dir}/data
 
 # The default number of log partitions per topic. More partitions allow greater
@@ -81,14 +84,25 @@ default.replication.factor={replicas}
 replica.lag.time.max.ms=1000
 replica.socket.timeout.ms=1000
 
+# The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
+# This value is recommended to be increased for installations with data dirs located in RAID array.
+num.recovery.threads.per.data.dir=1
+
+############################# Internal Topic Settings  #############################
+# The replication factor for the group metadata internal topics "__consumer_offsets" and "__transaction_state"
+# For anything other than development testing, a value greater than 1 is recommended to ensure availability such as 3.
+offsets.topic.replication.factor=1
+transaction.state.log.replication.factor=1
+transaction.state.log.min.isr=1
+
 ############################# Log Flush Policy #############################
 
 # Messages are immediately written to the filesystem but by default we only fsync() to sync
-# the OS cache lazily. The following configurations control the flush of data to disk. 
+# the OS cache lazily. The following configurations control the flush of data to disk.
 # There are a few important trade-offs here:
 #    1. Durability: Unflushed data may be lost if you are not using replication.
 #    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
-#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 
+#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to excessive seeks.
 # The settings below allow one to configure the flush policy to flush data after a period of time or
 # every N messages (or both). This can be done globally and overridden on a per-topic basis.
 
@@ -105,17 +119,17 @@ replica.socket.timeout.ms=1000
 # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
 # from the end of the log.
 
-# The minimum age of a log file to be eligible for deletion
+# The minimum age of a log file to be eligible for deletion due to age
 log.retention.hours=168
 
-# A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
-# segments don't drop below log.retention.bytes.
+# A size-based retention policy for logs. Segments are pruned from the log unless the remaining
+# segments drop below log.retention.bytes. Functions independently of log.retention.hours.
 #log.retention.bytes=1073741824
 
 # The maximum size of a log segment file. When this size is reached a new log segment will be created.
 log.segment.bytes=1073741824
 
-# The interval at which log segments are checked to see if they can be deleted according 
+# The interval at which log segments are checked to see if they can be deleted according
 # to the retention policies
 log.retention.check.interval.ms=300000
 
@@ -145,3 +159,13 @@ zookeeper.connect={zk_host}:{zk_port}/{zk_chroot}
 zookeeper.connection.timeout.ms=30000
 # We want to expire kafka broker sessions quickly when brokers die b/c we restart them quickly
 zookeeper.session.timeout.ms=500
+
+
+############################# Group Coordinator Settings #############################
+
+# The following configuration specifies the time, in milliseconds, that the GroupCoordinator will delay the initial consumer rebalance.
+# The rebalance will be further delayed by the value of group.initial.rebalance.delay.ms as new members join the group, up to a maximum of max.poll.interval.ms.
+# The default value for this is 3 seconds.
+# We override this to 0 here as it makes for a better out-of-the-box experience for development and testing.
+# However, in production environments the default value of 3 seconds is more suitable as this will help to avoid unnecessary, and potentially expensive, rebalances during application startup.
+group.initial.rebalance.delay.ms=0
diff --git a/servers/0.11.0.2/resources/kafka.properties b/servers/0.11.0.2/resources/kafka.properties
index 5775cfdc4..219023551 100644
--- a/servers/0.11.0.2/resources/kafka.properties
+++ b/servers/0.11.0.2/resources/kafka.properties
@@ -4,14 +4,15 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 # see kafka.server.KafkaConfig for additional details and defaults
 
 ############################# Server Basics #############################
@@ -21,6 +22,12 @@ broker.id={broker_id}
 
 ############################# Socket Server Settings #############################
 
+# The address the socket server listens on. It will get the value returned from 
+# java.net.InetAddress.getCanonicalHostName() if not configured.
+#   FORMAT:
+#     listeners = listener_name://host_name:port
+#   EXAMPLE:
+#     listeners = PLAINTEXT://your.host.name:9092
 listeners={transport}://{host}:{port}
 security.inter.broker.protocol={transport}
 
@@ -38,22 +45,18 @@ allow.everyone.if.no.acl.found=true
 # The port the socket server listens on
 #port=9092
 
-# Hostname the broker will bind to. If not set, the server will bind to all interfaces
-#host.name=localhost
-
-# Hostname the broker will advertise to producers and consumers. If not set, it uses the
-# value for "host.name" if configured.  Otherwise, it will use the value returned from
-# java.net.InetAddress.getCanonicalHostName().
-#advertised.host.name=<hostname routable by clients>
+# Hostname and port the broker will advertise to producers and consumers. If not set, 
+# it uses the value for "listeners" if configured.  Otherwise, it will use the value
+# returned from java.net.InetAddress.getCanonicalHostName().
+#advertised.listeners=PLAINTEXT://your.host.name:9092
 
-# The port to publish to ZooKeeper for clients to use. If this is not set,
-# it will publish the same port that the broker binds to.
-#advertised.port=<port accessible by clients>
+# Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
+#listener.security.protocol.map=PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
 
-# The number of threads handling network requests
+# The number of threads that the server uses for receiving requests from the network and sending responses to the network
 num.network.threads=3
- 
-# The number of threads doing disk I/O
+
+# The number of threads that the server uses for processing requests, which may include disk I/O
 num.io.threads=8
 
 # The send buffer (SO_SNDBUF) used by the socket server
@@ -68,7 +71,7 @@ socket.request.max.bytes=104857600
 
 ############################# Log Basics #############################
 
-# A comma seperated list of directories under which to store log files
+# A comma separated list of directories under which to store log files
 log.dirs={tmp_dir}/data
 
 # The default number of log partitions per topic. More partitions allow greater
@@ -81,14 +84,25 @@ default.replication.factor={replicas}
 replica.lag.time.max.ms=1000
 replica.socket.timeout.ms=1000
 
+# The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
+# This value is recommended to be increased for installations with data dirs located in RAID array.
+num.recovery.threads.per.data.dir=1
+
+############################# Internal Topic Settings  #############################
+# The replication factor for the group metadata internal topics "__consumer_offsets" and "__transaction_state"
+# For anything other than development testing, a value greater than 1 is recommended to ensure availability such as 3.
+offsets.topic.replication.factor=1
+transaction.state.log.replication.factor=1
+transaction.state.log.min.isr=1
+
 ############################# Log Flush Policy #############################
 
 # Messages are immediately written to the filesystem but by default we only fsync() to sync
-# the OS cache lazily. The following configurations control the flush of data to disk. 
+# the OS cache lazily. The following configurations control the flush of data to disk.
 # There are a few important trade-offs here:
 #    1. Durability: Unflushed data may be lost if you are not using replication.
 #    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
-#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 
+#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to excessive seeks.
 # The settings below allow one to configure the flush policy to flush data after a period of time or
 # every N messages (or both). This can be done globally and overridden on a per-topic basis.
 
@@ -105,17 +119,17 @@ replica.socket.timeout.ms=1000
 # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
 # from the end of the log.
 
-# The minimum age of a log file to be eligible for deletion
+# The minimum age of a log file to be eligible for deletion due to age
 log.retention.hours=168
 
-# A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
-# segments don't drop below log.retention.bytes.
+# A size-based retention policy for logs. Segments are pruned from the log unless the remaining
+# segments drop below log.retention.bytes. Functions independently of log.retention.hours.
 #log.retention.bytes=1073741824
 
 # The maximum size of a log segment file. When this size is reached a new log segment will be created.
 log.segment.bytes=1073741824
 
-# The interval at which log segments are checked to see if they can be deleted according 
+# The interval at which log segments are checked to see if they can be deleted according
 # to the retention policies
 log.retention.check.interval.ms=300000
 
@@ -145,3 +159,13 @@ zookeeper.connect={zk_host}:{zk_port}/{zk_chroot}
 zookeeper.connection.timeout.ms=30000
 # We want to expire kafka broker sessions quickly when brokers die b/c we restart them quickly
 zookeeper.session.timeout.ms=500
+
+
+############################# Group Coordinator Settings #############################
+
+# The following configuration specifies the time, in milliseconds, that the GroupCoordinator will delay the initial consumer rebalance.
+# The rebalance will be further delayed by the value of group.initial.rebalance.delay.ms as new members join the group, up to a maximum of max.poll.interval.ms.
+# The default value for this is 3 seconds.
+# We override this to 0 here as it makes for a better out-of-the-box experience for development and testing.
+# However, in production environments the default value of 3 seconds is more suitable as this will help to avoid unnecessary, and potentially expensive, rebalances during application startup.
+group.initial.rebalance.delay.ms=0
diff --git a/servers/0.11.0.3/resources/kafka.properties b/servers/0.11.0.3/resources/kafka.properties
index 5775cfdc4..219023551 100644
--- a/servers/0.11.0.3/resources/kafka.properties
+++ b/servers/0.11.0.3/resources/kafka.properties
@@ -4,14 +4,15 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 # see kafka.server.KafkaConfig for additional details and defaults
 
 ############################# Server Basics #############################
@@ -21,6 +22,12 @@ broker.id={broker_id}
 
 ############################# Socket Server Settings #############################
 
+# The address the socket server listens on. It will get the value returned from 
+# java.net.InetAddress.getCanonicalHostName() if not configured.
+#   FORMAT:
+#     listeners = listener_name://host_name:port
+#   EXAMPLE:
+#     listeners = PLAINTEXT://your.host.name:9092
 listeners={transport}://{host}:{port}
 security.inter.broker.protocol={transport}
 
@@ -38,22 +45,18 @@ allow.everyone.if.no.acl.found=true
 # The port the socket server listens on
 #port=9092
 
-# Hostname the broker will bind to. If not set, the server will bind to all interfaces
-#host.name=localhost
-
-# Hostname the broker will advertise to producers and consumers. If not set, it uses the
-# value for "host.name" if configured.  Otherwise, it will use the value returned from
-# java.net.InetAddress.getCanonicalHostName().
-#advertised.host.name=<hostname routable by clients>
+# Hostname and port the broker will advertise to producers and consumers. If not set, 
+# it uses the value for "listeners" if configured.  Otherwise, it will use the value
+# returned from java.net.InetAddress.getCanonicalHostName().
+#advertised.listeners=PLAINTEXT://your.host.name:9092
 
-# The port to publish to ZooKeeper for clients to use. If this is not set,
-# it will publish the same port that the broker binds to.
-#advertised.port=<port accessible by clients>
+# Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
+#listener.security.protocol.map=PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
 
-# The number of threads handling network requests
+# The number of threads that the server uses for receiving requests from the network and sending responses to the network
 num.network.threads=3
- 
-# The number of threads doing disk I/O
+
+# The number of threads that the server uses for processing requests, which may include disk I/O
 num.io.threads=8
 
 # The send buffer (SO_SNDBUF) used by the socket server
@@ -68,7 +71,7 @@ socket.request.max.bytes=104857600
 
 ############################# Log Basics #############################
 
-# A comma seperated list of directories under which to store log files
+# A comma separated list of directories under which to store log files
 log.dirs={tmp_dir}/data
 
 # The default number of log partitions per topic. More partitions allow greater
@@ -81,14 +84,25 @@ default.replication.factor={replicas}
 replica.lag.time.max.ms=1000
 replica.socket.timeout.ms=1000
 
+# The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
+# This value is recommended to be increased for installations with data dirs located in RAID array.
+num.recovery.threads.per.data.dir=1
+
+############################# Internal Topic Settings  #############################
+# The replication factor for the group metadata internal topics "__consumer_offsets" and "__transaction_state"
+# For anything other than development testing, a value greater than 1 is recommended to ensure availability such as 3.
+offsets.topic.replication.factor=1
+transaction.state.log.replication.factor=1
+transaction.state.log.min.isr=1
+
 ############################# Log Flush Policy #############################
 
 # Messages are immediately written to the filesystem but by default we only fsync() to sync
-# the OS cache lazily. The following configurations control the flush of data to disk. 
+# the OS cache lazily. The following configurations control the flush of data to disk.
 # There are a few important trade-offs here:
 #    1. Durability: Unflushed data may be lost if you are not using replication.
 #    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
-#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 
+#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to excessive seeks.
 # The settings below allow one to configure the flush policy to flush data after a period of time or
 # every N messages (or both). This can be done globally and overridden on a per-topic basis.
 
@@ -105,17 +119,17 @@ replica.socket.timeout.ms=1000
 # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
 # from the end of the log.
 
-# The minimum age of a log file to be eligible for deletion
+# The minimum age of a log file to be eligible for deletion due to age
 log.retention.hours=168
 
-# A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
-# segments don't drop below log.retention.bytes.
+# A size-based retention policy for logs. Segments are pruned from the log unless the remaining
+# segments drop below log.retention.bytes. Functions independently of log.retention.hours.
 #log.retention.bytes=1073741824
 
 # The maximum size of a log segment file. When this size is reached a new log segment will be created.
 log.segment.bytes=1073741824
 
-# The interval at which log segments are checked to see if they can be deleted according 
+# The interval at which log segments are checked to see if they can be deleted according
 # to the retention policies
 log.retention.check.interval.ms=300000
 
@@ -145,3 +159,13 @@ zookeeper.connect={zk_host}:{zk_port}/{zk_chroot}
 zookeeper.connection.timeout.ms=30000
 # We want to expire kafka broker sessions quickly when brokers die b/c we restart them quickly
 zookeeper.session.timeout.ms=500
+
+
+############################# Group Coordinator Settings #############################
+
+# The following configuration specifies the time, in milliseconds, that the GroupCoordinator will delay the initial consumer rebalance.
+# The rebalance will be further delayed by the value of group.initial.rebalance.delay.ms as new members join the group, up to a maximum of max.poll.interval.ms.
+# The default value for this is 3 seconds.
+# We override this to 0 here as it makes for a better out-of-the-box experience for development and testing.
+# However, in production environments the default value of 3 seconds is more suitable as this will help to avoid unnecessary, and potentially expensive, rebalances during application startup.
+group.initial.rebalance.delay.ms=0
diff --git a/servers/1.0.0/resources/kafka.properties b/servers/1.0.0/resources/kafka.properties
index 5775cfdc4..219023551 100644
--- a/servers/1.0.0/resources/kafka.properties
+++ b/servers/1.0.0/resources/kafka.properties
@@ -4,14 +4,15 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 # see kafka.server.KafkaConfig for additional details and defaults
 
 ############################# Server Basics #############################
@@ -21,6 +22,12 @@ broker.id={broker_id}
 
 ############################# Socket Server Settings #############################
 
+# The address the socket server listens on. It will get the value returned from 
+# java.net.InetAddress.getCanonicalHostName() if not configured.
+#   FORMAT:
+#     listeners = listener_name://host_name:port
+#   EXAMPLE:
+#     listeners = PLAINTEXT://your.host.name:9092
 listeners={transport}://{host}:{port}
 security.inter.broker.protocol={transport}
 
@@ -38,22 +45,18 @@ allow.everyone.if.no.acl.found=true
 # The port the socket server listens on
 #port=9092
 
-# Hostname the broker will bind to. If not set, the server will bind to all interfaces
-#host.name=localhost
-
-# Hostname the broker will advertise to producers and consumers. If not set, it uses the
-# value for "host.name" if configured.  Otherwise, it will use the value returned from
-# java.net.InetAddress.getCanonicalHostName().
-#advertised.host.name=<hostname routable by clients>
+# Hostname and port the broker will advertise to producers and consumers. If not set, 
+# it uses the value for "listeners" if configured.  Otherwise, it will use the value
+# returned from java.net.InetAddress.getCanonicalHostName().
+#advertised.listeners=PLAINTEXT://your.host.name:9092
 
-# The port to publish to ZooKeeper for clients to use. If this is not set,
-# it will publish the same port that the broker binds to.
-#advertised.port=<port accessible by clients>
+# Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
+#listener.security.protocol.map=PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
 
-# The number of threads handling network requests
+# The number of threads that the server uses for receiving requests from the network and sending responses to the network
 num.network.threads=3
- 
-# The number of threads doing disk I/O
+
+# The number of threads that the server uses for processing requests, which may include disk I/O
 num.io.threads=8
 
 # The send buffer (SO_SNDBUF) used by the socket server
@@ -68,7 +71,7 @@ socket.request.max.bytes=104857600
 
 ############################# Log Basics #############################
 
-# A comma seperated list of directories under which to store log files
+# A comma separated list of directories under which to store log files
 log.dirs={tmp_dir}/data
 
 # The default number of log partitions per topic. More partitions allow greater
@@ -81,14 +84,25 @@ default.replication.factor={replicas}
 replica.lag.time.max.ms=1000
 replica.socket.timeout.ms=1000
 
+# The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
+# This value is recommended to be increased for installations with data dirs located in RAID array.
+num.recovery.threads.per.data.dir=1
+
+############################# Internal Topic Settings  #############################
+# The replication factor for the group metadata internal topics "__consumer_offsets" and "__transaction_state"
+# For anything other than development testing, a value greater than 1 is recommended to ensure availability such as 3.
+offsets.topic.replication.factor=1
+transaction.state.log.replication.factor=1
+transaction.state.log.min.isr=1
+
 ############################# Log Flush Policy #############################
 
 # Messages are immediately written to the filesystem but by default we only fsync() to sync
-# the OS cache lazily. The following configurations control the flush of data to disk. 
+# the OS cache lazily. The following configurations control the flush of data to disk.
 # There are a few important trade-offs here:
 #    1. Durability: Unflushed data may be lost if you are not using replication.
 #    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
-#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 
+#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to excessive seeks.
 # The settings below allow one to configure the flush policy to flush data after a period of time or
 # every N messages (or both). This can be done globally and overridden on a per-topic basis.
 
@@ -105,17 +119,17 @@ replica.socket.timeout.ms=1000
 # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
 # from the end of the log.
 
-# The minimum age of a log file to be eligible for deletion
+# The minimum age of a log file to be eligible for deletion due to age
 log.retention.hours=168
 
-# A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
-# segments don't drop below log.retention.bytes.
+# A size-based retention policy for logs. Segments are pruned from the log unless the remaining
+# segments drop below log.retention.bytes. Functions independently of log.retention.hours.
 #log.retention.bytes=1073741824
 
 # The maximum size of a log segment file. When this size is reached a new log segment will be created.
 log.segment.bytes=1073741824
 
-# The interval at which log segments are checked to see if they can be deleted according 
+# The interval at which log segments are checked to see if they can be deleted according
 # to the retention policies
 log.retention.check.interval.ms=300000
 
@@ -145,3 +159,13 @@ zookeeper.connect={zk_host}:{zk_port}/{zk_chroot}
 zookeeper.connection.timeout.ms=30000
 # We want to expire kafka broker sessions quickly when brokers die b/c we restart them quickly
 zookeeper.session.timeout.ms=500
+
+
+############################# Group Coordinator Settings #############################
+
+# The following configuration specifies the time, in milliseconds, that the GroupCoordinator will delay the initial consumer rebalance.
+# The rebalance will be further delayed by the value of group.initial.rebalance.delay.ms as new members join the group, up to a maximum of max.poll.interval.ms.
+# The default value for this is 3 seconds.
+# We override this to 0 here as it makes for a better out-of-the-box experience for development and testing.
+# However, in production environments the default value of 3 seconds is more suitable as this will help to avoid unnecessary, and potentially expensive, rebalances during application startup.
+group.initial.rebalance.delay.ms=0
diff --git a/servers/1.0.1/resources/kafka.properties b/servers/1.0.1/resources/kafka.properties
index 5775cfdc4..219023551 100644
--- a/servers/1.0.1/resources/kafka.properties
+++ b/servers/1.0.1/resources/kafka.properties
@@ -4,14 +4,15 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 # see kafka.server.KafkaConfig for additional details and defaults
 
 ############################# Server Basics #############################
@@ -21,6 +22,12 @@ broker.id={broker_id}
 
 ############################# Socket Server Settings #############################
 
+# The address the socket server listens on. It will get the value returned from 
+# java.net.InetAddress.getCanonicalHostName() if not configured.
+#   FORMAT:
+#     listeners = listener_name://host_name:port
+#   EXAMPLE:
+#     listeners = PLAINTEXT://your.host.name:9092
 listeners={transport}://{host}:{port}
 security.inter.broker.protocol={transport}
 
@@ -38,22 +45,18 @@ allow.everyone.if.no.acl.found=true
 # The port the socket server listens on
 #port=9092
 
-# Hostname the broker will bind to. If not set, the server will bind to all interfaces
-#host.name=localhost
-
-# Hostname the broker will advertise to producers and consumers. If not set, it uses the
-# value for "host.name" if configured.  Otherwise, it will use the value returned from
-# java.net.InetAddress.getCanonicalHostName().
-#advertised.host.name=<hostname routable by clients>
+# Hostname and port the broker will advertise to producers and consumers. If not set, 
+# it uses the value for "listeners" if configured.  Otherwise, it will use the value
+# returned from java.net.InetAddress.getCanonicalHostName().
+#advertised.listeners=PLAINTEXT://your.host.name:9092
 
-# The port to publish to ZooKeeper for clients to use. If this is not set,
-# it will publish the same port that the broker binds to.
-#advertised.port=<port accessible by clients>
+# Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
+#listener.security.protocol.map=PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
 
-# The number of threads handling network requests
+# The number of threads that the server uses for receiving requests from the network and sending responses to the network
 num.network.threads=3
- 
-# The number of threads doing disk I/O
+
+# The number of threads that the server uses for processing requests, which may include disk I/O
 num.io.threads=8
 
 # The send buffer (SO_SNDBUF) used by the socket server
@@ -68,7 +71,7 @@ socket.request.max.bytes=104857600
 
 ############################# Log Basics #############################
 
-# A comma seperated list of directories under which to store log files
+# A comma separated list of directories under which to store log files
 log.dirs={tmp_dir}/data
 
 # The default number of log partitions per topic. More partitions allow greater
@@ -81,14 +84,25 @@ default.replication.factor={replicas}
 replica.lag.time.max.ms=1000
 replica.socket.timeout.ms=1000
 
+# The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
+# This value is recommended to be increased for installations with data dirs located in RAID array.
+num.recovery.threads.per.data.dir=1
+
+############################# Internal Topic Settings  #############################
+# The replication factor for the group metadata internal topics "__consumer_offsets" and "__transaction_state"
+# For anything other than development testing, a value greater than 1 is recommended to ensure availability such as 3.
+offsets.topic.replication.factor=1
+transaction.state.log.replication.factor=1
+transaction.state.log.min.isr=1
+
 ############################# Log Flush Policy #############################
 
 # Messages are immediately written to the filesystem but by default we only fsync() to sync
-# the OS cache lazily. The following configurations control the flush of data to disk. 
+# the OS cache lazily. The following configurations control the flush of data to disk.
 # There are a few important trade-offs here:
 #    1. Durability: Unflushed data may be lost if you are not using replication.
 #    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
-#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 
+#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to excessive seeks.
 # The settings below allow one to configure the flush policy to flush data after a period of time or
 # every N messages (or both). This can be done globally and overridden on a per-topic basis.
 
@@ -105,17 +119,17 @@ replica.socket.timeout.ms=1000
 # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
 # from the end of the log.
 
-# The minimum age of a log file to be eligible for deletion
+# The minimum age of a log file to be eligible for deletion due to age
 log.retention.hours=168
 
-# A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
-# segments don't drop below log.retention.bytes.
+# A size-based retention policy for logs. Segments are pruned from the log unless the remaining
+# segments drop below log.retention.bytes. Functions independently of log.retention.hours.
 #log.retention.bytes=1073741824
 
 # The maximum size of a log segment file. When this size is reached a new log segment will be created.
 log.segment.bytes=1073741824
 
-# The interval at which log segments are checked to see if they can be deleted according 
+# The interval at which log segments are checked to see if they can be deleted according
 # to the retention policies
 log.retention.check.interval.ms=300000
 
@@ -145,3 +159,13 @@ zookeeper.connect={zk_host}:{zk_port}/{zk_chroot}
 zookeeper.connection.timeout.ms=30000
 # We want to expire kafka broker sessions quickly when brokers die b/c we restart them quickly
 zookeeper.session.timeout.ms=500
+
+
+############################# Group Coordinator Settings #############################
+
+# The following configuration specifies the time, in milliseconds, that the GroupCoordinator will delay the initial consumer rebalance.
+# The rebalance will be further delayed by the value of group.initial.rebalance.delay.ms as new members join the group, up to a maximum of max.poll.interval.ms.
+# The default value for this is 3 seconds.
+# We override this to 0 here as it makes for a better out-of-the-box experience for development and testing.
+# However, in production environments the default value of 3 seconds is more suitable as this will help to avoid unnecessary, and potentially expensive, rebalances during application startup.
+group.initial.rebalance.delay.ms=0
diff --git a/servers/1.0.2/resources/kafka.properties b/servers/1.0.2/resources/kafka.properties
index 5775cfdc4..219023551 100644
--- a/servers/1.0.2/resources/kafka.properties
+++ b/servers/1.0.2/resources/kafka.properties
@@ -4,14 +4,15 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 # see kafka.server.KafkaConfig for additional details and defaults
 
 ############################# Server Basics #############################
@@ -21,6 +22,12 @@ broker.id={broker_id}
 
 ############################# Socket Server Settings #############################
 
+# The address the socket server listens on. It will get the value returned from 
+# java.net.InetAddress.getCanonicalHostName() if not configured.
+#   FORMAT:
+#     listeners = listener_name://host_name:port
+#   EXAMPLE:
+#     listeners = PLAINTEXT://your.host.name:9092
 listeners={transport}://{host}:{port}
 security.inter.broker.protocol={transport}
 
@@ -38,22 +45,18 @@ allow.everyone.if.no.acl.found=true
 # The port the socket server listens on
 #port=9092
 
-# Hostname the broker will bind to. If not set, the server will bind to all interfaces
-#host.name=localhost
-
-# Hostname the broker will advertise to producers and consumers. If not set, it uses the
-# value for "host.name" if configured.  Otherwise, it will use the value returned from
-# java.net.InetAddress.getCanonicalHostName().
-#advertised.host.name=<hostname routable by clients>
+# Hostname and port the broker will advertise to producers and consumers. If not set, 
+# it uses the value for "listeners" if configured.  Otherwise, it will use the value
+# returned from java.net.InetAddress.getCanonicalHostName().
+#advertised.listeners=PLAINTEXT://your.host.name:9092
 
-# The port to publish to ZooKeeper for clients to use. If this is not set,
-# it will publish the same port that the broker binds to.
-#advertised.port=<port accessible by clients>
+# Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
+#listener.security.protocol.map=PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
 
-# The number of threads handling network requests
+# The number of threads that the server uses for receiving requests from the network and sending responses to the network
 num.network.threads=3
- 
-# The number of threads doing disk I/O
+
+# The number of threads that the server uses for processing requests, which may include disk I/O
 num.io.threads=8
 
 # The send buffer (SO_SNDBUF) used by the socket server
@@ -68,7 +71,7 @@ socket.request.max.bytes=104857600
 
 ############################# Log Basics #############################
 
-# A comma seperated list of directories under which to store log files
+# A comma separated list of directories under which to store log files
 log.dirs={tmp_dir}/data
 
 # The default number of log partitions per topic. More partitions allow greater
@@ -81,14 +84,25 @@ default.replication.factor={replicas}
 replica.lag.time.max.ms=1000
 replica.socket.timeout.ms=1000
 
+# The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
+# This value is recommended to be increased for installations with data dirs located in RAID array.
+num.recovery.threads.per.data.dir=1
+
+############################# Internal Topic Settings  #############################
+# The replication factor for the group metadata internal topics "__consumer_offsets" and "__transaction_state"
+# For anything other than development testing, a value greater than 1 is recommended to ensure availability such as 3.
+offsets.topic.replication.factor=1
+transaction.state.log.replication.factor=1
+transaction.state.log.min.isr=1
+
 ############################# Log Flush Policy #############################
 
 # Messages are immediately written to the filesystem but by default we only fsync() to sync
-# the OS cache lazily. The following configurations control the flush of data to disk. 
+# the OS cache lazily. The following configurations control the flush of data to disk.
 # There are a few important trade-offs here:
 #    1. Durability: Unflushed data may be lost if you are not using replication.
 #    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
-#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 
+#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to excessive seeks.
 # The settings below allow one to configure the flush policy to flush data after a period of time or
 # every N messages (or both). This can be done globally and overridden on a per-topic basis.
 
@@ -105,17 +119,17 @@ replica.socket.timeout.ms=1000
 # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
 # from the end of the log.
 
-# The minimum age of a log file to be eligible for deletion
+# The minimum age of a log file to be eligible for deletion due to age
 log.retention.hours=168
 
-# A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
-# segments don't drop below log.retention.bytes.
+# A size-based retention policy for logs. Segments are pruned from the log unless the remaining
+# segments drop below log.retention.bytes. Functions independently of log.retention.hours.
 #log.retention.bytes=1073741824
 
 # The maximum size of a log segment file. When this size is reached a new log segment will be created.
 log.segment.bytes=1073741824
 
-# The interval at which log segments are checked to see if they can be deleted according 
+# The interval at which log segments are checked to see if they can be deleted according
 # to the retention policies
 log.retention.check.interval.ms=300000
 
@@ -145,3 +159,13 @@ zookeeper.connect={zk_host}:{zk_port}/{zk_chroot}
 zookeeper.connection.timeout.ms=30000
 # We want to expire kafka broker sessions quickly when brokers die b/c we restart them quickly
 zookeeper.session.timeout.ms=500
+
+
+############################# Group Coordinator Settings #############################
+
+# The following configuration specifies the time, in milliseconds, that the GroupCoordinator will delay the initial consumer rebalance.
+# The rebalance will be further delayed by the value of group.initial.rebalance.delay.ms as new members join the group, up to a maximum of max.poll.interval.ms.
+# The default value for this is 3 seconds.
+# We override this to 0 here as it makes for a better out-of-the-box experience for development and testing.
+# However, in production environments the default value of 3 seconds is more suitable as this will help to avoid unnecessary, and potentially expensive, rebalances during application startup.
+group.initial.rebalance.delay.ms=0
diff --git a/servers/1.1.0/resources/kafka.properties b/servers/1.1.0/resources/kafka.properties
index 5775cfdc4..219023551 100644
--- a/servers/1.1.0/resources/kafka.properties
+++ b/servers/1.1.0/resources/kafka.properties
@@ -4,14 +4,15 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 # see kafka.server.KafkaConfig for additional details and defaults
 
 ############################# Server Basics #############################
@@ -21,6 +22,12 @@ broker.id={broker_id}
 
 ############################# Socket Server Settings #############################
 
+# The address the socket server listens on. It will get the value returned from 
+# java.net.InetAddress.getCanonicalHostName() if not configured.
+#   FORMAT:
+#     listeners = listener_name://host_name:port
+#   EXAMPLE:
+#     listeners = PLAINTEXT://your.host.name:9092
 listeners={transport}://{host}:{port}
 security.inter.broker.protocol={transport}
 
@@ -38,22 +45,18 @@ allow.everyone.if.no.acl.found=true
 # The port the socket server listens on
 #port=9092
 
-# Hostname the broker will bind to. If not set, the server will bind to all interfaces
-#host.name=localhost
-
-# Hostname the broker will advertise to producers and consumers. If not set, it uses the
-# value for "host.name" if configured.  Otherwise, it will use the value returned from
-# java.net.InetAddress.getCanonicalHostName().
-#advertised.host.name=<hostname routable by clients>
+# Hostname and port the broker will advertise to producers and consumers. If not set, 
+# it uses the value for "listeners" if configured.  Otherwise, it will use the value
+# returned from java.net.InetAddress.getCanonicalHostName().
+#advertised.listeners=PLAINTEXT://your.host.name:9092
 
-# The port to publish to ZooKeeper for clients to use. If this is not set,
-# it will publish the same port that the broker binds to.
-#advertised.port=<port accessible by clients>
+# Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
+#listener.security.protocol.map=PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
 
-# The number of threads handling network requests
+# The number of threads that the server uses for receiving requests from the network and sending responses to the network
 num.network.threads=3
- 
-# The number of threads doing disk I/O
+
+# The number of threads that the server uses for processing requests, which may include disk I/O
 num.io.threads=8
 
 # The send buffer (SO_SNDBUF) used by the socket server
@@ -68,7 +71,7 @@ socket.request.max.bytes=104857600
 
 ############################# Log Basics #############################
 
-# A comma seperated list of directories under which to store log files
+# A comma separated list of directories under which to store log files
 log.dirs={tmp_dir}/data
 
 # The default number of log partitions per topic. More partitions allow greater
@@ -81,14 +84,25 @@ default.replication.factor={replicas}
 replica.lag.time.max.ms=1000
 replica.socket.timeout.ms=1000
 
+# The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
+# This value is recommended to be increased for installations with data dirs located in RAID array.
+num.recovery.threads.per.data.dir=1
+
+############################# Internal Topic Settings  #############################
+# The replication factor for the group metadata internal topics "__consumer_offsets" and "__transaction_state"
+# For anything other than development testing, a value greater than 1 is recommended to ensure availability such as 3.
+offsets.topic.replication.factor=1
+transaction.state.log.replication.factor=1
+transaction.state.log.min.isr=1
+
 ############################# Log Flush Policy #############################
 
 # Messages are immediately written to the filesystem but by default we only fsync() to sync
-# the OS cache lazily. The following configurations control the flush of data to disk. 
+# the OS cache lazily. The following configurations control the flush of data to disk.
 # There are a few important trade-offs here:
 #    1. Durability: Unflushed data may be lost if you are not using replication.
 #    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
-#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 
+#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to excessive seeks.
 # The settings below allow one to configure the flush policy to flush data after a period of time or
 # every N messages (or both). This can be done globally and overridden on a per-topic basis.
 
@@ -105,17 +119,17 @@ replica.socket.timeout.ms=1000
 # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
 # from the end of the log.
 
-# The minimum age of a log file to be eligible for deletion
+# The minimum age of a log file to be eligible for deletion due to age
 log.retention.hours=168
 
-# A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
-# segments don't drop below log.retention.bytes.
+# A size-based retention policy for logs. Segments are pruned from the log unless the remaining
+# segments drop below log.retention.bytes. Functions independently of log.retention.hours.
 #log.retention.bytes=1073741824
 
 # The maximum size of a log segment file. When this size is reached a new log segment will be created.
 log.segment.bytes=1073741824
 
-# The interval at which log segments are checked to see if they can be deleted according 
+# The interval at which log segments are checked to see if they can be deleted according
 # to the retention policies
 log.retention.check.interval.ms=300000
 
@@ -145,3 +159,13 @@ zookeeper.connect={zk_host}:{zk_port}/{zk_chroot}
 zookeeper.connection.timeout.ms=30000
 # We want to expire kafka broker sessions quickly when brokers die b/c we restart them quickly
 zookeeper.session.timeout.ms=500
+
+
+############################# Group Coordinator Settings #############################
+
+# The following configuration specifies the time, in milliseconds, that the GroupCoordinator will delay the initial consumer rebalance.
+# The rebalance will be further delayed by the value of group.initial.rebalance.delay.ms as new members join the group, up to a maximum of max.poll.interval.ms.
+# The default value for this is 3 seconds.
+# We override this to 0 here as it makes for a better out-of-the-box experience for development and testing.
+# However, in production environments the default value of 3 seconds is more suitable as this will help to avoid unnecessary, and potentially expensive, rebalances during application startup.
+group.initial.rebalance.delay.ms=0
diff --git a/servers/1.1.1/resources/kafka.properties b/servers/1.1.1/resources/kafka.properties
index 5775cfdc4..219023551 100644
--- a/servers/1.1.1/resources/kafka.properties
+++ b/servers/1.1.1/resources/kafka.properties
@@ -4,14 +4,15 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 # see kafka.server.KafkaConfig for additional details and defaults
 
 ############################# Server Basics #############################
@@ -21,6 +22,12 @@ broker.id={broker_id}
 
 ############################# Socket Server Settings #############################
 
+# The address the socket server listens on. It will get the value returned from 
+# java.net.InetAddress.getCanonicalHostName() if not configured.
+#   FORMAT:
+#     listeners = listener_name://host_name:port
+#   EXAMPLE:
+#     listeners = PLAINTEXT://your.host.name:9092
 listeners={transport}://{host}:{port}
 security.inter.broker.protocol={transport}
 
@@ -38,22 +45,18 @@ allow.everyone.if.no.acl.found=true
 # The port the socket server listens on
 #port=9092
 
-# Hostname the broker will bind to. If not set, the server will bind to all interfaces
-#host.name=localhost
-
-# Hostname the broker will advertise to producers and consumers. If not set, it uses the
-# value for "host.name" if configured.  Otherwise, it will use the value returned from
-# java.net.InetAddress.getCanonicalHostName().
-#advertised.host.name=<hostname routable by clients>
+# Hostname and port the broker will advertise to producers and consumers. If not set, 
+# it uses the value for "listeners" if configured.  Otherwise, it will use the value
+# returned from java.net.InetAddress.getCanonicalHostName().
+#advertised.listeners=PLAINTEXT://your.host.name:9092
 
-# The port to publish to ZooKeeper for clients to use. If this is not set,
-# it will publish the same port that the broker binds to.
-#advertised.port=<port accessible by clients>
+# Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
+#listener.security.protocol.map=PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
 
-# The number of threads handling network requests
+# The number of threads that the server uses for receiving requests from the network and sending responses to the network
 num.network.threads=3
- 
-# The number of threads doing disk I/O
+
+# The number of threads that the server uses for processing requests, which may include disk I/O
 num.io.threads=8
 
 # The send buffer (SO_SNDBUF) used by the socket server
@@ -68,7 +71,7 @@ socket.request.max.bytes=104857600
 
 ############################# Log Basics #############################
 
-# A comma seperated list of directories under which to store log files
+# A comma separated list of directories under which to store log files
 log.dirs={tmp_dir}/data
 
 # The default number of log partitions per topic. More partitions allow greater
@@ -81,14 +84,25 @@ default.replication.factor={replicas}
 replica.lag.time.max.ms=1000
 replica.socket.timeout.ms=1000
 
+# The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
+# This value is recommended to be increased for installations with data dirs located in RAID array.
+num.recovery.threads.per.data.dir=1
+
+############################# Internal Topic Settings  #############################
+# The replication factor for the group metadata internal topics "__consumer_offsets" and "__transaction_state"
+# For anything other than development testing, a value greater than 1 is recommended to ensure availability such as 3.
+offsets.topic.replication.factor=1
+transaction.state.log.replication.factor=1
+transaction.state.log.min.isr=1
+
 ############################# Log Flush Policy #############################
 
 # Messages are immediately written to the filesystem but by default we only fsync() to sync
-# the OS cache lazily. The following configurations control the flush of data to disk. 
+# the OS cache lazily. The following configurations control the flush of data to disk.
 # There are a few important trade-offs here:
 #    1. Durability: Unflushed data may be lost if you are not using replication.
 #    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
-#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 
+#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to excessive seeks.
 # The settings below allow one to configure the flush policy to flush data after a period of time or
 # every N messages (or both). This can be done globally and overridden on a per-topic basis.
 
@@ -105,17 +119,17 @@ replica.socket.timeout.ms=1000
 # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
 # from the end of the log.
 
-# The minimum age of a log file to be eligible for deletion
+# The minimum age of a log file to be eligible for deletion due to age
 log.retention.hours=168
 
-# A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
-# segments don't drop below log.retention.bytes.
+# A size-based retention policy for logs. Segments are pruned from the log unless the remaining
+# segments drop below log.retention.bytes. Functions independently of log.retention.hours.
 #log.retention.bytes=1073741824
 
 # The maximum size of a log segment file. When this size is reached a new log segment will be created.
 log.segment.bytes=1073741824
 
-# The interval at which log segments are checked to see if they can be deleted according 
+# The interval at which log segments are checked to see if they can be deleted according
 # to the retention policies
 log.retention.check.interval.ms=300000
 
@@ -145,3 +159,13 @@ zookeeper.connect={zk_host}:{zk_port}/{zk_chroot}
 zookeeper.connection.timeout.ms=30000
 # We want to expire kafka broker sessions quickly when brokers die b/c we restart them quickly
 zookeeper.session.timeout.ms=500
+
+
+############################# Group Coordinator Settings #############################
+
+# The following configuration specifies the time, in milliseconds, that the GroupCoordinator will delay the initial consumer rebalance.
+# The rebalance will be further delayed by the value of group.initial.rebalance.delay.ms as new members join the group, up to a maximum of max.poll.interval.ms.
+# The default value for this is 3 seconds.
+# We override this to 0 here as it makes for a better out-of-the-box experience for development and testing.
+# However, in production environments the default value of 3 seconds is more suitable as this will help to avoid unnecessary, and potentially expensive, rebalances during application startup.
+group.initial.rebalance.delay.ms=0
diff --git a/servers/2.0.0/resources/kafka.properties b/servers/2.0.0/resources/kafka.properties
index 5775cfdc4..219023551 100644
--- a/servers/2.0.0/resources/kafka.properties
+++ b/servers/2.0.0/resources/kafka.properties
@@ -4,14 +4,15 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 # see kafka.server.KafkaConfig for additional details and defaults
 
 ############################# Server Basics #############################
@@ -21,6 +22,12 @@ broker.id={broker_id}
 
 ############################# Socket Server Settings #############################
 
+# The address the socket server listens on. It will get the value returned from 
+# java.net.InetAddress.getCanonicalHostName() if not configured.
+#   FORMAT:
+#     listeners = listener_name://host_name:port
+#   EXAMPLE:
+#     listeners = PLAINTEXT://your.host.name:9092
 listeners={transport}://{host}:{port}
 security.inter.broker.protocol={transport}
 
@@ -38,22 +45,18 @@ allow.everyone.if.no.acl.found=true
 # The port the socket server listens on
 #port=9092
 
-# Hostname the broker will bind to. If not set, the server will bind to all interfaces
-#host.name=localhost
-
-# Hostname the broker will advertise to producers and consumers. If not set, it uses the
-# value for "host.name" if configured.  Otherwise, it will use the value returned from
-# java.net.InetAddress.getCanonicalHostName().
-#advertised.host.name=<hostname routable by clients>
+# Hostname and port the broker will advertise to producers and consumers. If not set, 
+# it uses the value for "listeners" if configured.  Otherwise, it will use the value
+# returned from java.net.InetAddress.getCanonicalHostName().
+#advertised.listeners=PLAINTEXT://your.host.name:9092
 
-# The port to publish to ZooKeeper for clients to use. If this is not set,
-# it will publish the same port that the broker binds to.
-#advertised.port=<port accessible by clients>
+# Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
+#listener.security.protocol.map=PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
 
-# The number of threads handling network requests
+# The number of threads that the server uses for receiving requests from the network and sending responses to the network
 num.network.threads=3
- 
-# The number of threads doing disk I/O
+
+# The number of threads that the server uses for processing requests, which may include disk I/O
 num.io.threads=8
 
 # The send buffer (SO_SNDBUF) used by the socket server
@@ -68,7 +71,7 @@ socket.request.max.bytes=104857600
 
 ############################# Log Basics #############################
 
-# A comma seperated list of directories under which to store log files
+# A comma separated list of directories under which to store log files
 log.dirs={tmp_dir}/data
 
 # The default number of log partitions per topic. More partitions allow greater
@@ -81,14 +84,25 @@ default.replication.factor={replicas}
 replica.lag.time.max.ms=1000
 replica.socket.timeout.ms=1000
 
+# The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
+# This value is recommended to be increased for installations with data dirs located in RAID array.
+num.recovery.threads.per.data.dir=1
+
+############################# Internal Topic Settings  #############################
+# The replication factor for the group metadata internal topics "__consumer_offsets" and "__transaction_state"
+# For anything other than development testing, a value greater than 1 is recommended to ensure availability such as 3.
+offsets.topic.replication.factor=1
+transaction.state.log.replication.factor=1
+transaction.state.log.min.isr=1
+
 ############################# Log Flush Policy #############################
 
 # Messages are immediately written to the filesystem but by default we only fsync() to sync
-# the OS cache lazily. The following configurations control the flush of data to disk. 
+# the OS cache lazily. The following configurations control the flush of data to disk.
 # There are a few important trade-offs here:
 #    1. Durability: Unflushed data may be lost if you are not using replication.
 #    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
-#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 
+#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to excessive seeks.
 # The settings below allow one to configure the flush policy to flush data after a period of time or
 # every N messages (or both). This can be done globally and overridden on a per-topic basis.
 
@@ -105,17 +119,17 @@ replica.socket.timeout.ms=1000
 # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
 # from the end of the log.
 
-# The minimum age of a log file to be eligible for deletion
+# The minimum age of a log file to be eligible for deletion due to age
 log.retention.hours=168
 
-# A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
-# segments don't drop below log.retention.bytes.
+# A size-based retention policy for logs. Segments are pruned from the log unless the remaining
+# segments drop below log.retention.bytes. Functions independently of log.retention.hours.
 #log.retention.bytes=1073741824
 
 # The maximum size of a log segment file. When this size is reached a new log segment will be created.
 log.segment.bytes=1073741824
 
-# The interval at which log segments are checked to see if they can be deleted according 
+# The interval at which log segments are checked to see if they can be deleted according
 # to the retention policies
 log.retention.check.interval.ms=300000
 
@@ -145,3 +159,13 @@ zookeeper.connect={zk_host}:{zk_port}/{zk_chroot}
 zookeeper.connection.timeout.ms=30000
 # We want to expire kafka broker sessions quickly when brokers die b/c we restart them quickly
 zookeeper.session.timeout.ms=500
+
+
+############################# Group Coordinator Settings #############################
+
+# The following configuration specifies the time, in milliseconds, that the GroupCoordinator will delay the initial consumer rebalance.
+# The rebalance will be further delayed by the value of group.initial.rebalance.delay.ms as new members join the group, up to a maximum of max.poll.interval.ms.
+# The default value for this is 3 seconds.
+# We override this to 0 here as it makes for a better out-of-the-box experience for development and testing.
+# However, in production environments the default value of 3 seconds is more suitable as this will help to avoid unnecessary, and potentially expensive, rebalances during application startup.
+group.initial.rebalance.delay.ms=0
diff --git a/servers/2.0.1/resources/kafka.properties b/servers/2.0.1/resources/kafka.properties
index 5775cfdc4..219023551 100644
--- a/servers/2.0.1/resources/kafka.properties
+++ b/servers/2.0.1/resources/kafka.properties
@@ -4,14 +4,15 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 # see kafka.server.KafkaConfig for additional details and defaults
 
 ############################# Server Basics #############################
@@ -21,6 +22,12 @@ broker.id={broker_id}
 
 ############################# Socket Server Settings #############################
 
+# The address the socket server listens on. It will get the value returned from 
+# java.net.InetAddress.getCanonicalHostName() if not configured.
+#   FORMAT:
+#     listeners = listener_name://host_name:port
+#   EXAMPLE:
+#     listeners = PLAINTEXT://your.host.name:9092
 listeners={transport}://{host}:{port}
 security.inter.broker.protocol={transport}
 
@@ -38,22 +45,18 @@ allow.everyone.if.no.acl.found=true
 # The port the socket server listens on
 #port=9092
 
-# Hostname the broker will bind to. If not set, the server will bind to all interfaces
-#host.name=localhost
-
-# Hostname the broker will advertise to producers and consumers. If not set, it uses the
-# value for "host.name" if configured.  Otherwise, it will use the value returned from
-# java.net.InetAddress.getCanonicalHostName().
-#advertised.host.name=<hostname routable by clients>
+# Hostname and port the broker will advertise to producers and consumers. If not set, 
+# it uses the value for "listeners" if configured.  Otherwise, it will use the value
+# returned from java.net.InetAddress.getCanonicalHostName().
+#advertised.listeners=PLAINTEXT://your.host.name:9092
 
-# The port to publish to ZooKeeper for clients to use. If this is not set,
-# it will publish the same port that the broker binds to.
-#advertised.port=<port accessible by clients>
+# Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
+#listener.security.protocol.map=PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
 
-# The number of threads handling network requests
+# The number of threads that the server uses for receiving requests from the network and sending responses to the network
 num.network.threads=3
- 
-# The number of threads doing disk I/O
+
+# The number of threads that the server uses for processing requests, which may include disk I/O
 num.io.threads=8
 
 # The send buffer (SO_SNDBUF) used by the socket server
@@ -68,7 +71,7 @@ socket.request.max.bytes=104857600
 
 ############################# Log Basics #############################
 
-# A comma seperated list of directories under which to store log files
+# A comma separated list of directories under which to store log files
 log.dirs={tmp_dir}/data
 
 # The default number of log partitions per topic. More partitions allow greater
@@ -81,14 +84,25 @@ default.replication.factor={replicas}
 replica.lag.time.max.ms=1000
 replica.socket.timeout.ms=1000
 
+# The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
+# This value is recommended to be increased for installations with data dirs located in RAID array.
+num.recovery.threads.per.data.dir=1
+
+############################# Internal Topic Settings  #############################
+# The replication factor for the group metadata internal topics "__consumer_offsets" and "__transaction_state"
+# For anything other than development testing, a value greater than 1 is recommended to ensure availability such as 3.
+offsets.topic.replication.factor=1
+transaction.state.log.replication.factor=1
+transaction.state.log.min.isr=1
+
 ############################# Log Flush Policy #############################
 
 # Messages are immediately written to the filesystem but by default we only fsync() to sync
-# the OS cache lazily. The following configurations control the flush of data to disk. 
+# the OS cache lazily. The following configurations control the flush of data to disk.
 # There are a few important trade-offs here:
 #    1. Durability: Unflushed data may be lost if you are not using replication.
 #    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
-#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 
+#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to excessive seeks.
 # The settings below allow one to configure the flush policy to flush data after a period of time or
 # every N messages (or both). This can be done globally and overridden on a per-topic basis.
 
@@ -105,17 +119,17 @@ replica.socket.timeout.ms=1000
 # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
 # from the end of the log.
 
-# The minimum age of a log file to be eligible for deletion
+# The minimum age of a log file to be eligible for deletion due to age
 log.retention.hours=168
 
-# A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
-# segments don't drop below log.retention.bytes.
+# A size-based retention policy for logs. Segments are pruned from the log unless the remaining
+# segments drop below log.retention.bytes. Functions independently of log.retention.hours.
 #log.retention.bytes=1073741824
 
 # The maximum size of a log segment file. When this size is reached a new log segment will be created.
 log.segment.bytes=1073741824
 
-# The interval at which log segments are checked to see if they can be deleted according 
+# The interval at which log segments are checked to see if they can be deleted according
 # to the retention policies
 log.retention.check.interval.ms=300000
 
@@ -145,3 +159,13 @@ zookeeper.connect={zk_host}:{zk_port}/{zk_chroot}
 zookeeper.connection.timeout.ms=30000
 # We want to expire kafka broker sessions quickly when brokers die b/c we restart them quickly
 zookeeper.session.timeout.ms=500
+
+
+############################# Group Coordinator Settings #############################
+
+# The following configuration specifies the time, in milliseconds, that the GroupCoordinator will delay the initial consumer rebalance.
+# The rebalance will be further delayed by the value of group.initial.rebalance.delay.ms as new members join the group, up to a maximum of max.poll.interval.ms.
+# The default value for this is 3 seconds.
+# We override this to 0 here as it makes for a better out-of-the-box experience for development and testing.
+# However, in production environments the default value of 3 seconds is more suitable as this will help to avoid unnecessary, and potentially expensive, rebalances during application startup.
+group.initial.rebalance.delay.ms=0
diff --git a/servers/2.1.0/resources/kafka.properties b/servers/2.1.0/resources/kafka.properties
index 5775cfdc4..219023551 100644
--- a/servers/2.1.0/resources/kafka.properties
+++ b/servers/2.1.0/resources/kafka.properties
@@ -4,14 +4,15 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 # see kafka.server.KafkaConfig for additional details and defaults
 
 ############################# Server Basics #############################
@@ -21,6 +22,12 @@ broker.id={broker_id}
 
 ############################# Socket Server Settings #############################
 
+# The address the socket server listens on. It will get the value returned from 
+# java.net.InetAddress.getCanonicalHostName() if not configured.
+#   FORMAT:
+#     listeners = listener_name://host_name:port
+#   EXAMPLE:
+#     listeners = PLAINTEXT://your.host.name:9092
 listeners={transport}://{host}:{port}
 security.inter.broker.protocol={transport}
 
@@ -38,22 +45,18 @@ allow.everyone.if.no.acl.found=true
 # The port the socket server listens on
 #port=9092
 
-# Hostname the broker will bind to. If not set, the server will bind to all interfaces
-#host.name=localhost
-
-# Hostname the broker will advertise to producers and consumers. If not set, it uses the
-# value for "host.name" if configured.  Otherwise, it will use the value returned from
-# java.net.InetAddress.getCanonicalHostName().
-#advertised.host.name=<hostname routable by clients>
+# Hostname and port the broker will advertise to producers and consumers. If not set, 
+# it uses the value for "listeners" if configured.  Otherwise, it will use the value
+# returned from java.net.InetAddress.getCanonicalHostName().
+#advertised.listeners=PLAINTEXT://your.host.name:9092
 
-# The port to publish to ZooKeeper for clients to use. If this is not set,
-# it will publish the same port that the broker binds to.
-#advertised.port=<port accessible by clients>
+# Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
+#listener.security.protocol.map=PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
 
-# The number of threads handling network requests
+# The number of threads that the server uses for receiving requests from the network and sending responses to the network
 num.network.threads=3
- 
-# The number of threads doing disk I/O
+
+# The number of threads that the server uses for processing requests, which may include disk I/O
 num.io.threads=8
 
 # The send buffer (SO_SNDBUF) used by the socket server
@@ -68,7 +71,7 @@ socket.request.max.bytes=104857600
 
 ############################# Log Basics #############################
 
-# A comma seperated list of directories under which to store log files
+# A comma separated list of directories under which to store log files
 log.dirs={tmp_dir}/data
 
 # The default number of log partitions per topic. More partitions allow greater
@@ -81,14 +84,25 @@ default.replication.factor={replicas}
 replica.lag.time.max.ms=1000
 replica.socket.timeout.ms=1000
 
+# The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
+# This value is recommended to be increased for installations with data dirs located in RAID array.
+num.recovery.threads.per.data.dir=1
+
+############################# Internal Topic Settings  #############################
+# The replication factor for the group metadata internal topics "__consumer_offsets" and "__transaction_state"
+# For anything other than development testing, a value greater than 1 is recommended to ensure availability such as 3.
+offsets.topic.replication.factor=1
+transaction.state.log.replication.factor=1
+transaction.state.log.min.isr=1
+
 ############################# Log Flush Policy #############################
 
 # Messages are immediately written to the filesystem but by default we only fsync() to sync
-# the OS cache lazily. The following configurations control the flush of data to disk. 
+# the OS cache lazily. The following configurations control the flush of data to disk.
 # There are a few important trade-offs here:
 #    1. Durability: Unflushed data may be lost if you are not using replication.
 #    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
-#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 
+#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to excessive seeks.
 # The settings below allow one to configure the flush policy to flush data after a period of time or
 # every N messages (or both). This can be done globally and overridden on a per-topic basis.
 
@@ -105,17 +119,17 @@ replica.socket.timeout.ms=1000
 # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
 # from the end of the log.
 
-# The minimum age of a log file to be eligible for deletion
+# The minimum age of a log file to be eligible for deletion due to age
 log.retention.hours=168
 
-# A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
-# segments don't drop below log.retention.bytes.
+# A size-based retention policy for logs. Segments are pruned from the log unless the remaining
+# segments drop below log.retention.bytes. Functions independently of log.retention.hours.
 #log.retention.bytes=1073741824
 
 # The maximum size of a log segment file. When this size is reached a new log segment will be created.
 log.segment.bytes=1073741824
 
-# The interval at which log segments are checked to see if they can be deleted according 
+# The interval at which log segments are checked to see if they can be deleted according
 # to the retention policies
 log.retention.check.interval.ms=300000
 
@@ -145,3 +159,13 @@ zookeeper.connect={zk_host}:{zk_port}/{zk_chroot}
 zookeeper.connection.timeout.ms=30000
 # We want to expire kafka broker sessions quickly when brokers die b/c we restart them quickly
 zookeeper.session.timeout.ms=500
+
+
+############################# Group Coordinator Settings #############################
+
+# The following configuration specifies the time, in milliseconds, that the GroupCoordinator will delay the initial consumer rebalance.
+# The rebalance will be further delayed by the value of group.initial.rebalance.delay.ms as new members join the group, up to a maximum of max.poll.interval.ms.
+# The default value for this is 3 seconds.
+# We override this to 0 here as it makes for a better out-of-the-box experience for development and testing.
+# However, in production environments the default value of 3 seconds is more suitable as this will help to avoid unnecessary, and potentially expensive, rebalances during application startup.
+group.initial.rebalance.delay.ms=0
diff --git a/servers/2.1.1/resources/kafka.properties b/servers/2.1.1/resources/kafka.properties
index 5775cfdc4..219023551 100644
--- a/servers/2.1.1/resources/kafka.properties
+++ b/servers/2.1.1/resources/kafka.properties
@@ -4,14 +4,15 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 # see kafka.server.KafkaConfig for additional details and defaults
 
 ############################# Server Basics #############################
@@ -21,6 +22,12 @@ broker.id={broker_id}
 
 ############################# Socket Server Settings #############################
 
+# The address the socket server listens on. It will get the value returned from 
+# java.net.InetAddress.getCanonicalHostName() if not configured.
+#   FORMAT:
+#     listeners = listener_name://host_name:port
+#   EXAMPLE:
+#     listeners = PLAINTEXT://your.host.name:9092
 listeners={transport}://{host}:{port}
 security.inter.broker.protocol={transport}
 
@@ -38,22 +45,18 @@ allow.everyone.if.no.acl.found=true
 # The port the socket server listens on
 #port=9092
 
-# Hostname the broker will bind to. If not set, the server will bind to all interfaces
-#host.name=localhost
-
-# Hostname the broker will advertise to producers and consumers. If not set, it uses the
-# value for "host.name" if configured.  Otherwise, it will use the value returned from
-# java.net.InetAddress.getCanonicalHostName().
-#advertised.host.name=<hostname routable by clients>
+# Hostname and port the broker will advertise to producers and consumers. If not set, 
+# it uses the value for "listeners" if configured.  Otherwise, it will use the value
+# returned from java.net.InetAddress.getCanonicalHostName().
+#advertised.listeners=PLAINTEXT://your.host.name:9092
 
-# The port to publish to ZooKeeper for clients to use. If this is not set,
-# it will publish the same port that the broker binds to.
-#advertised.port=<port accessible by clients>
+# Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
+#listener.security.protocol.map=PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
 
-# The number of threads handling network requests
+# The number of threads that the server uses for receiving requests from the network and sending responses to the network
 num.network.threads=3
- 
-# The number of threads doing disk I/O
+
+# The number of threads that the server uses for processing requests, which may include disk I/O
 num.io.threads=8
 
 # The send buffer (SO_SNDBUF) used by the socket server
@@ -68,7 +71,7 @@ socket.request.max.bytes=104857600
 
 ############################# Log Basics #############################
 
-# A comma seperated list of directories under which to store log files
+# A comma separated list of directories under which to store log files
 log.dirs={tmp_dir}/data
 
 # The default number of log partitions per topic. More partitions allow greater
@@ -81,14 +84,25 @@ default.replication.factor={replicas}
 replica.lag.time.max.ms=1000
 replica.socket.timeout.ms=1000
 
+# The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
+# This value is recommended to be increased for installations with data dirs located in RAID array.
+num.recovery.threads.per.data.dir=1
+
+############################# Internal Topic Settings  #############################
+# The replication factor for the group metadata internal topics "__consumer_offsets" and "__transaction_state"
+# For anything other than development testing, a value greater than 1 is recommended to ensure availability such as 3.
+offsets.topic.replication.factor=1
+transaction.state.log.replication.factor=1
+transaction.state.log.min.isr=1
+
 ############################# Log Flush Policy #############################
 
 # Messages are immediately written to the filesystem but by default we only fsync() to sync
-# the OS cache lazily. The following configurations control the flush of data to disk. 
+# the OS cache lazily. The following configurations control the flush of data to disk.
 # There are a few important trade-offs here:
 #    1. Durability: Unflushed data may be lost if you are not using replication.
 #    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
-#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 
+#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to excessive seeks.
 # The settings below allow one to configure the flush policy to flush data after a period of time or
 # every N messages (or both). This can be done globally and overridden on a per-topic basis.
 
@@ -105,17 +119,17 @@ replica.socket.timeout.ms=1000
 # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
 # from the end of the log.
 
-# The minimum age of a log file to be eligible for deletion
+# The minimum age of a log file to be eligible for deletion due to age
 log.retention.hours=168
 
-# A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
-# segments don't drop below log.retention.bytes.
+# A size-based retention policy for logs. Segments are pruned from the log unless the remaining
+# segments drop below log.retention.bytes. Functions independently of log.retention.hours.
 #log.retention.bytes=1073741824
 
 # The maximum size of a log segment file. When this size is reached a new log segment will be created.
 log.segment.bytes=1073741824
 
-# The interval at which log segments are checked to see if they can be deleted according 
+# The interval at which log segments are checked to see if they can be deleted according
 # to the retention policies
 log.retention.check.interval.ms=300000
 
@@ -145,3 +159,13 @@ zookeeper.connect={zk_host}:{zk_port}/{zk_chroot}
 zookeeper.connection.timeout.ms=30000
 # We want to expire kafka broker sessions quickly when brokers die b/c we restart them quickly
 zookeeper.session.timeout.ms=500
+
+
+############################# Group Coordinator Settings #############################
+
+# The following configuration specifies the time, in milliseconds, that the GroupCoordinator will delay the initial consumer rebalance.
+# The rebalance will be further delayed by the value of group.initial.rebalance.delay.ms as new members join the group, up to a maximum of max.poll.interval.ms.
+# The default value for this is 3 seconds.
+# We override this to 0 here as it makes for a better out-of-the-box experience for development and testing.
+# However, in production environments the default value of 3 seconds is more suitable as this will help to avoid unnecessary, and potentially expensive, rebalances during application startup.
+group.initial.rebalance.delay.ms=0
diff --git a/servers/2.2.1/resources/kafka.properties b/servers/2.2.1/resources/kafka.properties
index 5775cfdc4..219023551 100644
--- a/servers/2.2.1/resources/kafka.properties
+++ b/servers/2.2.1/resources/kafka.properties
@@ -4,14 +4,15 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 # see kafka.server.KafkaConfig for additional details and defaults
 
 ############################# Server Basics #############################
@@ -21,6 +22,12 @@ broker.id={broker_id}
 
 ############################# Socket Server Settings #############################
 
+# The address the socket server listens on. It will get the value returned from 
+# java.net.InetAddress.getCanonicalHostName() if not configured.
+#   FORMAT:
+#     listeners = listener_name://host_name:port
+#   EXAMPLE:
+#     listeners = PLAINTEXT://your.host.name:9092
 listeners={transport}://{host}:{port}
 security.inter.broker.protocol={transport}
 
@@ -38,22 +45,18 @@ allow.everyone.if.no.acl.found=true
 # The port the socket server listens on
 #port=9092
 
-# Hostname the broker will bind to. If not set, the server will bind to all interfaces
-#host.name=localhost
-
-# Hostname the broker will advertise to producers and consumers. If not set, it uses the
-# value for "host.name" if configured.  Otherwise, it will use the value returned from
-# java.net.InetAddress.getCanonicalHostName().
-#advertised.host.name=<hostname routable by clients>
+# Hostname and port the broker will advertise to producers and consumers. If not set, 
+# it uses the value for "listeners" if configured.  Otherwise, it will use the value
+# returned from java.net.InetAddress.getCanonicalHostName().
+#advertised.listeners=PLAINTEXT://your.host.name:9092
 
-# The port to publish to ZooKeeper for clients to use. If this is not set,
-# it will publish the same port that the broker binds to.
-#advertised.port=<port accessible by clients>
+# Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
+#listener.security.protocol.map=PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
 
-# The number of threads handling network requests
+# The number of threads that the server uses for receiving requests from the network and sending responses to the network
 num.network.threads=3
- 
-# The number of threads doing disk I/O
+
+# The number of threads that the server uses for processing requests, which may include disk I/O
 num.io.threads=8
 
 # The send buffer (SO_SNDBUF) used by the socket server
@@ -68,7 +71,7 @@ socket.request.max.bytes=104857600
 
 ############################# Log Basics #############################
 
-# A comma seperated list of directories under which to store log files
+# A comma separated list of directories under which to store log files
 log.dirs={tmp_dir}/data
 
 # The default number of log partitions per topic. More partitions allow greater
@@ -81,14 +84,25 @@ default.replication.factor={replicas}
 replica.lag.time.max.ms=1000
 replica.socket.timeout.ms=1000
 
+# The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
+# This value is recommended to be increased for installations with data dirs located in RAID array.
+num.recovery.threads.per.data.dir=1
+
+############################# Internal Topic Settings  #############################
+# The replication factor for the group metadata internal topics "__consumer_offsets" and "__transaction_state"
+# For anything other than development testing, a value greater than 1 is recommended to ensure availability such as 3.
+offsets.topic.replication.factor=1
+transaction.state.log.replication.factor=1
+transaction.state.log.min.isr=1
+
 ############################# Log Flush Policy #############################
 
 # Messages are immediately written to the filesystem but by default we only fsync() to sync
-# the OS cache lazily. The following configurations control the flush of data to disk. 
+# the OS cache lazily. The following configurations control the flush of data to disk.
 # There are a few important trade-offs here:
 #    1. Durability: Unflushed data may be lost if you are not using replication.
 #    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
-#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 
+#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to excessive seeks.
 # The settings below allow one to configure the flush policy to flush data after a period of time or
 # every N messages (or both). This can be done globally and overridden on a per-topic basis.
 
@@ -105,17 +119,17 @@ replica.socket.timeout.ms=1000
 # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
 # from the end of the log.
 
-# The minimum age of a log file to be eligible for deletion
+# The minimum age of a log file to be eligible for deletion due to age
 log.retention.hours=168
 
-# A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
-# segments don't drop below log.retention.bytes.
+# A size-based retention policy for logs. Segments are pruned from the log unless the remaining
+# segments drop below log.retention.bytes. Functions independently of log.retention.hours.
 #log.retention.bytes=1073741824
 
 # The maximum size of a log segment file. When this size is reached a new log segment will be created.
 log.segment.bytes=1073741824
 
-# The interval at which log segments are checked to see if they can be deleted according 
+# The interval at which log segments are checked to see if they can be deleted according
 # to the retention policies
 log.retention.check.interval.ms=300000
 
@@ -145,3 +159,13 @@ zookeeper.connect={zk_host}:{zk_port}/{zk_chroot}
 zookeeper.connection.timeout.ms=30000
 # We want to expire kafka broker sessions quickly when brokers die b/c we restart them quickly
 zookeeper.session.timeout.ms=500
+
+
+############################# Group Coordinator Settings #############################
+
+# The following configuration specifies the time, in milliseconds, that the GroupCoordinator will delay the initial consumer rebalance.
+# The rebalance will be further delayed by the value of group.initial.rebalance.delay.ms as new members join the group, up to a maximum of max.poll.interval.ms.
+# The default value for this is 3 seconds.
+# We override this to 0 here as it makes for a better out-of-the-box experience for development and testing.
+# However, in production environments the default value of 3 seconds is more suitable as this will help to avoid unnecessary, and potentially expensive, rebalances during application startup.
+group.initial.rebalance.delay.ms=0
diff --git a/servers/2.3.0/resources/kafka.properties b/servers/2.3.0/resources/kafka.properties
index 5775cfdc4..219023551 100644
--- a/servers/2.3.0/resources/kafka.properties
+++ b/servers/2.3.0/resources/kafka.properties
@@ -4,14 +4,15 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 # see kafka.server.KafkaConfig for additional details and defaults
 
 ############################# Server Basics #############################
@@ -21,6 +22,12 @@ broker.id={broker_id}
 
 ############################# Socket Server Settings #############################
 
+# The address the socket server listens on. It will get the value returned from 
+# java.net.InetAddress.getCanonicalHostName() if not configured.
+#   FORMAT:
+#     listeners = listener_name://host_name:port
+#   EXAMPLE:
+#     listeners = PLAINTEXT://your.host.name:9092
 listeners={transport}://{host}:{port}
 security.inter.broker.protocol={transport}
 
@@ -38,22 +45,18 @@ allow.everyone.if.no.acl.found=true
 # The port the socket server listens on
 #port=9092
 
-# Hostname the broker will bind to. If not set, the server will bind to all interfaces
-#host.name=localhost
-
-# Hostname the broker will advertise to producers and consumers. If not set, it uses the
-# value for "host.name" if configured.  Otherwise, it will use the value returned from
-# java.net.InetAddress.getCanonicalHostName().
-#advertised.host.name=<hostname routable by clients>
+# Hostname and port the broker will advertise to producers and consumers. If not set, 
+# it uses the value for "listeners" if configured.  Otherwise, it will use the value
+# returned from java.net.InetAddress.getCanonicalHostName().
+#advertised.listeners=PLAINTEXT://your.host.name:9092
 
-# The port to publish to ZooKeeper for clients to use. If this is not set,
-# it will publish the same port that the broker binds to.
-#advertised.port=<port accessible by clients>
+# Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
+#listener.security.protocol.map=PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
 
-# The number of threads handling network requests
+# The number of threads that the server uses for receiving requests from the network and sending responses to the network
 num.network.threads=3
- 
-# The number of threads doing disk I/O
+
+# The number of threads that the server uses for processing requests, which may include disk I/O
 num.io.threads=8
 
 # The send buffer (SO_SNDBUF) used by the socket server
@@ -68,7 +71,7 @@ socket.request.max.bytes=104857600
 
 ############################# Log Basics #############################
 
-# A comma seperated list of directories under which to store log files
+# A comma separated list of directories under which to store log files
 log.dirs={tmp_dir}/data
 
 # The default number of log partitions per topic. More partitions allow greater
@@ -81,14 +84,25 @@ default.replication.factor={replicas}
 replica.lag.time.max.ms=1000
 replica.socket.timeout.ms=1000
 
+# The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
+# This value is recommended to be increased for installations with data dirs located in RAID array.
+num.recovery.threads.per.data.dir=1
+
+############################# Internal Topic Settings  #############################
+# The replication factor for the group metadata internal topics "__consumer_offsets" and "__transaction_state"
+# For anything other than development testing, a value greater than 1 is recommended to ensure availability such as 3.
+offsets.topic.replication.factor=1
+transaction.state.log.replication.factor=1
+transaction.state.log.min.isr=1
+
 ############################# Log Flush Policy #############################
 
 # Messages are immediately written to the filesystem but by default we only fsync() to sync
-# the OS cache lazily. The following configurations control the flush of data to disk. 
+# the OS cache lazily. The following configurations control the flush of data to disk.
 # There are a few important trade-offs here:
 #    1. Durability: Unflushed data may be lost if you are not using replication.
 #    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
-#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 
+#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to excessive seeks.
 # The settings below allow one to configure the flush policy to flush data after a period of time or
 # every N messages (or both). This can be done globally and overridden on a per-topic basis.
 
@@ -105,17 +119,17 @@ replica.socket.timeout.ms=1000
 # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
 # from the end of the log.
 
-# The minimum age of a log file to be eligible for deletion
+# The minimum age of a log file to be eligible for deletion due to age
 log.retention.hours=168
 
-# A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
-# segments don't drop below log.retention.bytes.
+# A size-based retention policy for logs. Segments are pruned from the log unless the remaining
+# segments drop below log.retention.bytes. Functions independently of log.retention.hours.
 #log.retention.bytes=1073741824
 
 # The maximum size of a log segment file. When this size is reached a new log segment will be created.
 log.segment.bytes=1073741824
 
-# The interval at which log segments are checked to see if they can be deleted according 
+# The interval at which log segments are checked to see if they can be deleted according
 # to the retention policies
 log.retention.check.interval.ms=300000
 
@@ -145,3 +159,13 @@ zookeeper.connect={zk_host}:{zk_port}/{zk_chroot}
 zookeeper.connection.timeout.ms=30000
 # We want to expire kafka broker sessions quickly when brokers die b/c we restart them quickly
 zookeeper.session.timeout.ms=500
+
+
+############################# Group Coordinator Settings #############################
+
+# The following configuration specifies the time, in milliseconds, that the GroupCoordinator will delay the initial consumer rebalance.
+# The rebalance will be further delayed by the value of group.initial.rebalance.delay.ms as new members join the group, up to a maximum of max.poll.interval.ms.
+# The default value for this is 3 seconds.
+# We override this to 0 here as it makes for a better out-of-the-box experience for development and testing.
+# However, in production environments the default value of 3 seconds is more suitable as this will help to avoid unnecessary, and potentially expensive, rebalances during application startup.
+group.initial.rebalance.delay.ms=0
diff --git a/servers/2.4.0/resources/kafka.properties b/servers/2.4.0/resources/kafka.properties
index 5775cfdc4..219023551 100644
--- a/servers/2.4.0/resources/kafka.properties
+++ b/servers/2.4.0/resources/kafka.properties
@@ -4,14 +4,15 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 # see kafka.server.KafkaConfig for additional details and defaults
 
 ############################# Server Basics #############################
@@ -21,6 +22,12 @@ broker.id={broker_id}
 
 ############################# Socket Server Settings #############################
 
+# The address the socket server listens on. It will get the value returned from 
+# java.net.InetAddress.getCanonicalHostName() if not configured.
+#   FORMAT:
+#     listeners = listener_name://host_name:port
+#   EXAMPLE:
+#     listeners = PLAINTEXT://your.host.name:9092
 listeners={transport}://{host}:{port}
 security.inter.broker.protocol={transport}
 
@@ -38,22 +45,18 @@ allow.everyone.if.no.acl.found=true
 # The port the socket server listens on
 #port=9092
 
-# Hostname the broker will bind to. If not set, the server will bind to all interfaces
-#host.name=localhost
-
-# Hostname the broker will advertise to producers and consumers. If not set, it uses the
-# value for "host.name" if configured.  Otherwise, it will use the value returned from
-# java.net.InetAddress.getCanonicalHostName().
-#advertised.host.name=<hostname routable by clients>
+# Hostname and port the broker will advertise to producers and consumers. If not set, 
+# it uses the value for "listeners" if configured.  Otherwise, it will use the value
+# returned from java.net.InetAddress.getCanonicalHostName().
+#advertised.listeners=PLAINTEXT://your.host.name:9092
 
-# The port to publish to ZooKeeper for clients to use. If this is not set,
-# it will publish the same port that the broker binds to.
-#advertised.port=<port accessible by clients>
+# Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
+#listener.security.protocol.map=PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
 
-# The number of threads handling network requests
+# The number of threads that the server uses for receiving requests from the network and sending responses to the network
 num.network.threads=3
- 
-# The number of threads doing disk I/O
+
+# The number of threads that the server uses for processing requests, which may include disk I/O
 num.io.threads=8
 
 # The send buffer (SO_SNDBUF) used by the socket server
@@ -68,7 +71,7 @@ socket.request.max.bytes=104857600
 
 ############################# Log Basics #############################
 
-# A comma seperated list of directories under which to store log files
+# A comma separated list of directories under which to store log files
 log.dirs={tmp_dir}/data
 
 # The default number of log partitions per topic. More partitions allow greater
@@ -81,14 +84,25 @@ default.replication.factor={replicas}
 replica.lag.time.max.ms=1000
 replica.socket.timeout.ms=1000
 
+# The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
+# This value is recommended to be increased for installations with data dirs located in RAID array.
+num.recovery.threads.per.data.dir=1
+
+############################# Internal Topic Settings  #############################
+# The replication factor for the group metadata internal topics "__consumer_offsets" and "__transaction_state"
+# For anything other than development testing, a value greater than 1 is recommended to ensure availability such as 3.
+offsets.topic.replication.factor=1
+transaction.state.log.replication.factor=1
+transaction.state.log.min.isr=1
+
 ############################# Log Flush Policy #############################
 
 # Messages are immediately written to the filesystem but by default we only fsync() to sync
-# the OS cache lazily. The following configurations control the flush of data to disk. 
+# the OS cache lazily. The following configurations control the flush of data to disk.
 # There are a few important trade-offs here:
 #    1. Durability: Unflushed data may be lost if you are not using replication.
 #    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
-#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 
+#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to excessive seeks.
 # The settings below allow one to configure the flush policy to flush data after a period of time or
 # every N messages (or both). This can be done globally and overridden on a per-topic basis.
 
@@ -105,17 +119,17 @@ replica.socket.timeout.ms=1000
 # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
 # from the end of the log.
 
-# The minimum age of a log file to be eligible for deletion
+# The minimum age of a log file to be eligible for deletion due to age
 log.retention.hours=168
 
-# A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
-# segments don't drop below log.retention.bytes.
+# A size-based retention policy for logs. Segments are pruned from the log unless the remaining
+# segments drop below log.retention.bytes. Functions independently of log.retention.hours.
 #log.retention.bytes=1073741824
 
 # The maximum size of a log segment file. When this size is reached a new log segment will be created.
 log.segment.bytes=1073741824
 
-# The interval at which log segments are checked to see if they can be deleted according 
+# The interval at which log segments are checked to see if they can be deleted according
 # to the retention policies
 log.retention.check.interval.ms=300000
 
@@ -145,3 +159,13 @@ zookeeper.connect={zk_host}:{zk_port}/{zk_chroot}
 zookeeper.connection.timeout.ms=30000
 # We want to expire kafka broker sessions quickly when brokers die b/c we restart them quickly
 zookeeper.session.timeout.ms=500
+
+
+############################# Group Coordinator Settings #############################
+
+# The following configuration specifies the time, in milliseconds, that the GroupCoordinator will delay the initial consumer rebalance.
+# The rebalance will be further delayed by the value of group.initial.rebalance.delay.ms as new members join the group, up to a maximum of max.poll.interval.ms.
+# The default value for this is 3 seconds.
+# We override this to 0 here as it makes for a better out-of-the-box experience for development and testing.
+# However, in production environments the default value of 3 seconds is more suitable as this will help to avoid unnecessary, and potentially expensive, rebalances during application startup.
+group.initial.rebalance.delay.ms=0
diff --git a/servers/2.4.0/resources/zookeeper.properties b/servers/2.4.0/resources/zookeeper.properties
index e3fd09742..b146fac9e 100644
--- a/servers/2.4.0/resources/zookeeper.properties
+++ b/servers/2.4.0/resources/zookeeper.properties
@@ -19,3 +19,4 @@ clientPort={port}
 clientPortAddress={host}
 # disable the per-ip limit on the number of connections since this is a non-production config
 maxClientCnxns=0
+admin.enableServer=false
diff --git a/servers/2.5.0/resources/kafka.properties b/servers/2.5.0/resources/kafka.properties
index 5775cfdc4..219023551 100644
--- a/servers/2.5.0/resources/kafka.properties
+++ b/servers/2.5.0/resources/kafka.properties
@@ -4,14 +4,15 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 # see kafka.server.KafkaConfig for additional details and defaults
 
 ############################# Server Basics #############################
@@ -21,6 +22,12 @@ broker.id={broker_id}
 
 ############################# Socket Server Settings #############################
 
+# The address the socket server listens on. It will get the value returned from 
+# java.net.InetAddress.getCanonicalHostName() if not configured.
+#   FORMAT:
+#     listeners = listener_name://host_name:port
+#   EXAMPLE:
+#     listeners = PLAINTEXT://your.host.name:9092
 listeners={transport}://{host}:{port}
 security.inter.broker.protocol={transport}
 
@@ -38,22 +45,18 @@ allow.everyone.if.no.acl.found=true
 # The port the socket server listens on
 #port=9092
 
-# Hostname the broker will bind to. If not set, the server will bind to all interfaces
-#host.name=localhost
-
-# Hostname the broker will advertise to producers and consumers. If not set, it uses the
-# value for "host.name" if configured.  Otherwise, it will use the value returned from
-# java.net.InetAddress.getCanonicalHostName().
-#advertised.host.name=<hostname routable by clients>
+# Hostname and port the broker will advertise to producers and consumers. If not set, 
+# it uses the value for "listeners" if configured.  Otherwise, it will use the value
+# returned from java.net.InetAddress.getCanonicalHostName().
+#advertised.listeners=PLAINTEXT://your.host.name:9092
 
-# The port to publish to ZooKeeper for clients to use. If this is not set,
-# it will publish the same port that the broker binds to.
-#advertised.port=<port accessible by clients>
+# Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
+#listener.security.protocol.map=PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
 
-# The number of threads handling network requests
+# The number of threads that the server uses for receiving requests from the network and sending responses to the network
 num.network.threads=3
- 
-# The number of threads doing disk I/O
+
+# The number of threads that the server uses for processing requests, which may include disk I/O
 num.io.threads=8
 
 # The send buffer (SO_SNDBUF) used by the socket server
@@ -68,7 +71,7 @@ socket.request.max.bytes=104857600
 
 ############################# Log Basics #############################
 
-# A comma seperated list of directories under which to store log files
+# A comma separated list of directories under which to store log files
 log.dirs={tmp_dir}/data
 
 # The default number of log partitions per topic. More partitions allow greater
@@ -81,14 +84,25 @@ default.replication.factor={replicas}
 replica.lag.time.max.ms=1000
 replica.socket.timeout.ms=1000
 
+# The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
+# This value is recommended to be increased for installations with data dirs located in RAID array.
+num.recovery.threads.per.data.dir=1
+
+############################# Internal Topic Settings  #############################
+# The replication factor for the group metadata internal topics "__consumer_offsets" and "__transaction_state"
+# For anything other than development testing, a value greater than 1 is recommended to ensure availability such as 3.
+offsets.topic.replication.factor=1
+transaction.state.log.replication.factor=1
+transaction.state.log.min.isr=1
+
 ############################# Log Flush Policy #############################
 
 # Messages are immediately written to the filesystem but by default we only fsync() to sync
-# the OS cache lazily. The following configurations control the flush of data to disk. 
+# the OS cache lazily. The following configurations control the flush of data to disk.
 # There are a few important trade-offs here:
 #    1. Durability: Unflushed data may be lost if you are not using replication.
 #    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
-#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 
+#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to excessive seeks.
 # The settings below allow one to configure the flush policy to flush data after a period of time or
 # every N messages (or both). This can be done globally and overridden on a per-topic basis.
 
@@ -105,17 +119,17 @@ replica.socket.timeout.ms=1000
 # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
 # from the end of the log.
 
-# The minimum age of a log file to be eligible for deletion
+# The minimum age of a log file to be eligible for deletion due to age
 log.retention.hours=168
 
-# A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
-# segments don't drop below log.retention.bytes.
+# A size-based retention policy for logs. Segments are pruned from the log unless the remaining
+# segments drop below log.retention.bytes. Functions independently of log.retention.hours.
 #log.retention.bytes=1073741824
 
 # The maximum size of a log segment file. When this size is reached a new log segment will be created.
 log.segment.bytes=1073741824
 
-# The interval at which log segments are checked to see if they can be deleted according 
+# The interval at which log segments are checked to see if they can be deleted according
 # to the retention policies
 log.retention.check.interval.ms=300000
 
@@ -145,3 +159,13 @@ zookeeper.connect={zk_host}:{zk_port}/{zk_chroot}
 zookeeper.connection.timeout.ms=30000
 # We want to expire kafka broker sessions quickly when brokers die b/c we restart them quickly
 zookeeper.session.timeout.ms=500
+
+
+############################# Group Coordinator Settings #############################
+
+# The following configuration specifies the time, in milliseconds, that the GroupCoordinator will delay the initial consumer rebalance.
+# The rebalance will be further delayed by the value of group.initial.rebalance.delay.ms as new members join the group, up to a maximum of max.poll.interval.ms.
+# The default value for this is 3 seconds.
+# We override this to 0 here as it makes for a better out-of-the-box experience for development and testing.
+# However, in production environments the default value of 3 seconds is more suitable as this will help to avoid unnecessary, and potentially expensive, rebalances during application startup.
+group.initial.rebalance.delay.ms=0
diff --git a/servers/2.5.0/resources/zookeeper.properties b/servers/2.5.0/resources/zookeeper.properties
index e3fd09742..b146fac9e 100644
--- a/servers/2.5.0/resources/zookeeper.properties
+++ b/servers/2.5.0/resources/zookeeper.properties
@@ -19,3 +19,4 @@ clientPort={port}
 clientPortAddress={host}
 # disable the per-ip limit on the number of connections since this is a non-production config
 maxClientCnxns=0
+admin.enableServer=false
diff --git a/servers/2.6.0/resources/kafka.properties b/servers/2.6.0/resources/kafka.properties
index 5775cfdc4..219023551 100644
--- a/servers/2.6.0/resources/kafka.properties
+++ b/servers/2.6.0/resources/kafka.properties
@@ -4,14 +4,15 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
-# 
+#
 #    http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 # see kafka.server.KafkaConfig for additional details and defaults
 
 ############################# Server Basics #############################
@@ -21,6 +22,12 @@ broker.id={broker_id}
 
 ############################# Socket Server Settings #############################
 
+# The address the socket server listens on. It will get the value returned from 
+# java.net.InetAddress.getCanonicalHostName() if not configured.
+#   FORMAT:
+#     listeners = listener_name://host_name:port
+#   EXAMPLE:
+#     listeners = PLAINTEXT://your.host.name:9092
 listeners={transport}://{host}:{port}
 security.inter.broker.protocol={transport}
 
@@ -38,22 +45,18 @@ allow.everyone.if.no.acl.found=true
 # The port the socket server listens on
 #port=9092
 
-# Hostname the broker will bind to. If not set, the server will bind to all interfaces
-#host.name=localhost
-
-# Hostname the broker will advertise to producers and consumers. If not set, it uses the
-# value for "host.name" if configured.  Otherwise, it will use the value returned from
-# java.net.InetAddress.getCanonicalHostName().
-#advertised.host.name=<hostname routable by clients>
+# Hostname and port the broker will advertise to producers and consumers. If not set, 
+# it uses the value for "listeners" if configured.  Otherwise, it will use the value
+# returned from java.net.InetAddress.getCanonicalHostName().
+#advertised.listeners=PLAINTEXT://your.host.name:9092
 
-# The port to publish to ZooKeeper for clients to use. If this is not set,
-# it will publish the same port that the broker binds to.
-#advertised.port=<port accessible by clients>
+# Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
+#listener.security.protocol.map=PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
 
-# The number of threads handling network requests
+# The number of threads that the server uses for receiving requests from the network and sending responses to the network
 num.network.threads=3
- 
-# The number of threads doing disk I/O
+
+# The number of threads that the server uses for processing requests, which may include disk I/O
 num.io.threads=8
 
 # The send buffer (SO_SNDBUF) used by the socket server
@@ -68,7 +71,7 @@ socket.request.max.bytes=104857600
 
 ############################# Log Basics #############################
 
-# A comma seperated list of directories under which to store log files
+# A comma separated list of directories under which to store log files
 log.dirs={tmp_dir}/data
 
 # The default number of log partitions per topic. More partitions allow greater
@@ -81,14 +84,25 @@ default.replication.factor={replicas}
 replica.lag.time.max.ms=1000
 replica.socket.timeout.ms=1000
 
+# The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
+# This value is recommended to be increased for installations with data dirs located in RAID array.
+num.recovery.threads.per.data.dir=1
+
+############################# Internal Topic Settings  #############################
+# The replication factor for the group metadata internal topics "__consumer_offsets" and "__transaction_state"
+# For anything other than development testing, a value greater than 1 is recommended to ensure availability such as 3.
+offsets.topic.replication.factor=1
+transaction.state.log.replication.factor=1
+transaction.state.log.min.isr=1
+
 ############################# Log Flush Policy #############################
 
 # Messages are immediately written to the filesystem but by default we only fsync() to sync
-# the OS cache lazily. The following configurations control the flush of data to disk. 
+# the OS cache lazily. The following configurations control the flush of data to disk.
 # There are a few important trade-offs here:
 #    1. Durability: Unflushed data may be lost if you are not using replication.
 #    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
-#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 
+#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to excessive seeks.
 # The settings below allow one to configure the flush policy to flush data after a period of time or
 # every N messages (or both). This can be done globally and overridden on a per-topic basis.
 
@@ -105,17 +119,17 @@ replica.socket.timeout.ms=1000
 # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
 # from the end of the log.
 
-# The minimum age of a log file to be eligible for deletion
+# The minimum age of a log file to be eligible for deletion due to age
 log.retention.hours=168
 
-# A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
-# segments don't drop below log.retention.bytes.
+# A size-based retention policy for logs. Segments are pruned from the log unless the remaining
+# segments drop below log.retention.bytes. Functions independently of log.retention.hours.
 #log.retention.bytes=1073741824
 
 # The maximum size of a log segment file. When this size is reached a new log segment will be created.
 log.segment.bytes=1073741824
 
-# The interval at which log segments are checked to see if they can be deleted according 
+# The interval at which log segments are checked to see if they can be deleted according
 # to the retention policies
 log.retention.check.interval.ms=300000
 
@@ -145,3 +159,13 @@ zookeeper.connect={zk_host}:{zk_port}/{zk_chroot}
 zookeeper.connection.timeout.ms=30000
 # We want to expire kafka broker sessions quickly when brokers die b/c we restart them quickly
 zookeeper.session.timeout.ms=500
+
+
+############################# Group Coordinator Settings #############################
+
+# The following configuration specifies the time, in milliseconds, that the GroupCoordinator will delay the initial consumer rebalance.
+# The rebalance will be further delayed by the value of group.initial.rebalance.delay.ms as new members join the group, up to a maximum of max.poll.interval.ms.
+# The default value for this is 3 seconds.
+# We override this to 0 here as it makes for a better out-of-the-box experience for development and testing.
+# However, in production environments the default value of 3 seconds is more suitable as this will help to avoid unnecessary, and potentially expensive, rebalances during application startup.
+group.initial.rebalance.delay.ms=0
diff --git a/servers/2.6.0/resources/zookeeper.properties b/servers/2.6.0/resources/zookeeper.properties
index e3fd09742..b146fac9e 100644
--- a/servers/2.6.0/resources/zookeeper.properties
+++ b/servers/2.6.0/resources/zookeeper.properties
@@ -19,3 +19,4 @@ clientPort={port}
 clientPortAddress={host}
 # disable the per-ip limit on the number of connections since this is a non-production config
 maxClientCnxns=0
+admin.enableServer=false
diff --git a/servers/4.0.0/resources/kafka.properties b/servers/4.0.0/resources/kafka.properties
new file mode 100644
index 000000000..3dba393ba
--- /dev/null
+++ b/servers/4.0.0/resources/kafka.properties
@@ -0,0 +1,161 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+############################# Server Basics #############################
+
+# The role of this server. Setting this puts us in KRaft mode
+process.roles=broker,controller
+
+# The node id associated with this instance's roles
+node.id={broker_id}
+
+# List of controller endpoints used connect to the controller cluster
+controller.quorum.bootstrap.servers={controller_bootstrap_host}:{controller_port}
+
+############################# Socket Server Settings #############################
+
+# The address the socket server listens on.
+# Combined nodes (i.e. those with `process.roles=broker,controller`) must list the controller listener here at a minimum.
+# If the broker listener is not defined, the default listener will use a host name that is equal to the value of java.net.InetAddress.getCanonicalHostName(),
+# with PLAINTEXT listener name, and port 9092.
+#   FORMAT:
+#     listeners = listener_name://host_name:port
+#   EXAMPLE:
+#     listeners = PLAINTEXT://your.host.name:9092
+#listeners=PLAINTEXT://:9092,CONTROLLER://:9093
+listeners={transport}://{host}:{port},CONTROLLER://{host}:{controller_port}
+
+# Name of listener used for communication between brokers.
+inter.broker.listener.name={transport}
+
+{sasl_config}
+
+authorizer.class.name=org.apache.kafka.metadata.authorizer.StandardAuthorizer
+allow.everyone.if.no.acl.found=true
+
+# Listener name, hostname and port the broker or the controller will advertise to clients.
+# If not set, it uses the value for "listeners".
+advertised.listeners={transport}://{host}:{port},CONTROLLER://{host}:{controller_port}
+
+# A comma-separated list of the names of the listeners used by the controller.
+# If no explicit mapping set in `listener.security.protocol.map`, default will be using PLAINTEXT protocol
+# This is required if running in KRaft mode.
+controller.listener.names=CONTROLLER
+
+# Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
+listener.security.protocol.map=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
+
+# The number of threads that the server uses for receiving requests from the network and sending responses to the network
+num.network.threads=3
+
+# The number of threads that the server uses for processing requests, which may include disk I/O
+num.io.threads=8
+
+# The send buffer (SO_SNDBUF) used by the socket server
+socket.send.buffer.bytes=102400
+
+# The receive buffer (SO_RCVBUF) used by the socket server
+socket.receive.buffer.bytes=102400
+
+# The maximum size of a request that the socket server will accept (protection against OOM)
+socket.request.max.bytes=104857600
+
+
+############################# Log Basics #############################
+
+# A comma separated list of directories under which to store log files
+log.dirs={tmp_dir}/kraft-combined-logs
+
+# The default number of log partitions per topic. More partitions allow greater
+# parallelism for consumption, but this will also result in more files across
+# the brokers.
+num.partitions={partitions}
+default.replication.factor={replicas}
+
+## Short Replica Lag -- Drops failed brokers out of ISR
+replica.lag.time.max.ms=1000
+replica.socket.timeout.ms=1000
+
+# The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
+# This value is recommended to be increased for installations with data dirs located in RAID array.
+num.recovery.threads.per.data.dir=1
+
+############################# Internal Topic Settings  #############################
+# The replication factor for the group metadata internal topics "__consumer_offsets", "__share_group_state" and "__transaction_state"
+# For anything other than development testing, a value greater than 1 is recommended to ensure availability such as 3.
+offsets.topic.replication.factor=1
+share.coordinator.state.topic.replication.factor=1
+share.coordinator.state.topic.min.isr=1
+transaction.state.log.replication.factor=1
+transaction.state.log.min.isr=1
+
+############################# Log Flush Policy #############################
+
+# Messages are immediately written to the filesystem but by default we only fsync() to sync
+# the OS cache lazily. The following configurations control the flush of data to disk.
+# There are a few important trade-offs here:
+#    1. Durability: Unflushed data may be lost if you are not using replication.
+#    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
+#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to excessive seeks.
+# The settings below allow one to configure the flush policy to flush data after a period of time or
+# every N messages (or both). This can be done globally and overridden on a per-topic basis.
+
+# The number of messages to accept before forcing a flush of data to disk
+#log.flush.interval.messages=10000
+
+# The maximum amount of time a message can sit in a log before we force a flush
+#log.flush.interval.ms=1000
+
+############################# Log Retention Policy #############################
+
+# The following configurations control the disposal of log segments. The policy can
+# be set to delete segments after a period of time, or after a given size has accumulated.
+# A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
+# from the end of the log.
+
+# The minimum age of a log file to be eligible for deletion due to age
+log.retention.hours=168
+
+# A size-based retention policy for logs. Segments are pruned from the log unless the remaining
+# segments drop below log.retention.bytes. Functions independently of log.retention.hours.
+#log.retention.bytes=1073741824
+
+# The maximum size of a log segment file. When this size is reached a new log segment will be created.
+log.segment.bytes=1073741824
+
+# The interval at which log segments are checked to see if they can be deleted according
+# to the retention policies
+log.retention.check.interval.ms=300000
+
+# By default the log cleaner is disabled and the log retention policy will default to just delete segments after their retention expires.
+# If log.cleaner.enable=true is set the cleaner will be enabled and individual logs can then be marked for log compaction.
+log.cleaner.enable=false
+
+# tune down offset topics to reduce setup time in tests
+offsets.commit.timeout.ms=500
+offsets.topic.num.partitions=2
+offsets.topic.replication.factor=1
+
+# Allow shorter session timeouts for tests
+group.min.session.timeout.ms=1000
+
+############################# Group Coordinator Settings #############################
+
+# The following configuration specifies the time, in milliseconds, that the GroupCoordinator will delay the initial consumer rebalance.
+# The rebalance will be further delayed by the value of group.initial.rebalance.delay.ms as new members join the group, up to a maximum of max.poll.interval.ms.
+# The default value for this is 3 seconds.
+# We override this to 0 here as it makes for a better out-of-the-box experience for development and testing.
+# However, in production environments the default value of 3 seconds is more suitable as this will help to avoid unnecessary, and potentially expensive, rebalances during application startup.
+group.initial.rebalance.delay.ms=0
diff --git a/servers/resources/default/kafka.properties b/servers/resources/default/kafka.properties
new file mode 100644
index 000000000..71b20f53e
--- /dev/null
+++ b/servers/resources/default/kafka.properties
@@ -0,0 +1,171 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# see kafka.server.KafkaConfig for additional details and defaults
+
+############################# Server Basics #############################
+
+# The id of the broker. This must be set to a unique integer for each broker.
+broker.id={broker_id}
+
+############################# Socket Server Settings #############################
+
+# The address the socket server listens on. It will get the value returned from 
+# java.net.InetAddress.getCanonicalHostName() if not configured.
+#   FORMAT:
+#     listeners = listener_name://host_name:port
+#   EXAMPLE:
+#     listeners = PLAINTEXT://your.host.name:9092
+listeners={transport}://{host}:{port}
+security.inter.broker.protocol={transport}
+
+{sasl_config}
+
+ssl.keystore.location={ssl_dir}/kafka.server.keystore.jks
+ssl.keystore.password=foobar
+ssl.key.password=foobar
+ssl.truststore.location={ssl_dir}/kafka.server.truststore.jks
+ssl.truststore.password=foobar
+
+authorizer.class.name=kafka.security.authorizer.AclAuthorizer
+allow.everyone.if.no.acl.found=true
+
+# The port the socket server listens on
+#port=9092
+
+# Hostname and port the broker will advertise to producers and consumers. If not set, 
+# it uses the value for "listeners" if configured.  Otherwise, it will use the value
+# returned from java.net.InetAddress.getCanonicalHostName().
+#advertised.listeners=PLAINTEXT://your.host.name:9092
+
+# Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
+#listener.security.protocol.map=PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
+
+# The number of threads that the server uses for receiving requests from the network and sending responses to the network
+num.network.threads=3
+
+# The number of threads that the server uses for processing requests, which may include disk I/O
+num.io.threads=8
+
+# The send buffer (SO_SNDBUF) used by the socket server
+socket.send.buffer.bytes=102400
+
+# The receive buffer (SO_RCVBUF) used by the socket server
+socket.receive.buffer.bytes=102400
+
+# The maximum size of a request that the socket server will accept (protection against OOM)
+socket.request.max.bytes=104857600
+
+
+############################# Log Basics #############################
+
+# A comma separated list of directories under which to store log files
+log.dirs={tmp_dir}/data
+
+# The default number of log partitions per topic. More partitions allow greater
+# parallelism for consumption, but this will also result in more files across
+# the brokers.
+num.partitions={partitions}
+default.replication.factor={replicas}
+
+## Short Replica Lag -- Drops failed brokers out of ISR
+replica.lag.time.max.ms=1000
+replica.socket.timeout.ms=1000
+
+# The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
+# This value is recommended to be increased for installations with data dirs located in RAID array.
+num.recovery.threads.per.data.dir=1
+
+############################# Internal Topic Settings  #############################
+# The replication factor for the group metadata internal topics "__consumer_offsets" and "__transaction_state"
+# For anything other than development testing, a value greater than 1 is recommended to ensure availability such as 3.
+offsets.topic.replication.factor=1
+transaction.state.log.replication.factor=1
+transaction.state.log.min.isr=1
+
+############################# Log Flush Policy #############################
+
+# Messages are immediately written to the filesystem but by default we only fsync() to sync
+# the OS cache lazily. The following configurations control the flush of data to disk.
+# There are a few important trade-offs here:
+#    1. Durability: Unflushed data may be lost if you are not using replication.
+#    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
+#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to excessive seeks.
+# The settings below allow one to configure the flush policy to flush data after a period of time or
+# every N messages (or both). This can be done globally and overridden on a per-topic basis.
+
+# The number of messages to accept before forcing a flush of data to disk
+#log.flush.interval.messages=10000
+
+# The maximum amount of time a message can sit in a log before we force a flush
+#log.flush.interval.ms=1000
+
+############################# Log Retention Policy #############################
+
+# The following configurations control the disposal of log segments. The policy can
+# be set to delete segments after a period of time, or after a given size has accumulated.
+# A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
+# from the end of the log.
+
+# The minimum age of a log file to be eligible for deletion due to age
+log.retention.hours=168
+
+# A size-based retention policy for logs. Segments are pruned from the log unless the remaining
+# segments drop below log.retention.bytes. Functions independently of log.retention.hours.
+#log.retention.bytes=1073741824
+
+# The maximum size of a log segment file. When this size is reached a new log segment will be created.
+log.segment.bytes=1073741824
+
+# The interval at which log segments are checked to see if they can be deleted according
+# to the retention policies
+log.retention.check.interval.ms=300000
+
+# By default the log cleaner is disabled and the log retention policy will default to just delete segments after their retention expires.
+# If log.cleaner.enable=true is set the cleaner will be enabled and individual logs can then be marked for log compaction.
+log.cleaner.enable=false
+
+# tune down offset topics to reduce setup time in tests
+offsets.commit.timeout.ms=500
+offsets.topic.num.partitions=2
+offsets.topic.replication.factor=1
+
+# Allow shorter session timeouts for tests
+group.min.session.timeout.ms=1000
+
+
+############################# Zookeeper #############################
+
+# Zookeeper connection string (see zookeeper docs for details).
+# This is a comma separated host:port pairs, each corresponding to a zk
+# server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002".
+# You can also append an optional chroot string to the urls to specify the
+# root directory for all kafka znodes.
+zookeeper.connect={zk_host}:{zk_port}/{zk_chroot}
+
+# Timeout in ms for connecting to zookeeper
+zookeeper.connection.timeout.ms=30000
+# We want to expire kafka broker sessions quickly when brokers die b/c we restart them quickly
+zookeeper.session.timeout.ms=500
+
+
+############################# Group Coordinator Settings #############################
+
+# The following configuration specifies the time, in milliseconds, that the GroupCoordinator will delay the initial consumer rebalance.
+# The rebalance will be further delayed by the value of group.initial.rebalance.delay.ms as new members join the group, up to a maximum of max.poll.interval.ms.
+# The default value for this is 3 seconds.
+# We override this to 0 here as it makes for a better out-of-the-box experience for development and testing.
+# However, in production environments the default value of 3 seconds is more suitable as this will help to avoid unnecessary, and potentially expensive, rebalances during application startup.
+group.initial.rebalance.delay.ms=0
diff --git a/servers/resources/default/kafka_server_jaas.conf b/servers/resources/default/kafka_server_jaas.conf
new file mode 100644
index 000000000..18efe4369
--- /dev/null
+++ b/servers/resources/default/kafka_server_jaas.conf
@@ -0,0 +1,4 @@
+KafkaServer {{
+    {jaas_config}
+}};
+Client {{}};
\ No newline at end of file
diff --git a/servers/resources/default/log4j.properties b/servers/resources/default/log4j.properties
new file mode 100644
index 000000000..b0b76aa79
--- /dev/null
+++ b/servers/resources/default/log4j.properties
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+log4j.rootLogger=INFO, stdout, logfile
+
+log4j.appender.stdout=org.apache.log4j.ConsoleAppender
+log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
+log4j.appender.stdout.layout.ConversionPattern=[%d] %p %m (%c)%n
+
+log4j.appender.logfile=org.apache.log4j.FileAppender
+log4j.appender.logfile.File=${kafka.logs.dir}/server.log
+log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
+log4j.appender.logfile.layout.ConversionPattern=[%d] %p %m (%c)%n
diff --git a/servers/resources/default/sasl_command.conf b/servers/resources/default/sasl_command.conf
new file mode 100644
index 000000000..f4ae7bafa
--- /dev/null
+++ b/servers/resources/default/sasl_command.conf
@@ -0,0 +1,3 @@
+security.protocol={transport}
+sasl.mechanism={sasl_mechanism}
+sasl.jaas.config={jaas_config}
diff --git a/servers/resources/default/zookeeper.properties b/servers/resources/default/zookeeper.properties
new file mode 100644
index 000000000..b146fac9e
--- /dev/null
+++ b/servers/resources/default/zookeeper.properties
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# the directory where the snapshot is stored.
+dataDir={tmp_dir}
+# the port at which the clients will connect
+clientPort={port}
+clientPortAddress={host}
+# disable the per-ip limit on the number of connections since this is a non-production config
+maxClientCnxns=0
+admin.enableServer=false
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 5c6311daf..000000000
--- a/setup.cfg
+++ /dev/null
@@ -1,5 +0,0 @@
-[bdist_wheel]
-universal=1
-
-[metadata]
-license_file = LICENSE
diff --git a/setup.py b/setup.py
index fe8a594f3..87b428a4e 100644
--- a/setup.py
+++ b/setup.py
@@ -1,70 +1,4 @@
-import os
-import sys
+# See pyproject.toml for project / build configuration
+from setuptools import setup
 
-from setuptools import setup, Command, find_packages
-
-# Pull version from source without importing
-# since we can't import something we haven't built yet :)
-exec(open('kafka/version.py').read())
-
-
-class Tox(Command):
-
-    user_options = []
-
-    def initialize_options(self):
-        pass
-
-    def finalize_options(self):
-        pass
-
-    @classmethod
-    def run(cls):
-        import tox
-        sys.exit(tox.cmdline([]))
-
-
-test_require = ['tox', 'mock']
-
-here = os.path.abspath(os.path.dirname(__file__))
-
-with open(os.path.join(here, 'README.rst')) as f:
-    README = f.read()
-
-setup(
-    name="kafka-python",
-    version=__version__,
-
-    tests_require=test_require,
-    extras_require={
-        "crc32c": ["crc32c"],
-        "lz4": ["lz4"],
-        "snappy": ["python-snappy"],
-        "zstd": ["python-zstandard"],
-    },
-    cmdclass={"test": Tox},
-    packages=find_packages(exclude=['test']),
-    author="Dana Powers",
-    author_email="dana.powers@gmail.com",
-    url="https://github.com/dpkp/kafka-python",
-    license="Apache License 2.0",
-    description="Pure Python client for Apache Kafka",
-    long_description=README,
-    keywords="apache kafka",
-    classifiers=[
-        "Development Status :: 5 - Production/Stable",
-        "Intended Audience :: Developers",
-        "License :: OSI Approved :: Apache Software License",
-        "Programming Language :: Python",
-        "Programming Language :: Python :: 2",
-        "Programming Language :: Python :: 2.7",
-        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.4",
-        "Programming Language :: Python :: 3.5",
-        "Programming Language :: Python :: 3.6",
-        "Programming Language :: Python :: 3.7",
-        "Programming Language :: Python :: 3.8",
-        "Programming Language :: Python :: Implementation :: PyPy",
-        "Topic :: Software Development :: Libraries :: Python Modules",
-    ]
-)
+setup()
diff --git a/test/conftest.py b/test/conftest.py
index 3fa0262fd..b65593a86 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -1,127 +1,17 @@
 from __future__ import absolute_import
 
-import uuid
-
 import pytest
 
-from test.testutil import env_kafka_version, random_string
-from test.fixtures import KafkaFixture, ZookeeperFixture
-
-@pytest.fixture(scope="module")
-def zookeeper():
-    """Return a Zookeeper fixture"""
-    zk_instance = ZookeeperFixture.instance()
-    yield zk_instance
-    zk_instance.close()
-
-
-@pytest.fixture(scope="module")
-def kafka_broker(kafka_broker_factory):
-    """Return a Kafka broker fixture"""
-    return kafka_broker_factory()[0]
-
-
-@pytest.fixture(scope="module")
-def kafka_broker_factory(zookeeper):
-    """Return a Kafka broker fixture factory"""
-    assert env_kafka_version(), 'KAFKA_VERSION must be specified to run integration tests'
-
-    _brokers = []
-    def factory(**broker_params):
-        params = {} if broker_params is None else broker_params.copy()
-        params.setdefault('partitions', 4)
-        num_brokers = params.pop('num_brokers', 1)
-        brokers = tuple(KafkaFixture.instance(x, zookeeper, **params)
-                        for x in range(num_brokers))
-        _brokers.extend(brokers)
-        return brokers
-
-    yield factory
-
-    for broker in _brokers:
-        broker.close()
-
-
-@pytest.fixture
-def kafka_client(kafka_broker, request):
-    """Return a KafkaClient fixture"""
-    (client,) = kafka_broker.get_clients(cnt=1, client_id='%s_client' % (request.node.name,))
-    yield client
-    client.close()
-
-
-@pytest.fixture
-def kafka_consumer(kafka_consumer_factory):
-    """Return a KafkaConsumer fixture"""
-    return kafka_consumer_factory()
-
-
-@pytest.fixture
-def kafka_consumer_factory(kafka_broker, topic, request):
-    """Return a KafkaConsumer factory fixture"""
-    _consumer = [None]
-
-    def factory(**kafka_consumer_params):
-        params = {} if kafka_consumer_params is None else kafka_consumer_params.copy()
-        params.setdefault('client_id', 'consumer_%s' % (request.node.name,))
-        params.setdefault('auto_offset_reset', 'earliest')
-        _consumer[0] = next(kafka_broker.get_consumers(cnt=1, topics=[topic], **params))
-        return _consumer[0]
-
-    yield factory
-
-    if _consumer[0]:
-        _consumer[0].close()
-
-
-@pytest.fixture
-def kafka_producer(kafka_producer_factory):
-    """Return a KafkaProducer fixture"""
-    yield kafka_producer_factory()
-
 
 @pytest.fixture
-def kafka_producer_factory(kafka_broker, request):
-    """Return a KafkaProduce factory fixture"""
-    _producer = [None]
-
-    def factory(**kafka_producer_params):
-        params = {} if kafka_producer_params is None else kafka_producer_params.copy()
-        params.setdefault('client_id', 'producer_%s' % (request.node.name,))
-        _producer[0] = next(kafka_broker.get_producers(cnt=1, **params))
-        return _producer[0]
+def metrics():
+    from kafka.metrics import Metrics
 
-    yield factory
-
-    if _producer[0]:
-        _producer[0].close()
-
-@pytest.fixture
-def kafka_admin_client(kafka_admin_client_factory):
-    """Return a KafkaAdminClient fixture"""
-    yield kafka_admin_client_factory()
-
-@pytest.fixture
-def kafka_admin_client_factory(kafka_broker):
-    """Return a KafkaAdminClient factory fixture"""
-    _admin_client = [None]
-
-    def factory(**kafka_admin_client_params):
-        params = {} if kafka_admin_client_params is None else kafka_admin_client_params.copy()
-        _admin_client[0] = next(kafka_broker.get_admin_clients(cnt=1, **params))
-        return _admin_client[0]
-
-    yield factory
-
-    if _admin_client[0]:
-        _admin_client[0].close()
-
-@pytest.fixture
-def topic(kafka_broker, request):
-    """Return a topic fixture"""
-    topic_name = '%s_%s' % (request.node.name, random_string(10))
-    kafka_broker.create_topics([topic_name])
-    return topic_name
+    metrics = Metrics()
+    try:
+        yield metrics
+    finally:
+        metrics.close()
 
 
 @pytest.fixture
@@ -137,7 +27,9 @@ def conn(mocker):
         MetadataResponse[0](
             [(0, 'foo', 12), (1, 'bar', 34)],  # brokers
             []))  # topics
+    conn.connection_delay.return_value = 0
     conn.blacked_out.return_value = False
+    conn.next_ifr_request_timeout_ms.return_value = float('inf')
     def _set_conn_state(state):
         conn.state = state
         return state
@@ -151,25 +43,13 @@ def _set_conn_state(state):
     return conn
 
 
-@pytest.fixture()
-def send_messages(topic, kafka_producer, request):
-    """A factory that returns a send_messages function with a pre-populated
-    topic topic / producer."""
-
-    def _send_messages(number_range, partition=0, topic=topic, producer=kafka_producer, request=request):
-        """
-            messages is typically `range(0,100)`
-            partition is an int
-        """
-        messages_and_futures = []  # [(message, produce_future),]
-        for i in number_range:
-            # request.node.name provides the test name (including parametrized values)
-            encoded_msg = '{}-{}-{}'.format(i, request.node.name, uuid.uuid4()).encode('utf-8')
-            future = kafka_producer.send(topic, value=encoded_msg, partition=partition)
-            messages_and_futures.append((encoded_msg, future))
-        kafka_producer.flush()
-        for (msg, f) in messages_and_futures:
-            assert f.succeeded()
-        return [msg for (msg, f) in messages_and_futures]
+@pytest.fixture
+def client(conn, mocker):
+    from kafka import KafkaClient
 
-    return _send_messages
+    cli = KafkaClient(api_version=(0, 9))
+    mocker.patch.object(cli, '_init_connect', return_value=True)
+    try:
+        yield cli
+    finally:
+        cli._close()
diff --git a/test/integration/__init__.py b/test/integration/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/integration/conftest.py b/test/integration/conftest.py
new file mode 100644
index 000000000..8af729296
--- /dev/null
+++ b/test/integration/conftest.py
@@ -0,0 +1,168 @@
+from __future__ import absolute_import
+
+import os
+import uuid
+
+import pytest
+
+from kafka.vendor.six.moves.urllib.parse import urlparse  # pylint: disable=E0611,F0401
+from test.testutil import env_kafka_version, random_string
+from test.integration.fixtures import KafkaFixture, ZookeeperFixture
+
+
+@pytest.fixture(scope="module")
+def zookeeper():
+    """Return a Zookeeper fixture"""
+    if "ZOOKEEPER_URI" in os.environ:
+        parse = urlparse(os.environ["ZOOKEEPER_URI"])
+        (host, port) = (parse.hostname, parse.port)
+        yield ZookeeperFixture.instance(host=host, port=port, external=True)
+    else:
+        zk_instance = ZookeeperFixture.instance()
+        yield zk_instance
+        zk_instance.close()
+
+
+@pytest.fixture(scope="module")
+def kafka_broker(kafka_broker_factory):
+    """Return a Kafka broker fixture"""
+    if "KAFKA_URI" in os.environ:
+        parse = urlparse(os.environ["KAFKA_URI"])
+        (host, port) = (parse.hostname, parse.port)
+        return KafkaFixture.instance(0, host=host, port=port, external=True)
+    else:
+        return kafka_broker_factory()
+
+
+@pytest.fixture(scope="module")
+def kafka_broker_factory():
+    """Return a Kafka broker fixture factory"""
+    assert env_kafka_version(), 'KAFKA_VERSION must be specified to run integration tests'
+
+    _brokers = []
+    def factory(**broker_params):
+        params = {} if broker_params is None else broker_params.copy()
+        params.setdefault('partitions', 4)
+        node_id = params.pop('node_id', 0)
+        broker = KafkaFixture.instance(node_id, **params)
+        _brokers.append(broker)
+        return broker
+
+    yield factory
+
+    zks = set()
+    for broker in _brokers:
+        zks.add(broker.zookeeper)
+        broker.close()
+    for zk in zks:
+        if zk:
+            zk.close()
+
+
+@pytest.fixture
+def kafka_client(kafka_broker, request):
+    """Return a KafkaClient fixture"""
+    (client,) = kafka_broker.get_clients(cnt=1, client_id='%s_client' % (request.node.name,))
+    yield client
+    client.close()
+
+
+@pytest.fixture
+def kafka_consumer(kafka_consumer_factory):
+    """Return a KafkaConsumer fixture"""
+    return kafka_consumer_factory()
+
+
+@pytest.fixture
+def kafka_consumer_factory(kafka_broker, topic, request):
+    """Return a KafkaConsumer factory fixture"""
+    _consumer = [None]
+
+    def factory(topics=(topic,), **kafka_consumer_params):
+        params = {} if kafka_consumer_params is None else kafka_consumer_params.copy()
+        params.setdefault('client_id', 'consumer_%s' % (request.node.name,))
+        params.setdefault('auto_offset_reset', 'earliest')
+        _consumer[0] = next(kafka_broker.get_consumers(cnt=1, topics=list(topics), **params))
+        return _consumer[0]
+
+    yield factory
+
+    if _consumer[0]:
+        _consumer[0].close()
+
+
+@pytest.fixture
+def kafka_producer(kafka_producer_factory):
+    """Return a KafkaProducer fixture"""
+    yield kafka_producer_factory()
+
+
+@pytest.fixture
+def kafka_producer_factory(kafka_broker, request):
+    """Return a KafkaProduce factory fixture"""
+    _producer = [None]
+
+    def factory(**kafka_producer_params):
+        params = {} if kafka_producer_params is None else kafka_producer_params.copy()
+        params.setdefault('client_id', 'producer_%s' % (request.node.name,))
+        _producer[0] = next(kafka_broker.get_producers(cnt=1, **params))
+        return _producer[0]
+
+    yield factory
+
+    if _producer[0]:
+        _producer[0].close()
+
+
+@pytest.fixture
+def kafka_admin_client(kafka_admin_client_factory):
+    """Return a KafkaAdminClient fixture"""
+    yield kafka_admin_client_factory()
+
+
+@pytest.fixture
+def kafka_admin_client_factory(kafka_broker):
+    """Return a KafkaAdminClient factory fixture"""
+    _admin_client = [None]
+
+    def factory(**kafka_admin_client_params):
+        params = {} if kafka_admin_client_params is None else kafka_admin_client_params.copy()
+        _admin_client[0] = next(kafka_broker.get_admin_clients(cnt=1, **params))
+        return _admin_client[0]
+
+    yield factory
+
+    if _admin_client[0]:
+        _admin_client[0].close()
+
+
+@pytest.fixture
+def topic(kafka_broker, request):
+    """Return a topic fixture"""
+    topic_name = '%s_%s' % (request.node.name, random_string(10))
+    kafka_broker.create_topics([topic_name])
+    return topic_name
+
+
+@pytest.fixture()
+def send_messages(topic, kafka_producer, request):
+    """A factory that returns a send_messages function with a pre-populated
+    topic topic / producer."""
+
+    def _send_messages(number_range, partition=0, topic=topic, producer=kafka_producer, request=request):
+        """
+            messages is typically `range(0,100)`
+            partition is an int
+        """
+        messages_and_futures = []  # [(message, produce_future),]
+        for i in number_range:
+            # request.node.name provides the test name (including parametrized values)
+            encoded_msg = '{}-{}-{}'.format(i, request.node.name, uuid.uuid4()).encode('utf-8')
+            future = kafka_producer.send(topic, value=encoded_msg, partition=partition)
+            messages_and_futures.append((encoded_msg, future))
+        kafka_producer.flush()
+        for (msg, f) in messages_and_futures:
+            assert f.succeeded()
+        return [msg for (msg, f) in messages_and_futures]
+
+    return _send_messages
diff --git a/test/fixtures.py b/test/integration/fixtures.py
similarity index 66%
rename from test/fixtures.py
rename to test/integration/fixtures.py
index 26fb5e89d..b9baf5223 100644
--- a/test/fixtures.py
+++ b/test/integration/fixtures.py
@@ -1,6 +1,7 @@
-from __future__ import absolute_import
+from __future__ import absolute_import, division
 
 import atexit
+import base64
 import logging
 import os
 import os.path
@@ -10,11 +11,11 @@
 import uuid
 
 import py
-from kafka.vendor.six.moves import urllib, range
+from kafka.vendor.six.moves import range
 from kafka.vendor.six.moves.urllib.parse import urlparse  # pylint: disable=E0611,F0401
 
 from kafka import errors, KafkaAdminClient, KafkaClient, KafkaConsumer, KafkaProducer
-from kafka.errors import InvalidReplicationFactorError
+from kafka.errors import InvalidReplicationFactorError, KafkaTimeoutError
 from kafka.protocol.admin import CreateTopicsRequest
 from kafka.protocol.metadata import MetadataRequest
 from test.testutil import env_kafka_version, random_string
@@ -25,7 +26,7 @@
 
 def get_open_port():
     sock = socket.socket()
-    sock.bind(("", 0))
+    sock.bind(("127.0.0.1", 0))
     port = sock.getsockname()[1]
     sock.close()
     return port
@@ -65,53 +66,27 @@ class Fixture(object):
     kafka_version = os.environ.get('KAFKA_VERSION', '0.11.0.2')
     scala_version = os.environ.get("SCALA_VERSION", '2.8.0')
     project_root = os.environ.get('PROJECT_ROOT',
-                                  os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+                                  os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
     kafka_root = os.environ.get("KAFKA_ROOT",
                                 os.path.join(project_root, 'servers', kafka_version, "kafka-bin"))
 
     def __init__(self):
         self.child = None
+        if not os.path.isdir(self.kafka_root):
+            raise FileNotFoundError(self.kafka_root)
 
     @classmethod
-    def download_official_distribution(cls,
-                                       kafka_version=None,
-                                       scala_version=None,
-                                       output_dir=None):
-        if not kafka_version:
-            kafka_version = cls.kafka_version
-        if not scala_version:
-            scala_version = cls.scala_version
-        if not output_dir:
-            output_dir = os.path.join(cls.project_root, 'servers', 'dist')
-
-        distfile = 'kafka_%s-%s' % (scala_version, kafka_version,)
-        url_base = 'https://archive.apache.org/dist/kafka/%s/' % (kafka_version,)
-        output_file = os.path.join(output_dir, distfile + '.tgz')
-
-        if os.path.isfile(output_file):
-            log.info("Found file already on disk: %s", output_file)
-            return output_file
-
-        # New tarballs are .tgz, older ones are sometimes .tar.gz
-        try:
-            url = url_base + distfile + '.tgz'
-            log.info("Attempting to download %s", url)
-            response = urllib.request.urlopen(url)
-        except urllib.error.HTTPError:
-            log.exception("HTTP Error")
-            url = url_base + distfile + '.tar.gz'
-            log.info("Attempting to download %s", url)
-            response = urllib.request.urlopen(url)
-
-        log.info("Saving distribution file to %s", output_file)
-        with open(output_file, 'w') as output_file_fd:
-            output_file_fd.write(response.read())
-
-        return output_file
+    def test_resource(cls, filename):
+        path = os.path.join(cls.project_root, "servers", cls.kafka_version, "resources", filename)
+        if os.path.isfile(path):
+            return path
+        return os.path.join(cls.project_root, "servers", "resources", "default", filename)
 
     @classmethod
-    def test_resource(cls, filename):
-        return os.path.join(cls.project_root, "servers", cls.kafka_version, "resources", filename)
+    def run_script(cls, script, *args):
+        result = [os.path.join(cls.kafka_root, 'bin', script)]
+        result.extend([str(arg) for arg in args])
+        return result
 
     @classmethod
     def kafka_run_class_args(cls, *args):
@@ -158,23 +133,18 @@ def dump_logs(self):
 
 class ZookeeperFixture(Fixture):
     @classmethod
-    def instance(cls):
-        if "ZOOKEEPER_URI" in os.environ:
-            parse = urlparse(os.environ["ZOOKEEPER_URI"])
-            (host, port) = (parse.hostname, parse.port)
-            fixture = ExternalService(host, port)
-        else:
-            (host, port) = ("127.0.0.1", None)
-            fixture = cls(host, port)
-
+    def instance(cls, host=None, port=None, external=False):
+        if host is None:
+            host = "127.0.0.1"
+        fixture = cls(host, port, external=external)
         fixture.open()
         return fixture
 
-    def __init__(self, host, port, tmp_dir=None):
+    def __init__(self, host, port, external=False, tmp_dir=None):
         super(ZookeeperFixture, self).__init__()
         self.host = host
         self.port = port
-
+        self.running = external
         self.tmp_dir = tmp_dir
 
     def kafka_run_class_env(self):
@@ -183,9 +153,12 @@ def kafka_run_class_env(self):
         return env
 
     def out(self, message):
-        log.info("*** Zookeeper [%s:%s]: %s", self.host, self.port or '(auto)', message)
+        if len(log.handlers) > 0:
+            log.info("*** Zookeeper [%s:%s]: %s", self.host, self.port or '(auto)', message)
 
     def open(self):
+        if self.running:
+            return
         if self.tmp_dir is None:
             self.tmp_dir = py.path.local.mkdtemp() #pylint: disable=no-member
         self.tmp_dir.ensure(dir=True)
@@ -198,6 +171,7 @@ def open(self):
         # Configure Zookeeper child process
         template = self.test_resource("zookeeper.properties")
         properties = self.tmp_dir.join("zookeeper.properties")
+        # Consider replacing w/ run_script('zookeper-server-start.sh', ...)
         args = self.kafka_run_class_args("org.apache.zookeeper.server.quorum.QuorumPeerMain",
                                          properties.strpath)
         env = self.kafka_run_class_env()
@@ -248,40 +222,52 @@ class KafkaFixture(Fixture):
     broker_password = 'alice-secret'
 
     @classmethod
-    def instance(cls, broker_id, zookeeper, zk_chroot=None,
-                 host=None, port=None,
-                 transport='PLAINTEXT', replicas=1, partitions=2,
+    def instance(cls, broker_id, zookeeper=None, zk_chroot=None,
+                 host="localhost", port=None, external=False,
+                 transport='PLAINTEXT', replicas=1, partitions=4,
                  sasl_mechanism=None, auto_create_topic=True, tmp_dir=None):
 
-        if zk_chroot is None:
-            zk_chroot = "kafka-python_" + str(uuid.uuid4()).replace("-", "_")
-        if "KAFKA_URI" in os.environ:
-            parse = urlparse(os.environ["KAFKA_URI"])
-            (host, port) = (parse.hostname, parse.port)
-            fixture = ExternalService(host, port)
-        else:
-            if host is None:
-                host = "localhost"
-            fixture = KafkaFixture(host, port, broker_id,
-                                   zookeeper, zk_chroot,
-                                   transport=transport,
-                                   replicas=replicas, partitions=partitions,
-                                   sasl_mechanism=sasl_mechanism,
-                                   auto_create_topic=auto_create_topic,
-                                   tmp_dir=tmp_dir)
-
-            fixture.open()
+        # Kafka requries zookeeper prior to 4.0 release
+        if env_kafka_version() < (4, 0):
+            if zookeeper is None:
+                if "ZOOKEEPER_URI" in os.environ:
+                    parse = urlparse(os.environ["ZOOKEEPER_URI"])
+                    (host, port) = (parse.hostname, parse.port)
+                    zookeeper = ZookeeperFixture.instance(host=host, port=port, external=True)
+                elif not external:
+                    zookeeper = ZookeeperFixture.instance()
+            if zk_chroot is None:
+                zk_chroot = "kafka-python_" + str(uuid.uuid4()).replace("-", "_")
+
+        fixture = KafkaFixture(host, port, broker_id,
+                               zookeeper=zookeeper, zk_chroot=zk_chroot,
+                               external=external,
+                               transport=transport,
+                               replicas=replicas, partitions=partitions,
+                               sasl_mechanism=sasl_mechanism,
+                               auto_create_topic=auto_create_topic,
+                               tmp_dir=tmp_dir)
+
+        fixture.open()
         return fixture
 
-    def __init__(self, host, port, broker_id, zookeeper, zk_chroot,
+    def __init__(self, host, port, broker_id, zookeeper=None, zk_chroot=None,
                  replicas=1, partitions=2, transport='PLAINTEXT',
                  sasl_mechanism=None, auto_create_topic=True,
-                 tmp_dir=None):
+                 tmp_dir=None, external=False):
         super(KafkaFixture, self).__init__()
 
         self.host = host
-        self.port = port
+        self.controller_bootstrap_host = host
+        if port is None:
+            self.auto_port = True
+            self.port = get_open_port()
+        else:
+            self.auto_port = False
+            self.port = port
+        self.controller_port = get_open_port()
 
+        self.cluster_id = self._gen_cluster_id()
         self.broker_id = broker_id
         self.auto_create_topic = auto_create_topic
         self.transport = transport.upper()
@@ -294,26 +280,40 @@ def __init__(self, host, port, broker_id, zookeeper, zk_chroot,
         # TODO: checking for port connection would be better than scanning logs
         # until then, we need the pattern to work across all supported broker versions
         # The logging format changed slightly in 1.0.0
-        self.start_pattern = r"\[Kafka ?Server (id=)?%d\],? started" % (broker_id,)
-        # Need to wait until the broker has fetched user configs from zookeeper in case we use scram as sasl mechanism
-        self.scram_pattern = r"Removing Produce quota for user %s" % (self.broker_user)
+        if env_kafka_version() < (4, 0):
+            self.start_pattern = r"\[Kafka ?Server (id=)?%d\],? started" % (broker_id,)
+            # Need to wait until the broker has fetched user configs from zookeeper in case we use scram as sasl mechanism
+            self.scram_pattern = r"Removing Produce quota for user %s" % (self.broker_user)
+        else:
+            self.start_pattern = r"\[KafkaRaftServer nodeId=%d\] Kafka Server started" % (broker_id,)
+            self.scram_pattern = r"Replayed UserScramCredentialRecord creating new entry for %s" % (self.broker_user,)
 
         self.zookeeper = zookeeper
         self.zk_chroot = zk_chroot
         # Add the attributes below for the template binding
-        self.zk_host = self.zookeeper.host
-        self.zk_port = self.zookeeper.port
+        self.zk_host = self.zookeeper.host if self.zookeeper else None
+        self.zk_port = self.zookeeper.port if self.zookeeper else None
 
         self.replicas = replicas
         self.partitions = partitions
 
         self.tmp_dir = tmp_dir
-        self.running = False
+        self.external = external
+
+        if self.external:
+            self.child = ExternalService(self.host, self.port)
+            (self._client,) = self.get_clients(1, client_id='_internal_client')
+            self.running = True
+        else:
+            self._client = None
+            self.running = False
 
-        self._client = None
         self.sasl_config = ''
         self.jaas_config = ''
 
+    def _gen_cluster_id(self):
+        return base64.b64encode(uuid.uuid4().bytes).decode('utf-8').rstrip('=')
+
     def _sasl_config(self):
         if not self.sasl_enabled:
             return ''
@@ -330,13 +330,13 @@ def _jaas_config(self):
 
         elif self.sasl_mechanism == 'PLAIN':
             jaas_config = (
-                'org.apache.kafka.common.security.plain.PlainLoginModule required\n'
-                '  username="{user}" password="{password}" user_{user}="{password}";\n'
+                'org.apache.kafka.common.security.plain.PlainLoginModule required'
+                ' username="{user}" password="{password}" user_{user}="{password}";\n'
             )
         elif self.sasl_mechanism in ("SCRAM-SHA-256", "SCRAM-SHA-512"):
             jaas_config = (
-                'org.apache.kafka.common.security.scram.ScramLoginModule required\n'
-                '  username="{user}" password="{password}";\n'
+                'org.apache.kafka.common.security.scram.ScramLoginModule required'
+                ' username="{user}" password="{password}";\n'
             )
         else:
             raise ValueError("SASL mechanism {} currently not supported".format(self.sasl_mechanism))
@@ -344,18 +344,16 @@ def _jaas_config(self):
 
     def _add_scram_user(self):
         self.out("Adding SCRAM credentials for user {} to zookeeper.".format(self.broker_user))
-        args = self.kafka_run_class_args(
-            "kafka.admin.ConfigCommand",
-            "--zookeeper",
-            "%s:%d/%s" % (self.zookeeper.host,
-                       self.zookeeper.port,
-                       self.zk_chroot),
-            "--alter",
-            "--entity-type", "users",
-            "--entity-name", self.broker_user,
-            "--add-config",
-            "{}=[password={}]".format(self.sasl_mechanism, self.broker_password),
-        )
+        args = self.run_script('kafka-configs.sh',
+                               '--zookeeper',
+                               '%s:%d/%s' % (self.zookeeper.host,
+                                          self.zookeeper.port,
+                                          self.zk_chroot),
+                               '--alter',
+                               '--entity-type', 'users',
+                               '--entity-name', self.broker_user,
+                               '--add-config',
+                               '{}=[password={}]'.format(self.sasl_mechanism, self.broker_password))
         env = self.kafka_run_class_env()
         proc = subprocess.Popen(args, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 
@@ -381,17 +379,17 @@ def kafka_run_class_env(self):
         return env
 
     def out(self, message):
-        log.info("*** Kafka [%s:%s]: %s", self.host, self.port or '(auto)', message)
+        if len(log.handlers) > 0:
+            log.info("*** Kafka [%s:%s]: %s", self.host, self.port or '(auto)', message)
 
     def _create_zk_chroot(self):
         self.out("Creating Zookeeper chroot node...")
-        args = self.kafka_run_class_args("org.apache.zookeeper.ZooKeeperMain",
-                                         "-server",
-                                         "%s:%d" % (self.zookeeper.host,
-                                                    self.zookeeper.port),
-                                         "create",
-                                         "/%s" % (self.zk_chroot,),
-                                         "kafka-python")
+        args = self.run_script('zookeeper-shell.sh',
+                               '%s:%d' % (self.zookeeper.host,
+                                          self.zookeeper.port),
+                               'create',
+                               '/%s' % (self.zk_chroot,),
+                               'kafka-python')
         env = self.kafka_run_class_env()
         proc = subprocess.Popen(args, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 
@@ -405,12 +403,15 @@ def _create_zk_chroot(self):
         self.out("Kafka chroot created in Zookeeper!")
 
     def start(self):
+        if self.running:
+            return True
         # Configure Kafka child process
         properties = self.tmp_dir.join("kafka.properties")
         jaas_conf = self.tmp_dir.join("kafka_server_jaas.conf")
         properties_template = self.test_resource("kafka.properties")
         jaas_conf_template = self.test_resource("kafka_server_jaas.conf")
 
+        # Consider replacing w/ run_script('kafka-server-start.sh', ...)
         args = self.kafka_run_class_args("kafka.Kafka", properties.strpath)
         env = self.kafka_run_class_env()
         if self.sasl_enabled:
@@ -424,12 +425,11 @@ def start(self):
         backoff = 1
         end_at = time.time() + max_timeout
         tries = 1
-        auto_port = (self.port is None)
         while time.time() < end_at:
             # We have had problems with port conflicts on travis
             # so we will try a different port on each retry
             # unless the fixture was passed a specific port
-            if auto_port:
+            if self.auto_port:
                 self.port = get_open_port()
             self.out('Attempting to start on port %d (try #%d)' % (self.port, tries))
             self.render_template(properties_template, properties, vars(self))
@@ -475,6 +475,9 @@ def open(self):
         self.tmp_dir.ensure(dir=True)
         self.tmp_dir.ensure('logs', dir=True)
         self.tmp_dir.ensure('data', dir=True)
+        properties = self.tmp_dir.join('kafka.properties')
+        properties_template = self.test_resource('kafka.properties')
+        self.render_template(properties_template, properties, vars(self))
 
         self.out("Running local instance...")
         log.info("  host            = %s", self.host)
@@ -482,19 +485,26 @@ def open(self):
         log.info("  transport       = %s", self.transport)
         log.info("  sasl_mechanism  = %s", self.sasl_mechanism)
         log.info("  broker_id       = %s", self.broker_id)
-        log.info("  zk_host         = %s", self.zookeeper.host)
-        log.info("  zk_port         = %s", self.zookeeper.port)
+        log.info("  zk_host         = %s", self.zk_host)
+        log.info("  zk_port         = %s", self.zk_port)
         log.info("  zk_chroot       = %s", self.zk_chroot)
         log.info("  replicas        = %s", self.replicas)
         log.info("  partitions      = %s", self.partitions)
         log.info("  tmp_dir         = %s", self.tmp_dir.strpath)
 
-        self._create_zk_chroot()
+        if self.zookeeper:
+            if self.zk_chroot:
+                self._create_zk_chroot()
+            # add user to zookeeper for the first server
+            if self.sasl_enabled and self.sasl_mechanism.startswith("SCRAM-SHA") and self.broker_id == 0:
+                self._add_scram_user()
+
+        else:
+            # running in KRaft mode
+            self._format_log_dirs()
+
         self.sasl_config = self._sasl_config()
         self.jaas_config = self._jaas_config()
-        # add user to zookeeper for the first server
-        if self.sasl_enabled and self.sasl_mechanism.startswith("SCRAM-SHA") and self.broker_id == 0:
-            self._add_scram_user()
         self.start()
 
         atexit.register(self.close)
@@ -503,6 +513,8 @@ def __del__(self):
         self.close()
 
     def stop(self):
+        if self.external:
+            return
         if not self.running:
             self.out("Instance already stopped")
             return
@@ -524,6 +536,21 @@ def dump_logs(self):
         super(KafkaFixture, self).dump_logs()
         self.zookeeper.dump_logs()
 
+    def _format_log_dirs(self):
+        self.out("Formatting log dirs for kraft bootstrapping")
+        args = self.run_script('kafka-storage.sh', 'format', '--standalone', '-t', self.cluster_id, '-c', self.tmp_dir.join("kafka.properties"))
+        if self.sasl_enabled and self.sasl_mechanism.startswith("SCRAM-SHA"):
+            args.extend(['--add-scram', '{}=[name={},password={}]'.format(self.sasl_mechanism, self.broker_user, self.broker_password)])
+        env = self.kafka_run_class_env()
+        proc = subprocess.Popen(args, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        stdout, stderr = proc.communicate()
+        if proc.returncode != 0:
+            self.out("Failed to format log dirs for kraft bootstrap!")
+            self.out(stdout)
+            self.out(stderr)
+            raise RuntimeError("Failed to format log dirs!")
+        return True
+
     def _send_request(self, request, timeout=None):
         def _failure(error):
             raise error
@@ -536,13 +563,15 @@ def _failure(error):
                     break
                 self._client.poll(timeout_ms=100)
             else:
-                raise RuntimeError('Could not connect to broker with node id %d' % (node_id,))
+                raise RuntimeError('Could not connect to broker with node id %s' % (node_id,))
 
             try:
                 future = self._client.send(node_id, request)
                 future.error_on_callbacks = True
                 future.add_errback(_failure)
                 self._client.poll(future=future, timeout_ms=timeout)
+                if not future.is_done:
+                    raise KafkaTimeoutError()
                 return future.value
             except Exception as exc:
                 time.sleep(1)
@@ -561,8 +590,9 @@ def _create_topic(self, topic_name, num_partitions=None, replication_factor=None
         # Try different methods to create a topic, from the fastest to the slowest
         if self.auto_create_topic and num_partitions == self.partitions and replication_factor == self.replicas:
             self._create_topic_via_metadata(topic_name, timeout_ms)
-        elif env_kafka_version() >= (0, 10, 1, 0):
+        elif env_kafka_version() >= (0, 10, 1, 0) and env_kafka_version() < (4, 0):
             try:
+                # 4.0 brokers dropped support for CreateTopicsRequest v0 (TODO: pick from api_versions)
                 self._create_topic_via_admin_api(topic_name, num_partitions, replication_factor, timeout_ms)
             except InvalidReplicationFactorError:
                 # wait and try again
@@ -573,7 +603,15 @@ def _create_topic(self, topic_name, num_partitions=None, replication_factor=None
             self._create_topic_via_cli(topic_name, num_partitions, replication_factor)
 
     def _create_topic_via_metadata(self, topic_name, timeout_ms=10000):
-        self._send_request(MetadataRequest[0]([topic_name]), timeout_ms)
+        timeout_at = time.time() + timeout_ms / 1000
+        while time.time() < timeout_at:
+            response = self._send_request(MetadataRequest[0]([topic_name]), timeout_ms)
+            if response.topics[0][0] == 0:
+                return
+            log.warning("Unable to create topic via MetadataRequest: err %d", response.topics[0][0])
+            time.sleep(1)
+        else:
+            raise RuntimeError('Unable to create topic via MetadataRequest')
 
     def _create_topic_via_admin_api(self, topic_name, num_partitions, replication_factor, timeout_ms=10000):
         request = CreateTopicsRequest[0]([(topic_name, num_partitions,
@@ -585,17 +623,15 @@ def _create_topic_via_admin_api(self, topic_name, num_partitions, replication_fa
                 raise errors.for_code(error_code)
 
     def _create_topic_via_cli(self, topic_name, num_partitions, replication_factor):
-        args = self.kafka_run_class_args('kafka.admin.TopicCommand',
-                                         '--zookeeper', '%s:%s/%s' % (self.zookeeper.host,
-                                                                      self.zookeeper.port,
-                                                                      self.zk_chroot),
-                                         '--create',
-                                         '--topic', topic_name,
-                                         '--partitions', self.partitions \
-                                             if num_partitions is None else num_partitions,
-                                         '--replication-factor', self.replicas \
-                                             if replication_factor is None \
-                                             else replication_factor)
+        args = self.run_script('kafka-topics.sh',
+                               '--create',
+                               '--topic', topic_name,
+                               '--partitions', self.partitions \
+                                   if num_partitions is None else num_partitions,
+                               '--replication-factor', self.replicas \
+                                   if replication_factor is None \
+                                   else replication_factor,
+                               *self._cli_connect_args())
         if env_kafka_version() >= (0, 10):
             args.append('--if-not-exists')
         env = self.kafka_run_class_env()
@@ -608,16 +644,23 @@ def _create_topic_via_cli(self, topic_name, num_partitions, replication_factor):
                 self.out(stderr)
                 raise RuntimeError("Failed to create topic %s" % (topic_name,))
 
+    def _cli_connect_args(self):
+        if env_kafka_version() < (3, 0, 0):
+            return ['--zookeeper', '%s:%s/%s' % (self.zookeeper.host, self.zookeeper.port, self.zk_chroot)]
+        else:
+            args = ['--bootstrap-server', '%s:%s' % (self.host, self.port)]
+            if self.sasl_enabled:
+                command_conf = self.tmp_dir.join("sasl_command.conf")
+                self.render_template(self.test_resource("sasl_command.conf"), command_conf, vars(self))
+                args.append('--command-config')
+                args.append(command_conf.strpath)
+            return args
+
     def get_topic_names(self):
-        args = self.kafka_run_class_args('kafka.admin.TopicCommand',
-                                         '--zookeeper', '%s:%s/%s' % (self.zookeeper.host,
-                                                                      self.zookeeper.port,
-                                                                      self.zk_chroot),
-                                         '--list'
-                                         )
+        cmd = self.run_script('kafka-topics.sh', '--list', *self._cli_connect_args())
         env = self.kafka_run_class_env()
         env.pop('KAFKA_LOG4J_OPTS')
-        proc = subprocess.Popen(args, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        proc = subprocess.Popen(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         stdout, stderr = proc.communicate()
         if proc.returncode != 0:
             self.out("Failed to list topics!")
@@ -671,3 +714,52 @@ def get_producers(self, cnt, **params):
         params = self._enrich_client_params(params, client_id='producer')
         for client in self._create_many_clients(cnt, KafkaProducer, **params):
             yield client
+
+
+def get_api_versions():
+    logging.basicConfig(level=logging.ERROR)
+    zk = ZookeeperFixture.instance()
+    k = KafkaFixture.instance(0, zk)
+
+    from kafka import KafkaClient
+    client = KafkaClient(bootstrap_servers='localhost:{}'.format(k.port))
+    client.check_version()
+
+    from pprint import pprint
+
+    pprint(client.get_api_versions())
+
+    client.close()
+    k.close()
+    zk.close()
+
+
+def run_brokers():
+    logging.basicConfig(level=logging.ERROR)
+    k = KafkaFixture.instance(0)
+    zk = k.zookeeper
+
+    print("Kafka", k.kafka_version, "running on port:", k.port)
+    try:
+        while True:
+            time.sleep(5)
+    except KeyboardInterrupt:
+        print("Bye!")
+        k.close()
+        if zk:
+            zk.close()
+
+
+if __name__ == '__main__':
+    import sys
+    if len(sys.argv) < 2:
+        print("Commands: get_api_versions")
+        exit(0)
+    cmd = sys.argv[1]
+    if cmd == 'get_api_versions':
+        get_api_versions()
+    elif cmd == 'kafka':
+        run_brokers()
+    else:
+        print("Unknown cmd: %s", cmd)
+        exit(1)
diff --git a/test/test_admin_integration.py b/test/integration/test_admin_integration.py
similarity index 74%
rename from test/test_admin_integration.py
rename to test/integration/test_admin_integration.py
index 06c40a223..f95f367e8 100644
--- a/test/test_admin_integration.py
+++ b/test/integration/test_admin_integration.py
@@ -1,3 +1,4 @@
+from kafka.structs import TopicPartition
 import pytest
 
 from logging import info
@@ -7,7 +8,9 @@
 
 from kafka.admin import (
     ACLFilter, ACLOperation, ACLPermissionType, ResourcePattern, ResourceType, ACL, ConfigResource, ConfigResourceType)
-from kafka.errors import (NoError, GroupCoordinatorNotAvailableError, NonEmptyGroupError, GroupIdNotFoundError)
+from kafka.errors import (
+        BrokerResponseError, KafkaError, NoError, CoordinatorNotAvailableError, NonEmptyGroupError,
+        GroupIdNotFoundError, OffsetOutOfRangeError, UnknownTopicOrPartitionError)
 
 
 @pytest.mark.skipif(env_kafka_version() < (0, 11), reason="ACL features require broker >=0.11")
@@ -140,15 +143,15 @@ def test_describe_configs_invalid_broker_id_raises(kafka_admin_client):
     broker_id = "str"
 
     with pytest.raises(ValueError):
-        configs = kafka_admin_client.describe_configs([ConfigResource(ConfigResourceType.BROKER, broker_id)])
+        kafka_admin_client.describe_configs([ConfigResource(ConfigResourceType.BROKER, broker_id)])
 
 
 @pytest.mark.skipif(env_kafka_version() < (0, 11), reason='Describe consumer group requires broker >=0.11')
 def test_describe_consumer_group_does_not_exist(kafka_admin_client):
     """Tests that the describe consumer group call fails if the group coordinator is not available
     """
-    with pytest.raises(GroupCoordinatorNotAvailableError):
-        group_description = kafka_admin_client.describe_consumer_groups(['test'])
+    with pytest.raises(CoordinatorNotAvailableError):
+        kafka_admin_client.describe_consumer_groups(['test'])
 
 
 @pytest.mark.skipif(env_kafka_version() < (0, 11), reason='Describe consumer group requires broker >=0.11')
@@ -168,7 +171,7 @@ def consumer_thread(i, group_id):
         stop[i] = Event()
         consumers[i] = kafka_consumer_factory(group_id=group_id)
         while not stop[i].is_set():
-            consumers[i].poll(20)
+            consumers[i].poll(timeout_ms=200)
         consumers[i].close()
         consumers[i] = None
         stop[i] = None
@@ -183,6 +186,7 @@ def consumer_thread(i, group_id):
     try:
         timeout = time() + 35
         while True:
+            info('Checking consumers...')
             for c in range(num_consumers):
 
                 # Verify all consumers have been created
@@ -212,9 +216,9 @@ def consumer_thread(i, group_id):
 
                 if not rejoining and is_same_generation:
                     break
-                else:
-                    sleep(1)
             assert time() < timeout, "timeout waiting for assignments"
+            info('sleeping...')
+            sleep(1)
 
         info('Group stabilized; verifying assignment')
         output = kafka_admin_client.describe_consumer_groups(group_id_list)
@@ -236,6 +240,8 @@ def consumer_thread(i, group_id):
         for c in range(num_consumers):
             info('Stopping consumer %s', c)
             stop[c].set()
+        for c in range(num_consumers):
+            info('Waiting for consumer thread %s', c)
             threads[c].join()
             threads[c] = None
 
@@ -312,3 +318,71 @@ def test_delete_consumergroups_with_errors(kafka_admin_client, kafka_consumer_fa
     assert group1 not in consumergroups
     assert group2 in consumergroups
     assert group3 not in consumergroups
+
+@pytest.fixture(name="topic2")
+def _topic2(kafka_broker, request):
+    """Same as `topic` fixture, but a different name if you need to topics."""
+    topic_name = '%s_%s' % (request.node.name, random_string(10))
+    kafka_broker.create_topics([topic_name])
+    return topic_name
+
+@pytest.mark.skipif(env_kafka_version() < (0, 11), reason="Delete records requires broker >=0.11.0")
+def test_delete_records(kafka_admin_client, kafka_consumer_factory, send_messages, topic, topic2):
+    t0p0 = TopicPartition(topic, 0)
+    t0p1 = TopicPartition(topic, 1)
+    t0p2 = TopicPartition(topic, 2)
+    t1p0 = TopicPartition(topic2, 0)
+    t1p1 = TopicPartition(topic2, 1)
+    t1p2 = TopicPartition(topic2, 2)
+
+    partitions = (t0p0, t0p1, t0p2, t1p0, t1p1, t1p2)
+
+    for p in partitions:
+        send_messages(range(0, 100), partition=p.partition, topic=p.topic)
+
+    consumer1 = kafka_consumer_factory(group_id=None, topics=())
+    consumer1.assign(partitions)
+    for _ in range(600):
+        next(consumer1)
+
+    result = kafka_admin_client.delete_records({t0p0: -1, t0p1: 50, t1p0: 40, t1p2: 30}, timeout_ms=1000)
+    assert result[t0p0] == {"low_watermark": 100, "error_code": 0, "partition_index": t0p0.partition}
+    assert result[t0p1] == {"low_watermark": 50, "error_code": 0, "partition_index": t0p1.partition}
+    assert result[t1p0] == {"low_watermark": 40, "error_code": 0, "partition_index": t1p0.partition}
+    assert result[t1p2] == {"low_watermark": 30, "error_code": 0, "partition_index": t1p2.partition}
+
+    consumer2 = kafka_consumer_factory(group_id=None, topics=())
+    consumer2.assign(partitions)
+    all_messages = consumer2.poll(max_records=600, timeout_ms=2000)
+    assert sum(len(x) for x in all_messages.values()) == 600 - 100 - 50 - 40 - 30
+    assert not consumer2.poll(max_records=1, timeout_ms=1000) # ensure there are no delayed messages
+
+    assert not all_messages.get(t0p0, [])
+    assert [r.offset for r in all_messages[t0p1]] == list(range(50, 100))
+    assert [r.offset for r in all_messages[t0p2]] == list(range(100))
+
+    assert [r.offset for r in all_messages[t1p0]] == list(range(40, 100))
+    assert [r.offset for r in all_messages[t1p1]] == list(range(100))
+    assert [r.offset for r in all_messages[t1p2]] == list(range(30, 100))
+
+
+@pytest.mark.skipif(env_kafka_version() < (0, 11), reason="Delete records requires broker >=0.11.0")
+def test_delete_records_with_errors(kafka_admin_client, topic, send_messages):
+    sleep(1)  # sometimes the topic is not created yet...?
+    p0 = TopicPartition(topic, 0)
+    p1 = TopicPartition(topic, 1)
+    p2 = TopicPartition(topic, 2)
+    # verify that topic has been created
+    send_messages(range(0, 1), partition=p2.partition, topic=p2.topic)
+
+    with pytest.raises(UnknownTopicOrPartitionError):
+        kafka_admin_client.delete_records({TopicPartition(topic, 9999): -1})
+    with pytest.raises(UnknownTopicOrPartitionError):
+        kafka_admin_client.delete_records({TopicPartition("doesntexist", 0): -1})
+    with pytest.raises(OffsetOutOfRangeError):
+        kafka_admin_client.delete_records({p0: 1000})
+    with pytest.raises(BrokerResponseError):
+        kafka_admin_client.delete_records({p0: 1000, p1: 1000})
+
+
+
diff --git a/test/test_consumer_group.py b/test/integration/test_consumer_group.py
similarity index 67%
rename from test/test_consumer_group.py
rename to test/integration/test_consumer_group.py
index 58dc7ebf9..b2908c757 100644
--- a/test/test_consumer_group.py
+++ b/test/integration/test_consumer_group.py
@@ -23,7 +23,7 @@ def test_consumer(kafka_broker, topic):
     # The `topic` fixture is included because
     # 0.8.2 brokers need a topic to function well
     consumer = KafkaConsumer(bootstrap_servers=get_connect_str(kafka_broker))
-    consumer.poll(500)
+    consumer.poll(timeout_ms=500)
     assert len(consumer._client._conns) > 0
     node_id = list(consumer._client._conns.keys())[0]
     assert consumer._client._conns[node_id].state is ConnectionStates.CONNECTED
@@ -34,7 +34,7 @@ def test_consumer(kafka_broker, topic):
 def test_consumer_topics(kafka_broker, topic):
     consumer = KafkaConsumer(bootstrap_servers=get_connect_str(kafka_broker))
     # Necessary to drive the IO
-    consumer.poll(500)
+    consumer.poll(timeout_ms=500)
     assert topic in consumer.topics()
     assert len(consumer.partitions_for_topic(topic)) > 0
     consumer.close()
@@ -47,7 +47,7 @@ def test_group(kafka_broker, topic):
     consumers = {}
     stop = {}
     threads = {}
-    messages = collections.defaultdict(list)
+    messages = collections.defaultdict(lambda: collections.defaultdict(list))
     group_id = 'test-group-' + random_string(6)
     def consumer_thread(i):
         assert i not in consumers
@@ -56,58 +56,62 @@ def consumer_thread(i):
         consumers[i] = KafkaConsumer(topic,
                                      bootstrap_servers=connect_str,
                                      group_id=group_id,
+                                     client_id="consumer_thread-%s" % i,
+                                     api_version_auto_timeout_ms=5000,
                                      heartbeat_interval_ms=500)
         while not stop[i].is_set():
-            for tp, records in six.itervalues(consumers[i].poll(100)):
+            for tp, records in six.iteritems(consumers[i].poll(timeout_ms=200)):
                 messages[i][tp].extend(records)
-        consumers[i].close()
+        consumers[i].close(timeout_ms=500)
         consumers[i] = None
         stop[i] = None
 
     num_consumers = 4
     for i in range(num_consumers):
         t = threading.Thread(target=consumer_thread, args=(i,))
+        t.daemon = True
         t.start()
         threads[i] = t
 
     try:
-        timeout = time.time() + 35
+        timeout = time.time() + 15
         while True:
-            for c in range(num_consumers):
-
-                # Verify all consumers have been created
-                if c not in consumers:
-                    break
-
-                # Verify all consumers have an assignment
-                elif not consumers[c].assignment():
-                    break
+            assert time.time() < timeout, "timeout waiting for assignments"
+            # Verify all consumers have been created
+            missing_consumers = set(consumers.keys()) - set(range(num_consumers))
+            if missing_consumers:
+                logging.info('Waiting on consumer threads: %s', missing_consumers)
+                time.sleep(1)
+                continue
+
+            unassigned_consumers = {c for c, consumer in six.iteritems(consumers) if not consumer.assignment()}
+            if unassigned_consumers:
+                logging.info('Waiting for consumer assignments: %s', unassigned_consumers)
+                time.sleep(1)
+                continue
 
             # If all consumers exist and have an assignment
+            logging.info('All consumers have assignment... checking for stable group')
+            # Verify all consumers are in the same generation
+            # then log state and break while loop
+            generations = set([consumer._coordinator._generation.generation_id
+                               for consumer in six.itervalues(consumers)])
+
+            # New generation assignment is not complete until
+            # coordinator.rejoining = False
+            rejoining = set([c for c, consumer in six.iteritems(consumers) if consumer._coordinator.rejoining])
+
+            if not rejoining and len(generations) == 1:
+                for c, consumer in six.iteritems(consumers):
+                    logging.info("[%s] %s %s: %s", c,
+                                 consumer._coordinator._generation.generation_id,
+                                 consumer._coordinator._generation.member_id,
+                                 consumer.assignment())
+                break
             else:
-
-                logging.info('All consumers have assignment... checking for stable group')
-                # Verify all consumers are in the same generation
-                # then log state and break while loop
-                generations = set([consumer._coordinator._generation.generation_id
-                                   for consumer in list(consumers.values())])
-
-                # New generation assignment is not complete until
-                # coordinator.rejoining = False
-                rejoining = any([consumer._coordinator.rejoining
-                                 for consumer in list(consumers.values())])
-
-                if not rejoining and len(generations) == 1:
-                    for c, consumer in list(consumers.items()):
-                        logging.info("[%s] %s %s: %s", c,
-                                     consumer._coordinator._generation.generation_id,
-                                     consumer._coordinator._generation.member_id,
-                                     consumer.assignment())
-                    break
-                else:
-                    logging.info('Rejoining: %s, generations: %s', rejoining, generations)
-                    time.sleep(1)
-            assert time.time() < timeout, "timeout waiting for assignments"
+                logging.info('Rejoining: %s, generations: %s', rejoining, generations)
+                time.sleep(1)
+                continue
 
         logging.info('Group stabilized; verifying assignment')
         group_assignment = set()
@@ -126,7 +130,8 @@ def consumer_thread(i):
         for c in range(num_consumers):
             logging.info('Stopping consumer %s', c)
             stop[c].set()
-            threads[c].join()
+            threads[c].join(timeout=5)
+            assert not threads[c].is_alive()
             threads[c] = None
 
 
@@ -176,4 +181,4 @@ def test_heartbeat_thread(kafka_broker, topic):
     assert consumer._coordinator.heartbeat.last_poll == last_poll
     consumer.poll(timeout_ms=100)
     assert consumer._coordinator.heartbeat.last_poll > last_poll
-    consumer.close()
+    consumer.close(timeout_ms=100)
diff --git a/test/test_consumer_integration.py b/test/integration/test_consumer_integration.py
similarity index 92%
rename from test/test_consumer_integration.py
rename to test/integration/test_consumer_integration.py
index 90b7ed203..71cf2642d 100644
--- a/test/test_consumer_integration.py
+++ b/test/integration/test_consumer_integration.py
@@ -1,18 +1,22 @@
 import logging
 import time
 
-from mock import patch
+try:
+    from unittest.mock import patch, ANY
+except ImportError:
+    from mock import patch, ANY
 import pytest
 from kafka.vendor.six.moves import range
 
 import kafka.codec
-from kafka.errors import UnsupportedCodecError, UnsupportedVersionError
+from kafka.errors import KafkaTimeoutError, UnsupportedCodecError, UnsupportedVersionError
 from kafka.structs import TopicPartition, OffsetAndTimestamp
 
 from test.testutil import Timer, assert_message_count, env_kafka_version, random_string
 
 
 @pytest.mark.skipif(not env_kafka_version(), reason="No KAFKA_VERSION set")
+@pytest.mark.skipif(env_kafka_version()[:2] > (2, 6, 0), reason="KAFKA_VERSION newer than max inferred version")
 def test_kafka_version_infer(kafka_consumer_factory):
     consumer = kafka_consumer_factory()
     actual_ver_major_minor = env_kafka_version()[:2]
@@ -26,7 +30,7 @@ def test_kafka_version_infer(kafka_consumer_factory):
 @pytest.mark.skipif(not env_kafka_version(), reason="No KAFKA_VERSION set")
 def test_kafka_consumer(kafka_consumer_factory, send_messages):
     """Test KafkaConsumer"""
-    consumer = kafka_consumer_factory(auto_offset_reset='earliest')
+    consumer = kafka_consumer_factory(auto_offset_reset='earliest', consumer_timeout_ms=2000)
     send_messages(range(0, 100), partition=0)
     send_messages(range(0, 100), partition=1)
     cnt = 0
@@ -64,8 +68,8 @@ def test_kafka_consumer_unsupported_encoding(
 def test_kafka_consumer__blocking(kafka_consumer_factory, topic, send_messages):
     TIMEOUT_MS = 500
     consumer = kafka_consumer_factory(auto_offset_reset='earliest',
-                                    enable_auto_commit=False,
-                                    consumer_timeout_ms=TIMEOUT_MS)
+                                      enable_auto_commit=False,
+                                      consumer_timeout_ms=TIMEOUT_MS)
 
     # Manual assignment avoids overhead of consumer group mgmt
     consumer.unsubscribe()
@@ -257,9 +261,10 @@ def test_kafka_consumer_offsets_search_many_partitions(kafka_consumer, kafka_pro
         tp1: send_time
     })
 
+    leader_epoch = ANY if env_kafka_version() >= (2, 1) else -1
     assert offsets == {
-        tp0: OffsetAndTimestamp(p0msg.offset, send_time),
-        tp1: OffsetAndTimestamp(p1msg.offset, send_time)
+        tp0: OffsetAndTimestamp(p0msg.offset, send_time, leader_epoch),
+        tp1: OffsetAndTimestamp(p1msg.offset, send_time, leader_epoch)
     }
 
     offsets = consumer.beginning_offsets([tp0, tp1])
@@ -275,6 +280,7 @@ def test_kafka_consumer_offsets_search_many_partitions(kafka_consumer, kafka_pro
     }
 
 
+@pytest.mark.skipif(not env_kafka_version(), reason="No KAFKA_VERSION set")
 @pytest.mark.skipif(env_kafka_version() >= (0, 10, 1), reason="Requires KAFKA_VERSION < 0.10.1")
 def test_kafka_consumer_offsets_for_time_old(kafka_consumer, topic):
     consumer = kafka_consumer
@@ -294,4 +300,5 @@ def test_kafka_consumer_offsets_for_times_errors(kafka_consumer_factory, topic):
     with pytest.raises(ValueError):
         consumer.offsets_for_times({tp: -1})
 
-    assert consumer.offsets_for_times({bad_tp: 0}) == {bad_tp: None}
+    with pytest.raises(KafkaTimeoutError):
+        consumer.offsets_for_times({bad_tp: 0})
diff --git a/test/integration/test_producer_integration.py b/test/integration/test_producer_integration.py
new file mode 100644
index 000000000..037a82834
--- /dev/null
+++ b/test/integration/test_producer_integration.py
@@ -0,0 +1,207 @@
+from __future__ import absolute_import
+
+from contextlib import contextmanager
+import platform
+import time
+
+import pytest
+
+from kafka import KafkaAdminClient, KafkaConsumer, KafkaProducer, TopicPartition, OffsetAndMetadata
+from test.testutil import env_kafka_version, random_string, maybe_skip_unsupported_compression
+
+
+@contextmanager
+def producer_factory(**kwargs):
+    producer = KafkaProducer(**kwargs)
+    try:
+        yield producer
+    finally:
+        producer.close(timeout=1)
+
+
+@contextmanager
+def consumer_factory(**kwargs):
+    consumer = KafkaConsumer(**kwargs)
+    try:
+        yield consumer
+    finally:
+        consumer.close(timeout_ms=100)
+
+
+@contextmanager
+def admin_factory(**kwargs):
+    admin = KafkaAdminClient(**kwargs)
+    try:
+        yield admin
+    finally:
+        admin.close()
+
+
+@pytest.mark.skipif(not env_kafka_version(), reason="No KAFKA_VERSION set")
+@pytest.mark.parametrize("compression", [None, 'gzip', 'snappy', 'lz4', 'zstd'])
+def test_end_to_end(kafka_broker, compression):
+    maybe_skip_unsupported_compression(compression)
+    if compression == 'lz4':
+        if env_kafka_version() < (0, 8, 2):
+            pytest.skip('LZ4 requires 0.8.2')
+        elif platform.python_implementation() == 'PyPy':
+            pytest.skip('python-lz4 crashes on older versions of pypy')
+
+    if compression == 'zstd' and env_kafka_version() < (2, 1, 0):
+        pytest.skip('zstd requires kafka 2.1.0 or newer')
+
+    connect_str = ':'.join([kafka_broker.host, str(kafka_broker.port)])
+    producer_args = {
+        'bootstrap_servers': connect_str,
+        'retries': 5,
+        'max_block_ms': 30000,
+        'compression_type': compression,
+        'value_serializer': str.encode,
+    }
+    consumer_args = {
+        'bootstrap_servers': connect_str,
+        'group_id': None,
+        'consumer_timeout_ms': 30000,
+        'auto_offset_reset': 'earliest',
+        'value_deserializer': bytes.decode,
+    }
+    with producer_factory(**producer_args) as producer, consumer_factory(**consumer_args) as consumer:
+        topic = random_string(5)
+
+        messages = 100
+        futures = []
+        for i in range(messages):
+            futures.append(producer.send(topic, 'msg %d' % i))
+        ret = [f.get(timeout=30) for f in futures]
+        assert len(ret) == messages
+
+        consumer.subscribe([topic])
+        msgs = set()
+        for i in range(messages):
+            try:
+                msgs.add(next(consumer).value)
+            except StopIteration:
+                break
+
+        assert msgs == set(['msg %d' % (i,) for i in range(messages)])
+
+
+@pytest.mark.skipif(not env_kafka_version(), reason="No KAFKA_VERSION set")
+@pytest.mark.parametrize("compression", [None, 'gzip', 'snappy', 'lz4', 'zstd'])
+def test_kafka_producer_proper_record_metadata(kafka_broker, compression):
+    maybe_skip_unsupported_compression(compression)
+    if compression == 'zstd' and env_kafka_version() < (2, 1, 0):
+        pytest.skip('zstd requires 2.1.0 or more')
+    connect_str = ':'.join([kafka_broker.host, str(kafka_broker.port)])
+    with producer_factory(bootstrap_servers=connect_str,
+                          retries=5,
+                          max_block_ms=30000,
+                          compression_type=compression) as producer:
+        magic = producer.max_usable_produce_magic(producer.config['api_version'])
+
+        # record headers are supported in 0.11.0
+        if env_kafka_version() < (0, 11, 0):
+            headers = None
+        else:
+            headers = [("Header Key", b"Header Value")]
+
+        topic = random_string(5)
+        future = producer.send(
+            topic,
+            value=b"Simple value", key=b"Simple key", headers=headers, timestamp_ms=9999999,
+            partition=0)
+        record = future.get(timeout=5)
+        assert record is not None
+        assert record.topic == topic
+        assert record.partition == 0
+        assert record.topic_partition == TopicPartition(topic, 0)
+        assert record.offset == 0
+        if magic >= 1:
+            assert record.timestamp == 9999999
+        else:
+            assert record.timestamp == -1  # NO_TIMESTAMP
+
+        if magic >= 2:
+            assert record.checksum is None
+        elif magic == 1:
+            assert record.checksum == 1370034956
+        else:
+            assert record.checksum == 3296137851
+
+        assert record.serialized_key_size == 10
+        assert record.serialized_value_size == 12
+        if headers:
+            assert record.serialized_header_size == 22
+
+        if magic == 0:
+            pytest.skip('generated timestamp case is skipped for broker 0.9 and below')
+        send_time = time.time() * 1000
+        future = producer.send(
+            topic,
+            value=b"Simple value", key=b"Simple key", timestamp_ms=None,
+            partition=0)
+        record = future.get(timeout=5)
+        assert abs(record.timestamp - send_time) <= 1000  # Allow 1s deviation
+
+
+@pytest.mark.skipif(env_kafka_version() < (0, 11), reason="Idempotent producer requires broker >=0.11")
+def test_idempotent_producer(kafka_broker):
+    connect_str = ':'.join([kafka_broker.host, str(kafka_broker.port)])
+    with producer_factory(bootstrap_servers=connect_str, enable_idempotence=True) as producer:
+        for _ in range(10):
+            producer.send('idempotent_test_topic', value=b'idempotent_msg').get(timeout=1)
+
+
+@pytest.mark.skipif(env_kafka_version() < (0, 11), reason="Idempotent producer requires broker >=0.11")
+def test_transactional_producer_messages(kafka_broker):
+    connect_str = ':'.join([kafka_broker.host, str(kafka_broker.port)])
+    with producer_factory(bootstrap_servers=connect_str, transactional_id='testing') as producer:
+        producer.init_transactions()
+        producer.begin_transaction()
+        producer.send('transactional_test_topic', partition=0, value=b'msg1').get()
+        producer.send('transactional_test_topic', partition=0, value=b'msg2').get()
+        producer.abort_transaction()
+        producer.begin_transaction()
+        producer.send('transactional_test_topic', partition=0, value=b'msg3').get()
+        producer.send('transactional_test_topic', partition=0, value=b'msg4').get()
+        producer.commit_transaction()
+
+    messages = set()
+    consumer_opts = {
+        'bootstrap_servers': connect_str,
+        'group_id': None,
+        'consumer_timeout_ms': 10000,
+        'auto_offset_reset': 'earliest',
+        'isolation_level': 'read_committed',
+    }
+    with consumer_factory(**consumer_opts) as consumer:
+        consumer.assign([TopicPartition('transactional_test_topic', 0)])
+        for msg in consumer:
+            assert msg.value in {b'msg3', b'msg4'}
+            messages.add(msg.value)
+            if messages == {b'msg3', b'msg4'}:
+                break
+    assert messages == {b'msg3', b'msg4'}
+
+
+@pytest.mark.skipif(env_kafka_version() < (0, 11), reason="Idempotent producer requires broker >=0.11")
+def test_transactional_producer_offsets(kafka_broker):
+    connect_str = ':'.join([kafka_broker.host, str(kafka_broker.port)])
+    # Setting leader_epoch only supported in 2.1+
+    if env_kafka_version() >= (2, 1):
+        leader_epoch = 0
+    else:
+        leader_epoch = -1
+    offsets = {TopicPartition('transactional_test_topic', 0): OffsetAndMetadata(0, 'metadata', leader_epoch)}
+    with producer_factory(bootstrap_servers=connect_str, transactional_id='testing') as producer:
+        producer.init_transactions()
+        producer.begin_transaction()
+        producer.send_offsets_to_transaction(offsets, 'txn-test-group')
+        producer.commit_transaction()
+
+        producer.begin_transaction()
+        producer.send_offsets_to_transaction({TopicPartition('transactional_test_topic', 1): OffsetAndMetadata(1, 'bad', 1)}, 'txn-test-group')
+        producer.abort_transaction()
+
+    with admin_factory(bootstrap_servers=connect_str) as admin:
+        assert admin.list_consumer_group_offsets('txn-test-group') == offsets
diff --git a/test/test_sasl_integration.py b/test/integration/test_sasl_integration.py
similarity index 84%
rename from test/test_sasl_integration.py
rename to test/integration/test_sasl_integration.py
index e3a4813ae..69323fb92 100644
--- a/test/test_sasl_integration.py
+++ b/test/integration/test_sasl_integration.py
@@ -1,5 +1,6 @@
 import logging
 import uuid
+import time
 
 import pytest
 
@@ -24,7 +25,7 @@
     ]
 )
 def sasl_kafka(request, kafka_broker_factory):
-    sasl_kafka = kafka_broker_factory(transport="SASL_PLAINTEXT", sasl_mechanism=request.param)[0]
+    sasl_kafka = kafka_broker_factory(transport="SASL_PLAINTEXT", sasl_mechanism=request.param)
     yield sasl_kafka
     sasl_kafka.child.dump_logs()
 
@@ -69,12 +70,17 @@ def test_client(request, sasl_kafka):
 
     client, = sasl_kafka.get_clients(1)
     request = MetadataRequest_v1(None)
-    client.send(0, request)
-    for _ in range(10):
-        result = client.poll(timeout_ms=10000)
-        if len(result) > 0:
-            break
-    else:
+    timeout_at = time.time() + 1
+    while not client.is_ready(0):
+        client.maybe_connect(0)
+        client.poll(timeout_ms=100)
+        if time.time() > timeout_at:
+            raise RuntimeError("Couldn't connect to node 0")
+    future = client.send(0, request)
+    client.poll(future=future, timeout_ms=10000)
+    if not future.is_done:
         raise RuntimeError("Couldn't fetch topic response from Broker.")
-    result = result[0]
+    elif future.failed():
+        raise future.exception
+    result = future.value
     assert topic_name in [t[1] for t in result.topics]
diff --git a/test/record/test_default_records.py b/test/record/test_default_records.py
index c3a7b02c8..540705d50 100644
--- a/test/record/test_default_records.py
+++ b/test/record/test_default_records.py
@@ -1,13 +1,18 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 import pytest
-from mock import patch
+try:
+    from unittest.mock import patch
+except ImportError:
+    from mock import patch
 import kafka.codec
 from kafka.record.default_records import (
     DefaultRecordBatch, DefaultRecordBatchBuilder
 )
 from kafka.errors import UnsupportedCodecError
 
+from test.testutil import maybe_skip_unsupported_compression
+
 
 @pytest.mark.parametrize("compression_type", [
     DefaultRecordBatch.CODEC_NONE,
@@ -16,6 +21,7 @@
     DefaultRecordBatch.CODEC_LZ4
 ])
 def test_read_write_serde_v2(compression_type):
+    maybe_skip_unsupported_compression(compression_type)
     builder = DefaultRecordBatchBuilder(
         magic=2, compression_type=compression_type, is_transactional=1,
         producer_id=123456, producer_epoch=123, base_sequence=9999,
@@ -51,8 +57,8 @@ def test_written_bytes_equals_size_in_bytes_v2():
         producer_id=-1, producer_epoch=-1, base_sequence=-1,
         batch_size=999999)
 
-    size_in_bytes = builder.size_in_bytes(
-        0, timestamp=9999999, key=key, value=value, headers=headers)
+    size_in_bytes = DefaultRecordBatchBuilder.size_in_bytes(
+        offset_delta=0, timestamp_delta=0, key=key, value=value, headers=headers)
 
     pos = builder.size()
     meta = builder.append(
@@ -183,6 +189,8 @@ def test_default_batch_size_limit():
 ])
 @pytest.mark.parametrize("magic", [0, 1])
 def test_unavailable_codec(magic, compression_type, name, checker_name):
+    if not getattr(kafka.codec, checker_name)():
+        pytest.skip('%s compression_type not installed' % (compression_type,))
     builder = DefaultRecordBatchBuilder(
         magic=2, compression_type=compression_type, is_transactional=0,
         producer_id=-1, producer_epoch=-1, base_sequence=-1,
diff --git a/test/record/test_legacy_records.py b/test/record/test_legacy_records.py
index 43970f7c9..c692d35a1 100644
--- a/test/record/test_legacy_records.py
+++ b/test/record/test_legacy_records.py
@@ -1,12 +1,17 @@
 from __future__ import unicode_literals
 import pytest
-from mock import patch
+try:
+    from unittest.mock import patch
+except ImportError:
+    from mock import patch
 from kafka.record.legacy_records import (
     LegacyRecordBatch, LegacyRecordBatchBuilder
 )
 import kafka.codec
 from kafka.errors import UnsupportedCodecError
 
+from test.testutil import maybe_skip_unsupported_compression
+
 
 @pytest.mark.parametrize("magic", [0, 1])
 def test_read_write_serde_v0_v1_no_compression(magic):
@@ -36,6 +41,7 @@ def test_read_write_serde_v0_v1_no_compression(magic):
 ])
 @pytest.mark.parametrize("magic", [0, 1])
 def test_read_write_serde_v0_v1_with_compression(compression_type, magic):
+    maybe_skip_unsupported_compression(compression_type)
     builder = LegacyRecordBatchBuilder(
         magic=magic, compression_type=compression_type, batch_size=9999999)
     for offset in range(10):
@@ -176,6 +182,7 @@ def test_legacy_batch_size_limit(magic):
 ])
 @pytest.mark.parametrize("magic", [0, 1])
 def test_unavailable_codec(magic, compression_type, name, checker_name):
+    maybe_skip_unsupported_compression(compression_type)
     builder = LegacyRecordBatchBuilder(
         magic=magic, compression_type=compression_type, batch_size=1024)
     builder.append(0, timestamp=None, key=None, value=b"M")
diff --git a/test/record/test_records.py b/test/record/test_records.py
index 9f72234ae..65010d88f 100644
--- a/test/record/test_records.py
+++ b/test/record/test_records.py
@@ -2,7 +2,9 @@
 from __future__ import unicode_literals
 import pytest
 from kafka.record import MemoryRecords, MemoryRecordsBuilder
-from kafka.errors import CorruptRecordException
+from kafka.errors import CorruptRecordError
+
+from test.testutil import maybe_skip_unsupported_compression
 
 # This is real live data from Kafka 11 broker
 record_batch_data_v2 = [
@@ -60,6 +62,15 @@
     b'\x00\xff\xff\xff\xff\x00\x00\x00\x03123'
 ]
 
+# Single record control batch (abort)
+control_batch_data_v2 = [
+    b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00R\x00\x00\x00\x00'
+    b'\x02e\x97\xff\xd0\x00\x20\x00\x00\x00\x00\x00\x00\x00\x00\x00'
+    b'\x98\x96\x7f\x00\x00\x00\x00\x00\x98\x96'
+    b'\x7f\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff'
+    b'\x00\x00\x00\x01@\x00\x00\x00\x08\x00\x00\x00\x00,opaque-control-message\x00'
+]
+
 
 def test_memory_records_v2():
     data_bytes = b"".join(record_batch_data_v2) + b"\x00" * 4
@@ -163,13 +174,14 @@ def test_memory_records_corrupt():
         b"\x00\x00\x00\x03"  # Length=3
         b"\xfe\xb0\x1d",  # Some random bytes
     )
-    with pytest.raises(CorruptRecordException):
+    with pytest.raises(CorruptRecordError):
         records.next_batch()
 
 
 @pytest.mark.parametrize("compression_type", [0, 1, 2, 3])
 @pytest.mark.parametrize("magic", [0, 1, 2])
 def test_memory_records_builder(magic, compression_type):
+    maybe_skip_unsupported_compression(compression_type)
     builder = MemoryRecordsBuilder(
         magic=magic, compression_type=compression_type, batch_size=1024 * 10)
     base_size = builder.size_in_bytes()  # V2 has a header before
@@ -198,7 +210,7 @@ def test_memory_records_builder(magic, compression_type):
     # Size should remain the same after closing. No trailing bytes
     builder.close()
     assert builder.compression_rate() > 0
-    expected_size = size_before_close * builder.compression_rate()
+    expected_size = int(size_before_close * builder.compression_rate())
     assert builder.is_full()
     assert builder.size_in_bytes() == expected_size
     buffer = builder.buffer()
@@ -230,3 +242,18 @@ def test_memory_records_builder_full(magic, compression_type):
         key=None, timestamp=None, value=b"M")
     assert metadata is None
     assert builder.next_offset() == 1
+
+
+def test_control_record_v2():
+    data_bytes = b"".join(control_batch_data_v2)
+    records = MemoryRecords(data_bytes)
+
+    assert records.has_next() is True
+    batch = records.next_batch()
+    assert batch.is_control_batch is True
+    recs = list(batch)
+    assert len(recs) == 1
+    assert recs[0].version == 0
+    assert recs[0].type == 0
+    assert recs[0].abort is True
+    assert recs[0].commit is False
diff --git a/test/sasl/test_msk.py b/test/sasl/test_msk.py
new file mode 100644
index 000000000..e9f1325f3
--- /dev/null
+++ b/test/sasl/test_msk.py
@@ -0,0 +1,71 @@
+import datetime
+import json
+import sys
+
+from kafka.sasl.msk import AwsMskIamClient
+
+try:
+    from unittest import mock
+except ImportError:
+    import mock
+
+
+def client_factory(token=None):
+    if sys.version_info >= (3, 3):
+        now = datetime.datetime.fromtimestamp(1629321911, datetime.timezone.utc)
+    else:
+        now = datetime.datetime.utcfromtimestamp(1629321911)
+    with mock.patch('kafka.sasl.msk.datetime') as mock_dt:
+        mock_dt.datetime.utcnow = mock.Mock(return_value=now)
+        return AwsMskIamClient(
+            host='localhost',
+            access_key='XXXXXXXXXXXXXXXXXXXX',
+            secret_key='XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX',
+            region='us-east-1',
+            token=token,
+        )
+
+
+def test_aws_msk_iam_client_permanent_credentials():
+    client = client_factory(token=None)
+    msg = client.first_message()
+    assert msg
+    assert isinstance(msg, bytes)
+    actual = json.loads(msg)
+
+    expected = {
+        'version': '2020_10_22',
+        'host': 'localhost',
+        'user-agent': 'kafka-python',
+        'action': 'kafka-cluster:Connect',
+        'x-amz-algorithm': 'AWS4-HMAC-SHA256',
+        'x-amz-credential': 'XXXXXXXXXXXXXXXXXXXX/20210818/us-east-1/kafka-cluster/aws4_request',
+        'x-amz-date': '20210818T212511Z',
+        'x-amz-signedheaders': 'host',
+        'x-amz-expires': '900',
+        'x-amz-signature': '0fa42ae3d5693777942a7a4028b564f0b372bafa2f71c1a19ad60680e6cb994b',
+    }
+    assert actual == expected
+
+
+def test_aws_msk_iam_client_temporary_credentials():
+    client = client_factory(token='XXXXX')
+    msg = client.first_message()
+    assert msg
+    assert isinstance(msg, bytes)
+    actual = json.loads(msg)
+
+    expected = {
+        'version': '2020_10_22',
+        'host': 'localhost',
+        'user-agent': 'kafka-python',
+        'action': 'kafka-cluster:Connect',
+        'x-amz-algorithm': 'AWS4-HMAC-SHA256',
+        'x-amz-credential': 'XXXXXXXXXXXXXXXXXXXX/20210818/us-east-1/kafka-cluster/aws4_request',
+        'x-amz-date': '20210818T212511Z',
+        'x-amz-signedheaders': 'host',
+        'x-amz-expires': '900',
+        'x-amz-signature': 'b0619c50b7ecb4a7f6f92bd5f733770df5710e97b25146f97015c0b1db783b05',
+        'x-amz-security-token': 'XXXXX',
+    }
+    assert actual == expected
diff --git a/test/service.py b/test/service.py
index 045d780e7..a53fab8da 100644
--- a/test/service.py
+++ b/test/service.py
@@ -29,6 +29,11 @@ def open(self):
     def close(self):
         pass
 
+    def dump_logs(self):
+        pass
+
+    def wait_for(self, pattern, timeout=30):
+        pass
 
 class SpawnedService(threading.Thread):
     def __init__(self, args=None, env=None):
@@ -52,14 +57,14 @@ def __init__(self, args=None, env=None):
             log.debug("  {key}={value}".format(key=key, value=value))
 
     def _spawn(self):
-        if self.alive: return
-        if self.child and self.child.poll() is None: return
+        if self.alive or (self.child and self.child.poll() is None):
+            return
 
         self.child = subprocess.Popen(
             self.args,
             preexec_fn=os.setsid, # to avoid propagating signals
             env=self.env,
-            bufsize=1,
+            bufsize=0,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE)
         self.alive = self.child.poll() is None
@@ -76,6 +81,7 @@ def _despawn(self):
         else:
             self.child.kill()
 
+    # via threading.Thread
     def run(self):
         self._spawn()
         while True:
@@ -113,7 +119,8 @@ def wait_for(self, pattern, timeout=30):
         start = time.time()
         while True:
             if not self.is_alive():
-                raise RuntimeError("Child thread died already.")
+                log.error("Child thread died already.")
+                return False
 
             elapsed = time.time() - start
             if elapsed >= timeout:
diff --git a/test/test_admin.py b/test/test_admin.py
index 279f85abf..cdb74242e 100644
--- a/test/test_admin.py
+++ b/test/test_admin.py
@@ -6,7 +6,7 @@
 
 def test_config_resource():
     with pytest.raises(KeyError):
-        bad_resource = kafka.admin.ConfigResource('something', 'foo')
+        _bad_resource = kafka.admin.ConfigResource('something', 'foo')
     good_resource = kafka.admin.ConfigResource('broker', 'bar')
     assert good_resource.resource_type == kafka.admin.ConfigResourceType.BROKER
     assert good_resource.name == 'bar'
@@ -59,11 +59,11 @@ def test_acl_resource():
 
 def test_new_topic():
     with pytest.raises(IllegalArgumentError):
-        bad_topic = kafka.admin.NewTopic('foo', -1, -1)
+        _bad_topic = kafka.admin.NewTopic('foo', -1, -1)
     with pytest.raises(IllegalArgumentError):
-        bad_topic = kafka.admin.NewTopic('foo', 1, -1)
+        _bad_topic = kafka.admin.NewTopic('foo', 1, -1)
     with pytest.raises(IllegalArgumentError):
-        bad_topic = kafka.admin.NewTopic('foo', 1, 1, {1: [1, 1, 1]})
+        _bad_topic = kafka.admin.NewTopic('foo', 1, 1, {1: [1, 1, 1]})
     good_topic = kafka.admin.NewTopic('foo', 1, 2)
     assert good_topic.name == 'foo'
     assert good_topic.num_partitions == 1
diff --git a/test/test_assignors.py b/test/test_assignors.py
index 67e91e131..858ef426d 100644
--- a/test/test_assignors.py
+++ b/test/test_assignors.py
@@ -655,7 +655,7 @@ def test_conflicting_previous_assignments(mocker):
     'execution_number,n_topics,n_consumers', [(i, randint(10, 20), randint(20, 40)) for i in range(100)]
 )
 def test_reassignment_with_random_subscriptions_and_changes(mocker, execution_number, n_topics, n_consumers):
-    all_topics = set(['t{}'.format(i) for i in range(1, n_topics + 1)])
+    all_topics = sorted(['t{}'.format(i) for i in range(1, n_topics + 1)])
     partitions = dict([(t, set(range(1, i + 1))) for i, t in enumerate(all_topics)])
     cluster = create_cluster(mocker, topics=all_topics, topic_partitions_lambda=lambda t: partitions[t])
 
diff --git a/test/test_client_async.py b/test/test_client_async.py
index 74da66a36..acc400f9c 100644
--- a/test/test_client_async.py
+++ b/test/test_client_async.py
@@ -23,16 +23,33 @@
 
 
 @pytest.fixture
-def cli(mocker, conn):
+def client_poll_mocked(mocker):
+    cli = KafkaClient(request_timeout_ms=9999999,
+                      reconnect_backoff_ms=2222,
+                      connections_max_idle_ms=float('inf'),
+                      api_version=(0, 9))
+    mocker.patch.object(cli, '_poll')
+    ttl = mocker.patch.object(cli.cluster, 'ttl')
+    ttl.return_value = 0
+    try:
+        yield cli
+    finally:
+        cli._close()
+
+
+@pytest.fixture
+def client_selector_mocked(mocker, conn):
     client = KafkaClient(api_version=(0, 9))
     mocker.patch.object(client, '_selector')
     client.poll(future=client.cluster.request_update())
-    return client
-
+    try:
+        yield client
+    finally:
+        client._close()
 
 def test_bootstrap(mocker, conn):
     conn.state = ConnectionStates.CONNECTED
-    cli = KafkaClient(api_version=(0, 9))
+    cli = KafkaClient(api_version=(2, 1))
     mocker.patch.object(cli, '_selector')
     future = cli.cluster.request_update()
     cli.poll(future=future)
@@ -43,216 +60,206 @@ def test_bootstrap(mocker, conn):
     kwargs.pop('state_change_callback')
     kwargs.pop('node_id')
     assert kwargs == cli.config
-    conn.send.assert_called_once_with(MetadataRequest[0]([]), blocking=False)
+    conn.send.assert_called_once_with(MetadataRequest[7]([], True), blocking=False, request_timeout_ms=None)
     assert cli._bootstrap_fails == 0
     assert cli.cluster.brokers() == set([BrokerMetadata(0, 'foo', 12, None),
                                          BrokerMetadata(1, 'bar', 34, None)])
 
 
-def test_can_connect(cli, conn):
+def test_can_connect(client_selector_mocked, conn):
     # Node is not in broker metadata - can't connect
-    assert not cli._can_connect(2)
+    assert not client_selector_mocked._can_connect(2)
 
     # Node is in broker metadata but not in _conns
-    assert 0 not in cli._conns
-    assert cli._can_connect(0)
+    assert 0 not in client_selector_mocked._conns
+    assert client_selector_mocked._can_connect(0)
 
     # Node is connected, can't reconnect
-    assert cli._maybe_connect(0) is True
-    assert not cli._can_connect(0)
+    assert client_selector_mocked._init_connect(0) is True
+    assert not client_selector_mocked._can_connect(0)
 
     # Node is disconnected, can connect
-    cli._conns[0].state = ConnectionStates.DISCONNECTED
-    assert cli._can_connect(0)
+    client_selector_mocked._conns[0].state = ConnectionStates.DISCONNECTED
+    assert client_selector_mocked._can_connect(0)
 
     # Node is disconnected, but blacked out
     conn.blacked_out.return_value = True
-    assert not cli._can_connect(0)
+    assert not client_selector_mocked._can_connect(0)
 
 
-def test_maybe_connect(cli, conn):
-    try:
-        # Node not in metadata, raises AssertionError
-        cli._maybe_connect(2)
-    except AssertionError:
-        pass
-    else:
-        assert False, 'Exception not raised'
+def test_init_connect(client_selector_mocked, conn):
+    # Node not in metadata, return False
+    assert not client_selector_mocked._init_connect(2)
 
     # New node_id creates a conn object
-    assert 0 not in cli._conns
+    assert 0 not in client_selector_mocked._conns
     conn.state = ConnectionStates.DISCONNECTED
     conn.connect.side_effect = lambda: conn._set_conn_state(ConnectionStates.CONNECTING)
-    assert cli._maybe_connect(0) is False
-    assert cli._conns[0] is conn
+    assert client_selector_mocked._init_connect(0) is True
+    assert client_selector_mocked._conns[0] is conn
 
 
-def test_conn_state_change(mocker, cli, conn):
-    sel = cli._selector
+def test_conn_state_change(client_selector_mocked, conn):
+    sel = client_selector_mocked._selector
 
     node_id = 0
-    cli._conns[node_id] = conn
+    client_selector_mocked._conns[node_id] = conn
     conn.state = ConnectionStates.CONNECTING
     sock = conn._sock
-    cli._conn_state_change(node_id, sock, conn)
-    assert node_id in cli._connecting
+    client_selector_mocked._conn_state_change(node_id, sock, conn)
+    assert node_id in client_selector_mocked._connecting
     sel.register.assert_called_with(sock, selectors.EVENT_WRITE, conn)
 
     conn.state = ConnectionStates.CONNECTED
-    cli._conn_state_change(node_id, sock, conn)
-    assert node_id not in cli._connecting
+    client_selector_mocked._conn_state_change(node_id, sock, conn)
+    assert node_id not in client_selector_mocked._connecting
     sel.modify.assert_called_with(sock, selectors.EVENT_READ, conn)
 
     # Failure to connect should trigger metadata update
-    assert cli.cluster._need_update is False
+    assert client_selector_mocked.cluster._need_update is False
     conn.state = ConnectionStates.DISCONNECTED
-    cli._conn_state_change(node_id, sock, conn)
-    assert node_id not in cli._connecting
-    assert cli.cluster._need_update is True
+    client_selector_mocked._conn_state_change(node_id, sock, conn)
+    assert node_id not in client_selector_mocked._connecting
+    assert client_selector_mocked.cluster._need_update is True
     sel.unregister.assert_called_with(sock)
 
     conn.state = ConnectionStates.CONNECTING
-    cli._conn_state_change(node_id, sock, conn)
-    assert node_id in cli._connecting
+    client_selector_mocked._conn_state_change(node_id, sock, conn)
+    assert node_id in client_selector_mocked._connecting
     conn.state = ConnectionStates.DISCONNECTED
-    cli._conn_state_change(node_id, sock, conn)
-    assert node_id not in cli._connecting
+    client_selector_mocked._conn_state_change(node_id, sock, conn)
+    assert node_id not in client_selector_mocked._connecting
 
 
-def test_ready(mocker, cli, conn):
-    maybe_connect = mocker.patch.object(cli, 'maybe_connect')
+def test_ready(mocker, client_selector_mocked, conn):
+    maybe_connect = mocker.patch.object(client_selector_mocked, 'maybe_connect')
     node_id = 1
-    cli.ready(node_id)
+    client_selector_mocked.ready(node_id)
     maybe_connect.assert_called_with(node_id)
 
 
-def test_is_ready(mocker, cli, conn):
-    cli._maybe_connect(0)
-    cli._maybe_connect(1)
+def test_is_ready(client_selector_mocked, conn):
+    client_selector_mocked._init_connect(0)
+    client_selector_mocked._init_connect(1)
 
     # metadata refresh blocks ready nodes
-    assert cli.is_ready(0)
-    assert cli.is_ready(1)
-    cli._metadata_refresh_in_progress = True
-    assert not cli.is_ready(0)
-    assert not cli.is_ready(1)
+    assert client_selector_mocked.is_ready(0)
+    assert client_selector_mocked.is_ready(1)
+    client_selector_mocked._metadata_refresh_in_progress = True
+    assert not client_selector_mocked.is_ready(0)
+    assert not client_selector_mocked.is_ready(1)
 
     # requesting metadata update also blocks ready nodes
-    cli._metadata_refresh_in_progress = False
-    assert cli.is_ready(0)
-    assert cli.is_ready(1)
-    cli.cluster.request_update()
-    cli.cluster.config['retry_backoff_ms'] = 0
-    assert not cli._metadata_refresh_in_progress
-    assert not cli.is_ready(0)
-    assert not cli.is_ready(1)
-    cli.cluster._need_update = False
+    client_selector_mocked._metadata_refresh_in_progress = False
+    assert client_selector_mocked.is_ready(0)
+    assert client_selector_mocked.is_ready(1)
+    client_selector_mocked.cluster.request_update()
+    client_selector_mocked.cluster.config['retry_backoff_ms'] = 0
+    assert not client_selector_mocked._metadata_refresh_in_progress
+    assert not client_selector_mocked.is_ready(0)
+    assert not client_selector_mocked.is_ready(1)
+    client_selector_mocked.cluster._need_update = False
 
     # if connection can't send more, not ready
-    assert cli.is_ready(0)
+    assert client_selector_mocked.is_ready(0)
     conn.can_send_more.return_value = False
-    assert not cli.is_ready(0)
+    assert not client_selector_mocked.is_ready(0)
     conn.can_send_more.return_value = True
 
     # disconnected nodes, not ready
-    assert cli.is_ready(0)
+    assert client_selector_mocked.is_ready(0)
     conn.state = ConnectionStates.DISCONNECTED
-    assert not cli.is_ready(0)
-
+    assert not client_selector_mocked.is_ready(0)
 
-def test_close(mocker, cli, conn):
-    mocker.patch.object(cli, '_selector')
 
+def test_close(client_selector_mocked, conn):
     call_count = conn.close.call_count
 
     # Unknown node - silent
-    cli.close(2)
+    client_selector_mocked.close(2)
     call_count += 0
     assert conn.close.call_count == call_count
 
     # Single node close
-    cli._maybe_connect(0)
+    client_selector_mocked._init_connect(0)
     assert conn.close.call_count == call_count
-    cli.close(0)
+    client_selector_mocked.close(0)
     call_count += 1
     assert conn.close.call_count == call_count
 
     # All node close
-    cli._maybe_connect(1)
-    cli.close()
+    client_selector_mocked._init_connect(1)
+    client_selector_mocked.close()
     # +2 close: node 1, node bootstrap (node 0 already closed)
     call_count += 2
     assert conn.close.call_count == call_count
 
 
-def test_is_disconnected(cli, conn):
+def test_is_disconnected(client_selector_mocked, conn):
     # False if not connected yet
     conn.state = ConnectionStates.DISCONNECTED
-    assert not cli.is_disconnected(0)
+    assert not client_selector_mocked.is_disconnected(0)
 
-    cli._maybe_connect(0)
-    assert cli.is_disconnected(0)
+    client_selector_mocked._init_connect(0)
+    assert client_selector_mocked.is_disconnected(0)
 
     conn.state = ConnectionStates.CONNECTING
-    assert not cli.is_disconnected(0)
+    assert not client_selector_mocked.is_disconnected(0)
 
     conn.state = ConnectionStates.CONNECTED
-    assert not cli.is_disconnected(0)
+    assert not client_selector_mocked.is_disconnected(0)
 
 
-def test_send(cli, conn):
+def test_send(client_selector_mocked, conn):
     # Send to unknown node => raises AssertionError
     try:
-        cli.send(2, None)
+        client_selector_mocked.send(2, None)
         assert False, 'Exception not raised'
     except AssertionError:
         pass
 
     # Send to disconnected node => NodeNotReady
     conn.state = ConnectionStates.DISCONNECTED
-    f = cli.send(0, None)
+    f = client_selector_mocked.send(0, None)
     assert f.failed()
     assert isinstance(f.exception, Errors.NodeNotReadyError)
 
     conn.state = ConnectionStates.CONNECTED
-    cli._maybe_connect(0)
+    client_selector_mocked._init_connect(0)
     # ProduceRequest w/ 0 required_acks -> no response
     request = ProduceRequest[0](0, 0, [])
     assert request.expect_response() is False
-    ret = cli.send(0, request)
-    assert conn.send.called_with(request)
+    ret = client_selector_mocked.send(0, request)
+    conn.send.assert_called_with(request, blocking=False, request_timeout_ms=None)
     assert isinstance(ret, Future)
 
     request = MetadataRequest[0]([])
-    cli.send(0, request)
-    assert conn.send.called_with(request)
+    client_selector_mocked.send(0, request)
+    conn.send.assert_called_with(request, blocking=False, request_timeout_ms=None)
 
 
-def test_poll(mocker):
-    metadata = mocker.patch.object(KafkaClient, '_maybe_refresh_metadata')
-    _poll = mocker.patch.object(KafkaClient, '_poll')
-    ifrs = mocker.patch.object(KafkaClient, 'in_flight_request_count')
-    ifrs.return_value = 1
-    cli = KafkaClient(api_version=(0, 9))
+def test_poll(mocker, client_poll_mocked):
+    metadata = mocker.patch.object(client_poll_mocked, '_maybe_refresh_metadata')
+    ifr_request_timeout = mocker.patch.object(client_poll_mocked, '_next_ifr_request_timeout_ms')
+    now = time.time()
+    t = mocker.patch('time.time')
+    t.return_value = now
 
     # metadata timeout wins
+    ifr_request_timeout.return_value = float('inf')
     metadata.return_value = 1000
-    cli.poll()
-    _poll.assert_called_with(1.0)
+    client_poll_mocked.poll()
+    client_poll_mocked._poll.assert_called_with(1.0)
 
     # user timeout wins
-    cli.poll(250)
-    _poll.assert_called_with(0.25)
+    client_poll_mocked.poll(timeout_ms=250)
+    client_poll_mocked._poll.assert_called_with(0.25)
 
-    # default is request_timeout_ms
+    # ifr request timeout wins
+    ifr_request_timeout.return_value = 30000
     metadata.return_value = 1000000
-    cli.poll()
-    _poll.assert_called_with(cli.config['request_timeout_ms'] / 1000.0)
-
-    # If no in-flight-requests, drop timeout to retry_backoff_ms
-    ifrs.return_value = 0
-    cli.poll()
-    _poll.assert_called_with(cli.config['retry_backoff_ms'] / 1000.0)
+    client_poll_mocked.poll()
+    client_poll_mocked._poll.assert_called_with(30.0)
 
 
 def test__poll():
@@ -270,7 +277,7 @@ def test_least_loaded_node():
 def test_set_topics(mocker):
     request_update = mocker.patch.object(ClusterMetadata, 'request_update')
     request_update.side_effect = lambda: Future()
-    cli = KafkaClient(api_version=(0, 10))
+    cli = KafkaClient(api_version=(0, 10, 0))
 
     # replace 'empty' with 'non empty'
     request_update.reset_mock()
@@ -293,83 +300,66 @@ def test_set_topics(mocker):
     request_update.assert_not_called()
 
 
-@pytest.fixture
-def client(mocker):
-    _poll = mocker.patch.object(KafkaClient, '_poll')
+def test_maybe_refresh_metadata_ttl(client_poll_mocked):
+    client_poll_mocked.cluster.ttl.return_value = 1234
 
-    cli = KafkaClient(request_timeout_ms=9999999,
-                      reconnect_backoff_ms=2222,
-                      connections_max_idle_ms=float('inf'),
-                      api_version=(0, 9))
-
-    ttl = mocker.patch.object(cli.cluster, 'ttl')
-    ttl.return_value = 0
-    return cli
+    client_poll_mocked.poll(timeout_ms=12345678)
+    client_poll_mocked._poll.assert_called_with(1.234)
 
 
-def test_maybe_refresh_metadata_ttl(mocker, client):
-    client.cluster.ttl.return_value = 1234
-    mocker.patch.object(KafkaClient, 'in_flight_request_count', return_value=1)
-
-    client.poll(timeout_ms=12345678)
-    client._poll.assert_called_with(1.234)
-
-
-def test_maybe_refresh_metadata_backoff(mocker, client):
-    mocker.patch.object(KafkaClient, 'in_flight_request_count', return_value=1)
+def test_maybe_refresh_metadata_backoff(mocker, client_poll_mocked):
+    mocker.patch.object(client_poll_mocked, 'least_loaded_node', return_value=None)
+    mocker.patch.object(client_poll_mocked, 'least_loaded_node_refresh_ms', return_value=4321)
     now = time.time()
     t = mocker.patch('time.time')
     t.return_value = now
 
-    client.poll(timeout_ms=12345678)
-    client._poll.assert_called_with(2.222) # reconnect backoff
+    client_poll_mocked.poll(timeout_ms=12345678)
+    client_poll_mocked._poll.assert_called_with(4.321)
 
 
-def test_maybe_refresh_metadata_in_progress(mocker, client):
-    client._metadata_refresh_in_progress = True
-    mocker.patch.object(KafkaClient, 'in_flight_request_count', return_value=1)
+def test_maybe_refresh_metadata_in_progress(client_poll_mocked):
+    client_poll_mocked._metadata_refresh_in_progress = True
 
-    client.poll(timeout_ms=12345678)
-    client._poll.assert_called_with(9999.999) # request_timeout_ms
+    client_poll_mocked.poll(timeout_ms=12345678)
+    client_poll_mocked._poll.assert_called_with(9999.999) # request_timeout_ms
 
 
-def test_maybe_refresh_metadata_update(mocker, client):
-    mocker.patch.object(client, 'least_loaded_node', return_value='foobar')
-    mocker.patch.object(client, '_can_send_request', return_value=True)
-    mocker.patch.object(KafkaClient, 'in_flight_request_count', return_value=1)
-    send = mocker.patch.object(client, 'send')
+def test_maybe_refresh_metadata_update(mocker, client_poll_mocked):
+    mocker.patch.object(client_poll_mocked, 'least_loaded_node', return_value='foobar')
+    mocker.patch.object(client_poll_mocked, '_can_send_request', return_value=True)
+    send = mocker.patch.object(client_poll_mocked, 'send')
+    client_poll_mocked.cluster.need_all_topic_metadata = True
 
-    client.poll(timeout_ms=12345678)
-    client._poll.assert_called_with(9999.999) # request_timeout_ms
-    assert client._metadata_refresh_in_progress
+    client_poll_mocked.poll(timeout_ms=12345678)
+    client_poll_mocked._poll.assert_called_with(9999.999) # request_timeout_ms
+    assert client_poll_mocked._metadata_refresh_in_progress
     request = MetadataRequest[0]([])
     send.assert_called_once_with('foobar', request, wakeup=False)
 
 
-def test_maybe_refresh_metadata_cant_send(mocker, client):
-    mocker.patch.object(client, 'least_loaded_node', return_value='foobar')
-    mocker.patch.object(client, '_can_connect', return_value=True)
-    mocker.patch.object(client, '_maybe_connect', return_value=True)
-    mocker.patch.object(client, 'maybe_connect', return_value=True)
-    mocker.patch.object(KafkaClient, 'in_flight_request_count', return_value=1)
+def test_maybe_refresh_metadata_cant_send(mocker, client_poll_mocked):
+    mocker.patch.object(client_poll_mocked, 'least_loaded_node', return_value='foobar')
+    mocker.patch.object(client_poll_mocked, '_can_send_request', return_value=False)
+    mocker.patch.object(client_poll_mocked, '_can_connect', return_value=True)
+    mocker.patch.object(client_poll_mocked, '_init_connect', return_value=True)
 
     now = time.time()
     t = mocker.patch('time.time')
     t.return_value = now
 
     # first poll attempts connection
-    client.poll(timeout_ms=12345678)
-    client._poll.assert_called_with(2.222) # reconnect backoff
-    client.maybe_connect.assert_called_once_with('foobar', wakeup=False)
+    client_poll_mocked.poll()
+    client_poll_mocked._poll.assert_called()
+    client_poll_mocked._init_connect.assert_called_once_with('foobar')
 
     # poll while connecting should not attempt a new connection
-    client._connecting.add('foobar')
-    client._can_connect.reset_mock()
-    client.poll(timeout_ms=12345678)
-    client._poll.assert_called_with(2.222) # connection timeout (reconnect timeout)
-    assert not client._can_connect.called
-
-    assert not client._metadata_refresh_in_progress
+    client_poll_mocked._connecting.add('foobar')
+    client_poll_mocked._can_connect.reset_mock()
+    client_poll_mocked.poll()
+    client_poll_mocked._poll.assert_called()
+    assert not client_poll_mocked._can_connect.called
+    assert not client_poll_mocked._metadata_refresh_in_progress
 
 
 def test_schedule():
diff --git a/test/test_cluster.py b/test/test_cluster.py
index f010c4f71..c57bd8f9f 100644
--- a/test/test_cluster.py
+++ b/test/test_cluster.py
@@ -1,9 +1,9 @@
 # pylint: skip-file
 from __future__ import absolute_import
 
-import pytest
+import socket
 
-from kafka.cluster import ClusterMetadata
+from kafka.cluster import ClusterMetadata, collect_hosts
 from kafka.protocol.metadata import MetadataResponse
 
 
@@ -20,3 +20,174 @@ def test_empty_broker_list():
         [],  # empty brokers
         [(17, 'foo', []), (17, 'bar', [])]))  # topics w/ error
     assert len(cluster.brokers()) == 2
+
+
+def test_metadata_v0():
+    cluster = ClusterMetadata()
+    cluster.update_metadata(MetadataResponse[0](
+        [(0, 'foo', 12), (1, 'bar', 34)],
+        [(0, 'topic-1', [(0, 0, 0, [0], [0])])]))
+    assert len(cluster.topics()) == 1
+    assert cluster.controller is None
+    assert cluster.cluster_id is None
+    assert cluster._partitions['topic-1'][0].offline_replicas == []
+    assert cluster._partitions['topic-1'][0].leader_epoch == -1
+
+
+def test_metadata_v1():
+    cluster = ClusterMetadata()
+    cluster.update_metadata(MetadataResponse[1](
+        [(0, 'foo', 12, 'rack-1'), (1, 'bar', 34, 'rack-2')],
+        0, # controller_id
+        [(0, 'topic-1', False, [(0, 0, 0, [0], [0])])]))
+    assert len(cluster.topics()) == 1
+    assert cluster.controller == cluster.broker_metadata(0)
+    assert cluster.cluster_id is None
+    assert cluster._partitions['topic-1'][0].offline_replicas == []
+    assert cluster._partitions['topic-1'][0].leader_epoch == -1
+
+
+def test_metadata_v2():
+    cluster = ClusterMetadata()
+    cluster.update_metadata(MetadataResponse[2](
+        [(0, 'foo', 12, 'rack-1'), (1, 'bar', 34, 'rack-2')],
+        'cluster-foo', # cluster_id
+        0, # controller_id
+        [(0, 'topic-1', False, [(0, 0, 0, [0], [0])])]))
+    assert len(cluster.topics()) == 1
+    assert cluster.controller == cluster.broker_metadata(0)
+    assert cluster.cluster_id == 'cluster-foo'
+    assert cluster._partitions['topic-1'][0].offline_replicas == []
+    assert cluster._partitions['topic-1'][0].leader_epoch == -1
+
+
+def test_metadata_v3():
+    cluster = ClusterMetadata()
+    cluster.update_metadata(MetadataResponse[3](
+        0, # throttle_time_ms
+        [(0, 'foo', 12, 'rack-1'), (1, 'bar', 34, 'rack-2')],
+        'cluster-foo', # cluster_id
+        0, # controller_id
+        [(0, 'topic-1', False, [(0, 0, 0, [0], [0])])]))
+    assert len(cluster.topics()) == 1
+    assert cluster.controller == cluster.broker_metadata(0)
+    assert cluster.cluster_id == 'cluster-foo'
+    assert cluster._partitions['topic-1'][0].offline_replicas == []
+    assert cluster._partitions['topic-1'][0].leader_epoch == -1
+
+
+def test_metadata_v4():
+    cluster = ClusterMetadata()
+    cluster.update_metadata(MetadataResponse[4](
+        0, # throttle_time_ms
+        [(0, 'foo', 12, 'rack-1'), (1, 'bar', 34, 'rack-2')],
+        'cluster-foo', # cluster_id
+        0, # controller_id
+        [(0, 'topic-1', False, [(0, 0, 0, [0], [0])])]))
+    assert len(cluster.topics()) == 1
+    assert cluster.controller == cluster.broker_metadata(0)
+    assert cluster.cluster_id == 'cluster-foo'
+    assert cluster._partitions['topic-1'][0].offline_replicas == []
+    assert cluster._partitions['topic-1'][0].leader_epoch == -1
+
+
+def test_metadata_v5():
+    cluster = ClusterMetadata()
+    cluster.update_metadata(MetadataResponse[5](
+        0, # throttle_time_ms
+        [(0, 'foo', 12, 'rack-1'), (1, 'bar', 34, 'rack-2')],
+        'cluster-foo', # cluster_id
+        0, # controller_id
+        [(0, 'topic-1', False, [(0, 0, 0, [0], [0], [12])])]))
+    assert len(cluster.topics()) == 1
+    assert cluster.controller == cluster.broker_metadata(0)
+    assert cluster.cluster_id == 'cluster-foo'
+    assert cluster._partitions['topic-1'][0].offline_replicas == [12]
+    assert cluster._partitions['topic-1'][0].leader_epoch == -1
+
+
+def test_metadata_v6():
+    cluster = ClusterMetadata()
+    cluster.update_metadata(MetadataResponse[6](
+        0, # throttle_time_ms
+        [(0, 'foo', 12, 'rack-1'), (1, 'bar', 34, 'rack-2')],
+        'cluster-foo', # cluster_id
+        0, # controller_id
+        [(0, 'topic-1', False, [(0, 0, 0, [0], [0], [12])])]))
+    assert len(cluster.topics()) == 1
+    assert cluster.controller == cluster.broker_metadata(0)
+    assert cluster.cluster_id == 'cluster-foo'
+    assert cluster._partitions['topic-1'][0].offline_replicas == [12]
+    assert cluster._partitions['topic-1'][0].leader_epoch == -1
+
+
+def test_metadata_v7():
+    cluster = ClusterMetadata()
+    cluster.update_metadata(MetadataResponse[7](
+        0, # throttle_time_ms
+        [(0, 'foo', 12, 'rack-1'), (1, 'bar', 34, 'rack-2')],
+        'cluster-foo', # cluster_id
+        0, # controller_id
+        [(0, 'topic-1', False, [(0, 0, 0, 0, [0], [0], [12])])]))
+    assert len(cluster.topics()) == 1
+    assert cluster.controller == cluster.broker_metadata(0)
+    assert cluster.cluster_id == 'cluster-foo'
+    assert cluster._partitions['topic-1'][0].offline_replicas == [12]
+    assert cluster._partitions['topic-1'][0].leader_epoch == 0
+
+
+def test_collect_hosts__happy_path():
+    hosts = "127.0.0.1:1234,127.0.0.1"
+    results = collect_hosts(hosts)
+    assert set(results) == set([
+        ('127.0.0.1', 1234, socket.AF_INET),
+        ('127.0.0.1', 9092, socket.AF_INET),
+    ])
+
+
+def test_collect_hosts__ipv6():
+    hosts = "[localhost]:1234,[2001:1000:2000::1],[2001:1000:2000::1]:1234"
+    results = collect_hosts(hosts)
+    assert set(results) == set([
+        ('localhost', 1234, socket.AF_INET6),
+        ('2001:1000:2000::1', 9092, socket.AF_INET6),
+        ('2001:1000:2000::1', 1234, socket.AF_INET6),
+    ])
+
+
+def test_collect_hosts__string_list():
+    hosts = [
+        'localhost:1234',
+        'localhost',
+        '[localhost]',
+        '2001::1',
+        '[2001::1]',
+        '[2001::1]:1234',
+    ]
+    results = collect_hosts(hosts)
+    assert set(results) == set([
+        ('localhost', 1234, socket.AF_UNSPEC),
+        ('localhost', 9092, socket.AF_UNSPEC),
+        ('localhost', 9092, socket.AF_INET6),
+        ('2001::1', 9092, socket.AF_INET6),
+        ('2001::1', 9092, socket.AF_INET6),
+        ('2001::1', 1234, socket.AF_INET6),
+    ])
+
+
+def test_collect_hosts__with_spaces():
+    hosts = "localhost:1234, localhost"
+    results = collect_hosts(hosts)
+    assert set(results) == set([
+        ('localhost', 1234, socket.AF_UNSPEC),
+        ('localhost', 9092, socket.AF_UNSPEC),
+    ])
+
+
+def test_collect_hosts__protocol():
+    hosts = "SASL_SSL://foo.bar:1234,SASL_SSL://fizz.buzz:5678"
+    results = collect_hosts(hosts)
+    assert set(results) == set([
+        ('foo.bar', 1234, socket.AF_UNSPEC),
+        ('fizz.buzz', 5678, socket.AF_UNSPEC),
+    ])
diff --git a/test/test_codec.py b/test/test_codec.py
index e05707451..24159c253 100644
--- a/test/test_codec.py
+++ b/test/test_codec.py
@@ -39,12 +39,14 @@ def test_snappy_detect_xerial():
     _detect_xerial_stream = kafka1.codec._detect_xerial_stream
 
     header = b'\x82SNAPPY\x00\x00\x00\x00\x01\x00\x00\x00\x01Some extra bytes'
+    redpanda_header = b'\x82SNAPPY\x00\x01\x00\x00\x00\x01\x00\x00\x00Some extra bytes'
     false_header = b'\x01SNAPPY\x00\x00\x00\x01\x00\x00\x00\x01'
     default_snappy = snappy_encode(b'foobar' * 50)
     random_snappy = snappy_encode(b'SNAPPY' * 50, xerial_compatible=False)
     short_data = b'\x01\x02\x03\x04'
 
     assert _detect_xerial_stream(header) is True
+    assert _detect_xerial_stream(redpanda_header) is True
     assert _detect_xerial_stream(b'') is False
     assert _detect_xerial_stream(b'\x00') is False
     assert _detect_xerial_stream(false_header) is False
diff --git a/test/test_conn.py b/test/test_conn.py
index 966f7b34d..037cd015e 100644
--- a/test/test_conn.py
+++ b/test/test_conn.py
@@ -4,16 +4,28 @@
 from errno import EALREADY, EINPROGRESS, EISCONN, ECONNRESET
 import socket
 
-import mock
+try:
+    from unittest import mock
+except ImportError:
+    import mock
 import pytest
 
-from kafka.conn import BrokerConnection, ConnectionStates, collect_hosts
+from kafka.conn import BrokerConnection, ConnectionStates
+from kafka.future import Future
 from kafka.protocol.api import RequestHeader
+from kafka.protocol.group import HeartbeatResponse
 from kafka.protocol.metadata import MetadataRequest
 from kafka.protocol.produce import ProduceRequest
 
 import kafka.errors as Errors
 
+from kafka.vendor import six
+
+if six.PY2:
+    ConnectionError = socket.error
+    TimeoutError = socket.error
+    BlockingIOError = Exception
+
 
 @pytest.fixture
 def dns_lookup(mocker):
@@ -26,13 +38,16 @@ def dns_lookup(mocker):
 def _socket(mocker):
     socket = mocker.MagicMock()
     socket.connect_ex.return_value = 0
+    socket.send.side_effect = lambda d: len(d)
+    socket.recv.side_effect = BlockingIOError("mocked recv")
     mocker.patch('socket.socket', return_value=socket)
     return socket
 
 
 @pytest.fixture
-def conn(_socket, dns_lookup):
+def conn(_socket, dns_lookup, mocker):
     conn = BrokerConnection('localhost', 9092, socket.AF_INET)
+    mocker.patch.object(conn, '_try_api_versions_check', return_value=True)
     return conn
 
 
@@ -55,6 +70,38 @@ def test_connect(_socket, conn, states):
         assert conn.state is state
 
 
+def test_api_versions_check(_socket, mocker):
+    conn = BrokerConnection('localhost', 9092, socket.AF_INET)
+    mocker.patch.object(conn, '_send', return_value=Future())
+    mocker.patch.object(conn, 'recv', return_value=[])
+    assert conn._api_versions_future is None
+    conn.connect()
+    assert conn._api_versions_future is not None
+    assert conn.connecting() is True
+    assert conn.state is ConnectionStates.API_VERSIONS_RECV
+
+    assert conn._try_api_versions_check() is False
+    assert conn.connecting() is True
+    assert conn.state is ConnectionStates.API_VERSIONS_RECV
+
+    conn._api_versions_future = None
+    conn._check_version_idx = 0
+    assert conn._try_api_versions_check() is False
+    assert conn.connecting() is True
+
+    conn._check_version_idx = len(conn.VERSION_CHECKS)
+    conn._api_versions_future = None
+    assert conn._try_api_versions_check() is False
+    assert conn.connecting() is False
+    assert conn.disconnected() is True
+
+
+def test_api_versions_check_unrecognized(_socket):
+    conn = BrokerConnection('localhost', 9092, socket.AF_INET, api_version=(0, 0))
+    with pytest.raises(Errors.UnrecognizedBrokerVersion):
+        conn.connect()
+
+
 def test_connect_timeout(_socket, conn):
     assert conn.state is ConnectionStates.DISCONNECTED
 
@@ -80,15 +127,35 @@ def test_blacked_out(conn):
         assert conn.blacked_out() is True
 
 
-def test_connection_delay(conn):
+def test_connection_delay(conn, mocker):
+    mocker.patch.object(conn, '_reconnect_jitter_pct', return_value=1.0)
     with mock.patch("time.time", return_value=1000):
         conn.last_attempt = 1000
         assert conn.connection_delay() == conn.config['reconnect_backoff_ms']
         conn.state = ConnectionStates.CONNECTING
-        assert conn.connection_delay() == float('inf')
+        assert conn.connection_delay() == conn.config['reconnect_backoff_ms']
         conn.state = ConnectionStates.CONNECTED
         assert conn.connection_delay() == float('inf')
 
+        del conn._gai[:]
+        conn._update_reconnect_backoff()
+        conn.state = ConnectionStates.DISCONNECTED
+        assert conn.connection_delay() == 1.0 * conn.config['reconnect_backoff_ms']
+        conn.state = ConnectionStates.CONNECTING
+        assert conn.connection_delay() == 1.0 * conn.config['reconnect_backoff_ms']
+
+        conn._update_reconnect_backoff()
+        conn.state = ConnectionStates.DISCONNECTED
+        assert conn.connection_delay() == 2.0 * conn.config['reconnect_backoff_ms']
+        conn.state = ConnectionStates.CONNECTING
+        assert conn.connection_delay() == 2.0 * conn.config['reconnect_backoff_ms']
+
+        conn._update_reconnect_backoff()
+        conn.state = ConnectionStates.DISCONNECTED
+        assert conn.connection_delay() == 4.0 * conn.config['reconnect_backoff_ms']
+        conn.state = ConnectionStates.CONNECTING
+        assert conn.connection_delay() == 4.0 * conn.config['reconnect_backoff_ms']
+
 
 def test_connected(conn):
     assert conn.connected() is False
@@ -196,12 +263,13 @@ def test_recv_disconnected(_socket, conn):
     conn.send(req)
 
     # Empty data on recv means the socket is disconnected
+    _socket.recv.side_effect = None
     _socket.recv.return_value = b''
 
     # Attempt to receive should mark connection as disconnected
-    assert conn.connected()
+    assert conn.connected(), 'Not connected: %s' % conn.state
     conn.recv()
-    assert conn.disconnected()
+    assert conn.disconnected(), 'Not disconnected: %s' % conn.state
 
 
 def test_recv(_socket, conn):
@@ -212,54 +280,6 @@ def test_close(conn):
     pass # TODO
 
 
-def test_collect_hosts__happy_path():
-    hosts = "127.0.0.1:1234,127.0.0.1"
-    results = collect_hosts(hosts)
-    assert set(results) == set([
-        ('127.0.0.1', 1234, socket.AF_INET),
-        ('127.0.0.1', 9092, socket.AF_INET),
-    ])
-
-
-def test_collect_hosts__ipv6():
-    hosts = "[localhost]:1234,[2001:1000:2000::1],[2001:1000:2000::1]:1234"
-    results = collect_hosts(hosts)
-    assert set(results) == set([
-        ('localhost', 1234, socket.AF_INET6),
-        ('2001:1000:2000::1', 9092, socket.AF_INET6),
-        ('2001:1000:2000::1', 1234, socket.AF_INET6),
-    ])
-
-
-def test_collect_hosts__string_list():
-    hosts = [
-        'localhost:1234',
-        'localhost',
-        '[localhost]',
-        '2001::1',
-        '[2001::1]',
-        '[2001::1]:1234',
-    ]
-    results = collect_hosts(hosts)
-    assert set(results) == set([
-        ('localhost', 1234, socket.AF_UNSPEC),
-        ('localhost', 9092, socket.AF_UNSPEC),
-        ('localhost', 9092, socket.AF_INET6),
-        ('2001::1', 9092, socket.AF_INET6),
-        ('2001::1', 9092, socket.AF_INET6),
-        ('2001::1', 1234, socket.AF_INET6),
-    ])
-
-
-def test_collect_hosts__with_spaces():
-    hosts = "localhost:1234, localhost"
-    results = collect_hosts(hosts)
-    assert set(results) == set([
-        ('localhost', 1234, socket.AF_UNSPEC),
-        ('localhost', 9092, socket.AF_UNSPEC),
-    ])
-
-
 def test_lookup_on_connect():
     hostname = 'example.org'
     port = 9092
@@ -327,16 +347,42 @@ def test_requests_timed_out(conn):
         # No in-flight requests, not timed out
         assert not conn.requests_timed_out()
 
-        # Single request, timestamp = now (0)
-        conn.in_flight_requests[0] = ('foo', 0)
+        # Single request, timeout_at > now (0)
+        conn.in_flight_requests[0] = ('foo', 0, 1)
         assert not conn.requests_timed_out()
 
         # Add another request w/ timestamp > request_timeout ago
         request_timeout = conn.config['request_timeout_ms']
         expired_timestamp = 0 - request_timeout - 1
-        conn.in_flight_requests[1] = ('bar', expired_timestamp)
+        conn.in_flight_requests[1] = ('bar', 0, expired_timestamp)
         assert conn.requests_timed_out()
 
         # Drop the expired request and we should be good to go again
         conn.in_flight_requests.pop(1)
         assert not conn.requests_timed_out()
+
+
+def test_maybe_throttle(conn):
+    assert conn.state is ConnectionStates.DISCONNECTED
+    assert not conn.throttled()
+
+    conn.state = ConnectionStates.CONNECTED
+    assert not conn.throttled()
+
+    # No throttle_time_ms attribute
+    conn._maybe_throttle(HeartbeatResponse[0](error_code=0))
+    assert not conn.throttled()
+
+    with mock.patch("time.time", return_value=1000) as time:
+        # server-side throttling in v1.0
+        conn.config['api_version'] = (1, 0)
+        conn._maybe_throttle(HeartbeatResponse[1](throttle_time_ms=1000, error_code=0))
+        assert not conn.throttled()
+
+        # client-side throttling in v2.0
+        conn.config['api_version'] = (2, 0)
+        conn._maybe_throttle(HeartbeatResponse[2](throttle_time_ms=1000, error_code=0))
+        assert conn.throttled()
+
+        time.return_value = 3000
+        assert not conn.throttled()
diff --git a/test/test_consumer.py b/test/test_consumer.py
index 436fe55c0..0d9477729 100644
--- a/test/test_consumer.py
+++ b/test/test_consumer.py
@@ -1,26 +1,52 @@
+from __future__ import absolute_import
+
 import pytest
 
-from kafka import KafkaConsumer
-from kafka.errors import KafkaConfigurationError
+from kafka import KafkaConsumer, TopicPartition
+from kafka.errors import KafkaConfigurationError, IllegalStateError
+
+
+def test_session_timeout_larger_than_request_timeout_raises():
+    with pytest.raises(KafkaConfigurationError):
+        KafkaConsumer(bootstrap_servers='localhost:9092', api_version=(0, 9), group_id='foo', session_timeout_ms=50000, request_timeout_ms=40000)
+
+
+def test_fetch_max_wait_larger_than_request_timeout_raises():
+    with pytest.raises(KafkaConfigurationError):
+        KafkaConsumer(bootstrap_servers='localhost:9092', fetch_max_wait_ms=50000, request_timeout_ms=40000)
+
+
+def test_request_timeout_larger_than_connections_max_idle_ms_raises():
+    with pytest.raises(KafkaConfigurationError):
+        KafkaConsumer(bootstrap_servers='localhost:9092', api_version=(0, 9), request_timeout_ms=50000, connections_max_idle_ms=40000)
+
 
+def test_subscription_copy():
+    consumer = KafkaConsumer('foo', api_version=(0, 10, 0))
+    sub = consumer.subscription()
+    assert sub is not consumer.subscription()
+    assert sub == set(['foo'])
+    sub.add('fizz')
+    assert consumer.subscription() == set(['foo'])
 
-class TestKafkaConsumer:
-    def test_session_timeout_larger_than_request_timeout_raises(self):
-        with pytest.raises(KafkaConfigurationError):
-            KafkaConsumer(bootstrap_servers='localhost:9092', api_version=(0, 9), group_id='foo', session_timeout_ms=50000, request_timeout_ms=40000)
 
-    def test_fetch_max_wait_larger_than_request_timeout_raises(self):
-        with pytest.raises(KafkaConfigurationError):
-            KafkaConsumer(bootstrap_servers='localhost:9092', fetch_max_wait_ms=50000, request_timeout_ms=40000)
+def test_assign():
+    # Consumer w/ subscription to topic 'foo'
+    consumer = KafkaConsumer('foo', api_version=(0, 10, 0))
+    assert consumer.assignment() == set()
+    # Cannot assign manually
+    with pytest.raises(IllegalStateError):
+        consumer.assign([TopicPartition('foo', 0)])
 
-    def test_request_timeout_larger_than_connections_max_idle_ms_raises(self):
-        with pytest.raises(KafkaConfigurationError):
-            KafkaConsumer(bootstrap_servers='localhost:9092', api_version=(0, 9), request_timeout_ms=50000, connections_max_idle_ms=40000)
+    assert 'foo' in consumer._client._topics
 
-    def test_subscription_copy(self):
-        consumer = KafkaConsumer('foo', api_version=(0, 10))
-        sub = consumer.subscription()
-        assert sub is not consumer.subscription()
-        assert sub == set(['foo'])
-        sub.add('fizz')
-        assert consumer.subscription() == set(['foo'])
+    consumer = KafkaConsumer(api_version=(0, 10, 0))
+    assert consumer.assignment() == set()
+    consumer.assign([TopicPartition('foo', 0)])
+    assert consumer.assignment() == set([TopicPartition('foo', 0)])
+    assert 'foo' in consumer._client._topics
+    # Cannot subscribe
+    with pytest.raises(IllegalStateError):
+        consumer.subscribe(topics=['foo'])
+    consumer.assign([])
+    assert consumer.assignment() == set()
diff --git a/test/test_coordinator.py b/test/test_coordinator.py
index a35cdd1a0..251de566a 100644
--- a/test/test_coordinator.py
+++ b/test/test_coordinator.py
@@ -16,7 +16,7 @@
     ConsumerProtocolMemberMetadata, ConsumerProtocolMemberAssignment)
 import kafka.errors as Errors
 from kafka.future import Future
-from kafka.metrics import Metrics
+from kafka.protocol.broker_api_versions import BROKER_API_VERSIONS
 from kafka.protocol.commit import (
     OffsetCommitRequest, OffsetCommitResponse,
     OffsetFetchRequest, OffsetFetchResponse)
@@ -26,12 +26,13 @@
 
 
 @pytest.fixture
-def client(conn):
-    return KafkaClient(api_version=(0, 9))
-
-@pytest.fixture
-def coordinator(client):
-    return ConsumerCoordinator(client, SubscriptionState(), Metrics())
+def coordinator(client, metrics, mocker):
+    coord = ConsumerCoordinator(client, SubscriptionState(), metrics=metrics)
+    try:
+        yield coord
+    finally:
+        mocker.patch.object(coord, 'coordinator_unknown', return_value=True) # avoid attempting to leave group during close()
+        coord.close(timeout_ms=0)
 
 
 def test_init(client, coordinator):
@@ -41,9 +42,10 @@ def test_init(client, coordinator):
 
 
 @pytest.mark.parametrize("api_version", [(0, 8, 0), (0, 8, 1), (0, 8, 2), (0, 9)])
-def test_autocommit_enable_api_version(client, api_version):
-    coordinator = ConsumerCoordinator(client, SubscriptionState(),
-                                      Metrics(),
+def test_autocommit_enable_api_version(conn, metrics, api_version):
+    coordinator = ConsumerCoordinator(KafkaClient(api_version=api_version),
+                                      SubscriptionState(),
+                                      metrics=metrics,
                                       enable_auto_commit=True,
                                       session_timeout_ms=30000,   # session_timeout_ms and max_poll_interval_ms
                                       max_poll_interval_ms=30000, # should be the same to avoid KafkaConfigurationError
@@ -53,6 +55,7 @@ def test_autocommit_enable_api_version(client, api_version):
         assert coordinator.config['enable_auto_commit'] is False
     else:
         assert coordinator.config['enable_auto_commit'] is True
+    coordinator.close()
 
 
 def test_protocol_type(coordinator):
@@ -86,8 +89,13 @@ def test_group_protocols(coordinator):
 
 
 @pytest.mark.parametrize('api_version', [(0, 8, 0), (0, 8, 1), (0, 8, 2), (0, 9)])
-def test_pattern_subscription(coordinator, api_version):
-    coordinator.config['api_version'] = api_version
+def test_pattern_subscription(conn, metrics, api_version):
+    coordinator = ConsumerCoordinator(KafkaClient(api_version=api_version),
+                                      SubscriptionState(),
+                                      metrics=metrics,
+                                      api_version=api_version,
+                                      session_timeout_ms=10000,
+                                      max_poll_interval_ms=10000)
     coordinator._subscription.subscribe(pattern='foo')
     assert coordinator._subscription.subscription == set([])
     assert coordinator._metadata_snapshot == coordinator._build_metadata_snapshot(coordinator._subscription, {})
@@ -110,6 +118,7 @@ def test_pattern_subscription(coordinator, api_version):
     else:
         assert set(coordinator._subscription.assignment.keys()) == {TopicPartition('foo1', 0),
                                                                     TopicPartition('foo2', 0)}
+    coordinator.close()
 
 
 def test_lookup_assignor(coordinator):
@@ -182,6 +191,7 @@ def test_subscription_listener_failure(mocker, coordinator):
 
 
 def test_perform_assignment(mocker, coordinator):
+    coordinator._subscription.subscribe(topics=['foo1'])
     member_metadata = {
         'member-foo': ConsumerProtocolMemberMetadata(0, ['foo1'], b''),
         'member-bar': ConsumerProtocolMemberMetadata(0, ['foo1'], b'')
@@ -221,17 +231,23 @@ def test_need_rejoin(coordinator):
 
 
 def test_refresh_committed_offsets_if_needed(mocker, coordinator):
+    tp0 = TopicPartition('foobar', 0)
+    tp1 = TopicPartition('foobar', 1)
     mocker.patch.object(ConsumerCoordinator, 'fetch_committed_offsets',
                         return_value = {
-                            TopicPartition('foobar', 0): OffsetAndMetadata(123, b''),
-                            TopicPartition('foobar', 1): OffsetAndMetadata(234, b'')})
-    coordinator._subscription.assign_from_user([TopicPartition('foobar', 0)])
-    assert coordinator._subscription.needs_fetch_committed_offsets is True
+                            tp0: OffsetAndMetadata(123, '', -1),
+                            tp1: OffsetAndMetadata(234, '', -1)})
+    coordinator._subscription.assign_from_user([tp0, tp1])
+    coordinator._subscription.request_offset_reset(tp0)
+    coordinator._subscription.request_offset_reset(tp1)
+    assert coordinator._subscription.is_offset_reset_needed(tp0)
+    assert coordinator._subscription.is_offset_reset_needed(tp1)
     coordinator.refresh_committed_offsets_if_needed()
     assignment = coordinator._subscription.assignment
-    assert assignment[TopicPartition('foobar', 0)].committed == OffsetAndMetadata(123, b'')
-    assert TopicPartition('foobar', 1) not in assignment
-    assert coordinator._subscription.needs_fetch_committed_offsets is False
+    assert assignment[tp0].position == OffsetAndMetadata(123, '', -1)
+    assert assignment[tp1].position == OffsetAndMetadata(234, '', -1)
+    assert not coordinator._subscription.is_offset_reset_needed(tp0)
+    assert not coordinator._subscription.is_offset_reset_needed(tp1)
 
 
 def test_fetch_committed_offsets(mocker, coordinator):
@@ -288,7 +304,7 @@ def test_close(mocker, coordinator):
     coordinator._handle_leave_group_response.assert_called_with('foobar')
 
     assert coordinator.generation() is None
-    assert coordinator._generation is Generation.NO_GENERATION
+    assert coordinator._generation == Generation.NO_GENERATION
     assert coordinator.state is MemberState.UNJOINED
     assert coordinator.rejoin_needed is True
 
@@ -296,8 +312,8 @@ def test_close(mocker, coordinator):
 @pytest.fixture
 def offsets():
     return {
-        TopicPartition('foobar', 0): OffsetAndMetadata(123, b''),
-        TopicPartition('foobar', 1): OffsetAndMetadata(234, b''),
+        TopicPartition('foobar', 0): OffsetAndMetadata(123, '', -1),
+        TopicPartition('foobar', 1): OffsetAndMetadata(234, '', -1),
     }
 
 
@@ -369,7 +385,6 @@ def test_maybe_auto_commit_offsets_sync(mocker, api_version, group_id, enable,
     mock_exc = mocker.patch('kafka.coordinator.consumer.log.exception')
     client = KafkaClient(api_version=api_version)
     coordinator = ConsumerCoordinator(client, SubscriptionState(),
-                                      Metrics(),
                                       api_version=api_version,
                                       session_timeout_ms=30000,
                                       max_poll_interval_ms=30000,
@@ -390,6 +405,7 @@ def test_maybe_auto_commit_offsets_sync(mocker, api_version, group_id, enable,
     assert commit_sync.call_count == (1 if commit_offsets else 0)
     assert mock_warn.call_count == (1 if warn else 0)
     assert mock_exc.call_count == (1 if exc else 0)
+    coordinator.close()
 
 
 @pytest.fixture
@@ -426,17 +442,21 @@ def test_send_offset_commit_request_fail(mocker, patched_coord, offsets):
     # No coordinator
     ret = patched_coord._send_offset_commit_request(offsets)
     assert ret.failed()
-    assert isinstance(ret.exception, Errors.GroupCoordinatorNotAvailableError)
+    assert isinstance(ret.exception, Errors.CoordinatorNotAvailableError)
 
 
 @pytest.mark.parametrize('api_version,req_type', [
     ((0, 8, 1), OffsetCommitRequest[0]),
     ((0, 8, 2), OffsetCommitRequest[1]),
-    ((0, 9), OffsetCommitRequest[2])])
+    ((0, 9), OffsetCommitRequest[2]),
+    ((0, 11), OffsetCommitRequest[3]),
+    ((2, 0), OffsetCommitRequest[4]),
+    ((2, 1), OffsetCommitRequest[6]),
+])
 def test_send_offset_commit_request_versions(patched_coord, offsets,
                                              api_version, req_type):
     expect_node = 0
-    patched_coord.config['api_version'] = api_version
+    patched_coord._client._api_versions = BROKER_API_VERSIONS[api_version]
 
     patched_coord._send_offset_commit_request(offsets)
     (node, request), _ = patched_coord._client.send.call_args
@@ -475,11 +495,11 @@ def test_send_offset_commit_request_success(mocker, patched_coord, offsets):
     (OffsetCommitResponse[0]([('foobar', [(0, 28), (1, 28)])]),
      Errors.InvalidCommitOffsetSizeError, False),
     (OffsetCommitResponse[0]([('foobar', [(0, 14), (1, 14)])]),
-     Errors.GroupLoadInProgressError, False),
+     Errors.CoordinatorLoadInProgressError, False),
     (OffsetCommitResponse[0]([('foobar', [(0, 15), (1, 15)])]),
-     Errors.GroupCoordinatorNotAvailableError, True),
+     Errors.CoordinatorNotAvailableError, True),
     (OffsetCommitResponse[0]([('foobar', [(0, 16), (1, 16)])]),
-     Errors.NotCoordinatorForGroupError, True),
+     Errors.NotCoordinatorError, True),
     (OffsetCommitResponse[0]([('foobar', [(0, 7), (1, 7)])]),
      Errors.RequestTimedOutError, True),
     (OffsetCommitResponse[0]([('foobar', [(0, 25), (1, 25)])]),
@@ -492,13 +512,27 @@ def test_send_offset_commit_request_success(mocker, patched_coord, offsets):
      Errors.InvalidTopicError, False),
     (OffsetCommitResponse[0]([('foobar', [(0, 29), (1, 29)])]),
      Errors.TopicAuthorizationFailedError, False),
+    (OffsetCommitResponse[0]([('foobar', [(0, 0), (1, 0)])]),
+     None, False),
+    (OffsetCommitResponse[1]([('foobar', [(0, 0), (1, 0)])]),
+     None, False),
+    (OffsetCommitResponse[2]([('foobar', [(0, 0), (1, 0)])]),
+     None, False),
+    (OffsetCommitResponse[3](0, [('foobar', [(0, 0), (1, 0)])]),
+     None, False),
+    (OffsetCommitResponse[4](0, [('foobar', [(0, 0), (1, 0)])]),
+     None, False),
+    (OffsetCommitResponse[5](0, [('foobar', [(0, 0), (1, 0)])]),
+     None, False),
+    (OffsetCommitResponse[6](0, [('foobar', [(0, 0), (1, 0)])]),
+     None, False),
 ])
 def test_handle_offset_commit_response(mocker, patched_coord, offsets,
                                        response, error, dead):
     future = Future()
     patched_coord._handle_offset_commit_response(offsets, future, time.time(),
                                                  response)
-    assert isinstance(future.exception, error)
+    assert isinstance(future.exception, error) if error else True
     assert patched_coord.coordinator_id is (None if dead else 0)
 
 
@@ -521,18 +555,23 @@ def test_send_offset_fetch_request_fail(mocker, patched_coord, partitions):
     # No coordinator
     ret = patched_coord._send_offset_fetch_request(partitions)
     assert ret.failed()
-    assert isinstance(ret.exception, Errors.GroupCoordinatorNotAvailableError)
+    assert isinstance(ret.exception, Errors.CoordinatorNotAvailableError)
 
 
 @pytest.mark.parametrize('api_version,req_type', [
     ((0, 8, 1), OffsetFetchRequest[0]),
     ((0, 8, 2), OffsetFetchRequest[1]),
-    ((0, 9), OffsetFetchRequest[1])])
+    ((0, 9), OffsetFetchRequest[1]),
+    ((0, 10, 2), OffsetFetchRequest[2]),
+    ((0, 11), OffsetFetchRequest[3]),
+    ((2, 0), OffsetFetchRequest[4]),
+    ((2, 1), OffsetFetchRequest[5]),
+])
 def test_send_offset_fetch_request_versions(patched_coord, partitions,
                                             api_version, req_type):
     # assuming fixture sets coordinator=0, least_loaded_node=1
     expect_node = 0
-    patched_coord.config['api_version'] = api_version
+    patched_coord._client._api_versions = BROKER_API_VERSIONS[api_version]
 
     patched_coord._send_offset_fetch_request(partitions)
     (node, request), _ = patched_coord._client.send.call_args
@@ -564,17 +603,27 @@ def test_send_offset_fetch_request_success(patched_coord, partitions):
 
 
 @pytest.mark.parametrize('response,error,dead', [
-    (OffsetFetchResponse[0]([('foobar', [(0, 123, b'', 14), (1, 234, b'', 14)])]),
-     Errors.GroupLoadInProgressError, False),
-    (OffsetFetchResponse[0]([('foobar', [(0, 123, b'', 16), (1, 234, b'', 16)])]),
-     Errors.NotCoordinatorForGroupError, True),
-    (OffsetFetchResponse[0]([('foobar', [(0, 123, b'', 25), (1, 234, b'', 25)])]),
+    (OffsetFetchResponse[0]([('foobar', [(0, 123, '', 14), (1, 234, '', 14)])]),
+     Errors.CoordinatorLoadInProgressError, False),
+    (OffsetFetchResponse[0]([('foobar', [(0, 123, '', 16), (1, 234, '', 16)])]),
+     Errors.NotCoordinatorError, True),
+    (OffsetFetchResponse[0]([('foobar', [(0, 123, '', 25), (1, 234, '', 25)])]),
      Errors.UnknownMemberIdError, False),
-    (OffsetFetchResponse[0]([('foobar', [(0, 123, b'', 22), (1, 234, b'', 22)])]),
+    (OffsetFetchResponse[0]([('foobar', [(0, 123, '', 22), (1, 234, '', 22)])]),
      Errors.IllegalGenerationError, False),
-    (OffsetFetchResponse[0]([('foobar', [(0, 123, b'', 29), (1, 234, b'', 29)])]),
+    (OffsetFetchResponse[0]([('foobar', [(0, 123, '', 29), (1, 234, '', 29)])]),
      Errors.TopicAuthorizationFailedError, False),
-    (OffsetFetchResponse[0]([('foobar', [(0, 123, b'', 0), (1, 234, b'', 0)])]),
+    (OffsetFetchResponse[0]([('foobar', [(0, 123, '', 0), (1, 234, '', 0)])]),
+     None, False),
+    (OffsetFetchResponse[1]([('foobar', [(0, 123, '', 0), (1, 234, '', 0)])]),
+     None, False),
+    (OffsetFetchResponse[2]([('foobar', [(0, 123, '', 0), (1, 234, '', 0)])], 0),
+     None, False),
+    (OffsetFetchResponse[3](0, [('foobar', [(0, 123, '', 0), (1, 234, '', 0)])], 0),
+     None, False),
+    (OffsetFetchResponse[4](0, [('foobar', [(0, 123, '', 0), (1, 234, '', 0)])], 0),
+     None, False),
+    (OffsetFetchResponse[5](0, [('foobar', [(0, 123, -1, '', 0), (1, 234, -1, '', 0)])], 0),
      None, False),
 ])
 def test_handle_offset_fetch_response(patched_coord, offsets,
diff --git a/test/test_fetcher.py b/test/test_fetcher.py
index 697f8be1f..0ef349500 100644
--- a/test/test_fetcher.py
+++ b/test/test_fetcher.py
@@ -1,5 +1,6 @@
 # pylint: skip-file
 from __future__ import absolute_import
+import logging
 
 import pytest
 
@@ -7,26 +8,22 @@
 import itertools
 import time
 
-from kafka.client_async import KafkaClient
 from kafka.consumer.fetcher import (
-    CompletedFetch, ConsumerRecord, Fetcher, NoOffsetForPartitionError
+    CompletedFetch, ConsumerRecord, Fetcher
 )
 from kafka.consumer.subscription_state import SubscriptionState
+import kafka.errors as Errors
 from kafka.future import Future
-from kafka.metrics import Metrics
+from kafka.protocol.broker_api_versions import BROKER_API_VERSIONS
 from kafka.protocol.fetch import FetchRequest, FetchResponse
-from kafka.protocol.offset import OffsetResponse
+from kafka.protocol.list_offsets import ListOffsetsResponse, OffsetResetStrategy
 from kafka.errors import (
-    StaleMetadata, LeaderNotAvailableError, NotLeaderForPartitionError,
+    StaleMetadata, NotLeaderForPartitionError,
     UnknownTopicOrPartitionError, OffsetOutOfRangeError
 )
+from kafka.future import Future
 from kafka.record.memory_records import MemoryRecordsBuilder, MemoryRecords
-from kafka.structs import OffsetAndMetadata, TopicPartition
-
-
-@pytest.fixture
-def client(mocker):
-    return mocker.Mock(spec=KafkaClient(bootstrap_servers=(), api_version=(0, 9)))
+from kafka.structs import OffsetAndMetadata, OffsetAndTimestamp, TopicPartition
 
 
 @pytest.fixture
@@ -40,18 +37,18 @@ def topic():
 
 
 @pytest.fixture
-def fetcher(client, subscription_state, topic):
+def fetcher(client, metrics, subscription_state, topic):
     subscription_state.subscribe(topics=[topic])
     assignment = [TopicPartition(topic, i) for i in range(3)]
     subscription_state.assign_from_subscribed(assignment)
     for tp in assignment:
         subscription_state.seek(tp, 0)
-    return Fetcher(client, subscription_state, Metrics())
+    return Fetcher(client, subscription_state, metrics=metrics)
 
 
-def _build_record_batch(msgs, compression=0):
+def _build_record_batch(msgs, compression=0, offset=0, magic=2):
     builder = MemoryRecordsBuilder(
-        magic=1, compression_type=0, batch_size=9999999)
+        magic=magic, compression_type=0, batch_size=9999999, offset=offset)
     for msg in msgs:
         key, value, timestamp = msg
         builder.append(key=key, value=value, timestamp=timestamp, headers=[])
@@ -76,9 +73,20 @@ def test_send_fetches(fetcher, topic, mocker):
             ])])
     ]
 
-    mocker.patch.object(fetcher, '_create_fetch_requests',
-                        return_value=dict(enumerate(fetch_requests)))
+    def build_fetch_offsets(request):
+        fetch_offsets = {}
+        for topic, partitions in request.topics:
+            for partition_data in partitions:
+                partition, offset = partition_data[:2]
+                fetch_offsets[TopicPartition(topic, partition)] = offset
+        return fetch_offsets
 
+    mocker.patch.object(
+        fetcher, '_create_fetch_requests',
+        return_value=(dict(enumerate(map(lambda r: (r, build_fetch_offsets(r)), fetch_requests)))))
+
+    mocker.patch.object(fetcher._client, 'ready', return_value=True)
+    mocker.patch.object(fetcher._client, 'send')
     ret = fetcher.send_fetches()
     for node, request in enumerate(fetch_requests):
         fetcher._client.send.assert_any_call(node, request, wakeup=False)
@@ -89,64 +97,69 @@ def test_send_fetches(fetcher, topic, mocker):
     ((0, 10, 1), 3),
     ((0, 10, 0), 2),
     ((0, 9), 1),
-    ((0, 8), 0)
+    ((0, 8, 2), 0)
 ])
 def test_create_fetch_requests(fetcher, mocker, api_version, fetch_version):
-    fetcher._client.in_flight_request_count.return_value = 0
-    fetcher.config['api_version'] = api_version
+    fetcher._client._api_versions = BROKER_API_VERSIONS[api_version]
+    mocker.patch.object(fetcher._client.cluster, "leader_for_partition", return_value=0)
+    mocker.patch.object(fetcher._client.cluster, "leader_epoch_for_partition", return_value=0)
+    mocker.patch.object(fetcher._client, "ready", return_value=True)
     by_node = fetcher._create_fetch_requests()
-    requests = by_node.values()
-    assert all([isinstance(r, FetchRequest[fetch_version]) for r in requests])
+    requests_and_offsets = by_node.values()
+    assert set([r.API_VERSION for (r, _offsets) in requests_and_offsets]) == set([fetch_version])
 
 
-def test_update_fetch_positions(fetcher, topic, mocker):
-    mocker.patch.object(fetcher, '_reset_offset')
+def test_reset_offsets_if_needed(fetcher, topic, mocker):
+    mocker.patch.object(fetcher, '_reset_offsets_async')
     partition = TopicPartition(topic, 0)
 
-    # unassigned partition
-    fetcher.update_fetch_positions([TopicPartition('fizzbuzz', 0)])
-    assert fetcher._reset_offset.call_count == 0
-
     # fetchable partition (has offset, not paused)
-    fetcher.update_fetch_positions([partition])
-    assert fetcher._reset_offset.call_count == 0
-
-    # partition needs reset, no committed offset
-    fetcher._subscriptions.need_offset_reset(partition)
-    fetcher._subscriptions.assignment[partition].awaiting_reset = False
-    fetcher.update_fetch_positions([partition])
-    fetcher._reset_offset.assert_called_with(partition)
+    fetcher.reset_offsets_if_needed()
+    assert fetcher._reset_offsets_async.call_count == 0
+
+    # partition needs reset, no valid position
+    fetcher._subscriptions.request_offset_reset(partition)
+    fetcher.reset_offsets_if_needed()
+    fetcher._reset_offsets_async.assert_called_with({partition: OffsetResetStrategy.EARLIEST})
     assert fetcher._subscriptions.assignment[partition].awaiting_reset is True
-    fetcher.update_fetch_positions([partition])
-    fetcher._reset_offset.assert_called_with(partition)
-
-    # partition needs reset, has committed offset
-    fetcher._reset_offset.reset_mock()
-    fetcher._subscriptions.need_offset_reset(partition)
-    fetcher._subscriptions.assignment[partition].awaiting_reset = False
-    fetcher._subscriptions.assignment[partition].committed = OffsetAndMetadata(123, b'')
-    mocker.patch.object(fetcher._subscriptions, 'seek')
-    fetcher.update_fetch_positions([partition])
-    assert fetcher._reset_offset.call_count == 0
-    fetcher._subscriptions.seek.assert_called_with(partition, 123)
-
-
-def test__reset_offset(fetcher, mocker):
-    tp = TopicPartition("topic", 0)
-    fetcher._subscriptions.subscribe(topics="topic")
-    fetcher._subscriptions.assign_from_subscribed([tp])
-    fetcher._subscriptions.need_offset_reset(tp)
-    mocked = mocker.patch.object(fetcher, '_retrieve_offsets')
-
-    mocked.return_value = {tp: (1001, None)}
-    fetcher._reset_offset(tp)
-    assert not fetcher._subscriptions.assignment[tp].awaiting_reset
-    assert fetcher._subscriptions.assignment[tp].position == 1001
-
-
-def test__send_offset_requests(fetcher, mocker):
-    tp = TopicPartition("topic_send_offset", 1)
-    mocked_send = mocker.patch.object(fetcher, "_send_offset_request")
+    fetcher.reset_offsets_if_needed()
+    fetcher._reset_offsets_async.assert_called_with({partition: OffsetResetStrategy.EARLIEST})
+
+    # partition needs reset, has valid position
+    fetcher._reset_offsets_async.reset_mock()
+    fetcher._subscriptions.request_offset_reset(partition)
+    fetcher._subscriptions.seek(partition, 123)
+    fetcher.reset_offsets_if_needed()
+    assert fetcher._reset_offsets_async.call_count == 0
+
+
+def test__reset_offsets_async(fetcher, mocker):
+    tp0 = TopicPartition("topic", 0)
+    tp1 = TopicPartition("topic", 1)
+    fetcher._subscriptions.subscribe(topics=["topic"])
+    fetcher._subscriptions.assign_from_subscribed([tp0, tp1])
+    fetcher._subscriptions.request_offset_reset(tp0)
+    fetcher._subscriptions.request_offset_reset(tp1)
+    mocker.patch.object(fetcher._client.cluster, "leader_for_partition", side_effect=[0, 1])
+    mocker.patch.object(fetcher._client, 'ready', return_value=True)
+    future1 = Future()
+    future2 = Future()
+    mocker.patch.object(fetcher, '_send_list_offsets_request', side_effect=[future1, future2])
+    fetcher._reset_offsets_async({
+        tp0: OffsetResetStrategy.EARLIEST,
+        tp1: OffsetResetStrategy.EARLIEST,
+    })
+    future1.success(({tp0: OffsetAndTimestamp(1001, None, -1)}, set())),
+    future2.success(({tp1: OffsetAndTimestamp(1002, None, -1)}, set())),
+    assert not fetcher._subscriptions.assignment[tp0].awaiting_reset
+    assert not fetcher._subscriptions.assignment[tp1].awaiting_reset
+    assert fetcher._subscriptions.assignment[tp0].position.offset == 1001
+    assert fetcher._subscriptions.assignment[tp1].position.offset == 1002
+
+
+def test__send_list_offsets_requests(fetcher, mocker):
+    tp = TopicPartition("topic_send_list_offsets", 1)
+    mocked_send = mocker.patch.object(fetcher, "_send_list_offsets_request")
     send_futures = []
 
     def send_side_effect(*args, **kw):
@@ -161,21 +174,22 @@ def send_side_effect(*args, **kw):
     # always as available
     mocked_leader.side_effect = itertools.chain(
         [None, -1], itertools.cycle([0]))
+    mocker.patch.object(fetcher._client.cluster, "leader_epoch_for_partition", return_value=0)
 
     # Leader == None
-    fut = fetcher._send_offset_requests({tp: 0})
+    fut = fetcher._send_list_offsets_requests({tp: 0})
     assert fut.failed()
     assert isinstance(fut.exception, StaleMetadata)
     assert not mocked_send.called
 
     # Leader == -1
-    fut = fetcher._send_offset_requests({tp: 0})
+    fut = fetcher._send_list_offsets_requests({tp: 0})
     assert fut.failed()
-    assert isinstance(fut.exception, LeaderNotAvailableError)
+    assert isinstance(fut.exception, StaleMetadata)
     assert not mocked_send.called
 
     # Leader == 0, send failed
-    fut = fetcher._send_offset_requests({tp: 0})
+    fut = fetcher._send_list_offsets_requests({tp: 0})
     assert not fut.is_done
     assert mocked_send.called
     # Check that we bound the futures correctly to chain failure
@@ -184,21 +198,21 @@ def send_side_effect(*args, **kw):
     assert isinstance(fut.exception, NotLeaderForPartitionError)
 
     # Leader == 0, send success
-    fut = fetcher._send_offset_requests({tp: 0})
+    fut = fetcher._send_list_offsets_requests({tp: 0})
     assert not fut.is_done
     assert mocked_send.called
     # Check that we bound the futures correctly to chain success
-    send_futures.pop().success({tp: (10, 10000)})
+    send_futures.pop().success(({tp: (10, 10000)}, set()))
     assert fut.succeeded()
-    assert fut.value == {tp: (10, 10000)}
+    assert fut.value == ({tp: (10, 10000)}, set())
 
 
-def test__send_offset_requests_multiple_nodes(fetcher, mocker):
-    tp1 = TopicPartition("topic_send_offset", 1)
-    tp2 = TopicPartition("topic_send_offset", 2)
-    tp3 = TopicPartition("topic_send_offset", 3)
-    tp4 = TopicPartition("topic_send_offset", 4)
-    mocked_send = mocker.patch.object(fetcher, "_send_offset_request")
+def test__send_list_offsets_requests_multiple_nodes(fetcher, mocker):
+    tp1 = TopicPartition("topic_send_list_offsets", 1)
+    tp2 = TopicPartition("topic_send_list_offsets", 2)
+    tp3 = TopicPartition("topic_send_list_offsets", 3)
+    tp4 = TopicPartition("topic_send_list_offsets", 4)
+    mocked_send = mocker.patch.object(fetcher, "_send_list_offsets_request")
     send_futures = []
 
     def send_side_effect(node_id, timestamps):
@@ -210,10 +224,11 @@ def send_side_effect(node_id, timestamps):
     mocked_leader = mocker.patch.object(
         fetcher._client.cluster, "leader_for_partition")
     mocked_leader.side_effect = itertools.cycle([0, 1])
+    mocker.patch.object(fetcher._client.cluster, "leader_epoch_for_partition", return_value=0)
 
     # -- All node succeeded case
     tss = OrderedDict([(tp1, 0), (tp2, 0), (tp3, 0), (tp4, 0)])
-    fut = fetcher._send_offset_requests(tss)
+    fut = fetcher._send_list_offsets_requests(tss)
     assert not fut.is_done
     assert mocked_send.call_count == 2
 
@@ -223,80 +238,124 @@ def send_side_effect(node_id, timestamps):
         req_by_node[node] = timestamps
         if node == 0:
             # Say tp3 does not have any messages so it's missing
-            f.success({tp1: (11, 1001)})
+            f.success(({tp1: (11, 1001)}, set()))
         else:
             second_future = f
     assert req_by_node == {
-        0: {tp1: 0, tp3: 0},
-        1: {tp2: 0, tp4: 0}
+        0: {tp1: (0, -1), tp3: (0, -1)},
+        1: {tp2: (0, -1), tp4: (0, -1)}
     }
 
     # We only resolved 1 future so far, so result future is not yet ready
     assert not fut.is_done
-    second_future.success({tp2: (12, 1002), tp4: (14, 1004)})
+    second_future.success(({tp2: (12, 1002), tp4: (14, 1004)}, set()))
     assert fut.succeeded()
-    assert fut.value == {tp1: (11, 1001), tp2: (12, 1002), tp4: (14, 1004)}
+    assert fut.value == ({tp1: (11, 1001), tp2: (12, 1002), tp4: (14, 1004)}, set())
 
     # -- First succeeded second not
     del send_futures[:]
-    fut = fetcher._send_offset_requests(tss)
+    fut = fetcher._send_list_offsets_requests(tss)
     assert len(send_futures) == 2
-    send_futures[0][2].success({tp1: (11, 1001)})
+    send_futures[0][2].success(({tp1: (11, 1001)}, set()))
     send_futures[1][2].failure(UnknownTopicOrPartitionError(tp1))
     assert fut.failed()
     assert isinstance(fut.exception, UnknownTopicOrPartitionError)
 
     # -- First fails second succeeded
     del send_futures[:]
-    fut = fetcher._send_offset_requests(tss)
+    fut = fetcher._send_list_offsets_requests(tss)
     assert len(send_futures) == 2
     send_futures[0][2].failure(UnknownTopicOrPartitionError(tp1))
-    send_futures[1][2].success({tp1: (11, 1001)})
+    send_futures[1][2].success(({tp1: (11, 1001)}, set()))
     assert fut.failed()
     assert isinstance(fut.exception, UnknownTopicOrPartitionError)
 
 
-def test__handle_offset_response(fetcher, mocker):
+def test__handle_list_offsets_response_v1(fetcher, mocker):
     # Broker returns UnsupportedForMessageFormatError, will omit partition
     fut = Future()
-    res = OffsetResponse[1]([
+    res = ListOffsetsResponse[1]([
         ("topic", [(0, 43, -1, -1)]),
         ("topic", [(1, 0, 1000, 9999)])
     ])
-    fetcher._handle_offset_response(fut, res)
+    fetcher._handle_list_offsets_response(fut, res)
     assert fut.succeeded()
-    assert fut.value == {TopicPartition("topic", 1): (9999, 1000)}
+    assert fut.value == ({TopicPartition("topic", 1): OffsetAndTimestamp(9999, 1000, -1)}, set())
 
     # Broker returns NotLeaderForPartitionError
     fut = Future()
-    res = OffsetResponse[1]([
+    res = ListOffsetsResponse[1]([
         ("topic", [(0, 6, -1, -1)]),
     ])
-    fetcher._handle_offset_response(fut, res)
-    assert fut.failed()
-    assert isinstance(fut.exception, NotLeaderForPartitionError)
+    fetcher._handle_list_offsets_response(fut, res)
+    assert fut.succeeded()
+    assert fut.value == ({}, set([TopicPartition("topic", 0)]))
 
     # Broker returns UnknownTopicOrPartitionError
     fut = Future()
-    res = OffsetResponse[1]([
+    res = ListOffsetsResponse[1]([
         ("topic", [(0, 3, -1, -1)]),
     ])
-    fetcher._handle_offset_response(fut, res)
-    assert fut.failed()
-    assert isinstance(fut.exception, UnknownTopicOrPartitionError)
+    fetcher._handle_list_offsets_response(fut, res)
+    assert fut.succeeded()
+    assert fut.value == ({}, set([TopicPartition("topic", 0)]))
 
     # Broker returns many errors and 1 result
-    # Will fail on 1st error and return
     fut = Future()
-    res = OffsetResponse[1]([
-        ("topic", [(0, 43, -1, -1)]),
-        ("topic", [(1, 6, -1, -1)]),
-        ("topic", [(2, 3, -1, -1)]),
+    res = ListOffsetsResponse[1]([
+        ("topic", [(0, 43, -1, -1)]), # not retriable
+        ("topic", [(1, 6, -1, -1)]),  # retriable
+        ("topic", [(2, 3, -1, -1)]),  # retriable
         ("topic", [(3, 0, 1000, 9999)])
     ])
-    fetcher._handle_offset_response(fut, res)
-    assert fut.failed()
-    assert isinstance(fut.exception, NotLeaderForPartitionError)
+    fetcher._handle_list_offsets_response(fut, res)
+    assert fut.succeeded()
+    assert fut.value == ({TopicPartition("topic", 3): OffsetAndTimestamp(9999, 1000, -1)},
+                         set([TopicPartition("topic", 1), TopicPartition("topic", 2)]))
+
+
+def test__handle_list_offsets_response_v2_v3(fetcher, mocker):
+    # including a throttle_time shouldnt cause issues
+    fut = Future()
+    res = ListOffsetsResponse[2](
+        123, # throttle_time_ms
+        [("topic", [(0, 0, 1000, 9999)])
+    ])
+    fetcher._handle_list_offsets_response(fut, res)
+    assert fut.succeeded()
+    assert fut.value == ({TopicPartition("topic", 0): OffsetAndTimestamp(9999, 1000, -1)}, set())
+
+    # v3 response is the same format
+    fut = Future()
+    res = ListOffsetsResponse[3](
+        123, # throttle_time_ms
+        [("topic", [(0, 0, 1000, 9999)])
+    ])
+    fetcher._handle_list_offsets_response(fut, res)
+    assert fut.succeeded()
+    assert fut.value == ({TopicPartition("topic", 0): OffsetAndTimestamp(9999, 1000, -1)}, set())
+
+
+def test__handle_list_offsets_response_v4_v5(fetcher, mocker):
+    # includes leader_epoch
+    fut = Future()
+    res = ListOffsetsResponse[4](
+        123, # throttle_time_ms
+        [("topic", [(0, 0, 1000, 9999, 1234)])
+    ])
+    fetcher._handle_list_offsets_response(fut, res)
+    assert fut.succeeded()
+    assert fut.value == ({TopicPartition("topic", 0): OffsetAndTimestamp(9999, 1000, 1234)}, set())
+
+    # v5 response is the same format
+    fut = Future()
+    res = ListOffsetsResponse[5](
+        123, # throttle_time_ms
+        [("topic", [(0, 0, 1000, 9999, 1234)])
+    ])
+    fetcher._handle_list_offsets_response(fut, res)
+    assert fut.succeeded()
+    assert fut.value == ({TopicPartition("topic", 0): OffsetAndTimestamp(9999, 1000, 1234)}, set())
 
 
 def test_fetched_records(fetcher, topic, mocker):
@@ -318,19 +377,15 @@ def test_fetched_records(fetcher, topic, mocker):
     assert partial is False
 
 
-@pytest.mark.parametrize(("fetch_request", "fetch_response", "num_partitions"), [
+@pytest.mark.parametrize(("fetch_offsets", "fetch_response", "num_partitions"), [
     (
-        FetchRequest[0](
-            -1, 100, 100,
-            [('foo', [(0, 0, 1000),])]),
+        {TopicPartition('foo', 0): 0},
         FetchResponse[0](
             [("foo", [(0, 0, 1000, [(0, b'xxx'),])]),]),
         1,
     ),
     (
-        FetchRequest[1](
-            -1, 100, 100,
-            [('foo', [(0, 0, 1000), (1, 0, 1000),])]),
+        {TopicPartition('foo', 0): 0, TopicPartition('foo', 1): 0},
         FetchResponse[1](
             0,
             [("foo", [
@@ -340,46 +395,55 @@ def test_fetched_records(fetcher, topic, mocker):
         2,
     ),
     (
-        FetchRequest[2](
-            -1, 100, 100,
-            [('foo', [(0, 0, 1000),])]),
+        {TopicPartition('foo', 0): 0},
         FetchResponse[2](
             0, [("foo", [(0, 0, 1000, [(0, b'xxx'),])]),]),
         1,
     ),
     (
-        FetchRequest[3](
-            -1, 100, 100, 10000,
-            [('foo', [(0, 0, 1000),])]),
+        {TopicPartition('foo', 0): 0},
         FetchResponse[3](
             0, [("foo", [(0, 0, 1000, [(0, b'xxx'),])]),]),
         1,
     ),
     (
-        FetchRequest[4](
-            -1, 100, 100, 10000, 0,
-            [('foo', [(0, 0, 1000),])]),
+        {TopicPartition('foo', 0): 0},
         FetchResponse[4](
             0, [("foo", [(0, 0, 1000, 0, [], [(0, b'xxx'),])]),]),
         1,
     ),
     (
         # This may only be used in broker-broker api calls
-        FetchRequest[5](
-            -1, 100, 100, 10000, 0,
-            [('foo', [(0, 0, 1000),])]),
+        {TopicPartition('foo', 0): 0},
         FetchResponse[5](
             0, [("foo", [(0, 0, 1000, 0, 0, [], [(0, b'xxx'),])]),]),
         1,
     ),
 ])
-def test__handle_fetch_response(fetcher, fetch_request, fetch_response, num_partitions):
-    fetcher._handle_fetch_response(fetch_request, time.time(), fetch_response)
+def test__handle_fetch_response(fetcher, fetch_offsets, fetch_response, num_partitions):
+    fetcher._nodes_with_pending_fetch_requests.add(0)
+    fetcher._handle_fetch_response(0, fetch_offsets, time.time(), fetch_response)
     assert len(fetcher._completed_fetches) == num_partitions
 
 
-def test__unpack_message_set(fetcher):
-    fetcher.config['check_crcs'] = False
+@pytest.mark.parametrize(("exception", "log_level"), [
+(
+    Errors.Cancelled(),
+    logging.INFO
+),
+(
+    Errors.KafkaError(),
+    logging.ERROR
+)
+])
+def test__handle_fetch_error(fetcher, caplog, exception, log_level):
+    fetcher._nodes_with_pending_fetch_requests.add(3)
+    fetcher._handle_fetch_error(3, exception)
+    assert len(caplog.records) == 1
+    assert caplog.records[0].levelname == logging.getLevelName(log_level)
+
+
+def test__unpack_records(mocker):
     tp = TopicPartition('foo', 0)
     messages = [
         (None, b"a", None),
@@ -387,7 +451,8 @@ def test__unpack_message_set(fetcher):
         (None, b"c", None),
     ]
     memory_records = MemoryRecords(_build_record_batch(messages))
-    records = list(fetcher._unpack_message_set(tp, memory_records))
+    part_records = Fetcher.PartitionRecords(0, tp, memory_records)
+    records = list(part_records.record_iterator)
     assert len(records) == 3
     assert all(map(lambda x: isinstance(x, ConsumerRecord), records))
     assert records[0].value == b'a'
@@ -398,22 +463,21 @@ def test__unpack_message_set(fetcher):
     assert records[2].offset == 2
 
 
-def test__message_generator(fetcher, topic, mocker):
-    fetcher.config['check_crcs'] = False
-    tp = TopicPartition(topic, 0)
-    msgs = []
-    for i in range(10):
-        msgs.append((None, b"foo", None))
-    completed_fetch = CompletedFetch(
-        tp, 0, 0, [0, 100, _build_record_batch(msgs)],
-        mocker.MagicMock()
-    )
-    fetcher._completed_fetches.append(completed_fetch)
-    for i in range(10):
-        msg = next(fetcher)
-        assert isinstance(msg, ConsumerRecord)
-        assert msg.offset == i
-        assert msg.value == b'foo'
+def test__unpack_records_corrupted(mocker):
+    tp = TopicPartition('foo', 0)
+    messages = [
+        (None, b"a", None),
+        (None, b"b", None),
+        (None, b"c", None),
+    ]
+    memory_records = MemoryRecords(_build_record_batch(messages))
+    from kafka.record.default_records import DefaultRecord
+    mocker.patch.object(DefaultRecord, 'validate_crc', side_effect=[True, True, False])
+    part_records = Fetcher.PartitionRecords(0, tp, memory_records)
+    records = part_records.take(10)
+    assert len(records) == 2
+    with pytest.raises(Errors.CorruptRecordError):
+        part_records.take(10)
 
 
 def test__parse_fetched_data(fetcher, topic, mocker):
@@ -428,7 +492,8 @@ def test__parse_fetched_data(fetcher, topic, mocker):
     )
     partition_record = fetcher._parse_fetched_data(completed_fetch)
     assert isinstance(partition_record, fetcher.PartitionRecords)
-    assert len(partition_record) == 10
+    assert partition_record
+    assert len(partition_record.take()) == 10
 
 
 def test__parse_fetched_data__paused(fetcher, topic, mocker):
@@ -467,6 +532,7 @@ def test__parse_fetched_data__not_leader(fetcher, topic, mocker):
         tp, 0, 0, [NotLeaderForPartitionError.errno, -1, None],
         mocker.MagicMock()
     )
+    mocker.patch.object(fetcher._client.cluster, 'request_update')
     partition_record = fetcher._parse_fetched_data(completed_fetch)
     assert partition_record is None
     fetcher._client.cluster.request_update.assert_called_with()
@@ -479,6 +545,7 @@ def test__parse_fetched_data__unknown_tp(fetcher, topic, mocker):
         tp, 0, 0, [UnknownTopicOrPartitionError.errno, -1, None],
         mocker.MagicMock()
     )
+    mocker.patch.object(fetcher._client.cluster, 'request_update')
     partition_record = fetcher._parse_fetched_data(completed_fetch)
     assert partition_record is None
     fetcher._client.cluster.request_update.assert_called_with()
@@ -496,7 +563,7 @@ def test__parse_fetched_data__out_of_range(fetcher, topic, mocker):
     assert fetcher._subscriptions.assignment[tp].awaiting_reset is True
 
 
-def test_partition_records_offset():
+def test_partition_records_offset(mocker):
     """Test that compressed messagesets are handle correctly
     when fetch offset is in the middle of the message list
     """
@@ -504,39 +571,45 @@ def test_partition_records_offset():
     batch_end = 130
     fetch_offset = 123
     tp = TopicPartition('foo', 0)
-    messages = [ConsumerRecord(tp.topic, tp.partition, i,
-                               None, None, 'key', 'value', [], 'checksum', 0, 0, -1)
-                for i in range(batch_start, batch_end)]
-    records = Fetcher.PartitionRecords(fetch_offset, None, messages)
-    assert len(records) > 0
+    messages = [(None, b'msg', None) for i in range(batch_start, batch_end)]
+    memory_records = MemoryRecords(_build_record_batch(messages, offset=batch_start))
+    records = Fetcher.PartitionRecords(fetch_offset, tp, memory_records)
+    assert records
+    assert records.next_fetch_offset == fetch_offset
     msgs = records.take(1)
     assert msgs[0].offset == fetch_offset
-    assert records.fetch_offset == fetch_offset + 1
+    assert records.next_fetch_offset == fetch_offset + 1
     msgs = records.take(2)
     assert len(msgs) == 2
-    assert len(records) > 0
-    records.discard()
-    assert len(records) == 0
+    assert records
+    assert records.next_fetch_offset == fetch_offset + 3
+    records.drain()
+    assert not records
 
 
-def test_partition_records_empty():
-    records = Fetcher.PartitionRecords(0, None, [])
-    assert len(records) == 0
+def test_partition_records_empty(mocker):
+    tp = TopicPartition('foo', 0)
+    memory_records = MemoryRecords(_build_record_batch([]))
+    records = Fetcher.PartitionRecords(0, tp, memory_records)
+    msgs = records.take()
+    assert len(msgs) == 0
+    assert not records
 
 
-def test_partition_records_no_fetch_offset():
+def test_partition_records_no_fetch_offset(mocker):
     batch_start = 0
     batch_end = 100
     fetch_offset = 123
     tp = TopicPartition('foo', 0)
-    messages = [ConsumerRecord(tp.topic, tp.partition, i,
-                               None, None, 'key', 'value', None, 'checksum', 0, 0, -1)
-                for i in range(batch_start, batch_end)]
-    records = Fetcher.PartitionRecords(fetch_offset, None, messages)
-    assert len(records) == 0
+    messages = [(None, b'msg', None) for i in range(batch_start, batch_end)]
+    memory_records = MemoryRecords(_build_record_batch(messages, offset=batch_start))
+    records = Fetcher.PartitionRecords(fetch_offset, tp, memory_records)
+    msgs = records.take()
+    assert len(msgs) == 0
+    assert not records
 
 
-def test_partition_records_compacted_offset():
+def test_partition_records_compacted_offset(mocker):
     """Test that messagesets are handle correctly
     when the fetch offset points to a message that has been compacted
     """
@@ -544,10 +617,155 @@ def test_partition_records_compacted_offset():
     batch_end = 100
     fetch_offset = 42
     tp = TopicPartition('foo', 0)
-    messages = [ConsumerRecord(tp.topic, tp.partition, i,
-                               None, None, 'key', 'value', None, 'checksum', 0, 0, -1)
-                for i in range(batch_start, batch_end) if i != fetch_offset]
-    records = Fetcher.PartitionRecords(fetch_offset, None, messages)
-    assert len(records) == batch_end - fetch_offset - 1
-    msgs = records.take(1)
+    builder = MemoryRecordsBuilder(
+        magic=2, compression_type=0, batch_size=9999999)
+
+    for i in range(batch_start, batch_end):
+        if i == fetch_offset:
+            builder.skip(1)
+        else:
+            builder.append(key=None, value=b'msg', timestamp=None, headers=[])
+    builder.close()
+    memory_records = MemoryRecords(builder.buffer())
+    records = Fetcher.PartitionRecords(fetch_offset, tp, memory_records)
+    msgs = records.take()
+    assert len(msgs) == batch_end - fetch_offset - 1
     assert msgs[0].offset == fetch_offset + 1
+
+
+def test_reset_offsets_paused(subscription_state, client, mocker):
+    fetcher = Fetcher(client, subscription_state)
+    tp = TopicPartition('foo', 0)
+    subscription_state.assign_from_user([tp])
+    subscription_state.pause(tp) # paused partition does not have a valid position
+    subscription_state.request_offset_reset(tp, OffsetResetStrategy.LATEST)
+
+    fetched_offsets = {tp: OffsetAndTimestamp(10, 1, -1)}
+    mocker.patch.object(fetcher._client, 'ready', return_value=True)
+    mocker.patch.object(fetcher, '_send_list_offsets_request',
+                        return_value=Future().success((fetched_offsets, set())))
+    mocker.patch.object(fetcher._client.cluster, "leader_for_partition", return_value=0)
+    fetcher.reset_offsets_if_needed()
+
+    assert not subscription_state.is_offset_reset_needed(tp)
+    assert not subscription_state.is_fetchable(tp) # because tp is paused
+    assert subscription_state.has_valid_position(tp)
+    assert subscription_state.position(tp) == OffsetAndMetadata(10, '', -1)
+
+
+def test_reset_offsets_paused_without_valid(subscription_state, client, mocker):
+    fetcher = Fetcher(client, subscription_state)
+    tp = TopicPartition('foo', 0)
+    subscription_state.assign_from_user([tp])
+    subscription_state.pause(tp) # paused partition does not have a valid position
+    subscription_state.reset_missing_positions()
+
+    fetched_offsets = {tp: OffsetAndTimestamp(0, 1, -1)}
+    mocker.patch.object(fetcher._client, 'ready', return_value=True)
+    mocker.patch.object(fetcher, '_send_list_offsets_request',
+                        return_value=Future().success((fetched_offsets, set())))
+    mocker.patch.object(fetcher._client.cluster, "leader_for_partition", return_value=0)
+    fetcher.reset_offsets_if_needed()
+
+    assert not subscription_state.is_offset_reset_needed(tp)
+    assert not subscription_state.is_fetchable(tp) # because tp is paused
+    assert subscription_state.has_valid_position(tp)
+    assert subscription_state.position(tp) == OffsetAndMetadata(0, '', -1)
+
+
+def test_reset_offsets_paused_with_valid(subscription_state, client, mocker):
+    fetcher = Fetcher(client, subscription_state)
+    tp = TopicPartition('foo', 0)
+    subscription_state.assign_from_user([tp])
+    subscription_state.seek(tp, 0)
+    subscription_state.assignment[tp].position = OffsetAndMetadata(10, '', -1)
+    subscription_state.pause(tp) # paused partition already has a valid position
+
+    mocker.patch.object(fetcher, '_fetch_offsets_by_times', return_value={tp: OffsetAndTimestamp(0, 1, -1)})
+    fetcher.reset_offsets_if_needed()
+
+    assert not subscription_state.is_offset_reset_needed(tp)
+    assert not subscription_state.is_fetchable(tp) # because tp is paused
+    assert subscription_state.has_valid_position(tp)
+    assert subscription_state.position(tp) == OffsetAndMetadata(10, '', -1)
+
+
+def test_fetch_position_after_exception(client, mocker):
+    subscription_state = SubscriptionState(offset_reset_strategy='NONE')
+    fetcher = Fetcher(client, subscription_state)
+
+    tp0 = TopicPartition('foo', 0)
+    tp1 = TopicPartition('foo', 1)
+    # verify the advancement in the next fetch offset equals to the number of fetched records when
+    # some fetched partitions cause Exception. This ensures that consumer won't lose record upon exception
+    subscription_state.assign_from_user([tp0, tp1])
+    subscription_state.seek(tp0, 1)
+    subscription_state.seek(tp1, 1)
+
+    assert len(fetcher._fetchable_partitions()) == 2
+
+    empty_records = _build_record_batch([], offset=1)
+    three_records = _build_record_batch([(None, b'msg', None) for _ in range(3)], offset=1)
+    fetcher._completed_fetches.append(
+        CompletedFetch(tp1, 1, 0, [0, 100, three_records], mocker.MagicMock()))
+    fetcher._completed_fetches.append(
+        CompletedFetch(tp0, 1, 0, [1, 100, empty_records], mocker.MagicMock()))
+    records, partial = fetcher.fetched_records()
+
+    assert len(records) == 1
+    assert tp1 in records
+    assert tp0 not in records
+    assert len(records[tp1]) == 3
+    assert subscription_state.position(tp1).offset == 4
+
+    exceptions = []
+    try:
+        records, partial = fetcher.fetched_records()
+    except Errors.OffsetOutOfRangeError as e:
+        exceptions.append(e)
+
+    assert len(exceptions) == 1
+    assert isinstance(exceptions[0], Errors.OffsetOutOfRangeError)
+    assert exceptions[0].args == ({tp0: 1},)
+
+
+def test_seek_before_exception(client, mocker):
+    subscription_state = SubscriptionState(offset_reset_strategy='NONE')
+    fetcher = Fetcher(client, subscription_state, max_poll_records=2)
+
+    tp0 = TopicPartition('foo', 0)
+    tp1 = TopicPartition('foo', 1)
+    subscription_state.assign_from_user([tp0])
+    subscription_state.seek(tp0, 1)
+
+    assert len(fetcher._fetchable_partitions()) == 1
+
+    three_records = _build_record_batch([(None, b'msg', None) for _ in range(3)], offset=1)
+    fetcher._completed_fetches.append(
+        CompletedFetch(tp0, 1, 0, [0, 100, three_records], mocker.MagicMock()))
+    records, partial = fetcher.fetched_records()
+
+    assert len(records) == 1
+    assert tp0 in records
+    assert len(records[tp0]) == 2
+    assert subscription_state.position(tp0).offset == 3
+
+    subscription_state.assign_from_user([tp0, tp1])
+    subscription_state.seek(tp1, 1)
+
+    assert len(fetcher._fetchable_partitions()) == 1
+
+    empty_records = _build_record_batch([], offset=1)
+    fetcher._completed_fetches.append(
+        CompletedFetch(tp1, 1, 0, [1, 100, empty_records], mocker.MagicMock()))
+    records, partial = fetcher.fetched_records()
+
+    assert len(records) == 1
+    assert tp0 in records
+    assert len(records[tp0]) == 1
+    assert subscription_state.position(tp0).offset == 4
+
+    subscription_state.seek(tp1, 10)
+    # Should not throw OffsetOutOfRangeError after the seek
+    records, partial = fetcher.fetched_records()
+    assert len(records) == 0
diff --git a/test/test_metrics.py b/test/test_metrics.py
index 308ea5831..07c0e838a 100644
--- a/test/test_metrics.py
+++ b/test/test_metrics.py
@@ -19,23 +19,6 @@ def time_keeper():
     return TimeKeeper()
 
 
-@pytest.fixture
-def config():
-    return MetricConfig()
-
-
-@pytest.fixture
-def reporter():
-    return DictReporter()
-
-
-@pytest.fixture
-def metrics(request, config, reporter):
-    metrics = Metrics(config, [reporter], enable_expiration=True)
-    yield metrics
-    metrics.close()
-
-
 def test_MetricName():
     # The Java test only cover the differences between the deprecated
     # constructors, so I'm skipping them but doing some other basic testing.
@@ -82,8 +65,9 @@ def test_MetricName():
     assert name.tags == tags
 
 
-def test_simple_stats(mocker, time_keeper, config, metrics):
+def test_simple_stats(mocker, time_keeper, metrics):
     mocker.patch('time.time', side_effect=time_keeper.time)
+    config = metrics._config
 
     measurable = ConstantMeasurable()
 
diff --git a/test/test_object_conversion.py b/test/test_object_conversion.py
index 9b1ff2131..a48eb0601 100644
--- a/test/test_object_conversion.py
+++ b/test/test_object_conversion.py
@@ -207,7 +207,7 @@ def test_with_metadata_response():
     assert len(obj['topics']) == 2
     assert obj['topics'][0]['error_code'] == 0
     assert obj['topics'][0]['topic'] == 'testtopic1'
-    assert obj['topics'][0]['is_internal'] == False
+    assert obj['topics'][0]['is_internal'] is False
     assert len(obj['topics'][0]['partitions']) == 2
     assert obj['topics'][0]['partitions'][0]['error_code'] == 0
     assert obj['topics'][0]['partitions'][0]['partition'] == 0
@@ -224,7 +224,7 @@ def test_with_metadata_response():
 
     assert obj['topics'][1]['error_code'] == 0
     assert obj['topics'][1]['topic'] == 'other-test-topic'
-    assert obj['topics'][1]['is_internal'] == True
+    assert obj['topics'][1]['is_internal'] is True
     assert len(obj['topics'][1]['partitions']) == 1
     assert obj['topics'][1]['partitions'][0]['error_code'] == 0
     assert obj['topics'][1]['partitions'][0]['partition'] == 0
diff --git a/test/test_producer.py b/test/test_producer.py
index 7263130d1..e79c682a7 100644
--- a/test/test_producer.py
+++ b/test/test_producer.py
@@ -1,137 +1,35 @@
+from __future__ import absolute_import
+
 import gc
 import platform
-import time
 import threading
 
 import pytest
 
-from kafka import KafkaConsumer, KafkaProducer, TopicPartition
-from kafka.producer.buffer import SimpleBufferPool
-from test.testutil import env_kafka_version, random_string
-
-
-def test_buffer_pool():
-    pool = SimpleBufferPool(1000, 1000)
-
-    buf1 = pool.allocate(1000, 1000)
-    message = ''.join(map(str, range(100)))
-    buf1.write(message.encode('utf-8'))
-    pool.deallocate(buf1)
-
-    buf2 = pool.allocate(1000, 1000)
-    assert buf2.read() == b''
-
-
-@pytest.mark.skipif(not env_kafka_version(), reason="No KAFKA_VERSION set")
-@pytest.mark.parametrize("compression", [None, 'gzip', 'snappy', 'lz4', 'zstd'])
-def test_end_to_end(kafka_broker, compression):
-    if compression == 'lz4':
-        if env_kafka_version() < (0, 8, 2):
-            pytest.skip('LZ4 requires 0.8.2')
-        elif platform.python_implementation() == 'PyPy':
-            pytest.skip('python-lz4 crashes on older versions of pypy')
-
-    if compression == 'zstd' and env_kafka_version() < (2, 1, 0):
-        pytest.skip('zstd requires kafka 2.1.0 or newer')
-
-    connect_str = ':'.join([kafka_broker.host, str(kafka_broker.port)])
-    producer = KafkaProducer(bootstrap_servers=connect_str,
-                             retries=5,
-                             max_block_ms=30000,
-                             compression_type=compression,
-                             value_serializer=str.encode)
-    consumer = KafkaConsumer(bootstrap_servers=connect_str,
-                             group_id=None,
-                             consumer_timeout_ms=30000,
-                             auto_offset_reset='earliest',
-                             value_deserializer=bytes.decode)
-
-    topic = random_string(5)
+from kafka import KafkaProducer
+from kafka.cluster import ClusterMetadata
+from kafka.producer.transaction_manager import TransactionManager, ProducerIdAndEpoch
 
-    messages = 100
-    futures = []
-    for i in range(messages):
-        futures.append(producer.send(topic, 'msg %d' % i))
-    ret = [f.get(timeout=30) for f in futures]
-    assert len(ret) == messages
-    producer.close()
-
-    consumer.subscribe([topic])
-    msgs = set()
-    for i in range(messages):
-        try:
-            msgs.add(next(consumer).value)
-        except StopIteration:
-            break
-
-    assert msgs == set(['msg %d' % (i,) for i in range(messages)])
-    consumer.close()
 
-
-@pytest.mark.skipif(platform.python_implementation() != 'CPython',
-                    reason='Test relies on CPython-specific gc policies')
-def test_kafka_producer_gc_cleanup():
-    gc.collect()
+def test_kafka_producer_thread_close():
     threads = threading.active_count()
-    producer = KafkaProducer(api_version='0.9') # set api_version explicitly to avoid auto-detection
+    producer = KafkaProducer(api_version=(2, 1)) # set api_version explicitly to avoid auto-detection
     assert threading.active_count() == threads + 1
-    del(producer)
-    gc.collect()
+    producer.close()
     assert threading.active_count() == threads
 
 
-@pytest.mark.skipif(not env_kafka_version(), reason="No KAFKA_VERSION set")
-@pytest.mark.parametrize("compression", [None, 'gzip', 'snappy', 'lz4', 'zstd'])
-def test_kafka_producer_proper_record_metadata(kafka_broker, compression):
-    if compression == 'zstd' and env_kafka_version() < (2, 1, 0):
-        pytest.skip('zstd requires 2.1.0 or more')
-    connect_str = ':'.join([kafka_broker.host, str(kafka_broker.port)])
-    producer = KafkaProducer(bootstrap_servers=connect_str,
-                             retries=5,
-                             max_block_ms=30000,
-                             compression_type=compression)
-    magic = producer._max_usable_produce_magic()
-
-    # record headers are supported in 0.11.0
-    if env_kafka_version() < (0, 11, 0):
-        headers = None
-    else:
-        headers = [("Header Key", b"Header Value")]
-
-    topic = random_string(5)
-    future = producer.send(
-        topic,
-        value=b"Simple value", key=b"Simple key", headers=headers, timestamp_ms=9999999,
-        partition=0)
-    record = future.get(timeout=5)
-    assert record is not None
-    assert record.topic == topic
-    assert record.partition == 0
-    assert record.topic_partition == TopicPartition(topic, 0)
-    assert record.offset == 0
-    if magic >= 1:
-        assert record.timestamp == 9999999
-    else:
-        assert record.timestamp == -1  # NO_TIMESTAMP
-
-    if magic >= 2:
-        assert record.checksum is None
-    elif magic == 1:
-        assert record.checksum == 1370034956
-    else:
-        assert record.checksum == 3296137851
-
-    assert record.serialized_key_size == 10
-    assert record.serialized_value_size == 12
-    if headers:
-        assert record.serialized_header_size == 22
-
-    if magic == 0:
-        pytest.skip('generated timestamp case is skipped for broker 0.9 and below')
-    send_time = time.time() * 1000
-    future = producer.send(
-        topic,
-        value=b"Simple value", key=b"Simple key", timestamp_ms=None,
-        partition=0)
-    record = future.get(timeout=5)
-    assert abs(record.timestamp - send_time) <= 1000  # Allow 1s deviation
+def test_idempotent_producer_reset_producer_id():
+    transaction_manager = TransactionManager(
+        transactional_id=None,
+        transaction_timeout_ms=1000,
+        retry_backoff_ms=100,
+        api_version=(0, 11),
+        metadata=ClusterMetadata(),
+    )
+
+    test_producer_id_and_epoch = ProducerIdAndEpoch(123, 456)
+    transaction_manager.set_producer_id_and_epoch(test_producer_id_and_epoch)
+    assert transaction_manager.producer_id_and_epoch == test_producer_id_and_epoch
+    transaction_manager.reset_producer_id()
+    assert transaction_manager.producer_id_and_epoch == ProducerIdAndEpoch(-1, -1)
diff --git a/test/test_protocol.py b/test/test_protocol.py
index e295174d4..d0cc7ed0a 100644
--- a/test/test_protocol.py
+++ b/test/test_protocol.py
@@ -2,14 +2,12 @@
 import io
 import struct
 
-import pytest
-
 from kafka.protocol.api import RequestHeader
-from kafka.protocol.commit import GroupCoordinatorRequest
 from kafka.protocol.fetch import FetchRequest, FetchResponse
+from kafka.protocol.find_coordinator import FindCoordinatorRequest
 from kafka.protocol.message import Message, MessageSet, PartialMessage
 from kafka.protocol.metadata import MetadataRequest
-from kafka.protocol.types import Int16, Int32, Int64, String
+from kafka.protocol.types import Int16, Int32, Int64, String, UnsignedVarInt32, CompactString, CompactArray, CompactBytes
 
 
 def test_create_message():
@@ -168,7 +166,7 @@ def test_encode_message_header():
         b'client3',                        # ClientId
     ])
 
-    req = GroupCoordinatorRequest[0]('foo')
+    req = FindCoordinatorRequest[0]('foo')
     header = RequestHeader(req, correlation_id=4, client_id='client3')
     assert header.encode() == expect
 
@@ -273,7 +271,7 @@ def test_decode_fetch_response_partial():
 
 def test_struct_unrecognized_kwargs():
     try:
-        mr = MetadataRequest[0](topicz='foo')
+        _mr = MetadataRequest[0](topicz='foo')
         assert False, 'Structs should not allow unrecognized kwargs'
     except ValueError:
         pass
@@ -282,3 +280,55 @@ def test_struct_unrecognized_kwargs():
 def test_struct_missing_kwargs():
     fr = FetchRequest[0](max_wait_time=100)
     assert fr.min_bytes is None
+
+
+def test_unsigned_varint_serde():
+    pairs = {
+        0: [0],
+        -1: [0xff, 0xff, 0xff, 0xff, 0x0f],
+        1: [1],
+        63: [0x3f],
+        -64: [0xc0, 0xff, 0xff, 0xff, 0x0f],
+        64: [0x40],
+        8191: [0xff, 0x3f],
+        -8192: [0x80, 0xc0, 0xff, 0xff, 0x0f],
+        8192: [0x80, 0x40],
+        -8193: [0xff, 0xbf, 0xff, 0xff, 0x0f],
+        1048575: [0xff, 0xff, 0x3f],
+
+    }
+    for value, expected_encoded in pairs.items():
+        value &= 0xffffffff
+        encoded = UnsignedVarInt32.encode(value)
+        assert encoded == b''.join(struct.pack('>B', x) for x in expected_encoded)
+        assert value == UnsignedVarInt32.decode(io.BytesIO(encoded))
+
+
+def test_compact_data_structs():
+    cs = CompactString()
+    encoded = cs.encode(None)
+    assert encoded == struct.pack('B', 0)
+    decoded = cs.decode(io.BytesIO(encoded))
+    assert decoded is None
+    assert b'\x01' == cs.encode('')
+    assert '' == cs.decode(io.BytesIO(b'\x01'))
+    encoded = cs.encode("foobarbaz")
+    assert cs.decode(io.BytesIO(encoded)) == "foobarbaz"
+
+    arr = CompactArray(CompactString())
+    assert arr.encode(None) == b'\x00'
+    assert arr.decode(io.BytesIO(b'\x00')) is None
+    enc = arr.encode([])
+    assert enc == b'\x01'
+    assert [] == arr.decode(io.BytesIO(enc))
+    encoded = arr.encode(["foo", "bar", "baz", "quux"])
+    assert arr.decode(io.BytesIO(encoded)) == ["foo", "bar", "baz", "quux"]
+
+    enc = CompactBytes.encode(None)
+    assert enc == b'\x00'
+    assert CompactBytes.decode(io.BytesIO(b'\x00')) is None
+    enc = CompactBytes.encode(b'')
+    assert enc == b'\x01'
+    assert CompactBytes.decode(io.BytesIO(b'\x01')) == b''
+    enc = CompactBytes.encode(b'foo')
+    assert CompactBytes.decode(io.BytesIO(enc)) == b'foo'
diff --git a/test/test_record_accumulator.py b/test/test_record_accumulator.py
new file mode 100644
index 000000000..5c7134e5c
--- /dev/null
+++ b/test/test_record_accumulator.py
@@ -0,0 +1,266 @@
+# pylint: skip-file
+from __future__ import absolute_import, division
+
+import pytest
+
+from kafka.cluster import ClusterMetadata
+from kafka.errors import IllegalStateError, KafkaError
+from kafka.producer.future import FutureRecordMetadata, RecordMetadata
+from kafka.producer.record_accumulator import RecordAccumulator, ProducerBatch
+from kafka.record.default_records import DefaultRecordBatchBuilder
+from kafka.record.memory_records import MemoryRecordsBuilder
+from kafka.structs import TopicPartition
+
+
+@pytest.fixture
+def tp():
+    return TopicPartition('foo', 0)
+
+@pytest.fixture
+def cluster(tp, mocker):
+    metadata = ClusterMetadata()
+    mocker.patch.object(metadata, 'leader_for_partition', return_value=0)
+    mocker.patch.object(metadata, 'partitions_for_broker', return_value=[tp])
+    return metadata
+
+def test_producer_batch_producer_id():
+    tp = TopicPartition('foo', 0)
+    records = MemoryRecordsBuilder(
+        magic=2, compression_type=0, batch_size=100000)
+    batch = ProducerBatch(tp, records)
+    assert batch.producer_id == -1
+    batch.records.set_producer_state(123, 456, 789, False)
+    assert batch.producer_id == 123
+    records.close()
+    assert batch.producer_id == 123
+
+@pytest.mark.parametrize("magic", [0, 1, 2])
+def test_producer_batch_try_append(magic):
+    tp = TopicPartition('foo', 0)
+    records = MemoryRecordsBuilder(
+        magic=magic, compression_type=0, batch_size=100000)
+    batch = ProducerBatch(tp, records)
+    assert batch.record_count == 0
+    future = batch.try_append(0, b'key', b'value', [])
+    assert isinstance(future, FutureRecordMetadata)
+    assert not future.is_done
+    batch.done(base_offset=123, timestamp_ms=456)
+    assert future.is_done
+    # record-level checksum only provided in v0/v1 formats; payload includes magic-byte
+    if magic == 0:
+        checksum = 592888119
+    elif magic == 1:
+        checksum = 213653215
+    else:
+        checksum = None
+
+    expected_metadata = RecordMetadata(
+        topic=tp[0], partition=tp[1], topic_partition=tp,
+        offset=123, timestamp=456, checksum=checksum,
+        serialized_key_size=3, serialized_value_size=5, serialized_header_size=-1)
+    assert future.value == expected_metadata
+
+def test_producer_batch_retry():
+    tp = TopicPartition('foo', 0)
+    records = MemoryRecordsBuilder(
+        magic=2, compression_type=0, batch_size=100000)
+    batch = ProducerBatch(tp, records)
+    assert not batch.in_retry()
+    batch.retry()
+    assert batch.in_retry()
+
+def test_batch_abort():
+    tp = TopicPartition('foo', 0)
+    records = MemoryRecordsBuilder(
+        magic=2, compression_type=0, batch_size=100000)
+    batch = ProducerBatch(tp, records)
+    future = batch.try_append(123, None, b'msg', [])
+
+    batch.abort(KafkaError())
+    assert future.is_done
+
+    # subsequent completion should be ignored
+    batch.done(500, 2342342341)
+    batch.done(exception=KafkaError())
+
+    assert future.is_done
+    with pytest.raises(KafkaError):
+        future.get()
+
+def test_batch_cannot_abort_twice():
+    tp = TopicPartition('foo', 0)
+    records = MemoryRecordsBuilder(
+        magic=2, compression_type=0, batch_size=100000)
+    batch = ProducerBatch(tp, records)
+    future = batch.try_append(123, None, b'msg', [])
+
+    batch.abort(KafkaError())
+
+    with pytest.raises(IllegalStateError):
+        batch.abort(KafkaError())
+
+    assert future.is_done
+    with pytest.raises(KafkaError):
+        future.get()
+
+def test_batch_cannot_complete_twice():
+    tp = TopicPartition('foo', 0)
+    records = MemoryRecordsBuilder(
+        magic=2, compression_type=0, batch_size=100000)
+    batch = ProducerBatch(tp, records)
+    future = batch.try_append(123, None, b'msg', [])
+
+    batch.done(500, 10, None)
+
+    with pytest.raises(IllegalStateError):
+        batch.done(1000, 20, None)
+
+    record_metadata = future.get()
+
+    assert record_metadata.offset == 500
+    assert record_metadata.timestamp == 10
+
+def test_linger(tp, cluster):
+    now = 0
+    accum = RecordAccumulator(linger_ms=10)
+    accum.append(tp, 0, b'key', b'value', [], now=now)
+    ready, next_ready_check, _unknown_leaders_exist = accum.ready(cluster, now=now)
+    assert len(ready) == 0, 'No partitions should be ready'
+    assert next_ready_check == .01 # linger_ms in secs
+    now += .01
+    ready, _next_ready_check, _unknown_leaders_exist = accum.ready(cluster, now=now)
+    assert ready == set([0]), "Our partitions leader should be ready"
+    batches = accum.drain(cluster, ready, 0, 2147483647)[0]
+    assert len(batches) == 1
+    batch = batches[0]
+    assert batch.records.is_full()
+
+    parsed = list(batch.records.records())
+    assert len(parsed) == 1
+    records = list(parsed[0])
+    assert len(records) == 1
+    assert records[0].key == b'key', 'Keys should match'
+    assert records[0].value == b'value', 'Values should match'
+
+def _advance_now_ms(now, ms):
+    return now + ms / 1000 + 1/10000 # add extra .1 ms to each advance to avoid rounding issues when converting back to seconds
+
+def _do_expire_batch_single(cluster, tp, delivery_timeout_ms):
+    now = 0
+    linger_ms = 300
+    accum = RecordAccumulator(linger_ms=linger_ms, delivery_timeout_ms=delivery_timeout_ms, request_timeout_ms=(delivery_timeout_ms-linger_ms-100))
+
+    # Make the batches ready due to linger. These batches are not in retry
+    for mute in [False, True]:
+        accum.append(tp, 0, b'key', b'value', [], now=now)
+        ready, next_ready_check, _unknown_leaders_exist = accum.ready(cluster, now=now)
+        assert len(ready) == 0, 'No partitions should be ready'
+        assert next_ready_check == linger_ms / 1000
+
+        now = _advance_now_ms(now, linger_ms)
+        ready, _next_ready_check, _unknown_leaders_exist = accum.ready(cluster, now=now)
+        assert ready == set([0]), "Our partitions leader should be ready"
+
+        expired_batches = accum.expired_batches(now=now)
+        assert len(expired_batches) == 0, "The batch should not expire when just linger has passed"
+
+        if mute:
+            accum.muted.add(tp)
+        else:
+            try:
+                accum.muted.remove(tp)
+            except KeyError:
+                pass
+
+        # Advance the clock to expire the batch.
+        now = _advance_now_ms(now, delivery_timeout_ms - linger_ms)
+        expired_batches = accum.expired_batches(now=now)
+        assert len(expired_batches) == 1, "The batch may expire when the partition is muted"
+        ready, _next_ready_check, _unknown_leaders_exist = accum.ready(cluster, now=now)
+        assert len(ready) == 0, "No partitions should be ready."
+
+def test_expired_batch_single(cluster, tp):
+    _do_expire_batch_single(cluster, tp, 3200)
+
+def test_expired_batch_single_max_value(cluster, tp):
+    _do_expire_batch_single(cluster, tp, 2147483647)
+
+def _expected_num_appends(batch_size):
+    size = DefaultRecordBatchBuilder.header_size_in_bytes()
+    offset_delta = 0
+    while True:
+        record_size = DefaultRecordBatchBuilder.size_in_bytes(offset_delta, 0, b'key', b'value', [])
+        if size + record_size > batch_size:
+            return offset_delta
+        offset_delta += 1
+        size += record_size
+
+def test_expired_batches(cluster, tp):
+    now = 0
+    retry_backoff_ms = 100
+    linger_ms = 30
+    request_timeout_ms = 60
+    delivery_timeout_ms = 3200
+    batch_size = 1024
+    accum = RecordAccumulator(linger_ms=linger_ms, delivery_timeout_ms=delivery_timeout_ms, request_timeout_ms=request_timeout_ms, retry_backoff_ms=retry_backoff_ms, batch_size=batch_size)
+    appends = _expected_num_appends(batch_size)
+
+    # Test batches not in retry
+    for i in range(appends):
+        accum.append(tp, 0, b'key', b'value', [], now=now)
+        ready, next_ready_check, _unknown_leaders_exist = accum.ready(cluster, now=now)
+        assert len(ready) == 0, 'No partitions should be ready'
+        assert next_ready_check == linger_ms / 1000
+
+    # Make the batches ready due to batch full
+    accum.append(tp, 0, b'key', b'value', [], now=now)
+    ready, _next_ready_check, _unknown_leaders_exist = accum.ready(cluster, now=now)
+    assert ready == set([0]), "Our partitions leader should be ready"
+
+    # Advance the clock to expire the batch.
+    now = _advance_now_ms(now, delivery_timeout_ms + 1)
+    accum.muted.add(tp)
+    expired_batches = accum.expired_batches(now=now)
+    assert len(expired_batches) == 2, "The batches will be expired no matter if the partition is muted or not"
+
+    accum.muted.remove(tp)
+    expired_batches = accum.expired_batches(now=now)
+    assert len(expired_batches) == 0, "All batches should have been expired earlier"
+    ready, _next_ready_check, _unknown_leaders_exist = accum.ready(cluster, now=now)
+    assert len(ready) == 0, "No partitions should be ready."
+
+    # Test batches in retry.
+    # Create a retried batch
+    accum.append(tp, 0, b'key', b'value', [], now=now)
+    now = _advance_now_ms(now, linger_ms)
+    ready, _next_ready_check, _unknown_leaders_exist = accum.ready(cluster, now=now)
+    assert ready == set([0]), "Our partitions leader should be ready"
+
+    drained = accum.drain(cluster, ready, 2147483647, now=now)
+    assert len(drained[0]) == 1, "There should be only one batch."
+    now = _advance_now_ms(now, 1000)
+    accum.reenqueue(drained[0][0], now=now)
+
+    # test expiration.
+    now = _advance_now_ms(now, request_timeout_ms + retry_backoff_ms)
+    expired_batches = accum.expired_batches(now=now)
+    assert len(expired_batches) == 0, "The batch should not be expired."
+    now = _advance_now_ms(now, 1)
+
+    accum.muted.add(tp)
+    expired_batches = accum.expired_batches(now=now)
+    assert len(expired_batches) == 0, "The batch should not be expired when the partition is muted"
+
+    accum.muted.remove(tp)
+    expired_batches = accum.expired_batches(now=now)
+    assert len(expired_batches) == 0, "The batch should not be expired when the partition is unmuted"
+
+    now = _advance_now_ms(now, linger_ms)
+    ready, _next_ready_check, _unknown_leaders_exist = accum.ready(cluster, now=now)
+    assert ready == set([0]), "Our partitions leader should be ready"
+
+    # Advance the clock to expire the batch.
+    now = _advance_now_ms(now, delivery_timeout_ms + 1)
+    accum.muted.add(tp)
+    expired_batches = accum.expired_batches(now=now)
+    assert len(expired_batches) == 1, "The batch should not be expired when the partition is muted"
diff --git a/test/test_sender.py b/test/test_sender.py
index 2a68defcf..0731454df 100644
--- a/test/test_sender.py
+++ b/test/test_sender.py
@@ -1,53 +1,242 @@
 # pylint: skip-file
 from __future__ import absolute_import
 
-import pytest
+import collections
 import io
+import time
+
+import pytest
+try:
+    from unittest.mock import call
+except ImportError:
+    from mock import call
+
+from kafka.vendor import six
 
 from kafka.client_async import KafkaClient
 from kafka.cluster import ClusterMetadata
-from kafka.metrics import Metrics
+import kafka.errors as Errors
+from kafka.protocol.broker_api_versions import BROKER_API_VERSIONS
+from kafka.producer.kafka import KafkaProducer
 from kafka.protocol.produce import ProduceRequest
 from kafka.producer.record_accumulator import RecordAccumulator, ProducerBatch
 from kafka.producer.sender import Sender
+from kafka.producer.transaction_manager import TransactionManager
 from kafka.record.memory_records import MemoryRecordsBuilder
 from kafka.structs import TopicPartition
 
 
-@pytest.fixture
-def client(mocker):
-    _cli = mocker.Mock(spec=KafkaClient(bootstrap_servers=(), api_version=(0, 9)))
-    _cli.cluster = mocker.Mock(spec=ClusterMetadata())
-    return _cli
-
-
 @pytest.fixture
 def accumulator():
     return RecordAccumulator()
 
 
 @pytest.fixture
-def metrics():
-    return Metrics()
+def sender(client, accumulator):
+    return Sender(client, client.cluster, accumulator)
+
+
+def producer_batch(topic='foo', partition=0, magic=2):
+    tp = TopicPartition(topic, partition)
+    records = MemoryRecordsBuilder(
+        magic=magic, compression_type=0, batch_size=100000)
+    batch = ProducerBatch(tp, records)
+    batch.try_append(0, None, b'msg', [])
+    batch.records.close()
+    return batch
 
 
 @pytest.fixture
-def sender(client, accumulator, metrics):
-    return Sender(client, client.cluster, accumulator, metrics)
+def transaction_manager():
+    return TransactionManager(
+        transactional_id=None,
+        transaction_timeout_ms=60000,
+        retry_backoff_ms=100,
+        api_version=(2, 1),
+        metadata=ClusterMetadata())
 
 
 @pytest.mark.parametrize(("api_version", "produce_version"), [
-    ((0, 10), 2),
+    ((2, 1), 7),
+    ((0, 10, 0), 2),
     ((0, 9), 1),
-    ((0, 8), 0)
+    ((0, 8, 0), 0)
 ])
-def test_produce_request(sender, mocker, api_version, produce_version):
-    sender.config['api_version'] = api_version
-    tp = TopicPartition('foo', 0)
-    buffer = io.BytesIO()
-    records = MemoryRecordsBuilder(
-        magic=1, compression_type=0, batch_size=100000)
-    batch = ProducerBatch(tp, records, buffer)
-    records.close()
+def test_produce_request(sender, api_version, produce_version):
+    sender._client._api_versions = BROKER_API_VERSIONS[api_version]
+    magic = KafkaProducer.max_usable_produce_magic(api_version)
+    batch = producer_batch(magic=magic)
     produce_request = sender._produce_request(0, 0, 0, [batch])
     assert isinstance(produce_request, ProduceRequest[produce_version])
+
+
+@pytest.mark.parametrize(("api_version", "produce_version"), [
+    ((2, 1), 7),
+])
+def test_create_produce_requests(sender, api_version, produce_version):
+    sender._client._api_versions = BROKER_API_VERSIONS[api_version]
+    tp = TopicPartition('foo', 0)
+    magic = KafkaProducer.max_usable_produce_magic(api_version)
+    batches_by_node = collections.defaultdict(list)
+    for node in range(3):
+        for _ in range(5):
+            batches_by_node[node].append(producer_batch(magic=magic))
+    produce_requests_by_node = sender._create_produce_requests(batches_by_node)
+    assert len(produce_requests_by_node) == 3
+    for node in range(3):
+        assert isinstance(produce_requests_by_node[node], ProduceRequest[produce_version])
+
+
+def test_complete_batch_success(sender):
+    batch = producer_batch()
+    assert not batch.produce_future.is_done
+
+    # No error, base_offset 0
+    sender._complete_batch(batch, None, 0, timestamp_ms=123)
+    assert batch.is_done
+    assert batch.produce_future.is_done
+    assert batch.produce_future.succeeded()
+    assert batch.produce_future.value == (0, 123)
+
+
+def test_complete_batch_transaction(sender, transaction_manager):
+    sender._transaction_manager = transaction_manager
+    batch = producer_batch()
+    assert sender._transaction_manager.sequence_number(batch.topic_partition) == 0
+    assert sender._transaction_manager.producer_id_and_epoch.producer_id == batch.producer_id
+
+    # No error, base_offset 0
+    sender._complete_batch(batch, None, 0)
+    assert batch.is_done
+    assert sender._transaction_manager.sequence_number(batch.topic_partition) == batch.record_count
+
+
+@pytest.mark.parametrize(("error", "refresh_metadata"), [
+    (Errors.KafkaConnectionError, True),
+    (Errors.CorruptRecordError, False),
+    (Errors.UnknownTopicOrPartitionError, True),
+    (Errors.NotLeaderForPartitionError, True),
+    (Errors.MessageSizeTooLargeError, False),
+    (Errors.InvalidTopicError, False),
+    (Errors.RecordListTooLargeError, False),
+    (Errors.NotEnoughReplicasError, False),
+    (Errors.NotEnoughReplicasAfterAppendError, False),
+    (Errors.InvalidRequiredAcksError, False),
+    (Errors.TopicAuthorizationFailedError, False),
+    (Errors.UnsupportedForMessageFormatError, False),
+    (Errors.InvalidProducerEpochError, False),
+    (Errors.ClusterAuthorizationFailedError, False),
+    (Errors.TransactionalIdAuthorizationFailedError, False),
+])
+def test_complete_batch_error(sender, error, refresh_metadata):
+    sender._client.cluster._last_successful_refresh_ms = (time.time() - 10) * 1000
+    sender._client.cluster._need_update = False
+    sender.config['retries'] = 0
+    assert sender._client.cluster.ttl() > 0
+    batch = producer_batch()
+    sender._complete_batch(batch, error, -1)
+    if refresh_metadata:
+        assert sender._client.cluster.ttl() == 0
+    else:
+        assert sender._client.cluster.ttl() > 0
+    assert batch.is_done
+    assert batch.produce_future.failed()
+    assert isinstance(batch.produce_future.exception, error)
+
+
+@pytest.mark.parametrize(("error", "retry"), [
+    (Errors.KafkaConnectionError, True),
+    (Errors.CorruptRecordError, False),
+    (Errors.UnknownTopicOrPartitionError, True),
+    (Errors.NotLeaderForPartitionError, True),
+    (Errors.MessageSizeTooLargeError, False),
+    (Errors.InvalidTopicError, False),
+    (Errors.RecordListTooLargeError, False),
+    (Errors.NotEnoughReplicasError, True),
+    (Errors.NotEnoughReplicasAfterAppendError, True),
+    (Errors.InvalidRequiredAcksError, False),
+    (Errors.TopicAuthorizationFailedError, False),
+    (Errors.UnsupportedForMessageFormatError, False),
+    (Errors.InvalidProducerEpochError, False),
+    (Errors.ClusterAuthorizationFailedError, False),
+    (Errors.TransactionalIdAuthorizationFailedError, False),
+])
+def test_complete_batch_retry(sender, accumulator, mocker, error, retry):
+    sender.config['retries'] = 1
+    mocker.spy(sender, '_fail_batch')
+    mocker.patch.object(accumulator, 'reenqueue')
+    batch = producer_batch()
+    sender._complete_batch(batch, error, -1)
+    if retry:
+        assert not batch.is_done
+        accumulator.reenqueue.assert_called_with(batch)
+        batch.attempts += 1 # normally handled by accumulator.reenqueue, but it's mocked
+        sender._complete_batch(batch, error, -1)
+        assert batch.is_done
+        assert isinstance(batch.produce_future.exception, error)
+    else:
+        assert batch.is_done
+        assert isinstance(batch.produce_future.exception, error)
+
+
+def test_complete_batch_producer_id_changed_no_retry(sender, accumulator, transaction_manager, mocker):
+    sender._transaction_manager = transaction_manager
+    sender.config['retries'] = 1
+    mocker.spy(sender, '_fail_batch')
+    mocker.patch.object(accumulator, 'reenqueue')
+    error = Errors.NotLeaderForPartitionError
+    batch = producer_batch()
+    sender._complete_batch(batch, error, -1)
+    assert not batch.is_done
+    accumulator.reenqueue.assert_called_with(batch)
+    batch.records._producer_id = 123 # simulate different producer_id
+    assert batch.producer_id != sender._transaction_manager.producer_id_and_epoch.producer_id
+    sender._complete_batch(batch, error, -1)
+    assert batch.is_done
+    assert isinstance(batch.produce_future.exception, error)
+
+
+def test_fail_batch(sender, accumulator, transaction_manager, mocker):
+    sender._transaction_manager = transaction_manager
+    batch = producer_batch()
+    mocker.patch.object(batch, 'done')
+    assert sender._transaction_manager.producer_id_and_epoch.producer_id == batch.producer_id
+    error = Exception('error')
+    sender._fail_batch(batch, base_offset=0, timestamp_ms=None, exception=error)
+    batch.done.assert_called_with(base_offset=0, timestamp_ms=None, exception=error)
+
+
+def test_out_of_order_sequence_number_reset_producer_id(sender, accumulator, transaction_manager, mocker):
+    sender._transaction_manager = transaction_manager
+    assert transaction_manager.transactional_id is None # this test is for idempotent producer only
+    mocker.patch.object(TransactionManager, 'reset_producer_id')
+    batch = producer_batch()
+    mocker.patch.object(batch, 'done')
+    assert sender._transaction_manager.producer_id_and_epoch.producer_id == batch.producer_id
+    error = Errors.OutOfOrderSequenceNumberError()
+    sender._fail_batch(batch, base_offset=0, timestamp_ms=None, exception=error)
+    sender._transaction_manager.reset_producer_id.assert_called_once()
+    batch.done.assert_called_with(base_offset=0, timestamp_ms=None, exception=error)
+
+
+def test_handle_produce_response():
+    pass
+
+
+def test_failed_produce(sender, mocker):
+    mocker.patch.object(sender, '_complete_batch')
+    mock_batches = ['foo', 'bar', 'fizzbuzz']
+    sender._failed_produce(mock_batches, 0, 'error')
+    sender._complete_batch.assert_has_calls([
+        call('foo', 'error', -1),
+        call('bar', 'error', -1),
+        call('fizzbuzz', 'error', -1),
+    ])
+
+
+def test_maybe_wait_for_producer_id():
+    pass
+
+
+def test_run_once():
+    pass
diff --git a/test/test_subscription_state.py b/test/test_subscription_state.py
index 9718f6af4..773606525 100644
--- a/test/test_subscription_state.py
+++ b/test/test_subscription_state.py
@@ -1,25 +1,57 @@
-# pylint: skip-file
 from __future__ import absolute_import
 
 import pytest
 
-from kafka.consumer.subscription_state import SubscriptionState
-
-@pytest.mark.parametrize(('topic_name', 'expectation'), [
-    (0, pytest.raises(TypeError)),
-    (None, pytest.raises(TypeError)),
-    ('', pytest.raises(ValueError)),
-    ('.', pytest.raises(ValueError)),
-    ('..', pytest.raises(ValueError)),
-    ('a' * 250, pytest.raises(ValueError)),
-    ('abc/123', pytest.raises(ValueError)),
-    ('/abc/123', pytest.raises(ValueError)),
-    ('/abc123', pytest.raises(ValueError)),
-    ('name with space', pytest.raises(ValueError)),
-    ('name*with*stars', pytest.raises(ValueError)),
-    ('name+with+plus', pytest.raises(ValueError)),
-])
-def test_topic_name_validation(topic_name, expectation):
-    state = SubscriptionState()
-    with expectation:
-        state._ensure_valid_topic_name(topic_name)
+from kafka import TopicPartition
+from kafka.consumer.subscription_state import SubscriptionState, TopicPartitionState
+from kafka.vendor import six
+
+
+def test_type_error():
+    s = SubscriptionState()
+    with pytest.raises(TypeError):
+        s.subscribe(topics='foo')
+
+    s.subscribe(topics=['foo'])
+
+
+def test_change_subscription():
+    s = SubscriptionState()
+    s.subscribe(topics=['foo'])
+    assert s.subscription == set(['foo'])
+    s.change_subscription(['bar'])
+    assert s.subscription == set(['bar'])
+
+
+def test_group_subscribe():
+    s = SubscriptionState()
+    s.subscribe(topics=['foo'])
+    assert s.subscription == set(['foo'])
+    s.group_subscribe(['bar'])
+    assert s.subscription == set(['foo'])
+    assert s._group_subscription == set(['foo', 'bar'])
+
+    s.reset_group_subscription()
+    assert s.subscription == set(['foo'])
+    assert s._group_subscription == set(['foo'])
+
+
+def test_assign_from_subscribed():
+    s = SubscriptionState()
+    s.subscribe(topics=['foo'])
+    with pytest.raises(ValueError):
+        s.assign_from_subscribed([TopicPartition('bar', 0)])
+
+    s.assign_from_subscribed([TopicPartition('foo', 0), TopicPartition('foo', 1)])
+    assert set(s.assignment.keys()) == set([TopicPartition('foo', 0), TopicPartition('foo', 1)])
+    assert all([isinstance(tps, TopicPartitionState) for tps in six.itervalues(s.assignment)])
+    assert all([not tps.has_valid_position for tps in six.itervalues(s.assignment)])
+
+
+def test_change_subscription_after_assignment():
+    s = SubscriptionState()
+    s.subscribe(topics=['foo'])
+    s.assign_from_subscribed([TopicPartition('foo', 0), TopicPartition('foo', 1)])
+    # Changing subscription retains existing assignment until next rebalance
+    s.change_subscription(['bar'])
+    assert set(s.assignment.keys()) == set([TopicPartition('foo', 0), TopicPartition('foo', 1)])
diff --git a/test/test_util.py b/test/test_util.py
new file mode 100644
index 000000000..875b252aa
--- /dev/null
+++ b/test/test_util.py
@@ -0,0 +1,24 @@
+# pylint: skip-file
+from __future__ import absolute_import
+
+import pytest
+
+from kafka.util import ensure_valid_topic_name
+
+@pytest.mark.parametrize(('topic_name', 'expectation'), [
+    (0, pytest.raises(TypeError)),
+    (None, pytest.raises(TypeError)),
+    ('', pytest.raises(ValueError)),
+    ('.', pytest.raises(ValueError)),
+    ('..', pytest.raises(ValueError)),
+    ('a' * 250, pytest.raises(ValueError)),
+    ('abc/123', pytest.raises(ValueError)),
+    ('/abc/123', pytest.raises(ValueError)),
+    ('/abc123', pytest.raises(ValueError)),
+    ('name with space', pytest.raises(ValueError)),
+    ('name*with*stars', pytest.raises(ValueError)),
+    ('name+with+plus', pytest.raises(ValueError)),
+])
+def test_topic_name_validation(topic_name, expectation):
+    with expectation:
+        ensure_valid_topic_name(topic_name)
diff --git a/test/testutil.py b/test/testutil.py
index ec4d70bf6..b5dab1c02 100644
--- a/test/testutil.py
+++ b/test/testutil.py
@@ -6,6 +6,10 @@
 import string
 import time
 
+import pytest
+
+import kafka.codec
+
 
 def special_to_underscore(string, _matcher=re.compile(r'[^a-zA-Z0-9_]+')):
     return _matcher.sub('_', string)
@@ -28,12 +32,24 @@ def env_kafka_version():
 def assert_message_count(messages, num_messages):
     """Check that we received the expected number of messages with no duplicates."""
     # Make sure we got them all
-    assert len(messages) == num_messages
+    assert len(messages) == num_messages, 'Expected %d messages, got %d' % (num_messages, len(messages))
     # Make sure there are no duplicates
     # Note: Currently duplicates are identified only using key/value. Other attributes like topic, partition, headers,
     # timestamp, etc are ignored... this could be changed if necessary, but will be more tolerant of dupes.
     unique_messages = {(m.key, m.value) for m in messages}
-    assert len(unique_messages) == num_messages
+    assert len(unique_messages) == num_messages, 'Expected %d unique messages, got %d' % (num_messages, len(unique_messages))
+
+
+def maybe_skip_unsupported_compression(compression_type):
+    codecs = {1: 'gzip', 2: 'snappy', 3: 'lz4', 4: 'zstd'}
+    if not compression_type:
+        return
+    elif compression_type in codecs:
+        compression_type = codecs[compression_type]
+
+    checker = getattr(kafka.codec, 'has_' + compression_type, None)
+    if checker and not checker():
+        pytest.skip("Compression libraries not installed for %s" % (compression_type,))
 
 
 class Timer(object):
diff --git a/tox.ini b/tox.ini
deleted file mode 100644
index 10e9911dc..000000000
--- a/tox.ini
+++ /dev/null
@@ -1,44 +0,0 @@
-[tox]
-envlist = py{26,27,34,35,36,37,38,py}, docs
-
-[pytest]
-testpaths = kafka test
-addopts = --durations=10
-log_format = %(created)f %(filename)-23s %(threadName)s %(message)s
-
-[testenv]
-deps =
-    pytest
-    pytest-cov
-    py{27,34,35,36,37,38,py}: pylint
-    py{27,34,35,36,37,38,py}: pytest-pylint
-    pytest-mock
-    mock
-    python-snappy
-    zstandard
-    lz4
-    xxhash
-    crc32c
-commands =
-    py.test {posargs:--pylint --pylint-rcfile=pylint.rc --pylint-error-types=EF --cov=kafka --cov-config=.covrc}
-setenv =
-    CRC32C_SW_MODE = auto
-    PROJECT_ROOT = {toxinidir}
-passenv = KAFKA_VERSION
-
-[testenv:py26]
-# pylint doesn't support python2.6
-commands = py.test {posargs:--cov=kafka --cov-config=.covrc}
-
-[testenv:pypy]
-# pylint is super slow on pypy...
-commands = py.test {posargs:--cov=kafka --cov-config=.covrc}
-
-[testenv:docs]
-deps =
-    sphinx_rtd_theme
-    sphinx
-
-commands =
-    sphinx-apidoc -o docs/apidoc/ kafka/
-    sphinx-build -b html docs/ docs/_build
diff --git a/travis_java_install.sh b/travis_java_install.sh
deleted file mode 100644
index f662ce274..000000000
--- a/travis_java_install.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-# borrowed from: https://github.com/mansenfranzen/pywrangler/blob/master/tests/travis_java_install.sh
-
-# Kafka requires Java 8 in order to work properly. However, TravisCI's Ubuntu
-# 16.04 ships with Java 11 and Java can't be set with `jdk` when python is
-# selected as language. Ubuntu 14.04 does not work due to missing python 3.7
-# support on TravisCI which does have Java 8 as default.
-
-# show current JAVA_HOME and java version
-echo "Current JAVA_HOME: $JAVA_HOME"
-echo "Current java -version:"
-which java
-java -version
-
-echo "Updating JAVA_HOME"
-# change JAVA_HOME to Java 8
-export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64
-
-echo "Updating PATH"
-export PATH=${PATH/\/usr\/local\/lib\/jvm\/openjdk11\/bin/$JAVA_HOME\/bin}
-
-echo "New java -version"
-which java
-java -version