Skip to content

Commit 843af78

Browse files
Add support for replicated ClickHouse setups (#2895)
* added docker compose for replicated clickhouse * added cluster name to clickhouse connection info * added function for table engine names * added required edits to migrations to allow for replicated clickhouse setups * added cluster name to database creation and completed docker compose * all clickhouse tests pass in a replicated setup * added a test that you can read from replica * added a test for reading from replicated MV * added replication to TensorZero migration * added handling of failure cases for replicated configuration * added a test to confirm tables are being correctly created * test the clickhouse GHA replicated * debugging * removed debugger * fixed ordering of clickhouse startup * added cluster name to concurrent migrations tests * added most of the handling for replicated trees in migration manager * fixed all old tests * disabled replication in concurrent test * added command for docker to error * added handling for table start check * wip * run migrations manually in workflow * fixed check that a cluster is configured * added bigger runner for CH replicated tests * abort concurrent clickhouse test faster if replication is enabled * wip on replicated rollbacks * fixed rollback issues * decrease concurrency for replicated tests * see how much memory this needs really * removed stray print * consolidate clickhouse cluster in test * updated version tag and to --run-migrations-only * only run replicated tests in the merge queue * Use smaller runner with larger cache volume * Print docker compose logs * Print clickhouse logs * Add missing dash * Print health check * Fix health check print * Mount data directory in cache * Adjust runner profile * Revert other changes and add debug logging * removed stray todo * simplified clickhouse replication checking * fixed issue with merge * Properly scope clickhouse log print * Print e2e logs when gateway fails to start * fixed issue with chc * fixed should apply for 0029 * removed stray rust log * fixed test for fake row replicated * fixed success check * removed replicated tests from PR CI --------- Co-authored-by: Aaron Hill <aaron@tensorzero.com>
1 parent e0c9de1 commit 843af78

34 files changed

+1930
-385
lines changed

.github/workflows/general.yml

Lines changed: 75 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -350,13 +350,17 @@ jobs:
350350
run: find . -name "Chart.yaml" -exec dirname {} \; | xargs -I {} helm lint {}
351351

352352
clickhouse-tests:
353+
name: "ClickHouse tests (replicated: ${{ matrix.replicated }}) (version: ${{ matrix.clickhouse_version.tag }})"
354+
353355
# We don't run many tests here, so use a normal runner with Github Actions caching
354356
# to avoid unnecessarily using Namespace credits (it should still always finish before
355357
# the main 'validate' job)
356-
runs-on: ubuntu-latest
358+
runs-on: ${{ matrix.replicated && 'namespace-profile-tensorzero-large-cache-volume' || 'ubuntu-latest' }}
357359
continue-on-error: ${{ matrix.clickhouse_version.allow_failure }}
358360
strategy:
359361
matrix:
362+
# Only include replicated: true when running in merge queue
363+
replicated: ${{ github.event_name == 'merge_group' && fromJSON('[true, false]') || fromJSON('[false]') }}
360364
clickhouse_version:
361365
- tag: "24.12-alpine"
362366
prefix: "24.12"
@@ -390,6 +394,10 @@ jobs:
390394
with:
391395
tool: cargo-nextest
392396

397+
- name: Set ClickHouse replicated cluster name
398+
if: matrix.replicated == true
399+
run: echo "TENSORZERO_CLICKHOUSE_CLUSTER_NAME=tensorzero_e2e_tests_cluster" >> $GITHUB_ENV
400+
393401
- name: Install uv
394402
run: curl -LsSf https://astral.sh/uv/0.6.17/install.sh | sh
395403

@@ -400,9 +408,14 @@ jobs:
400408
run: |
401409
echo "TENSORZERO_CLICKHOUSE_URL=http://chuser:chpassword@localhost:8123/tensorzero_e2e_tests" >> $GITHUB_ENV
402410
403-
- name: Launch ClickHouse container for E2E tests
411+
- name: Launch non-replicated ClickHouse container for E2E tests
412+
if: matrix.replicated == false
404413
run: TENSORZERO_CLICKHOUSE_VERSION=${{ matrix.clickhouse_version.tag }} docker compose -f tensorzero-core/tests/e2e/docker-compose.yml up clickhouse --wait
405414

415+
- name: Launch replicated ClickHouse container for E2E tests
416+
if: matrix.replicated == true
417+
run: TENSORZERO_CLICKHOUSE_VERSION=${{ matrix.clickhouse_version.tag }} docker compose -f tensorzero-core/tests/e2e/docker-compose.replicated.yml up clickhouse-01 clickhouse-02 clickhouse-03 --wait
418+
406419
# Make an HTTP request to ClickHouse and check that the version matches '${{ matrix.clickhouse_version }}'
407420
- name: Check ClickHouse version
408421
run: |
@@ -415,8 +428,27 @@ jobs:
415428
- name: Build the gateway for E2E tests
416429
run: cargo build-e2e
417430

418-
- name: Launch the gateway for E2E tests
431+
- name: Launch the gateway for E2E tests (not configured for replication)
432+
if: matrix.replicated == false
433+
run: |
434+
cargo run-e2e > e2e_logs.txt 2>&1 &
435+
count=0
436+
max_attempts=10
437+
while ! curl -s -f http://localhost:3000/health >/dev/null 2>&1; do
438+
echo "Waiting for gateway to be healthy..."
439+
sleep 1
440+
count=$((count + 1))
441+
if [ $count -ge $max_attempts ]; then
442+
echo "Gateway failed to become healthy after $max_attempts attempts"
443+
exit 1
444+
fi
445+
done
446+
echo "GATEWAY_PID=$!" >> $GITHUB_ENV
447+
448+
- name: Launch the gateway for E2E tests (configured for replication)
449+
if: matrix.replicated == true
419450
run: |
451+
cargo run-e2e --run-migrations-only &&
420452
cargo run-e2e > e2e_logs.txt 2>&1 &
421453
count=0
422454
max_attempts=10
@@ -434,6 +466,46 @@ jobs:
434466
- name: Test (Rust)
435467
run: cargo test-e2e-no-creds
436468

469+
- name: Print docker compose logs (replicated)
470+
if: always() && matrix.replicated == true
471+
run: |
472+
TENSORZERO_CLICKHOUSE_VERSION=${{ matrix.clickhouse_version.tag }} docker compose -f tensorzero-core/tests/e2e/docker-compose.replicated.yml logs -t
473+
474+
- name: Print ClickHouse error logs (replicated)
475+
if: always() && matrix.replicated == true
476+
run: |
477+
echo "Error logs for ClickHouse 01:"
478+
docker exec e2e-clickhouse-01-1 cat /var/log/clickhouse-server/clickhouse-server.err.log
479+
echo "Error logs for ClickHouse 02:"
480+
docker exec e2e-clickhouse-02-1 cat /var/log/clickhouse-server/clickhouse-server.err.log
481+
echo "Error logs for ClickHouse 03:"
482+
docker exec e2e-clickhouse-03-1 cat /var/log/clickhouse-server/clickhouse-server.err.log
483+
484+
- name: Print ClickHouse trace logs (replicated)
485+
if: always() && matrix.replicated == true
486+
run: |
487+
echo "Trace logs for ClickHouse 01:"
488+
docker exec e2e-clickhouse-01-1 cat /var/log/clickhouse-server/clickhouse-server.log
489+
echo "Trace logs for ClickHouse 02:"
490+
docker exec e2e-clickhouse-02-1 cat /var/log/clickhouse-server/clickhouse-server.log
491+
echo "Trace logs for ClickHouse 03:"
492+
docker exec e2e-clickhouse-03-1 cat /var/log/clickhouse-server/clickhouse-server.log
493+
494+
- name: Print container health checks (replicated)
495+
if: always() && matrix.replicated == true
496+
run: |
497+
echo "Health check for ClickHouse 01:"
498+
docker inspect --format "{{json .State.Health }}" $(docker compose -f tensorzero-core/tests/e2e/docker-compose.replicated.yml ps -q clickhouse-01) | jq
499+
echo "Health check for ClickHouse 02:"
500+
docker inspect --format "{{json .State.Health }}" $(docker compose -f tensorzero-core/tests/e2e/docker-compose.replicated.yml ps -q clickhouse-02) | jq
501+
echo "Health check for ClickHouse 03:"
502+
docker inspect --format "{{json .State.Health }}" $(docker compose -f tensorzero-core/tests/e2e/docker-compose.replicated.yml ps -q clickhouse-03) | jq
503+
504+
- name: Print docker compose logs (non-replicated)
505+
if: always() && matrix.replicated == false
506+
run: |
507+
TENSORZERO_CLICKHOUSE_VERSION=${{ matrix.clickhouse_version.tag }} docker compose -f tensorzero-core/tests/e2e/docker-compose.yml logs -t
508+
437509
- name: Print e2e logs
438510
if: always()
439511
run: cat e2e_logs.txt

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@ target-rust-analyzer/
1414
# VScode settings
1515
.vscode/
1616

17+
# Zed settings
18+
.zed/
19+
1720
.DS_Store
1821
.credentials/
1922

ci/buildkite/test-clickhouse-cloud.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ cargo run-e2e > e2e_logs.txt 2>&1 &
3434
count=$((count + 1))
3535
if [ $count -ge $max_attempts ]; then
3636
echo "Gateway failed to become healthy after $max_attempts attempts"
37+
cat e2e_logs.txt
3738
exit 1
3839
fi
3940
done

gateway/src/main.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use std::io::ErrorKind;
1111
use std::net::SocketAddr;
1212
use std::path::{Path, PathBuf};
1313
use std::sync::Arc;
14+
use tensorzero_core::clickhouse::migration_manager::manual_run_migrations;
1415
use tensorzero_core::howdy::setup_howdy;
1516
use tokio::signal;
1617
use tower_http::trace::{DefaultOnFailure, TraceLayer};
@@ -49,6 +50,10 @@ struct Args {
4950
#[clap(default_value_t = LogFormat::default())]
5051
log_format: LogFormat,
5152

53+
/// Run database migrations manually then exit.
54+
#[arg(long)]
55+
run_migrations_only: bool,
56+
5257
/// Deprecated: use `--config-file` instead
5358
tensorzero_toml: Option<PathBuf>,
5459
}
@@ -89,6 +94,12 @@ async fn main() {
8994
.expect_pretty("Failed to set up logs");
9095

9196
let git_sha = tensorzero_core::built_info::GIT_COMMIT_HASH_SHORT.unwrap_or("unknown");
97+
if args.run_migrations_only {
98+
manual_run_migrations()
99+
.await
100+
.expect_pretty("Failed to run migrations");
101+
return;
102+
}
92103

93104
tracing::info!("Starting TensorZero Gateway {TENSORZERO_VERSION} (commit: {git_sha})");
94105

0 commit comments

Comments
 (0)