diff --git a/.clang-format b/.clang-format
new file mode 100644
index 000000000..37881bfc8
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,12 @@
+BasedOnStyle: Chromium
+UseTab: Never
+IndentWidth: 4
+TabWidth: 4
+AllowShortIfStatementsOnASingleLine: false
+ColumnLimit: 0
+AccessModifierOffset: -4
+NamespaceIndentation: All
+FixNamespaceComments: false
+AlignAfterOpenBracket: true
+AlignConsecutiveAssignments: true
+IndentCaseLabels: true
\ No newline at end of file
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 000000000..64a58a781
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,6 @@
+build*/
+test/
+
+.cache/
+*.swp
+models/
\ No newline at end of file
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 50e6a9227..4112ae9bb 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -4,17 +4,36 @@ on:
workflow_dispatch: # allows manual triggering
inputs:
create_release:
- description: 'Create new release'
+ description: "Create new release"
required: true
type: boolean
push:
branches:
- master
- ci
- paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
+ paths:
+ [
+ ".github/workflows/**",
+ "**/CMakeLists.txt",
+ "**/Makefile",
+ "**/*.h",
+ "**/*.hpp",
+ "**/*.c",
+ "**/*.cpp",
+ "**/*.cu",
+ ]
pull_request:
types: [opened, synchronize, reopened]
- paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
+ paths:
+ [
+ "**/CMakeLists.txt",
+ "**/Makefile",
+ "**/*.h",
+ "**/*.hpp",
+ "**/*.c",
+ "**/*.cpp",
+ "**/*.cu",
+ ]
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@@ -30,7 +49,6 @@ jobs:
with:
submodules: recursive
-
- name: Dependencies
id: depends
run: |
@@ -42,14 +60,37 @@ jobs:
run: |
mkdir build
cd build
- cmake ..
+ cmake .. -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON
cmake --build . --config Release
- #- name: Test
- #id: cmake_test
- #run: |
- #cd build
- #ctest --verbose --timeout 900
+ - name: Get commit hash
+ id: commit
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
+ uses: pr-mpt/actions-commit-hash@v2
+
+ - name: Fetch system info
+ id: system-info
+ run: |
+ echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
+ echo "OS_NAME=`lsb_release -s -i`" >> "$GITHUB_OUTPUT"
+ echo "OS_VERSION=`lsb_release -s -r`" >> "$GITHUB_OUTPUT"
+ echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
+
+ - name: Pack artifacts
+ id: pack_artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ run: |
+ cp ggml/LICENSE ./build/bin/ggml.txt
+ cp LICENSE ./build/bin/stable-diffusion.cpp.txt
+ zip -j sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip ./build/bin/*
+
+ - name: Upload artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: actions/upload-artifact@v4
+ with:
+ name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
+ path: |
+ sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
macOS-latest-cmake:
runs-on: macos-latest
@@ -63,9 +104,8 @@ jobs:
- name: Dependencies
id: depends
- continue-on-error: true
run: |
- brew update
+ brew install zip
- name: Build
id: cmake_build
@@ -73,30 +113,61 @@ jobs:
sysctl -a
mkdir build
cd build
- cmake ..
+ cmake .. -DGGML_AVX2=ON -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" -DSD_BUILD_SHARED_LIBS=ON
cmake --build . --config Release
- #- name: Test
- #id: cmake_test
- #run: |
- #cd build
- #ctest --verbose --timeout 900
+ - name: Get commit hash
+ id: commit
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
+ uses: pr-mpt/actions-commit-hash@v2
+
+ - name: Fetch system info
+ id: system-info
+ run: |
+ echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
+ echo "OS_NAME=`sw_vers -productName`" >> "$GITHUB_OUTPUT"
+ echo "OS_VERSION=`sw_vers -productVersion`" >> "$GITHUB_OUTPUT"
+ echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
+
+ - name: Pack artifacts
+ id: pack_artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ run: |
+ cp ggml/LICENSE ./build/bin/ggml.txt
+ cp LICENSE ./build/bin/stable-diffusion.cpp.txt
+ zip -j sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip ./build/bin/*
+
+ - name: Upload artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: actions/upload-artifact@v4
+ with:
+ name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
+ path: |
+ sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
windows-latest-cmake:
- runs-on: windows-latest
+ runs-on: windows-2025
+
+ env:
+ VULKAN_VERSION: 1.3.261.1
strategy:
matrix:
include:
- - build: 'noavx'
- defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
- - build: 'avx2'
- defines: '-DGGML_AVX2=ON'
- - build: 'avx'
- defines: '-DGGML_AVX2=OFF'
- - build: 'avx512'
- defines: '-DGGML_AVX512=ON'
-
+ - build: "noavx"
+ defines: "-DGGML_NATIVE=OFF -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON"
+ - build: "avx2"
+ defines: "-DGGML_NATIVE=OFF -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
+ - build: "avx"
+ defines: "-DGGML_NATIVE=OFF -DGGML_AVX=ON -DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON"
+ - build: "avx512"
+ defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
+ - build: "cuda12"
+ defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES=90;89;80;75"
+ # - build: "rocm5.5"
+ # defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
+ - build: 'vulkan'
+ defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
steps:
- name: Clone
id: checkout
@@ -104,6 +175,37 @@ jobs:
with:
submodules: recursive
+ - name: Install cuda-toolkit
+ id: cuda-toolkit
+ if: ${{ matrix.build == 'cuda12' }}
+ uses: Jimver/cuda-toolkit@v0.2.19
+ with:
+ cuda: "12.6.2"
+ method: "network"
+ sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
+
+ - name: Install rocm-toolkit
+ id: rocm-toolkit
+ if: ${{ matrix.build == 'rocm5.5' }}
+ uses: Cyberhan123/rocm-toolkit@v0.1.0
+ with:
+ rocm: "5.5.0"
+
+ - name: Install Ninja
+ id: install-ninja
+ if: ${{ matrix.build == 'rocm5.5' }}
+ uses: urkle/action-get-ninja@v1
+ with:
+ version: 1.11.1
+ - name: Install Vulkan SDK
+ id: get_vulkan
+ if: ${{ matrix.build == 'vulkan' }}
+ run: |
+ curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
+ & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
+ Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
+ Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
+
- name: Build
id: cmake_build
run: |
@@ -125,12 +227,6 @@ jobs:
& $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
.\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
- #- name: Test
- #id: cmake_test
- #run: |
- #cd build
- #ctest -C Release --verbose --timeout 900
-
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -140,14 +236,44 @@ jobs:
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
- Copy-Item ggml/LICENSE .\build\bin\Release\ggml.txt
- Copy-Item LICENSE .\build\bin\Release\stable-diffusion.cpp.txt
- 7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
+ $filePath = ".\build\bin\Release\*"
+ if (Test-Path $filePath) {
+ echo "Exists at path $filePath"
+ Copy-Item ggml/LICENSE .\build\bin\Release\ggml.txt
+ Copy-Item LICENSE .\build\bin\Release\stable-diffusion.cpp.txt
+ } elseif (Test-Path ".\build\bin\stable-diffusion.dll") {
+ $filePath = ".\build\bin\*"
+ echo "Exists at path $filePath"
+ Copy-Item ggml/LICENSE .\build\bin\ggml.txt
+ Copy-Item LICENSE .\build\bin\stable-diffusion.cpp.txt
+ } else {
+ ls .\build\bin
+ throw "Can't find stable-diffusion.dll"
+ }
+ 7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip $filePath
+
+ - name: Copy and pack Cuda runtime
+ id: pack_cuda_runtime
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
+ run: |
+ echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+ $dst='.\build\bin\cudart\'
+ robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+ 7z a cudart-sd-bin-win-cu12-x64.zip $dst\*
+
+ - name: Upload Cuda runtime
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
+ uses: actions/upload-artifact@v4
+ with:
+ name: sd-cudart-sd-bin-win-cu12-x64.zip
+ path: |
+ cudart-sd-bin-win-cu12-x64.zip
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
+ name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
@@ -164,7 +290,11 @@ jobs:
steps:
- name: Download artifacts
id: download-artifact
- uses: actions/download-artifact@v3
+ uses: actions/download-artifact@v4
+ with:
+ path: ./artifact
+ pattern: sd-*
+ merge-multiple: true
- name: Get commit hash
id: commit
diff --git a/.gitignore b/.gitignore
index 59a8a2cab..38fe570df 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,13 @@
build*/
test/
-
+.vscode/
.cache/
*.swp
+.vscode/
+*.bat
+*.bin
+*.exe
+*.gguf
+output*.png
+models*
+*.log
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
index cc639feee..d9d943713 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
[submodule "ggml"]
- path = ggml
- url = https://github.com/leejet/ggml.git
+ path = ggml
+ url = https://github.com/ggerganov/ggml.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6f1930775..06de0d58b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,17 +24,113 @@ endif()
# general
#option(SD_BUILD_TESTS "sd: build tests" ${SD_STANDALONE})
option(SD_BUILD_EXAMPLES "sd: build examples" ${SD_STANDALONE})
+option(SD_CUDA "sd: cuda backend" OFF)
+option(SD_HIPBLAS "sd: rocm backend" OFF)
+option(SD_METAL "sd: metal backend" OFF)
+option(SD_VULKAN "sd: vulkan backend" OFF)
+option(SD_OPENCL "sd: opencl backend" OFF)
+option(SD_SYCL "sd: sycl backend" OFF)
+option(SD_MUSA "sd: musa backend" OFF)
+option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
+option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF)
#option(SD_BUILD_SERVER "sd: build server example" ON)
+if(SD_CUDA)
+ message("-- Use CUDA as backend stable-diffusion")
+ set(GGML_CUDA ON)
+ add_definitions(-DSD_USE_CUDA)
+endif()
-# deps
-add_subdirectory(ggml)
+if(SD_METAL)
+ message("-- Use Metal as backend stable-diffusion")
+ set(GGML_METAL ON)
+ add_definitions(-DSD_USE_METAL)
+endif()
+
+if (SD_VULKAN)
+ message("-- Use Vulkan as backend stable-diffusion")
+ set(GGML_VULKAN ON)
+ add_definitions(-DSD_USE_VULKAN)
+endif ()
+
+if (SD_OPENCL)
+ message("-- Use OpenCL as backend stable-diffusion")
+ set(GGML_OPENCL ON)
+ add_definitions(-DSD_USE_OPENCL)
+endif ()
+
+if (SD_HIPBLAS)
+ message("-- Use HIPBLAS as backend stable-diffusion")
+ set(GGML_HIP ON)
+ add_definitions(-DSD_USE_CUDA)
+ if(SD_FAST_SOFTMAX)
+ set(GGML_CUDA_FAST_SOFTMAX ON)
+ endif()
+endif ()
+
+if(SD_MUSA)
+ message("-- Use MUSA as backend stable-diffusion")
+ set(GGML_MUSA ON)
+ add_definitions(-DSD_USE_CUDA)
+ if(SD_FAST_SOFTMAX)
+ set(GGML_CUDA_FAST_SOFTMAX ON)
+ endif()
+endif()
set(SD_LIB stable-diffusion)
-add_library(${SD_LIB} stable-diffusion.h stable-diffusion.cpp)
-target_link_libraries(${SD_LIB} PUBLIC ggml)
-target_include_directories(${SD_LIB} PUBLIC .)
+file(GLOB SD_LIB_SOURCES
+ "*.h"
+ "*.cpp"
+ "*.hpp"
+)
+
+# we can get only one share lib
+if(SD_BUILD_SHARED_LIBS)
+ message("-- Build shared library")
+ message(${SD_LIB_SOURCES})
+ set(BUILD_SHARED_LIBS OFF)
+ add_library(${SD_LIB} SHARED ${SD_LIB_SOURCES})
+ add_definitions(-DSD_BUILD_SHARED_LIB)
+ target_compile_definitions(${SD_LIB} PRIVATE -DSD_BUILD_DLL)
+ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+else()
+ message("-- Build static library")
+ set(BUILD_SHARED_LIBS OFF)
+ add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
+endif()
+
+if(SD_SYCL)
+ message("-- Use SYCL as backend stable-diffusion")
+ set(GGML_SYCL ON)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
+ add_definitions(-DSD_USE_SYCL)
+ # disable fast-math on host, see:
+ # https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-10/fp-model-fp.html
+ if (WIN32)
+ set(SYCL_COMPILE_OPTIONS /fp:precise)
+ else()
+ set(SYCL_COMPILE_OPTIONS -fp-model=precise)
+ endif()
+ message("-- Turn off fast-math for host in SYCL backend")
+ target_compile_options(${SD_LIB} PRIVATE ${SYCL_COMPILE_OPTIONS})
+endif()
+
+set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+
+# see https://github.com/ggerganov/ggml/pull/682
+add_definitions(-DGGML_MAX_NAME=128)
+
+# deps
+# Only add ggml if it hasn't been added yet
+if (NOT TARGET ggml)
+ add_subdirectory(ggml)
+endif()
+
+add_subdirectory(thirdparty)
+
+target_link_libraries(${SD_LIB} PUBLIC ggml zip)
+target_include_directories(${SD_LIB} PUBLIC . thirdparty)
target_compile_features(${SD_LIB} PUBLIC cxx_std_11)
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 000000000..bd9a378f0
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,17 @@
+ARG UBUNTU_VERSION=22.04
+
+FROM ubuntu:$UBUNTU_VERSION as build
+
+RUN apt-get update && apt-get install -y build-essential git cmake
+
+WORKDIR /sd.cpp
+
+COPY . .
+
+RUN mkdir build && cd build && cmake .. && cmake --build . --config Release
+
+FROM ubuntu:$UBUNTU_VERSION as runtime
+
+COPY --from=build /sd.cpp/build/bin/sd /sd
+
+ENTRYPOINT [ "/sd" ]
\ No newline at end of file
diff --git a/Dockerfile.musa b/Dockerfile.musa
new file mode 100644
index 000000000..c7f5f2e83
--- /dev/null
+++ b/Dockerfile.musa
@@ -0,0 +1,22 @@
+ARG MUSA_VERSION=rc3.1.1
+
+FROM mthreads/musa:${MUSA_VERSION}-devel-ubuntu22.04 as build
+
+RUN apt-get update && apt-get install -y ccache cmake git
+
+WORKDIR /sd.cpp
+
+COPY . .
+
+RUN mkdir build && cd build && \
+ cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ \
+ -DCMAKE_C_FLAGS="${CMAKE_C_FLAGS} -fopenmp -I/usr/lib/llvm-14/lib/clang/14.0.0/include -L/usr/lib/llvm-14/lib" \
+ -DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fopenmp -I/usr/lib/llvm-14/lib/clang/14.0.0/include -L/usr/lib/llvm-14/lib" \
+ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release && \
+ cmake --build . --config Release
+
+FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu22.04 as runtime
+
+COPY --from=build /sd.cpp/build/bin/sd /sd
+
+ENTRYPOINT [ "/sd" ]
\ No newline at end of file
diff --git a/README.md b/README.md
index 6a673b86f..4720dc29c 100644
--- a/README.md
+++ b/README.md
@@ -1,42 +1,70 @@
-
+
# stable-diffusion.cpp
-Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in pure C/C++
+Inference of Stable Diffusion and Flux in pure C/C++
## Features
- Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp)
+- Super lightweight and without external dependencies
+- SD1.x, SD2.x, SDXL and [SD3/SD3.5](./docs/sd3.md) support
+ - !!!The VAE in SDXL encounters NaN issues under FP16, but unfortunately, the ggml_conv_2d only operates under FP16. Hence, a parameter is needed to specify the VAE that has fixed the FP16 NaN issue. You can find it here: [SDXL VAE FP16 Fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors).
+- [Flux-dev/Flux-schnell Support](./docs/flux.md)
+- [FLUX.1-Kontext-dev](./docs/kontext.md)
+- [Chroma](./docs/chroma.md)
+- [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo) and [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo) support
+- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
- 16-bit, 32-bit float support
-- 4-bit, 5-bit and 8-bit integer quantization support
+- 2-bit, 3-bit, 4-bit, 5-bit and 8-bit integer quantization support
- Accelerated memory-efficient CPU inference
- - Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image
+ - Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image, enabling Flash Attention just requires ~1.8GB.
- AVX, AVX2 and AVX512 support for x86 architectures
+- Full CUDA, Metal, Vulkan, OpenCL and SYCL backend for GPU acceleration.
+- Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs models
+ - No need to convert to `.ggml` or `.gguf` anymore!
+- Flash Attention for memory usage optimization
- Original `txt2img` and `img2img` mode
- Negative prompt
- [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
+- LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
+- Latent Consistency Models support (LCM/LCM-LoRA)
+- Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
+- Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
+- VAE tiling processing for reduce memory usage
+- Control Net support with SD 1.5
- Sampling method
- `Euler A`
+ - `Euler`
+ - `Heun`
+ - `DPM2`
+ - `DPM++ 2M`
+ - [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
+ - `DPM++ 2S a`
+ - [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
+- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`)
+- Embedds generation parameters into png output as webui-compatible text string
- Supported platforms
- Linux
- Mac OS
- Windows
+ - Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion))
### TODO
- [ ] More sampling methods
-- [ ] GPU support
- [ ] Make inference faster
- The current implementation of ggml_conv_2d is slow and has high memory usage
- [ ] Continuing to reduce memory usage (quantizing the weights of ggml_conv_2d)
-- [ ] LoRA support
-- [ ] k-quants support
-- [ ] Cross-platform reproducibility (perhaps ensuring consistency with the original SD)
+- [ ] Implement Inpainting support
## Usage
+For most users, you can download the built executable program from the latest [release](https://github.com/leejet/stable-diffusion.cpp/releases/latest).
+If the built product does not meet your requirements, you can choose to build it manually.
+
### Get the Code
```
@@ -53,38 +81,25 @@ git submodule init
git submodule update
```
-### Convert weights
+### Download weights
- download original weights(.ckpt or .safetensors). For example
- Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
- Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
+ - Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
+ - Stable Diffusion 3 2B from https://huggingface.co/stabilityai/stable-diffusion-3-medium
```shell
curl -L -O https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt
# curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
+ # curl -L -O https://huggingface.co/stabilityai/stable-diffusion-2-1/resolve/main/v2-1_768-nonema-pruned.safetensors
+ # curl -L -O https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors
```
-- convert weights to ggml model format
-
- ```shell
- cd models
- pip install -r requirements.txt
- python convert.py [path to weights] --out_type [output precision]
- # For example, python convert.py sd-v1-4.ckpt --out_type f16
- ```
-
-### Quantization
-
-You can specify the output model format using the --out_type parameter
-
-- `f16` for 16-bit floating-point
-- `f32` for 32-bit floating-point
-- `q8_0` for 8-bit integer quantization
-- `q5_0` or `q5_1` for 5-bit integer quantization
-- `q4_0` or `q4_1` for 4-bit integer quantization
-
### Build
+#### Build from scratch
+
```shell
mkdir build
cd build
@@ -92,13 +107,174 @@ cmake ..
cmake --build . --config Release
```
-#### Using OpenBLAS
+##### Using OpenBLAS
```
cmake .. -DGGML_OPENBLAS=ON
cmake --build . --config Release
```
+##### Using CUDA
+
+This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
+
+```
+cmake .. -DSD_CUDA=ON
+cmake --build . --config Release
+```
+
+##### Using HipBLAS
+This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure to have the ROCm toolkit installed.
+
+Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
+
+```
+cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=gfx1100
+cmake --build . --config Release
+```
+
+##### Using MUSA
+
+This provides BLAS acceleration using the MUSA cores of your Moore Threads GPU. Make sure to have the MUSA toolkit installed.
+
+```bash
+cmake .. -DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release
+cmake --build . --config Release
+```
+
+##### Using Metal
+
+Using Metal makes the computation run on the GPU. Currently, there are some issues with Metal when performing operations on very large matrices, making it highly inefficient at the moment. Performance improvements are expected in the near future.
+
+```
+cmake .. -DSD_METAL=ON
+cmake --build . --config Release
+```
+
+##### Using Vulkan
+
+Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
+
+```
+cmake .. -DSD_VULKAN=ON
+cmake --build . --config Release
+```
+
+##### Using OpenCL (for Adreno GPU)
+
+Currently, it supports only Adreno GPUs and is primarily optimized for Q4_0 type
+
+To build for Windows ARM please refers to [Windows 11 Arm64
+](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENCL.md#windows-11-arm64)
+
+Building for Android:
+
+ Android NDK:
+ Download and install the Android NDK from the [official Android developer site](https://developer.android.com/ndk/downloads).
+
+Setup OpenCL Dependencies for NDK:
+
+You need to provide OpenCL headers and the ICD loader library to your NDK sysroot.
+
+* OpenCL Headers:
+ ```bash
+ # In a temporary working directory
+ git clone https://github.com/KhronosGroup/OpenCL-Headers
+ cd OpenCL-Headers
+ # Replace with your actual NDK installation path
+ # e.g., cp -r CL /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
+ sudo cp -r CL /toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
+ cd ..
+ ```
+
+* OpenCL ICD Loader:
+ ```bash
+ # In the same temporary working directory
+ git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
+ cd OpenCL-ICD-Loader
+ mkdir build_ndk && cd build_ndk
+
+ # Replace in the CMAKE_TOOLCHAIN_FILE and OPENCL_ICD_LOADER_HEADERS_DIR
+ cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
+ -DCMAKE_TOOLCHAIN_FILE=/build/cmake/android.toolchain.cmake \
+ -DOPENCL_ICD_LOADER_HEADERS_DIR=/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
+ -DANDROID_ABI=arm64-v8a \
+ -DANDROID_PLATFORM=24 \
+ -DANDROID_STL=c++_shared
+
+ ninja
+ # Replace
+ # e.g., cp libOpenCL.so /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
+ sudo cp libOpenCL.so /toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
+ cd ../..
+ ```
+
+Build `stable-diffusion.cpp` for Android with OpenCL:
+
+```bash
+mkdir build-android && cd build-android
+
+# Replace with your actual NDK installation path
+# e.g., -DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk-r26c/build/cmake/android.toolchain.cmake
+cmake .. -G Ninja \
+ -DCMAKE_TOOLCHAIN_FILE=/build/cmake/android.toolchain.cmake \
+ -DANDROID_ABI=arm64-v8a \
+ -DANDROID_PLATFORM=android-28 \
+ -DGGML_OPENMP=OFF \
+ -DSD_OPENCL=ON
+
+ninja
+```
+*(Note: Don't forget to include `LD_LIBRARY_PATH=/vendor/lib64` in your command line before running the binary)*
+
+##### Using SYCL
+
+Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [IntelĀ® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggerganov/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
+
+```
+# Export relevant ENV variables
+source /opt/intel/oneapi/setvars.sh
+
+# Option 1: Use FP32 (recommended for better performance in most cases)
+cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+# Option 2: Use FP16
+cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+
+cmake --build . --config Release
+```
+
+Example of text2img by using SYCL backend:
+
+- download `stable-diffusion` model weight, refer to [download-weight](#download-weights).
+
+- run `./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors --cfg-scale 5 --steps 30 --sampling-method euler -H 1024 -W 1024 --seed 42 -p "fantasy medieval village world inside a glass sphere , high detail, fantasy, realistic, light effect, hyper detail, volumetric lighting, cinematic, macro, depth of field, blur, red light and clouds from the back, highly detailed epic cinematic concept art cg render made in maya, blender and photoshop, octane render, excellent composition, dynamic dramatic cinematic lighting, aesthetic, very inspirational, world inside a glass sphere by james gurney by artgerm with james jean, joe fenton and tristan eaton by ross tran, fine details, 4k resolution"`
+
+
+
+
+
+
+
+##### Using Flash Attention
+
+Enabling flash attention for the diffusion model reduces memory usage by varying amounts of MB.
+eg.:
+ - flux 768x768 ~600mb
+ - SD2 768x768 ~1400mb
+
+For most backends, it slows things down, but for cuda it generally speeds it up too.
+At the moment, it is only supported for some models and some backends (like cpu, cuda/rocm, metal).
+
+Run by adding `--diffusion-fa` to the arguments and watch for:
+```
+[INFO ] stable-diffusion.cpp:312 - Using flash attention in the diffusion model
+```
+and the compute buffer shrink in the debug log:
+```
+[DEBUG] ggml_extend.hpp:1004 - flux compute buffer size: 650.00 MB(VRAM)
+```
+
### Run
```
@@ -106,29 +282,81 @@ usage: ./bin/sd [arguments]
arguments:
-h, --help show this help message and exit
- -M, --mode [txt2img or img2img] generation mode (default: txt2img)
- -t, --threads N number of threads to use during computation (default: -1).
+ -M, --mode [MODEL] run mode (txt2img or img2img or convert, default: txt2img)
+ -t, --threads N number of threads to use during computation (default: -1)
If threads <= 0, then threads will be set to the number of CPU physical cores
- -m, --model [MODEL] path to model
+ -m, --model [MODEL] path to full model
+ --diffusion-model path to the standalone diffusion model
+ --clip_l path to the clip-l text encoder
+ --clip_g path to the clip-g text encoder
+ --t5xxl path to the the t5xxl text encoder
+ --vae [VAE] path to vae
+ --taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
+ --control-net [CONTROL_PATH] path to control net model
+ --embd-dir [EMBEDDING_PATH] path to embeddings
+ --stacked-id-embd-dir [DIR] path to PHOTOMAKER stacked id embeddings
+ --input-id-images-dir [DIR] path to PHOTOMAKER input id images dir
+ --normalize-input normalize PHOTOMAKER input id images
+ --upscale-model [ESRGAN_PATH] path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now
+ --upscale-repeats Run the ESRGAN upscaler this many times (default 1)
+ --type [TYPE] weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)
+ If not specified, the default is the type of the weight file
+ --lora-model-dir [DIR] lora model directory
-i, --init-img [IMAGE] path to the input image, required by img2img
- -o, --output OUTPUT path to write result image to (default: .\output.png)
+ --mask [MASK] path to the mask image, required by img2img with mask
+ --control-image [IMAGE] path to image condition, control net
+ -r, --ref_image [PATH] reference image for Flux Kontext models (can be used multiple times)
+ -o, --output OUTPUT path to write result image to (default: ./output.png)
-p, --prompt [PROMPT] the prompt to render
-n, --negative-prompt PROMPT the negative prompt (default: "")
--cfg-scale SCALE unconditional guidance scale: (default: 7.0)
+ --guidance SCALE guidance scale for img2img (default: 3.5)
+ --slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)
+ 0 means disabled, a value of 2.5 is nice for sd3.5 medium
+ --eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0)
+ --skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])
+ --skip-layer-start START SLG enabling point: (default: 0.01)
+ --skip-layer-end END SLG disabling point: (default: 0.2)
+ SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
--strength STRENGTH strength for noising/unnoising (default: 0.75)
+ --style-ratio STYLE-RATIO strength for keeping input identity (default: 20%)
+ --control-strength STRENGTH strength to apply Control Net (default: 0.9)
1.0 corresponds to full destruction of information in init image
-H, --height H image height, in pixel space (default: 512)
-W, --width W image width, in pixel space (default: 512)
- --sample-method SAMPLE_METHOD sample method (default: "eular a")
+ --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
+ sampling method (default: "euler_a")
--steps STEPS number of sample steps (default: 20)
+ --rng {std_default, cuda} RNG (default: cuda)
-s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)
+ -b, --batch-count COUNT number of images to generate
+ --schedule {discrete, karras, exponential, ays, gits} Denoiser sigma schedule (default: discrete)
+ --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
+ <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
+ --vae-tiling process vae in tiles to reduce memory usage
+ --vae-on-cpu keep vae in cpu (for low vram)
+ --clip-on-cpu keep clip in cpu (for low vram)
+ --diffusion-fa use flash attention in the diffusion model (for low vram)
+ Might lower quality, since it implies converting k and v to f16.
+ This might crash if it is not supported by the backend.
+ --control-net-cpu keep controlnet in cpu (for low vram)
+ --canny apply canny preprocessor (edge detection)
+ --color colors the logging tags according to level
+ --chroma-disable-dit-mask disable dit mask for chroma
+ --chroma-enable-t5-mask enable t5 mask for chroma
+ --chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma
-v, --verbose print extra info
```
#### txt2img example
-```
-./bin/sd -m ../models/sd-v1-4-ggml-model-f16.bin -p "a lovely cat"
+```sh
+./bin/sd -m ../models/sd-v1-4.ckpt -p "a lovely cat"
+# ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
+# ./bin/sd -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
+# ./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v
+# ./bin/sd --diffusion-model ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v
+# ./bin/sd -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v
```
Using formats of different precisions will yield results of varying quality.
@@ -143,24 +371,62 @@ Using formats of different precisions will yield results of varying quality.
```
-./bin/sd --mode img2img -m ../models/sd-v1-4-ggml-model-f16.bin -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
+./bin/sd --mode img2img -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
```
-## Memory/Disk Requirements
+## More Guides
+
+- [LoRA](./docs/lora.md)
+- [LCM/LCM-LoRA](./docs/lcm.md)
+- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
+- [Using ESRGAN to upscale results](./docs/esrgan.md)
+- [Using TAESD to faster decoding](./docs/taesd.md)
+- [Docker](./docs/docker.md)
+- [Quantization and GGUF](./docs/quantization_and_gguf.md)
+
+## Bindings
+
+These projects wrap `stable-diffusion.cpp` for easier use in other languages/frameworks.
+
+* Golang (non-cgo): [seasonjs/stable-diffusion](https://github.com/seasonjs/stable-diffusion)
+* Golang (cgo): [Binozo/GoStableDiffusion](https://github.com/Binozo/GoStableDiffusion)
+* C#: [DarthAffe/StableDiffusion.NET](https://github.com/DarthAffe/StableDiffusion.NET)
+* Python: [william-murray1204/stable-diffusion-cpp-python](https://github.com/william-murray1204/stable-diffusion-cpp-python)
+* Rust: [newfla/diffusion-rs](https://github.com/newfla/diffusion-rs)
+* Flutter/Dart: [rmatif/Local-Diffusion](https://github.com/rmatif/Local-Diffusion)
+
+## UIs
+
+These projects use `stable-diffusion.cpp` as a backend for their image generation.
+
+- [Jellybox](https://jellybox.com)
+- [Stable Diffusion GUI](https://github.com/fszontagh/sd.cpp.gui.wx)
+- [Stable Diffusion CLI-GUI](https://github.com/piallai/stable-diffusion.cpp)
+- [Local Diffusion](https://github.com/rmatif/Local-Diffusion)
+
+## Contributors
+
+Thank you to all the people who have already contributed to stable-diffusion.cpp!
+
+[](https://github.com/leejet/stable-diffusion.cpp/graphs/contributors)
-| precision | f32 | f16 |q8_0 |q5_0 |q5_1 |q4_0 |q4_1 |
-| ---- | ---- |---- |---- |---- |---- |---- |---- |
-| **Disk** | 2.7G | 2.0G | 1.7G | 1.6G | 1.6G | 1.5G | 1.5G |
-| **Memory**(txt2img - 512 x 512) | ~2.8G | ~2.3G | ~2.1G | ~2.0G | ~2.0G | ~2.0G | ~2.0G |
+## Star History
+[](https://star-history.com/#leejet/stable-diffusion.cpp&Date)
## References
- [ggml](https://github.com/ggerganov/ggml)
- [stable-diffusion](https://github.com/CompVis/stable-diffusion)
+- [sd3-ref](https://github.com/Stability-AI/sd3-ref)
+- [stable-diffusion-stability-ai](https://github.com/Stability-AI/stablediffusion)
- [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui)
+- [ComfyUI](https://github.com/comfyanonymous/ComfyUI)
- [k-diffusion](https://github.com/crowsonkb/k-diffusion)
+- [latent-consistency-model](https://github.com/luosiallen/latent-consistency-model)
+- [generative-models](https://github.com/Stability-AI/generative-models/)
+- [PhotoMaker](https://github.com/TencentARC/PhotoMaker)
diff --git a/assets/cat_with_sd_cpp_20184.png b/assets/cat_with_sd_cpp_20184.png
new file mode 100644
index 000000000..04a82bef8
Binary files /dev/null and b/assets/cat_with_sd_cpp_20184.png differ
diff --git a/assets/cat_with_sd_cpp_42.png b/assets/cat_with_sd_cpp_42.png
new file mode 100644
index 000000000..6368d5427
Binary files /dev/null and b/assets/cat_with_sd_cpp_42.png differ
diff --git a/assets/control.png b/assets/control.png
new file mode 100644
index 000000000..3ed95d093
Binary files /dev/null and b/assets/control.png differ
diff --git a/assets/control_2.png b/assets/control_2.png
new file mode 100644
index 000000000..9352dc0f4
Binary files /dev/null and b/assets/control_2.png differ
diff --git a/assets/control_3.png b/assets/control_3.png
new file mode 100644
index 000000000..4d114df0c
Binary files /dev/null and b/assets/control_3.png differ
diff --git a/assets/flux/chroma_v40.png b/assets/flux/chroma_v40.png
new file mode 100644
index 000000000..4217009dc
Binary files /dev/null and b/assets/flux/chroma_v40.png differ
diff --git a/assets/flux/flux1-dev-q2_k.png b/assets/flux/flux1-dev-q2_k.png
new file mode 100644
index 000000000..1aef6f8c6
Binary files /dev/null and b/assets/flux/flux1-dev-q2_k.png differ
diff --git a/assets/flux/flux1-dev-q3_k.png b/assets/flux/flux1-dev-q3_k.png
new file mode 100644
index 000000000..352bfc70c
Binary files /dev/null and b/assets/flux/flux1-dev-q3_k.png differ
diff --git a/assets/flux/flux1-dev-q4_0.png b/assets/flux/flux1-dev-q4_0.png
new file mode 100644
index 000000000..1a5ee2b56
Binary files /dev/null and b/assets/flux/flux1-dev-q4_0.png differ
diff --git a/assets/flux/flux1-dev-q4_k.png b/assets/flux/flux1-dev-q4_k.png
new file mode 100644
index 000000000..9b3ebdd1a
Binary files /dev/null and b/assets/flux/flux1-dev-q4_k.png differ
diff --git a/assets/flux/flux1-dev-q8_0 with lora.png b/assets/flux/flux1-dev-q8_0 with lora.png
new file mode 100644
index 000000000..fb05892aa
Binary files /dev/null and b/assets/flux/flux1-dev-q8_0 with lora.png differ
diff --git a/assets/flux/flux1-dev-q8_0.png b/assets/flux/flux1-dev-q8_0.png
new file mode 100644
index 000000000..3f469d2da
Binary files /dev/null and b/assets/flux/flux1-dev-q8_0.png differ
diff --git a/assets/flux/flux1-schnell-q8_0.png b/assets/flux/flux1-schnell-q8_0.png
new file mode 100644
index 000000000..4ba7dc401
Binary files /dev/null and b/assets/flux/flux1-schnell-q8_0.png differ
diff --git a/assets/flux/kontext1_dev_output.png b/assets/flux/kontext1_dev_output.png
new file mode 100644
index 000000000..4fa5e38dd
Binary files /dev/null and b/assets/flux/kontext1_dev_output.png differ
diff --git a/assets/photomaker_examples/lenna_woman/lenna.jpg b/assets/photomaker_examples/lenna_woman/lenna.jpg
new file mode 100644
index 000000000..ca3ef19b5
Binary files /dev/null and b/assets/photomaker_examples/lenna_woman/lenna.jpg differ
diff --git a/assets/photomaker_examples/newton_man/newton_0.jpg b/assets/photomaker_examples/newton_man/newton_0.jpg
new file mode 100644
index 000000000..71ba285fd
Binary files /dev/null and b/assets/photomaker_examples/newton_man/newton_0.jpg differ
diff --git a/assets/photomaker_examples/newton_man/newton_1.jpg b/assets/photomaker_examples/newton_man/newton_1.jpg
new file mode 100644
index 000000000..a59ed8c72
Binary files /dev/null and b/assets/photomaker_examples/newton_man/newton_1.jpg differ
diff --git a/assets/photomaker_examples/newton_man/newton_2.png b/assets/photomaker_examples/newton_man/newton_2.png
new file mode 100644
index 000000000..d8d4b9482
Binary files /dev/null and b/assets/photomaker_examples/newton_man/newton_2.png differ
diff --git a/assets/photomaker_examples/newton_man/newton_3.jpg b/assets/photomaker_examples/newton_man/newton_3.jpg
new file mode 100644
index 000000000..852867e8d
Binary files /dev/null and b/assets/photomaker_examples/newton_man/newton_3.jpg differ
diff --git a/assets/photomaker_examples/scarletthead_woman/scarlett_0.jpg b/assets/photomaker_examples/scarletthead_woman/scarlett_0.jpg
new file mode 100644
index 000000000..ce9435a6f
Binary files /dev/null and b/assets/photomaker_examples/scarletthead_woman/scarlett_0.jpg differ
diff --git a/assets/photomaker_examples/scarletthead_woman/scarlett_1.jpg b/assets/photomaker_examples/scarletthead_woman/scarlett_1.jpg
new file mode 100644
index 000000000..23269960f
Binary files /dev/null and b/assets/photomaker_examples/scarletthead_woman/scarlett_1.jpg differ
diff --git a/assets/photomaker_examples/scarletthead_woman/scarlett_2.jpg b/assets/photomaker_examples/scarletthead_woman/scarlett_2.jpg
new file mode 100644
index 000000000..93ae735aa
Binary files /dev/null and b/assets/photomaker_examples/scarletthead_woman/scarlett_2.jpg differ
diff --git a/assets/photomaker_examples/scarletthead_woman/scarlett_3.jpg b/assets/photomaker_examples/scarletthead_woman/scarlett_3.jpg
new file mode 100644
index 000000000..ccdca4be3
Binary files /dev/null and b/assets/photomaker_examples/scarletthead_woman/scarlett_3.jpg differ
diff --git a/assets/photomaker_examples/yangmi_woman/yangmi_1.jpg b/assets/photomaker_examples/yangmi_woman/yangmi_1.jpg
new file mode 100644
index 000000000..20fe66c87
Binary files /dev/null and b/assets/photomaker_examples/yangmi_woman/yangmi_1.jpg differ
diff --git a/assets/photomaker_examples/yangmi_woman/yangmi_2.jpeg b/assets/photomaker_examples/yangmi_woman/yangmi_2.jpeg
new file mode 100644
index 000000000..9ed47435b
Binary files /dev/null and b/assets/photomaker_examples/yangmi_woman/yangmi_2.jpeg differ
diff --git a/assets/photomaker_examples/yangmi_woman/yangmi_3.jpg b/assets/photomaker_examples/yangmi_woman/yangmi_3.jpg
new file mode 100644
index 000000000..e840c1c6b
Binary files /dev/null and b/assets/photomaker_examples/yangmi_woman/yangmi_3.jpg differ
diff --git a/assets/photomaker_examples/yangmi_woman/yangmi_4.jpg b/assets/photomaker_examples/yangmi_woman/yangmi_4.jpg
new file mode 100644
index 000000000..f43601123
Binary files /dev/null and b/assets/photomaker_examples/yangmi_woman/yangmi_4.jpg differ
diff --git a/assets/photomaker_examples/yangmi_woman/yangmi_5.jpg b/assets/photomaker_examples/yangmi_woman/yangmi_5.jpg
new file mode 100644
index 000000000..95e771406
Binary files /dev/null and b/assets/photomaker_examples/yangmi_woman/yangmi_5.jpg differ
diff --git a/assets/photomaker_examples/yangmi_woman/yangmi_6.jpg b/assets/photomaker_examples/yangmi_woman/yangmi_6.jpg
new file mode 100644
index 000000000..8c7c4428e
Binary files /dev/null and b/assets/photomaker_examples/yangmi_woman/yangmi_6.jpg differ
diff --git a/assets/sd3.5_large.png b/assets/sd3.5_large.png
new file mode 100644
index 000000000..b76b13225
Binary files /dev/null and b/assets/sd3.5_large.png differ
diff --git a/assets/sycl_sd3_output.png b/assets/sycl_sd3_output.png
new file mode 100644
index 000000000..9a902a37c
Binary files /dev/null and b/assets/sycl_sd3_output.png differ
diff --git a/assets/with_lcm.png b/assets/with_lcm.png
new file mode 100644
index 000000000..70e2c700c
Binary files /dev/null and b/assets/with_lcm.png differ
diff --git a/assets/without_lcm.png b/assets/without_lcm.png
new file mode 100644
index 000000000..145ab9419
Binary files /dev/null and b/assets/without_lcm.png differ
diff --git a/clip.hpp b/clip.hpp
new file mode 100644
index 000000000..d359f61cd
--- /dev/null
+++ b/clip.hpp
@@ -0,0 +1,952 @@
+#ifndef __CLIP_HPP__
+#define __CLIP_HPP__
+
+#include "ggml_extend.hpp"
+#include "model.h"
+
+/*================================================== CLIPTokenizer ===================================================*/
+
+std::pair, std::string> extract_and_remove_lora(std::string text) {
+ std::regex re("]+)>");
+ std::smatch matches;
+ std::unordered_map filename2multiplier;
+
+ while (std::regex_search(text, matches, re)) {
+ std::string filename = matches[1].str();
+ float multiplier = std::stof(matches[2].str());
+
+ text = std::regex_replace(text, re, "", std::regex_constants::format_first_only);
+
+ if (multiplier == 0.f) {
+ continue;
+ }
+
+ if (filename2multiplier.find(filename) == filename2multiplier.end()) {
+ filename2multiplier[filename] = multiplier;
+ } else {
+ filename2multiplier[filename] += multiplier;
+ }
+ }
+
+ return std::make_pair(filename2multiplier, text);
+}
+
+std::vector> bytes_to_unicode() {
+ std::vector> byte_unicode_pairs;
+ std::set byte_set;
+ for (int b = static_cast('!'); b <= static_cast('~'); ++b) {
+ byte_set.insert(b);
+ byte_unicode_pairs.push_back(std::pair(b, unicode_value_to_utf32(b)));
+ }
+ for (int b = 161; b <= 172; ++b) {
+ byte_set.insert(b);
+ byte_unicode_pairs.push_back(std::pair(b, unicode_value_to_utf32(b)));
+ }
+ for (int b = 174; b <= 255; ++b) {
+ byte_set.insert(b);
+ byte_unicode_pairs.push_back(std::pair(b, unicode_value_to_utf32(b)));
+ }
+ int n = 0;
+ for (int b = 0; b < 256; ++b) {
+ if (byte_set.find(b) == byte_set.end()) {
+ byte_unicode_pairs.push_back(std::pair(b, unicode_value_to_utf32(n + 256)));
+ ++n;
+ }
+ }
+ // LOG_DEBUG("byte_unicode_pairs %d", byte_unicode_pairs.size());
+ return byte_unicode_pairs;
+}
+
+// Ref: https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py
+
+typedef std::function&)> on_new_token_cb_t;
+
+class CLIPTokenizer {
+private:
+ std::map byte_encoder;
+ std::map byte_decoder;
+ std::map encoder;
+ std::map decoder;
+ std::map, int> bpe_ranks;
+ std::regex pat;
+ int encoder_len;
+ int bpe_len;
+
+public:
+ const std::string UNK_TOKEN = "<|endoftext|>";
+ const std::string BOS_TOKEN = "<|startoftext|>";
+ const std::string EOS_TOKEN = "<|endoftext|>";
+ const std::string PAD_TOKEN = "<|endoftext|>";
+
+ const int UNK_TOKEN_ID = 49407;
+ const int BOS_TOKEN_ID = 49406;
+ const int EOS_TOKEN_ID = 49407;
+ const int PAD_TOKEN_ID = 49407;
+
+private:
+ static std::string strip(const std::string& str) {
+ std::string::size_type start = str.find_first_not_of(" \t\n\r\v\f");
+ std::string::size_type end = str.find_last_not_of(" \t\n\r\v\f");
+
+ if (start == std::string::npos) {
+ // String contains only whitespace characters
+ return "";
+ }
+
+ return str.substr(start, end - start + 1);
+ }
+
+ static std::string whitespace_clean(std::string text) {
+ text = std::regex_replace(text, std::regex(R"(\s+)"), " ");
+ text = strip(text);
+ return text;
+ }
+
+ static std::set> get_pairs(const std::vector& subwords) {
+ std::set> pairs;
+ if (subwords.size() == 0) {
+ return pairs;
+ }
+ std::u32string prev_subword = subwords[0];
+ for (int i = 1; i < subwords.size(); i++) {
+ std::u32string subword = subwords[i];
+ std::pair pair(prev_subword, subword);
+ pairs.insert(pair);
+ prev_subword = subword;
+ }
+ return pairs;
+ }
+
+public:
+ CLIPTokenizer(int pad_token_id = 49407, const std::string& merges_utf8_str = "")
+ : PAD_TOKEN_ID(pad_token_id) {
+ if (merges_utf8_str.size() > 0) {
+ load_from_merges(merges_utf8_str);
+ } else {
+ load_from_merges(ModelLoader::load_merges());
+ }
+ }
+
+ void load_from_merges(const std::string& merges_utf8_str) {
+ auto byte_unicode_pairs = bytes_to_unicode();
+ // printf("byte_unicode_pairs have %lu pairs \n", byte_unicode_pairs.size());
+ byte_encoder = std::map(byte_unicode_pairs.begin(), byte_unicode_pairs.end());
+ for (auto& pair : byte_unicode_pairs) {
+ byte_decoder[pair.second] = pair.first;
+ }
+ // for (auto & pair: byte_unicode_pairs) {
+ // std::cout << pair.first << ": " << pair.second << std::endl;
+ // }
+ std::vector merges;
+ size_t start = 0;
+ size_t pos;
+ std::u32string merges_utf32_str = utf8_to_utf32(merges_utf8_str);
+ while ((pos = merges_utf32_str.find('\n', start)) != std::string::npos) {
+ merges.push_back(merges_utf32_str.substr(start, pos - start));
+ start = pos + 1;
+ }
+ // LOG_DEBUG("merges size %llu", merges.size());
+ GGML_ASSERT(merges.size() == 48895);
+ merges = std::vector(merges.begin() + 1, merges.end());
+ std::vector> merge_pairs;
+ for (const auto& merge : merges) {
+ size_t space_pos = merge.find(' ');
+ merge_pairs.emplace_back(merge.substr(0, space_pos), merge.substr(space_pos + 1));
+ // LOG_DEBUG("%s", utf32_to_utf8(merge.substr(space_pos + 1)).c_str());
+ // printf("%s :: %s | %s \n", utf32_to_utf8(merge).c_str(), utf32_to_utf8(merge.substr(0, space_pos)).c_str(),
+ // utf32_to_utf8(merge.substr(space_pos + 1)).c_str());
+ }
+ std::vector vocab;
+ for (const auto& pair : byte_unicode_pairs) {
+ vocab.push_back(pair.second);
+ }
+ for (const auto& pair : byte_unicode_pairs) {
+ vocab.push_back(pair.second + utf8_to_utf32(""));
+ }
+ for (const auto& merge : merge_pairs) {
+ vocab.push_back(merge.first + merge.second);
+ }
+ vocab.push_back(utf8_to_utf32("<|startoftext|>"));
+ vocab.push_back(utf8_to_utf32("<|endoftext|>"));
+ LOG_DEBUG("vocab size: %llu", vocab.size());
+ int i = 0;
+ for (const auto& token : vocab) {
+ encoder[token] = i;
+ decoder[i] = token;
+ i++;
+ }
+ encoder_len = i;
+
+ auto it = encoder.find(utf8_to_utf32("img"));
+ if (it != encoder.end()) {
+ LOG_DEBUG(" trigger word img already in vocab");
+ } else {
+ LOG_DEBUG(" trigger word img not in vocab yet");
+ }
+
+ int rank = 0;
+ for (const auto& merge : merge_pairs) {
+ bpe_ranks[merge] = rank++;
+ }
+ bpe_len = rank;
+ };
+
+ void add_token(const std::string& text) {
+ std::u32string token = utf8_to_utf32(text);
+ auto it = encoder.find(token);
+ if (it != encoder.end()) {
+ encoder[token] = encoder_len;
+ decoder[encoder_len] = token;
+ encoder_len++;
+ }
+ }
+
+ std::u32string bpe(const std::u32string& token) {
+ std::vector word;
+
+ for (int i = 0; i < token.size() - 1; i++) {
+ word.emplace_back(1, token[i]);
+ }
+ word.push_back(token.substr(token.size() - 1) + utf8_to_utf32(""));
+
+ std::set> pairs = get_pairs(word);
+
+ if (pairs.empty()) {
+ return token + utf8_to_utf32("");
+ }
+
+ while (true) {
+ auto min_pair_iter = std::min_element(pairs.begin(),
+ pairs.end(),
+ [&](const std::pair& a,
+ const std::pair& b) {
+ if (bpe_ranks.find(a) == bpe_ranks.end()) {
+ return false;
+ } else if (bpe_ranks.find(b) == bpe_ranks.end()) {
+ return true;
+ }
+ return bpe_ranks.at(a) < bpe_ranks.at(b);
+ });
+
+ const std::pair& bigram = *min_pair_iter;
+
+ if (bpe_ranks.find(bigram) == bpe_ranks.end()) {
+ break;
+ }
+
+ std::u32string first = bigram.first;
+ std::u32string second = bigram.second;
+ std::vector new_word;
+ int32_t i = 0;
+
+ while (i < word.size()) {
+ auto it = std::find(word.begin() + i, word.end(), first);
+ if (it == word.end()) {
+ new_word.insert(new_word.end(), word.begin() + i, word.end());
+ break;
+ }
+ new_word.insert(new_word.end(), word.begin() + i, it);
+ i = static_cast(std::distance(word.begin(), it));
+
+ if (word[i] == first && i < static_cast(word.size()) - 1 && word[i + 1] == second) {
+ new_word.push_back(first + second);
+ i += 2;
+ } else {
+ new_word.push_back(word[i]);
+ i += 1;
+ }
+ }
+
+ word = new_word;
+
+ if (word.size() == 1) {
+ break;
+ }
+ pairs = get_pairs(word);
+ }
+
+ std::u32string result;
+ for (int i = 0; i < word.size(); i++) {
+ result += word[i];
+ if (i != word.size() - 1) {
+ result += utf8_to_utf32(" ");
+ }
+ }
+
+ return result;
+ }
+
+ std::vector tokenize(std::string text,
+ on_new_token_cb_t on_new_token_cb,
+ size_t max_length = 0,
+ bool padding = false) {
+ std::vector tokens = encode(text, on_new_token_cb);
+
+ tokens.insert(tokens.begin(), BOS_TOKEN_ID);
+ if (max_length > 0) {
+ if (tokens.size() > max_length - 1) {
+ tokens.resize(max_length - 1);
+ tokens.push_back(EOS_TOKEN_ID);
+ } else {
+ tokens.push_back(EOS_TOKEN_ID);
+ if (padding) {
+ tokens.insert(tokens.end(), max_length - tokens.size(), PAD_TOKEN_ID);
+ }
+ }
+ }
+
+ return tokens;
+ }
+
+ void pad_tokens(std::vector& tokens,
+ std::vector& weights,
+ size_t max_length = 0,
+ bool padding = false) {
+ if (max_length > 0 && padding) {
+ size_t n = std::ceil(tokens.size() * 1.0 / (max_length - 2));
+ if (n == 0) {
+ n = 1;
+ }
+ size_t length = max_length * n;
+ LOG_DEBUG("token length: %llu", length);
+ std::vector new_tokens;
+ std::vector new_weights;
+ new_tokens.push_back(BOS_TOKEN_ID);
+ new_weights.push_back(1.0);
+ int token_idx = 0;
+ for (int i = 1; i < length; i++) {
+ if (token_idx >= tokens.size()) {
+ break;
+ }
+ if (i % max_length == 0) {
+ new_tokens.push_back(BOS_TOKEN_ID);
+ new_weights.push_back(1.0);
+ } else if (i % max_length == max_length - 1) {
+ new_tokens.push_back(EOS_TOKEN_ID);
+ new_weights.push_back(1.0);
+ } else {
+ new_tokens.push_back(tokens[token_idx]);
+ new_weights.push_back(weights[token_idx]);
+ token_idx++;
+ }
+ }
+
+ new_tokens.push_back(EOS_TOKEN_ID);
+ new_weights.push_back(1.0);
+ tokens = new_tokens;
+ weights = new_weights;
+
+ if (padding) {
+ tokens.insert(tokens.end(), length - tokens.size(), PAD_TOKEN_ID);
+ weights.insert(weights.end(), length - weights.size(), 1.0);
+ }
+ }
+ }
+
+ std::string clean_up_tokenization(std::string& text) {
+ std::regex pattern(R"( ,)");
+ // Replace " ," with ","
+ std::string result = std::regex_replace(text, pattern, ",");
+ return result;
+ }
+
+ std::string decode(const std::vector& tokens) {
+ std::string text = "";
+ for (int t : tokens) {
+ if (t == 49406 || t == 49407)
+ continue;
+ std::u32string ts = decoder[t];
+ // printf("%d, %s \n", t, utf32_to_utf8(ts).c_str());
+ std::string s = utf32_to_utf8(ts);
+ if (s.length() >= 4) {
+ if (ends_with(s, "")) {
+ text += s.replace(s.length() - 4, s.length() - 1, "") + " ";
+ } else {
+ text += s;
+ }
+ } else {
+ text += " " + s;
+ }
+ }
+ // std::vector bytes;
+ // for (auto c : text){
+ // bytes.push_back(byte_decoder[c]);
+ // }
+
+ // std::string s((char *)bytes.data());
+ // std::string s = "";
+ text = clean_up_tokenization(text);
+ return trim(text);
+ }
+
+ std::vector encode(std::string text, on_new_token_cb_t on_new_token_cb) {
+ std::string original_text = text;
+ std::vector bpe_tokens;
+ text = whitespace_clean(text);
+ std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); });
+
+ std::regex pat(R"(<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
+ std::regex::icase);
+
+ std::smatch matches;
+ std::string str = text;
+ std::vector token_strs;
+ while (std::regex_search(str, matches, pat)) {
+ bool skip = on_new_token_cb(str, bpe_tokens);
+ if (skip) {
+ continue;
+ }
+ for (auto& token : matches) {
+ std::string token_str = token.str();
+ std::u32string utf32_token;
+ for (int i = 0; i < token_str.length(); i++) {
+ unsigned char b = token_str[i];
+ utf32_token += byte_encoder[b];
+ }
+ auto bpe_strs = bpe(utf32_token);
+ size_t start = 0;
+ size_t pos;
+ while ((pos = bpe_strs.find(' ', start)) != std::u32string::npos) {
+ auto bpe_str = bpe_strs.substr(start, pos - start);
+ bpe_tokens.push_back(encoder[bpe_str]);
+ token_strs.push_back(utf32_to_utf8(bpe_str));
+
+ start = pos + 1;
+ }
+ auto bpe_str = bpe_strs.substr(start, bpe_strs.size() - start);
+ bpe_tokens.push_back(encoder[bpe_str]);
+ token_strs.push_back(utf32_to_utf8(bpe_str));
+ }
+ str = matches.suffix();
+ }
+ std::stringstream ss;
+ ss << "[";
+ for (auto token : token_strs) {
+ ss << "\"" << token << "\", ";
+ }
+ ss << "]";
+ // LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str());
+ // printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str());
+ return bpe_tokens;
+ }
+};
+
+/*================================================ FrozenCLIPEmbedder ================================================*/
+
+// Ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/modeling_clip.py
+
+struct CLIPMLP : public GGMLBlock {
+protected:
+ bool use_gelu;
+
+public:
+ CLIPMLP(int64_t d_model, int64_t intermediate_size) {
+ blocks["fc1"] = std::shared_ptr(new Linear(d_model, intermediate_size));
+ blocks["fc2"] = std::shared_ptr(new Linear(intermediate_size, d_model));
+
+ if (d_model == 1024 || d_model == 1280) { // SD 2.x
+ use_gelu = true;
+ } else { // SD 1.x
+ use_gelu = false;
+ }
+ }
+
+ struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+ // x: [N, n_token, d_model]
+ auto fc1 = std::dynamic_pointer_cast(blocks["fc1"]);
+ auto fc2 = std::dynamic_pointer_cast(blocks["fc2"]);
+
+ x = fc1->forward(ctx, x);
+ if (use_gelu) {
+ x = ggml_gelu_inplace(ctx, x);
+ } else {
+ x = ggml_gelu_quick_inplace(ctx, x);
+ }
+ x = fc2->forward(ctx, x);
+ return x;
+ }
+};
+
+struct CLIPLayer : public GGMLBlock {
+protected:
+ int64_t d_model; // hidden_size/embed_dim
+ int64_t n_head;
+ int64_t intermediate_size;
+
+public:
+ CLIPLayer(int64_t d_model,
+ int64_t n_head,
+ int64_t intermediate_size)
+ : d_model(d_model),
+ n_head(n_head),
+ intermediate_size(intermediate_size) {
+ blocks["self_attn"] = std::shared_ptr(new MultiheadAttention(d_model, n_head, true, true));
+
+ blocks["layer_norm1"] = std::shared_ptr(new LayerNorm(d_model));
+ blocks["layer_norm2"] = std::shared_ptr(new LayerNorm(d_model));
+
+ blocks["mlp"] = std::shared_ptr(new CLIPMLP(d_model, intermediate_size));
+ }
+
+ struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, bool mask = true) {
+ // x: [N, n_token, d_model]
+ auto self_attn = std::dynamic_pointer_cast(blocks["self_attn"]);
+ auto layer_norm1 = std::dynamic_pointer_cast(blocks["layer_norm1"]);
+ auto layer_norm2 = std::dynamic_pointer_cast(blocks["layer_norm2"]);
+ auto mlp = std::dynamic_pointer_cast(blocks["mlp"]);
+
+ x = ggml_add(ctx, x, self_attn->forward(ctx, layer_norm1->forward(ctx, x), mask));
+ x = ggml_add(ctx, x, mlp->forward(ctx, layer_norm2->forward(ctx, x)));
+ return x;
+ }
+};
+
+struct CLIPEncoder : public GGMLBlock {
+protected:
+ int64_t n_layer;
+
+public:
+ CLIPEncoder(int64_t n_layer,
+ int64_t d_model,
+ int64_t n_head,
+ int64_t intermediate_size)
+ : n_layer(n_layer) {
+ for (int i = 0; i < n_layer; i++) {
+ std::string name = "layers." + std::to_string(i);
+ blocks[name] = std::shared_ptr(new CLIPLayer(d_model, n_head, intermediate_size));
+ }
+ }
+
+ struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, int clip_skip = -1, bool mask = true) {
+ // x: [N, n_token, d_model]
+ int layer_idx = n_layer - 1;
+ // LOG_DEBUG("clip_skip %d", clip_skip);
+ if (clip_skip > 0) {
+ layer_idx = n_layer - clip_skip;
+ }
+
+ for (int i = 0; i < n_layer; i++) {
+ // LOG_DEBUG("layer %d", i);
+ if (i == layer_idx + 1) {
+ break;
+ }
+ std::string name = "layers." + std::to_string(i);
+ auto layer = std::dynamic_pointer_cast(blocks[name]);
+ x = layer->forward(ctx, x, mask); // [N, n_token, d_model]
+ // LOG_DEBUG("layer %d", i);
+ }
+ return x;
+ }
+};
+
+class CLIPEmbeddings : public GGMLBlock {
+protected:
+ int64_t embed_dim;
+ int64_t vocab_size;
+ int64_t num_positions;
+
+ void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") {
+ enum ggml_type token_wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "token_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "token_embedding.weight"] : GGML_TYPE_F32;
+ enum ggml_type position_wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;
+
+ params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
+ params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
+ }
+
+public:
+ CLIPEmbeddings(int64_t embed_dim,
+ int64_t vocab_size = 49408,
+ int64_t num_positions = 77)
+ : embed_dim(embed_dim),
+ vocab_size(vocab_size),
+ num_positions(num_positions) {
+ }
+
+ struct ggml_tensor* get_token_embed_weight() {
+ return params["token_embedding.weight"];
+ }
+
+ struct ggml_tensor* forward(struct ggml_context* ctx,
+ struct ggml_tensor* input_ids,
+ struct ggml_tensor* custom_embed_weight) {
+ // input_ids: [N, n_token]
+ auto token_embed_weight = params["token_embedding.weight"];
+ auto position_embed_weight = params["position_embedding.weight"];
+
+ GGML_ASSERT(input_ids->ne[0] == position_embed_weight->ne[1]);
+ input_ids = ggml_reshape_3d(ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
+ auto token_embedding = ggml_get_rows(ctx, custom_embed_weight != NULL ? custom_embed_weight : token_embed_weight, input_ids);
+ token_embedding = ggml_reshape_3d(ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]);
+
+ // token_embedding + position_embedding
+ auto x = ggml_add(ctx,
+ token_embedding,
+ position_embed_weight); // [N, n_token, embed_dim]
+ return x;
+ }
+};
+
+class CLIPVisionEmbeddings : public GGMLBlock {
+protected:
+ int64_t embed_dim;
+ int64_t num_channels;
+ int64_t patch_size;
+ int64_t image_size;
+ int64_t num_patches;
+ int64_t num_positions;
+ void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") {
+ enum ggml_type patch_wtype = GGML_TYPE_F16; // tensor_types.find(prefix + "patch_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "patch_embedding.weight"] : GGML_TYPE_F16;
+ enum ggml_type class_wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "class_embedding") != tensor_types.end() ? tensor_types[prefix + "class_embedding"] : GGML_TYPE_F32;
+ enum ggml_type position_wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;
+
+ params["patch_embedding.weight"] = ggml_new_tensor_4d(ctx, patch_wtype, patch_size, patch_size, num_channels, embed_dim);
+ params["class_embedding"] = ggml_new_tensor_1d(ctx, class_wtype, embed_dim);
+ params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
+ }
+
+public:
+ CLIPVisionEmbeddings(int64_t embed_dim,
+ int64_t num_channels = 3,
+ int64_t patch_size = 14,
+ int64_t image_size = 224)
+ : embed_dim(embed_dim),
+ num_channels(num_channels),
+ patch_size(patch_size),
+ image_size(image_size) {
+ num_patches = (image_size / patch_size) * (image_size / patch_size);
+ num_positions = num_patches + 1;
+ }
+
+ struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values) {
+ // pixel_values: [N, num_channels, image_size, image_size]
+ // return: [N, num_positions, embed_dim]
+ GGML_ASSERT(pixel_values->ne[0] == image_size && pixel_values->ne[1] == image_size && pixel_values->ne[2] == num_channels);
+
+ auto patch_embed_weight = params["patch_embedding.weight"];
+ auto class_embed_weight = params["class_embedding"];
+ auto position_embed_weight = params["position_embedding.weight"];
+
+ // concat(patch_embedding, class_embedding) + position_embedding
+ struct ggml_tensor* patch_embedding;
+ int64_t N = pixel_values->ne[3];
+ patch_embedding = ggml_nn_conv_2d(ctx, pixel_values, patch_embed_weight, NULL, patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
+ patch_embedding = ggml_reshape_3d(ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches]
+ patch_embedding = ggml_cont(ctx, ggml_permute(ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim]
+ patch_embedding = ggml_reshape_4d(ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1]
+
+ struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, embed_dim, N);
+ class_embedding = ggml_repeat(ctx, class_embed_weight, class_embedding); // [N, embed_dim]
+ class_embedding = ggml_reshape_4d(ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1]
+
+ struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1]
+ x = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim]
+ x = ggml_add(ctx, x, position_embed_weight);
+ return x; // [N, num_positions, embed_dim]
+ }
+};
+
+// OPENAI_CLIP_VIT_L_14: https://huggingface.co/openai/clip-vit-large-patch14/blob/main/config.json
+// OPEN_CLIP_VIT_H_14: https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/blob/main/config.json
+// OPEN_CLIP_VIT_BIGG_14: https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k/blob/main/config.json (CLIPTextModelWithProjection)
+
+enum CLIPVersion {
+ OPENAI_CLIP_VIT_L_14, // SD 1.x and SDXL
+ OPEN_CLIP_VIT_H_14, // SD 2.x
+ OPEN_CLIP_VIT_BIGG_14, // SDXL
+};
+
+class CLIPTextModel : public GGMLBlock {
+protected:
+ void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") {
+ if (version == OPEN_CLIP_VIT_BIGG_14) {
+ enum ggml_type wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "text_projection") != tensor_types.end() ? tensor_types[prefix + "text_projection"] : GGML_TYPE_F32;
+ params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size);
+ }
+ }
+
+public:
+ CLIPVersion version = OPENAI_CLIP_VIT_L_14;
+ // network hparams
+ int32_t vocab_size = 49408;
+ int32_t n_token = 77; // max_position_embeddings
+ int32_t hidden_size = 768;
+ int32_t intermediate_size = 3072;
+ int32_t n_head = 12;
+ int32_t n_layer = 12; // num_hidden_layers
+ int32_t projection_dim = 1280; // only for OPEN_CLIP_VIT_BIGG_14
+ int32_t clip_skip = -1;
+ bool with_final_ln = true;
+
+ CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
+ bool with_final_ln = true,
+ int clip_skip_value = -1)
+ : version(version), with_final_ln(with_final_ln) {
+ if (version == OPEN_CLIP_VIT_H_14) {
+ hidden_size = 1024;
+ intermediate_size = 4096;
+ n_head = 16;
+ n_layer = 24;
+ } else if (version == OPEN_CLIP_VIT_BIGG_14) { // CLIPTextModelWithProjection
+ hidden_size = 1280;
+ intermediate_size = 5120;
+ n_head = 20;
+ n_layer = 32;
+ }
+ set_clip_skip(clip_skip_value);
+
+ blocks["embeddings"] = std::shared_ptr(new CLIPEmbeddings(hidden_size, vocab_size, n_token));
+ blocks["encoder"] = std::shared_ptr(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
+ blocks["final_layer_norm"] = std::shared_ptr(new LayerNorm(hidden_size));
+ }
+
+ void set_clip_skip(int skip) {
+ if (skip <= 0) {
+ skip = -1;
+ }
+ clip_skip = skip;
+ }
+
+ struct ggml_tensor* get_token_embed_weight() {
+ auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]);
+ return embeddings->get_token_embed_weight();
+ }
+
+ struct ggml_tensor* forward(struct ggml_context* ctx,
+ struct ggml_tensor* input_ids,
+ struct ggml_tensor* tkn_embeddings,
+ size_t max_token_idx = 0,
+ bool return_pooled = false) {
+ // input_ids: [N, n_token]
+ auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]);
+ auto encoder = std::dynamic_pointer_cast(blocks["encoder"]);
+ auto final_layer_norm = std::dynamic_pointer_cast(blocks["final_layer_norm"]);
+
+ auto x = embeddings->forward(ctx, input_ids, tkn_embeddings); // [N, n_token, hidden_size]
+ x = encoder->forward(ctx, x, return_pooled ? -1 : clip_skip, true);
+ if (return_pooled || with_final_ln) {
+ x = final_layer_norm->forward(ctx, x);
+ }
+
+ if (return_pooled) {
+ auto text_projection = params["text_projection"];
+ ggml_tensor* pooled = ggml_view_1d(ctx, x, hidden_size, x->nb[1] * max_token_idx);
+ if (text_projection != NULL) {
+ pooled = ggml_nn_linear(ctx, pooled, text_projection, NULL);
+ } else {
+ LOG_DEBUG("Missing text_projection matrix, assuming identity...");
+ }
+ return pooled; // [hidden_size, 1, 1]
+ }
+
+ return x; // [N, n_token, hidden_size]
+ }
+};
+
+class CLIPVisionModel : public GGMLBlock {
+public:
+ // network hparams
+ int32_t num_channels = 3;
+ int32_t patch_size = 14;
+ int32_t image_size = 224;
+ int32_t num_positions = 257; // (image_size / patch_size)^2 + 1
+ int32_t hidden_size = 1024;
+ int32_t intermediate_size = 4096;
+ int32_t n_head = 16;
+ int32_t n_layer = 24;
+
+public:
+ CLIPVisionModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14) {
+ if (version == OPEN_CLIP_VIT_H_14) {
+ hidden_size = 1280;
+ intermediate_size = 5120;
+ n_head = 16;
+ n_layer = 32;
+ } else if (version == OPEN_CLIP_VIT_BIGG_14) {
+ hidden_size = 1664;
+ intermediate_size = 8192;
+ n_head = 16;
+ n_layer = 48;
+ }
+
+ blocks["embeddings"] = std::shared_ptr(new CLIPVisionEmbeddings(hidden_size, num_channels, patch_size, image_size));
+ blocks["pre_layernorm"] = std::shared_ptr(new LayerNorm(hidden_size));
+ blocks["encoder"] = std::shared_ptr(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
+ blocks["post_layernorm"] = std::shared_ptr(new LayerNorm(hidden_size));
+ }
+
+ struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values, bool return_pooled = true) {
+ // pixel_values: [N, num_channels, image_size, image_size]
+ auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]);
+ auto pre_layernorm = std::dynamic_pointer_cast(blocks["pre_layernorm"]);
+ auto encoder = std::dynamic_pointer_cast(blocks["encoder"]);
+ auto post_layernorm = std::dynamic_pointer_cast(blocks["post_layernorm"]);
+
+ auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim]
+ x = pre_layernorm->forward(ctx, x);
+ x = encoder->forward(ctx, x, -1, false);
+ // print_ggml_tensor(x, true, "ClipVisionModel x: ");
+ auto last_hidden_state = x;
+ x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size]
+
+ GGML_ASSERT(x->ne[3] == 1);
+ if (return_pooled) {
+ ggml_tensor* pooled = ggml_cont(ctx, ggml_view_2d(ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
+ return pooled; // [N, hidden_size]
+ } else {
+ // return x; // [N, n_token, hidden_size]
+ return last_hidden_state; // [N, n_token, hidden_size]
+ }
+ }
+};
+
+class CLIPProjection : public UnaryBlock {
+protected:
+ int64_t in_features;
+ int64_t out_features;
+ bool transpose_weight;
+
+ void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") {
+ enum ggml_type wtype = tensor_types.find(prefix + "weight") != tensor_types.end() ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
+ if (transpose_weight) {
+ params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features);
+ } else {
+ params["weight"] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features);
+ }
+ }
+
+public:
+ CLIPProjection(int64_t in_features,
+ int64_t out_features,
+ bool transpose_weight = false)
+ : in_features(in_features),
+ out_features(out_features),
+ transpose_weight(transpose_weight) {}
+
+ struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+ struct ggml_tensor* w = params["weight"];
+ if (transpose_weight) {
+ w = ggml_cont(ctx, ggml_transpose(ctx, w));
+ }
+ return ggml_nn_linear(ctx, x, w, NULL);
+ }
+};
+
+class CLIPVisionModelProjection : public GGMLBlock {
+public:
+ int32_t hidden_size = 1024;
+ int32_t projection_dim = 768;
+ int32_t image_size = 224;
+
+public:
+ CLIPVisionModelProjection(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
+ bool transpose_proj_w = false) {
+ if (version == OPEN_CLIP_VIT_H_14) {
+ hidden_size = 1280;
+ projection_dim = 1024;
+ } else if (version == OPEN_CLIP_VIT_BIGG_14) {
+ hidden_size = 1664;
+ }
+
+ blocks["vision_model"] = std::shared_ptr(new CLIPVisionModel(version));
+ blocks["visual_projection"] = std::shared_ptr(new CLIPProjection(hidden_size, projection_dim, transpose_proj_w));
+ }
+
+ struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values) {
+ // pixel_values: [N, num_channels, image_size, image_size]
+ // return: [N, projection_dim]
+ auto vision_model = std::dynamic_pointer_cast(blocks["vision_model"]);
+ auto visual_projection = std::dynamic_pointer_cast(blocks["visual_projection"]);
+
+ auto x = vision_model->forward(ctx, pixel_values); // [N, hidden_size]
+ x = visual_projection->forward(ctx, x); // [N, projection_dim]
+
+ return x; // [N, projection_dim]
+ }
+};
+
+struct CLIPTextModelRunner : public GGMLRunner {
+ CLIPTextModel model;
+
+ CLIPTextModelRunner(ggml_backend_t backend,
+ std::map& tensor_types,
+ const std::string prefix,
+ CLIPVersion version = OPENAI_CLIP_VIT_L_14,
+ bool with_final_ln = true,
+ int clip_skip_value = -1)
+ : GGMLRunner(backend), model(version, with_final_ln, clip_skip_value) {
+ model.init(params_ctx, tensor_types, prefix);
+ }
+
+ std::string get_desc() {
+ return "clip";
+ }
+
+ void set_clip_skip(int clip_skip) {
+ model.set_clip_skip(clip_skip);
+ }
+
+ void get_param_tensors(std::map& tensors, const std::string prefix) {
+ model.get_param_tensors(tensors, prefix);
+ }
+
+ struct ggml_tensor* forward(struct ggml_context* ctx,
+ struct ggml_tensor* input_ids,
+ struct ggml_tensor* embeddings,
+ size_t max_token_idx = 0,
+ bool return_pooled = false) {
+ size_t N = input_ids->ne[1];
+ size_t n_token = input_ids->ne[0];
+ if (input_ids->ne[0] > model.n_token) {
+ GGML_ASSERT(input_ids->ne[0] % model.n_token == 0);
+ input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
+ }
+
+ return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled);
+ }
+
+ struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
+ int num_custom_embeddings = 0,
+ void* custom_embeddings_data = NULL,
+ size_t max_token_idx = 0,
+ bool return_pooled = false) {
+ struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
+
+ input_ids = to_backend(input_ids);
+
+ struct ggml_tensor* embeddings = NULL;
+
+ if (num_custom_embeddings > 0 && custom_embeddings_data != NULL) {
+ auto token_embed_weight = model.get_token_embed_weight();
+ auto custom_embeddings = ggml_new_tensor_2d(compute_ctx,
+ token_embed_weight->type,
+ model.hidden_size,
+ num_custom_embeddings);
+ set_backend_tensor_data(custom_embeddings, custom_embeddings_data);
+
+ // concatenate custom embeddings
+ embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
+ }
+
+ struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, embeddings, max_token_idx, return_pooled);
+
+ ggml_build_forward_expand(gf, hidden_states);
+
+ return gf;
+ }
+
+ void compute(const int n_threads,
+ struct ggml_tensor* input_ids,
+ int num_custom_embeddings,
+ void* custom_embeddings_data,
+ size_t max_token_idx,
+ bool return_pooled,
+ ggml_tensor** output,
+ ggml_context* output_ctx = NULL) {
+ auto get_graph = [&]() -> struct ggml_cgraph* {
+ return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled);
+ };
+ GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
+ }
+};
+
+#endif // __CLIP_HPP__
diff --git a/common.hpp b/common.hpp
new file mode 100644
index 000000000..9b5cc53be
--- /dev/null
+++ b/common.hpp
@@ -0,0 +1,523 @@
+#ifndef __COMMON_HPP__
+#define __COMMON_HPP__
+
+#include "ggml_extend.hpp"
+
+class DownSampleBlock : public GGMLBlock {
+protected:
+ int channels;
+ int out_channels;
+ bool vae_downsample;
+
+public:
+ DownSampleBlock(int channels,
+ int out_channels,
+ bool vae_downsample = false)
+ : channels(channels),
+ out_channels(out_channels),
+ vae_downsample(vae_downsample) {
+ if (vae_downsample) {
+ blocks["conv"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}));
+ } else {
+ blocks["op"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1}));
+ }
+ }
+
+ struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+ // x: [N, channels, h, w]
+ if (vae_downsample) {
+ auto conv = std::dynamic_pointer_cast(blocks["conv"]);
+
+ x = ggml_pad(ctx, x, 1, 1, 0, 0);
+ x = conv->forward(ctx, x);
+ } else {
+ auto conv = std::dynamic_pointer_cast(blocks["op"]);
+
+ x = conv->forward(ctx, x);
+ }
+ return x; // [N, out_channels, h/2, w/2]
+ }
+};
+
+class UpSampleBlock : public GGMLBlock {
+protected:
+ int channels;
+ int out_channels;
+
+public:
+ UpSampleBlock(int channels,
+ int out_channels)
+ : channels(channels),
+ out_channels(out_channels) {
+ blocks["conv"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
+ }
+
+ struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+ // x: [N, channels, h, w]
+ auto conv = std::dynamic_pointer_cast(blocks["conv"]);
+
+ x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2]
+ x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2]
+ return x;
+ }
+};
+
+class ResBlock : public GGMLBlock {
+protected:
+ // network hparams
+ int64_t channels; // model_channels * (1, 1, 1, 2, 2, 4, 4, 4)
+ int64_t emb_channels; // time_embed_dim
+ int64_t out_channels; // mult * model_channels
+ std::pair kernel_size;
+ int dims;
+ bool skip_t_emb;
+ bool exchange_temb_dims;
+
+ std::shared_ptr conv_nd(int dims,
+ int64_t in_channels,
+ int64_t out_channels,
+ std::pair kernel_size,
+ std::pair padding) {
+ GGML_ASSERT(dims == 2 || dims == 3);
+ if (dims == 3) {
+ return std::shared_ptr(new Conv3dnx1x1(in_channels, out_channels, kernel_size.first, 1, padding.first));
+ } else {
+ return std::shared_ptr(new Conv2d(in_channels, out_channels, kernel_size, {1, 1}, padding));
+ }
+ }
+
+public:
+ ResBlock(int64_t channels,
+ int64_t emb_channels,
+ int64_t out_channels,
+ std::pair kernel_size = {3, 3},
+ int dims = 2,
+ bool exchange_temb_dims = false,
+ bool skip_t_emb = false)
+ : channels(channels),
+ emb_channels(emb_channels),
+ out_channels(out_channels),
+ kernel_size(kernel_size),
+ dims(dims),
+ skip_t_emb(skip_t_emb),
+ exchange_temb_dims(exchange_temb_dims) {
+ std::pair padding = {kernel_size.first / 2, kernel_size.second / 2};
+ blocks["in_layers.0"] = std::shared_ptr(new GroupNorm32(channels));
+ // in_layer_1 is nn.SILU()
+ blocks["in_layers.2"] = conv_nd(dims, channels, out_channels, kernel_size, padding);
+
+ if (!skip_t_emb) {
+ // emb_layer_0 is nn.SILU()
+ blocks["emb_layers.1"] = std::shared_ptr(new Linear(emb_channels, out_channels));
+ }
+
+ blocks["out_layers.0"] = std::shared_ptr(new GroupNorm32(out_channels));
+ // out_layer_1 is nn.SILU()
+ // out_layer_2 is nn.Dropout(), skip for inference
+ blocks["out_layers.3"] = conv_nd(dims, out_channels, out_channels, kernel_size, padding);
+
+ if (out_channels != channels) {
+ blocks["skip_connection"] = conv_nd(dims, channels, out_channels, {1, 1}, {0, 0});
+ }
+ }
+
+ virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = NULL) {
+ // For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
+ // [N, c, t, h, w] => [N, c, t, h * w]
+ // x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
+ // emb: [N, emb_channels] if dims == 2 else [N, t, emb_channels]
+ auto in_layers_0 = std::dynamic_pointer_cast(blocks["in_layers.0"]);
+ auto in_layers_2 = std::dynamic_pointer_cast(blocks["in_layers.2"]);
+ auto out_layers_0 = std::dynamic_pointer_cast(blocks["out_layers.0"]);
+ auto out_layers_3 = std::dynamic_pointer_cast(blocks["out_layers.3"]);
+
+ if (emb == NULL) {
+ GGML_ASSERT(skip_t_emb);
+ }
+
+ // in_layers
+ auto h = in_layers_0->forward(ctx, x);
+ h = ggml_silu_inplace(ctx, h);
+ h = in_layers_2->forward(ctx, h); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
+
+ // emb_layers
+ if (!skip_t_emb) {
+ auto emb_layer_1 = std::dynamic_pointer_cast(blocks["emb_layers.1"]);
+
+ auto emb_out = ggml_silu(ctx, emb);
+ emb_out = emb_layer_1->forward(ctx, emb_out); // [N, out_channels] if dims == 2 else [N, t, out_channels]
+
+ if (dims == 2) {
+ emb_out = ggml_reshape_4d(ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]); // [N, out_channels, 1, 1]
+ } else {
+ emb_out = ggml_reshape_4d(ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]); // [N, t, out_channels, 1]
+ if (exchange_temb_dims) {
+ // emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
+ emb_out = ggml_cont(ctx, ggml_permute(ctx, emb_out, 0, 2, 1, 3)); // [N, out_channels, t, 1]
+ }
+ }
+
+ h = ggml_add(ctx, h, emb_out); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
+ }
+
+ // out_layers
+ h = out_layers_0->forward(ctx, h);
+ h = ggml_silu_inplace(ctx, h);
+ // dropout, skip for inference
+ h = out_layers_3->forward(ctx, h);
+
+ // skip connection
+ if (out_channels != channels) {
+ auto skip_connection = std::dynamic_pointer_cast(blocks["skip_connection"]);
+ x = skip_connection->forward(ctx, x); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
+ }
+
+ h = ggml_add(ctx, h, x);
+ return h; // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
+ }
+};
+
+class GEGLU : public GGMLBlock {
+protected:
+ int64_t dim_in;
+ int64_t dim_out;
+
+ void init_params(struct ggml_context* ctx, std::map& tensor_types, std::string prefix = "") {
+ enum ggml_type wtype = (tensor_types.find(prefix + "proj.weight") != tensor_types.end()) ? tensor_types[prefix + "proj.weight"] : GGML_TYPE_F32;
+ enum ggml_type bias_wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "proj.bias") != tensor_types.end()) ? tensor_types[prefix + "proj.bias"] : GGML_TYPE_F32;
+ params["proj.weight"] = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
+ params["proj.bias"] = ggml_new_tensor_1d(ctx, bias_wtype, dim_out * 2);
+ }
+
+public:
+ GEGLU(int64_t dim_in, int64_t dim_out)
+ : dim_in(dim_in), dim_out(dim_out) {}
+
+ struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+ // x: [ne3, ne2, ne1, dim_in]
+ // return: [ne3, ne2, ne1, dim_out]
+ struct ggml_tensor* w = params["proj.weight"];
+ struct ggml_tensor* b = params["proj.bias"];
+
+ auto x_w = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], 0); // [dim_out, dim_in]
+ auto x_b = ggml_view_1d(ctx, b, b->ne[0] / 2, 0); // [dim_out, dim_in]
+ auto gate_w = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], w->nb[1] * w->ne[1] / 2); // [dim_out, ]
+ auto gate_b = ggml_view_1d(ctx, b, b->ne[0] / 2, b->nb[0] * b->ne[0] / 2); // [dim_out, ]
+
+ auto x_in = x;
+ x = ggml_nn_linear(ctx, x_in, x_w, x_b); // [ne3, ne2, ne1, dim_out]
+ auto gate = ggml_nn_linear(ctx, x_in, gate_w, gate_b); // [ne3, ne2, ne1, dim_out]
+
+ gate = ggml_gelu_inplace(ctx, gate);
+
+ x = ggml_mul(ctx, x, gate); // [ne3, ne2, ne1, dim_out]
+
+ return x;
+ }
+};
+
+class FeedForward : public GGMLBlock {
+public:
+ FeedForward(int64_t dim,
+ int64_t dim_out,
+ int64_t mult = 4) {
+ int64_t inner_dim = dim * mult;
+
+ blocks["net.0"] = std::shared_ptr(new GEGLU(dim, inner_dim));
+ // net_1 is nn.Dropout(), skip for inference
+ blocks["net.2"] = std::shared_ptr(new Linear(inner_dim, dim_out));
+ }
+
+ struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+ // x: [ne3, ne2, ne1, dim]
+ // return: [ne3, ne2, ne1, dim_out]
+
+ auto net_0 = std::dynamic_pointer_cast(blocks["net.0"]);
+ auto net_2 = std::dynamic_pointer_cast(blocks["net.2"]);
+
+ x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim]
+ x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out]
+ return x;
+ }
+};
+
+class CrossAttention : public GGMLBlock {
+protected:
+ int64_t query_dim;
+ int64_t context_dim;
+ int64_t n_head;
+ int64_t d_head;
+ bool flash_attn;
+
+public:
+ CrossAttention(int64_t query_dim,
+ int64_t context_dim,
+ int64_t n_head,
+ int64_t d_head,
+ bool flash_attn = false)
+ : n_head(n_head),
+ d_head(d_head),
+ query_dim(query_dim),
+ context_dim(context_dim),
+ flash_attn(flash_attn) {
+ int64_t inner_dim = d_head * n_head;
+
+ blocks["to_q"] = std::shared_ptr(new Linear(query_dim, inner_dim, false));
+ blocks["to_k"] = std::shared_ptr(new Linear(context_dim, inner_dim, false));
+ blocks["to_v"] = std::shared_ptr(new Linear(context_dim, inner_dim, false));
+
+ blocks["to_out.0"] = std::shared_ptr(new Linear(inner_dim, query_dim));
+ // to_out_1 is nn.Dropout(), skip for inference
+ }
+
+ struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
+ // x: [N, n_token, query_dim]
+ // context: [N, n_context, context_dim]
+ // return: [N, n_token, query_dim]
+ auto to_q = std::dynamic_pointer_cast(blocks["to_q"]);
+ auto to_k = std::dynamic_pointer_cast(blocks["to_k"]);
+ auto to_v = std::dynamic_pointer_cast(blocks["to_v"]);
+ auto to_out_0 = std::dynamic_pointer_cast(blocks["to_out.0"]);
+
+ int64_t n = x->ne[2];
+ int64_t n_token = x->ne[1];
+ int64_t n_context = context->ne[1];
+ int64_t inner_dim = d_head * n_head;
+
+ auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim]
+ auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim]
+ auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim]
+
+ x = ggml_nn_attention_ext(ctx, q, k, v, n_head, NULL, false, false, flash_attn); // [N, n_token, inner_dim]
+
+ x = to_out_0->forward(ctx, x); // [N, n_token, query_dim]
+ return x;
+ }
+};
+
+class BasicTransformerBlock : public GGMLBlock {
+protected:
+ int64_t n_head;
+ int64_t d_head;
+ bool ff_in;
+
+public:
+ BasicTransformerBlock(int64_t dim,
+ int64_t n_head,
+ int64_t d_head,
+ int64_t context_dim,
+ bool ff_in = false,
+ bool flash_attn = false)
+ : n_head(n_head), d_head(d_head), ff_in(ff_in) {
+ // disable_self_attn is always False
+ // disable_temporal_crossattention is always False
+ // switch_temporal_ca_to_sa is always False
+ // inner_dim is always None or equal to dim
+ // gated_ff is always True
+ blocks["attn1"] = std::shared_ptr(new CrossAttention(dim, dim, n_head, d_head, flash_attn));
+ blocks["attn2"] = std::shared_ptr(new CrossAttention(dim, context_dim, n_head, d_head, flash_attn));
+ blocks["ff"] = std::shared_ptr(new FeedForward(dim, dim));
+ blocks["norm1"] = std::shared_ptr(new LayerNorm(dim));
+ blocks["norm2"] = std::shared_ptr(new LayerNorm(dim));
+ blocks["norm3"] = std::shared_ptr(new LayerNorm(dim));
+
+ if (ff_in) {
+ blocks["norm_in"] = std::shared_ptr(new LayerNorm(dim));
+ blocks["ff_in"] = std::shared_ptr(new FeedForward(dim, dim));
+ }
+ }
+
+ struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
+ // x: [N, n_token, query_dim]
+ // context: [N, n_context, context_dim]
+ // return: [N, n_token, query_dim]
+
+ auto attn1 = std::dynamic_pointer_cast(blocks["attn1"]);
+ auto attn2 = std::dynamic_pointer_cast(blocks["attn2"]);
+ auto ff = std::dynamic_pointer_cast(blocks["ff"]);
+ auto norm1 = std::dynamic_pointer_cast(blocks["norm1"]);
+ auto norm2 = std::dynamic_pointer_cast(blocks["norm2"]);
+ auto norm3 = std::dynamic_pointer_cast(blocks["norm3"]);
+
+ if (ff_in) {
+ auto norm_in = std::dynamic_pointer_cast(blocks["norm_in"]);
+ auto ff_in = std::dynamic_pointer_cast(blocks["ff_in"]);
+
+ auto x_skip = x;
+ x = norm_in->forward(ctx, x);
+ x = ff_in->forward(ctx, x);
+ // self.is_res is always True
+ x = ggml_add(ctx, x, x_skip);
+ }
+
+ auto r = x;
+ x = norm1->forward(ctx, x);
+ x = attn1->forward(ctx, x, x); // self-attention
+ x = ggml_add(ctx, x, r);
+ r = x;
+ x = norm2->forward(ctx, x);
+ x = attn2->forward(ctx, x, context); // cross-attention
+ x = ggml_add(ctx, x, r);
+ r = x;
+ x = norm3->forward(ctx, x);
+ x = ff->forward(ctx, x);
+ x = ggml_add(ctx, x, r);
+
+ return x;
+ }
+};
+
+class SpatialTransformer : public GGMLBlock {
+protected:
+ int64_t in_channels; // mult * model_channels
+ int64_t n_head;
+ int64_t d_head;
+ int64_t depth = 1; // 1
+ int64_t context_dim = 768; // hidden_size, 1024 for VERSION_SD2
+
+public:
+ SpatialTransformer(int64_t in_channels,
+ int64_t n_head,
+ int64_t d_head,
+ int64_t depth,
+ int64_t context_dim,
+ bool flash_attn = false)
+ : in_channels(in_channels),
+ n_head(n_head),
+ d_head(d_head),
+ depth(depth),
+ context_dim(context_dim) {
+ // We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False
+ // disable_self_attn is always False
+ int64_t inner_dim = n_head * d_head; // in_channels
+ blocks["norm"] = std::shared_ptr(new GroupNorm32(in_channels));
+ blocks["proj_in"] = std::shared_ptr(new Conv2d(in_channels, inner_dim, {1, 1}));
+
+ for (int i = 0; i < depth; i++) {
+ std::string name = "transformer_blocks." + std::to_string(i);
+ blocks[name] = std::shared_ptr(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false, flash_attn));
+ }
+
+ blocks["proj_out"] = std::shared_ptr(new Conv2d(inner_dim, in_channels, {1, 1}));
+ }
+
+ virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
+ // x: [N, in_channels, h, w]
+ // context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
+ auto norm = std::dynamic_pointer_cast(blocks["norm"]);
+ auto proj_in = std::dynamic_pointer_cast(blocks["proj_in"]);
+ auto proj_out = std::dynamic_pointer_cast(blocks["proj_out"]);
+
+ auto x_in = x;
+ int64_t n = x->ne[3];
+ int64_t h = x->ne[1];
+ int64_t w = x->ne[0];
+ int64_t inner_dim = n_head * d_head;
+
+ x = norm->forward(ctx, x);
+ x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
+
+ x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
+ x = ggml_reshape_3d(ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
+
+ for (int i = 0; i < depth; i++) {
+ std::string name = "transformer_blocks." + std::to_string(i);
+ auto transformer_block = std::dynamic_pointer_cast(blocks[name]);
+
+ x = transformer_block->forward(ctx, x, context);
+ }
+
+ x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
+ x = ggml_reshape_4d(ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
+
+ // proj_out
+ x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
+
+ x = ggml_add(ctx, x, x_in);
+ return x;
+ }
+};
+
+class AlphaBlender : public GGMLBlock {
+protected:
+ void init_params(struct ggml_context* ctx, std::map& tensor_types, std::string prefix = "") {
+ // Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
+ enum ggml_type wtype = GGML_TYPE_F32; //(tensor_types.ypes.find(prefix + "mix_factor") != tensor_types.end()) ? tensor_types[prefix + "mix_factor"] : GGML_TYPE_F32;
+ params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
+ }
+
+ float get_alpha() {
+ // image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,]
+ // so learned_with_images is same as learned
+ float alpha = ggml_backend_tensor_get_f32(params["mix_factor"]);
+ return sigmoid(alpha);
+ }
+
+public:
+ AlphaBlender() {
+ // merge_strategy is always learned_with_images
+ // for inference, we don't need to set alpha
+ // since mix_factor.shape is [1,], we don't need rearrange using rearrange_pattern
+ }
+
+ struct ggml_tensor* forward(struct ggml_context* ctx,
+ struct ggml_tensor* x_spatial,
+ struct ggml_tensor* x_temporal) {
+ // image_only_indicator is always tensor([0.])
+ float alpha = get_alpha();
+ auto x = ggml_add(ctx,
+ ggml_scale(ctx, x_spatial, alpha),
+ ggml_scale(ctx, x_temporal, 1.0f - alpha));
+ return x;
+ }
+};
+
+class VideoResBlock : public ResBlock {
+public:
+ VideoResBlock(int channels,
+ int emb_channels,
+ int out_channels,
+ std::pair kernel_size = {3, 3},
+ int64_t video_kernel_size = 3,
+ int dims = 2) // always 2
+ : ResBlock(channels, emb_channels, out_channels, kernel_size, dims) {
+ blocks["time_stack"] = std::shared_ptr(new ResBlock(out_channels, emb_channels, out_channels, kernel_size, 3, true));
+ blocks["time_mixer"] = std::shared_ptr(new AlphaBlender());
+ }
+
+ struct ggml_tensor* forward(struct ggml_context* ctx,
+ struct ggml_tensor* x,
+ struct ggml_tensor* emb,
+ int num_video_frames) {
+ // x: [N, channels, h, w] aka [b*t, channels, h, w]
+ // emb: [N, emb_channels] aka [b*t, emb_channels]
+ // image_only_indicator is always tensor([0.])
+ auto time_stack = std::dynamic_pointer_cast(blocks["time_stack"]);
+ auto time_mixer = std::dynamic_pointer_cast(blocks["time_mixer"]);
+
+ x = ResBlock::forward(ctx, x, emb);
+
+ int64_t T = num_video_frames;
+ int64_t B = x->ne[3] / T;
+ int64_t C = x->ne[2];
+ int64_t H = x->ne[1];
+ int64_t W = x->ne[0];
+
+ x = ggml_reshape_4d(ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w)
+ x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w)
+ auto x_mix = x;
+
+ emb = ggml_reshape_4d(ctx, emb, emb->ne[0], T, B, emb->ne[3]); // (b t) ... -> b t ...
+
+ x = time_stack->forward(ctx, x, emb); // b t c (h w)
+
+ x = time_mixer->forward(ctx, x_mix, x); // b t c (h w)
+
+ x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w)
+ x = ggml_reshape_4d(ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w
+
+ return x;
+ }
+};
+
+#endif // __COMMON_HPP__
diff --git a/conditioner.hpp b/conditioner.hpp
new file mode 100644
index 000000000..3f89d5263
--- /dev/null
+++ b/conditioner.hpp
@@ -0,0 +1,1428 @@
+#ifndef __CONDITIONER_HPP__
+#define __CONDITIONER_HPP__
+
+#include "clip.hpp"
+#include "t5.hpp"
+
+struct SDCondition {
+ struct ggml_tensor* c_crossattn = NULL; // aka context
+ struct ggml_tensor* c_vector = NULL; // aka y
+ struct ggml_tensor* c_concat = NULL;
+
+ SDCondition() = default;
+ SDCondition(struct ggml_tensor* c_crossattn, struct ggml_tensor* c_vector, struct ggml_tensor* c_concat)
+ : c_crossattn(c_crossattn), c_vector(c_vector), c_concat(c_concat) {}
+};
+
+struct Conditioner {
+ virtual SDCondition get_learned_condition(ggml_context* work_ctx,
+ int n_threads,
+ const std::string& text,
+ int clip_skip,
+ int width,
+ int height,
+ int adm_in_channels = -1,
+ bool force_zero_embeddings = false) = 0;
+ virtual void alloc_params_buffer() = 0;
+ virtual void free_params_buffer() = 0;
+ virtual void get_param_tensors(std::map& tensors) = 0;
+ virtual size_t get_params_buffer_size() = 0;
+ virtual std::tuple> get_learned_condition_with_trigger(ggml_context* work_ctx,
+ int n_threads,
+ const std::string& text,
+ int clip_skip,
+ int width,
+ int height,
+ int num_input_imgs,
+ int adm_in_channels = -1,
+ bool force_zero_embeddings = false) = 0;
+ virtual std::string remove_trigger_from_prompt(ggml_context* work_ctx,
+ const std::string& prompt) = 0;
+};
+
+// ldm.modules.encoders.modules.FrozenCLIPEmbedder
+// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
+struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
+ SDVersion version = VERSION_SD1;
+ PMVersion pm_version = PM_VERSION_1;
+ CLIPTokenizer tokenizer;
+ std::shared_ptr text_model;
+ std::shared_ptr text_model2;
+
+ std::string trigger_word = "img"; // should be user settable
+ std::string embd_dir;
+ int32_t num_custom_embeddings = 0;
+ int32_t num_custom_embeddings_2 = 0;
+ std::vector token_embed_custom;
+ std::vector readed_embeddings;
+
+ FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
+ std::map& tensor_types,
+ const std::string& embd_dir,
+ SDVersion version = VERSION_SD1,
+ PMVersion pv = PM_VERSION_1,
+ int clip_skip = -1)
+ : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
+ if (sd_version_is_sd1(version)) {
+ text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
+ } else if (sd_version_is_sd2(version)) {
+ text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
+ } else if (sd_version_is_sdxl(version)) {
+ text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
+ text_model2 = std::make_shared(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
+ }
+ set_clip_skip(clip_skip);
+ }
+
+ void set_clip_skip(int clip_skip) {
+ if (clip_skip <= 0) {
+ clip_skip = 1;
+ if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
+ clip_skip = 2;
+ }
+ }
+ text_model->set_clip_skip(clip_skip);
+ if (sd_version_is_sdxl(version)) {
+ text_model2->set_clip_skip(clip_skip);
+ }
+ }
+
+ void get_param_tensors(std::map& tensors) {
+ text_model->get_param_tensors(tensors, "cond_stage_model.transformer.text_model");
+ if (sd_version_is_sdxl(version)) {
+ text_model2->get_param_tensors(tensors, "cond_stage_model.1.transformer.text_model");
+ }
+ }
+
+ void alloc_params_buffer() {
+ text_model->alloc_params_buffer();
+ if (sd_version_is_sdxl(version)) {
+ text_model2->alloc_params_buffer();
+ }
+ }
+
+ void free_params_buffer() {
+ text_model->free_params_buffer();
+ if (sd_version_is_sdxl(version)) {
+ text_model2->free_params_buffer();
+ }
+ }
+
+ size_t get_params_buffer_size() {
+ size_t buffer_size = text_model->get_params_buffer_size();
+ if (sd_version_is_sdxl(version)) {
+ buffer_size += text_model2->get_params_buffer_size();
+ }
+ return buffer_size;
+ }
+
+ bool load_embedding(std::string embd_name, std::string embd_path, std::vector& bpe_tokens) {
+ // the order matters
+ ModelLoader model_loader;
+ if (!model_loader.init_from_file(embd_path)) {
+ LOG_ERROR("embedding '%s' failed", embd_name.c_str());
+ return false;
+ }
+ if (std::find(readed_embeddings.begin(), readed_embeddings.end(), embd_name) != readed_embeddings.end()) {
+ LOG_DEBUG("embedding already read in: %s", embd_name.c_str());
+ return true;
+ }
+ struct ggml_init_params params;
+ params.mem_size = 10 * 1024 * 1024; // max for custom embeddings 10 MB
+ params.mem_buffer = NULL;
+ params.no_alloc = false;
+ struct ggml_context* embd_ctx = ggml_init(params);
+ struct ggml_tensor* embd = NULL;
+ struct ggml_tensor* embd2 = NULL;
+ auto on_load = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
+ if (tensor_storage.ne[0] != text_model->model.hidden_size) {
+ if (text_model2) {
+ if (tensor_storage.ne[0] == text_model2->model.hidden_size) {
+ embd2 = ggml_new_tensor_2d(embd_ctx, tensor_storage.type, text_model2->model.hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne[1] : 1);
+ *dst_tensor = embd2;
+ } else {
+ LOG_DEBUG("embedding wrong hidden size, got %i, expected %i or %i", tensor_storage.ne[0], text_model->model.hidden_size, text_model2->model.hidden_size);
+ return false;
+ }
+ } else {
+ LOG_DEBUG("embedding wrong hidden size, got %i, expected %i", tensor_storage.ne[0], text_model->model.hidden_size);
+ return false;
+ }
+ } else {
+ embd = ggml_new_tensor_2d(embd_ctx, tensor_storage.type, text_model->model.hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne[1] : 1);
+ *dst_tensor = embd;
+ }
+ return true;
+ };
+ model_loader.load_tensors(on_load, NULL);
+ readed_embeddings.push_back(embd_name);
+ if (embd) {
+ int64_t hidden_size = text_model->model.hidden_size;
+ token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd));
+ memcpy((void*)(token_embed_custom.data() + num_custom_embeddings * hidden_size * ggml_type_size(embd->type)),
+ embd->data,
+ ggml_nbytes(embd));
+ for (int i = 0; i < embd->ne[1]; i++) {
+ bpe_tokens.push_back(text_model->model.vocab_size + num_custom_embeddings);
+ // LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
+ num_custom_embeddings++;
+ }
+ LOG_DEBUG("embedding '%s' applied, custom embeddings: %i", embd_name.c_str(), num_custom_embeddings);
+ }
+ if (embd2) {
+ int64_t hidden_size = text_model2->model.hidden_size;
+ token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd2));
+ memcpy((void*)(token_embed_custom.data() + num_custom_embeddings_2 * hidden_size * ggml_type_size(embd2->type)),
+ embd2->data,
+ ggml_nbytes(embd2));
+ for (int i = 0; i < embd2->ne[1]; i++) {
+ bpe_tokens.push_back(text_model2->model.vocab_size + num_custom_embeddings_2);
+ // LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
+ num_custom_embeddings_2++;
+ }
+ LOG_DEBUG("embedding '%s' applied, custom embeddings: %i (text model 2)", embd_name.c_str(), num_custom_embeddings_2);
+ }
+ return true;
+ }
+
+ std::tuple, std::vector, std::vector>
+ tokenize_with_trigger_token(std::string text,
+ int num_input_imgs,
+ int32_t image_token,
+ bool padding = false) {
+ return tokenize_with_trigger_token(text, num_input_imgs, image_token,
+ text_model->model.n_token, padding);
+ }
+
+ std::vector convert_token_to_id(std::string text) {
+ auto on_new_token_cb = [&](std::string& str, std::vector& bpe_tokens) -> bool {
+ size_t word_end = str.find(",");
+ std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
+ embd_name = trim(embd_name);
+ std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
+ if (embd_path.size() == 0) {
+ embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
+ }
+ if (embd_path.size() == 0) {
+ embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
+ }
+ if (embd_path.size() > 0) {
+ if (load_embedding(embd_name, embd_path, bpe_tokens)) {
+ if (word_end != std::string::npos) {
+ str = str.substr(word_end);
+ } else {
+ str = "";
+ }
+ return true;
+ }
+ }
+ return false;
+ };
+ std::vector curr_tokens = tokenizer.encode(text, on_new_token_cb);
+ return curr_tokens;
+ }
+
+ std::string decode(const std::vector& tokens) {
+ return tokenizer.decode(tokens);
+ }
+
+ std::tuple, std::vector, std::vector>
+ tokenize_with_trigger_token(std::string text,
+ int num_input_imgs,
+ int32_t image_token,
+ size_t max_length = 0,
+ bool padding = false) {
+ auto parsed_attention = parse_prompt_attention(text);
+
+ {
+ std::stringstream ss;
+ ss << "[";
+ for (const auto& item : parsed_attention) {
+ ss << "['" << item.first << "', " << item.second << "], ";
+ }
+ ss << "]";
+ LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
+ }
+
+ auto on_new_token_cb = [&](std::string& str, std::vector& bpe_tokens) -> bool {
+ size_t word_end = str.find(",");
+ std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
+ embd_name = trim(embd_name);
+ std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
+ if (embd_path.size() == 0) {
+ embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
+ }
+ if (embd_path.size() == 0) {
+ embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
+ }
+ if (embd_path.size() > 0) {
+ if (load_embedding(embd_name, embd_path, bpe_tokens)) {
+ if (word_end != std::string::npos) {
+ str = str.substr(word_end);
+ } else {
+ str = "";
+ }
+ return true;
+ }
+ }
+ return false;
+ };
+
+ std::vector tokens;
+ std::vector weights;
+ std::vector class_token_mask;
+ int32_t class_idx = -1, tokens_acc = 0;
+ for (const auto& item : parsed_attention) {
+ std::vector class_token_index;
+ std::vector clean_input_ids;
+ const std::string& curr_text = item.first;
+ float curr_weight = item.second;
+ // printf(" %s: %f \n", curr_text.c_str(), curr_weight);
+ std::vector curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
+ int32_t clean_index = 0;
+ for (uint32_t i = 0; i < curr_tokens.size(); i++) {
+ int token_id = curr_tokens[i];
+ if (token_id == image_token)
+ class_token_index.push_back(clean_index - 1);
+ else {
+ clean_input_ids.push_back(token_id);
+ clean_index++;
+ }
+ }
+ // GGML_ASSERT(class_token_index.size() == 1); // PhotoMaker currently does not support multiple
+ // trigger words in a single prompt.
+ if (class_token_index.size() == 1) {
+ // Expand the class word token and corresponding mask
+ int class_token = clean_input_ids[class_token_index[0]];
+ class_idx = tokens_acc + class_token_index[0];
+ std::vector clean_input_ids_tmp;
+ for (uint32_t i = 0; i < class_token_index[0]; i++)
+ clean_input_ids_tmp.push_back(clean_input_ids[i]);
+ for (uint32_t i = 0; i < (pm_version == PM_VERSION_2 ? 2 * num_input_imgs : num_input_imgs); i++)
+ clean_input_ids_tmp.push_back(class_token);
+ for (uint32_t i = class_token_index[0] + 1; i < clean_input_ids.size(); i++)
+ clean_input_ids_tmp.push_back(clean_input_ids[i]);
+ clean_input_ids.clear();
+ clean_input_ids = clean_input_ids_tmp;
+ }
+ tokens_acc += clean_index;
+ tokens.insert(tokens.end(), clean_input_ids.begin(), clean_input_ids.end());
+ weights.insert(weights.end(), clean_input_ids.size(), curr_weight);
+ }
+ // BUG!! double couting, pad_tokens will add BOS at the beginning
+ // tokens.insert(tokens.begin(), tokenizer.BOS_TOKEN_ID);
+ // weights.insert(weights.begin(), 1.0);
+
+ tokenizer.pad_tokens(tokens, weights, max_length, padding);
+ int offset = pm_version == PM_VERSION_2 ? 2 * num_input_imgs : num_input_imgs;
+ for (uint32_t i = 0; i < tokens.size(); i++) {
+ // if (class_idx + 1 <= i && i < class_idx + 1 + 2*num_input_imgs) // photomaker V2 has num_tokens(=2)*num_input_imgs
+ if (class_idx + 1 <= i && i < class_idx + 1 + offset) // photomaker V2 has num_tokens(=2)*num_input_imgs
+ // hardcode for now
+ class_token_mask.push_back(true);
+ else
+ class_token_mask.push_back(false);
+ }
+
+ // printf("[");
+ // for (int i = 0; i < tokens.size(); i++) {
+ // printf("%d, ", class_token_mask[i] ? 1 : 0);
+ // }
+ // printf("]\n");
+
+ // for (int i = 0; i < tokens.size(); i++) {
+ // std::cout << tokens[i] << ":" << weights[i] << ", ";
+ // }
+ // std::cout << std::endl;
+
+ return std::make_tuple(tokens, weights, class_token_mask);
+ }
+
+ std::pair, std::vector> tokenize(std::string text,
+ bool padding = false) {
+ return tokenize(text, text_model->model.n_token, padding);
+ }
+
+ std::pair, std::vector> tokenize(std::string text,
+ size_t max_length = 0,
+ bool padding = false) {
+ auto parsed_attention = parse_prompt_attention(text);
+
+ {
+ std::stringstream ss;
+ ss << "[";
+ for (const auto& item : parsed_attention) {
+ ss << "['" << item.first << "', " << item.second << "], ";
+ }
+ ss << "]";
+ LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
+ }
+
+ auto on_new_token_cb = [&](std::string& str, std::vector& bpe_tokens) -> bool {
+ size_t word_end = str.find(",");
+ std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
+ embd_name = trim(embd_name);
+ std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
+ if (embd_path.size() == 0) {
+ embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
+ }
+ if (embd_path.size() == 0) {
+ embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
+ }
+ if (embd_path.size() > 0) {
+ if (load_embedding(embd_name, embd_path, bpe_tokens)) {
+ if (word_end != std::string::npos) {
+ str = str.substr(word_end);
+ } else {
+ str = "";
+ }
+ return true;
+ }
+ }
+ return false;
+ };
+
+ std::vector tokens;
+ std::vector weights;
+ for (const auto& item : parsed_attention) {
+ const std::string& curr_text = item.first;
+ float curr_weight = item.second;
+ std::vector curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
+ tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
+ weights.insert(weights.end(), curr_tokens.size(), curr_weight);
+ }
+
+ tokenizer.pad_tokens(tokens, weights, max_length, padding);
+
+ // for (int i = 0; i < tokens.size(); i++) {
+ // std::cout << tokens[i] << ":" << weights[i] << ", ";
+ // }
+ // std::cout << std::endl;
+
+ return {tokens, weights};
+ }
+
+ SDCondition get_learned_condition_common(ggml_context* work_ctx,
+ int n_threads,
+ std::vector& tokens,
+ std::vector& weights,
+ int clip_skip,
+ int width,
+ int height,
+ int adm_in_channels = -1,
+ bool force_zero_embeddings = false) {
+ set_clip_skip(clip_skip);
+ int64_t t0 = ggml_time_ms();
+ struct ggml_tensor* hidden_states = NULL; // [N, n_token, hidden_size]
+ struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
+ struct ggml_tensor* chunk_hidden_states1 = NULL; // [n_token, hidden_size]
+ struct ggml_tensor* chunk_hidden_states2 = NULL; // [n_token, hidden_size2]
+ struct ggml_tensor* pooled = NULL;
+ std::vector hidden_states_vec;
+
+ size_t chunk_len = 77;
+ size_t chunk_count = tokens.size() / chunk_len;
+ for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
+ std::vector chunk_tokens(tokens.begin() + chunk_idx * chunk_len,
+ tokens.begin() + (chunk_idx + 1) * chunk_len);
+ std::vector chunk_weights(weights.begin() + chunk_idx * chunk_len,
+ weights.begin() + (chunk_idx + 1) * chunk_len);
+
+ auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
+ struct ggml_tensor* input_ids2 = NULL;
+ size_t max_token_idx = 0;
+ if (sd_version_is_sdxl(version)) {
+ auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), tokenizer.EOS_TOKEN_ID);
+ if (it != chunk_tokens.end()) {
+ std::fill(std::next(it), chunk_tokens.end(), 0);
+ }
+
+ max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
+
+ input_ids2 = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
+
+ // for (int i = 0; i < chunk_tokens.size(); i++) {
+ // printf("%d ", chunk_tokens[i]);
+ // }
+ // printf("\n");
+ }
+
+ {
+ text_model->compute(n_threads,
+ input_ids,
+ num_custom_embeddings,
+ token_embed_custom.data(),
+ max_token_idx,
+ false,
+ &chunk_hidden_states1,
+ work_ctx);
+ if (sd_version_is_sdxl(version)) {
+ text_model2->compute(n_threads,
+ input_ids2,
+ num_custom_embeddings,
+ token_embed_custom.data(),
+ max_token_idx,
+ false,
+ &chunk_hidden_states2, work_ctx);
+ // concat
+ chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0);
+
+ if (chunk_idx == 0) {
+ text_model2->compute(n_threads,
+ input_ids2,
+ num_custom_embeddings,
+ token_embed_custom.data(),
+ max_token_idx,
+ true,
+ &pooled,
+ work_ctx);
+ }
+ } else {
+ chunk_hidden_states = chunk_hidden_states1;
+ }
+ }
+
+ int64_t t1 = ggml_time_ms();
+ LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
+ ggml_tensor* result = ggml_dup_tensor(work_ctx, chunk_hidden_states);
+ {
+ float original_mean = ggml_tensor_mean(chunk_hidden_states);
+ for (int i2 = 0; i2 < chunk_hidden_states->ne[2]; i2++) {
+ for (int i1 = 0; i1 < chunk_hidden_states->ne[1]; i1++) {
+ for (int i0 = 0; i0 < chunk_hidden_states->ne[0]; i0++) {
+ float value = ggml_tensor_get_f32(chunk_hidden_states, i0, i1, i2);
+ value *= chunk_weights[i1];
+ ggml_tensor_set_f32(result, value, i0, i1, i2);
+ }
+ }
+ }
+ float new_mean = ggml_tensor_mean(result);
+ ggml_tensor_scale(result, (original_mean / new_mean));
+ }
+ if (force_zero_embeddings) {
+ float* vec = (float*)result->data;
+ for (int i = 0; i < ggml_nelements(result); i++) {
+ vec[i] = 0;
+ }
+ }
+ hidden_states_vec.insert(hidden_states_vec.end(), (float*)result->data, ((float*)result->data) + ggml_nelements(result));
+ }
+
+ hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
+ hidden_states = ggml_reshape_2d(work_ctx,
+ hidden_states,
+ chunk_hidden_states->ne[0],
+ ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
+
+ ggml_tensor* vec = NULL;
+ if (sd_version_is_sdxl(version)) {
+ int out_dim = 256;
+ vec = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, adm_in_channels);
+ // [0:1280]
+ size_t offset = 0;
+ memcpy(vec->data, pooled->data, ggml_nbytes(pooled));
+ offset += ggml_nbytes(pooled);
+
+ // original_size_as_tuple
+ float orig_width = (float)width;
+ float orig_height = (float)height;
+ std::vector timesteps = {orig_height, orig_width};
+
+ ggml_tensor* embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
+ offset += ggml_nbytes(embed_view);
+ set_timestep_embedding(timesteps, embed_view, out_dim);
+ // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
+ // crop_coords_top_left
+ float crop_coord_top = 0.f;
+ float crop_coord_left = 0.f;
+ timesteps = {crop_coord_top, crop_coord_left};
+ embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
+ offset += ggml_nbytes(embed_view);
+ set_timestep_embedding(timesteps, embed_view, out_dim);
+ // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
+ // target_size_as_tuple
+ float target_width = (float)width;
+ float target_height = (float)height;
+ timesteps = {target_height, target_width};
+ embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
+ offset += ggml_nbytes(embed_view);
+ set_timestep_embedding(timesteps, embed_view, out_dim);
+ // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
+ GGML_ASSERT(offset == ggml_nbytes(vec));
+ }
+ // print_ggml_tensor(result);
+ return SDCondition(hidden_states, vec, NULL);
+ }
+
+ std::tuple>
+ get_learned_condition_with_trigger(ggml_context* work_ctx,
+ int n_threads,
+ const std::string& text,
+ int clip_skip,
+ int width,
+ int height,
+ int num_input_imgs,
+ int adm_in_channels = -1,
+ bool force_zero_embeddings = false) {
+ auto image_tokens = convert_token_to_id(trigger_word);
+ // if(image_tokens.size() == 1){
+ // printf(" image token id is: %d \n", image_tokens[0]);
+ // }
+ GGML_ASSERT(image_tokens.size() == 1);
+ auto tokens_and_weights = tokenize_with_trigger_token(text,
+ num_input_imgs,
+ image_tokens[0],
+ true);
+ std::vector& tokens = std::get<0>(tokens_and_weights);
+ std::vector& weights = std::get<1>(tokens_and_weights);
+ std::vector& clsm = std::get<2>(tokens_and_weights);
+ // printf("tokens: \n");
+ // for(int i = 0; i < tokens.size(); ++i)
+ // printf("%d ", tokens[i]);
+ // printf("\n");
+ // printf("clsm: \n");
+ // for(int i = 0; i < clsm.size(); ++i)
+ // printf("%d ", clsm[i]?1:0);
+ // printf("\n");
+ auto cond = get_learned_condition_common(work_ctx, n_threads, tokens, weights, clip_skip, width, height, adm_in_channels, force_zero_embeddings);
+ return std::make_tuple(cond, clsm);
+ }
+
+ std::string remove_trigger_from_prompt(ggml_context* work_ctx,
+ const std::string& prompt) {
+ auto image_tokens = convert_token_to_id(trigger_word);
+ GGML_ASSERT(image_tokens.size() == 1);
+ auto tokens_and_weights = tokenize(prompt, false);
+ std::vector& tokens = tokens_and_weights.first;
+ auto it = std::find(tokens.begin(), tokens.end(), image_tokens[0]);
+ GGML_ASSERT(it != tokens.end()); // prompt must have trigger word
+ tokens.erase(it);
+ return decode(tokens);
+ }
+
+ SDCondition get_learned_condition(ggml_context* work_ctx,
+ int n_threads,
+ const std::string& text,
+ int clip_skip,
+ int width,
+ int height,
+ int adm_in_channels = -1,
+ bool force_zero_embeddings = false) {
+ auto tokens_and_weights = tokenize(text, true);
+ std::vector& tokens = tokens_and_weights.first;
+ std::vector& weights = tokens_and_weights.second;
+ return get_learned_condition_common(work_ctx, n_threads, tokens, weights, clip_skip, width, height, adm_in_channels, force_zero_embeddings);
+ }
+};
+
+struct FrozenCLIPVisionEmbedder : public GGMLRunner {
+ CLIPVisionModelProjection vision_model;
+
+ FrozenCLIPVisionEmbedder(ggml_backend_t backend, std::map& tensor_types)
+ : vision_model(OPEN_CLIP_VIT_H_14, true), GGMLRunner(backend) {
+ vision_model.init(params_ctx, tensor_types, "cond_stage_model.transformer");
+ }
+
+ std::string get_desc() {
+ return "clip_vision";
+ }
+
+ void get_param_tensors(std::map& tensors) {
+ vision_model.get_param_tensors(tensors, "cond_stage_model.transformer");
+ }
+
+ struct ggml_cgraph* build_graph(struct ggml_tensor* pixel_values) {
+ struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
+
+ pixel_values = to_backend(pixel_values);
+
+ struct ggml_tensor* hidden_states = vision_model.forward(compute_ctx, pixel_values);
+
+ ggml_build_forward_expand(gf, hidden_states);
+
+ return gf;
+ }
+
+ void compute(const int n_threads,
+ ggml_tensor* pixel_values,
+ ggml_tensor** output,
+ ggml_context* output_ctx) {
+ auto get_graph = [&]() -> struct ggml_cgraph* {
+ return build_graph(pixel_values);
+ };
+ GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
+ }
+};
+
+struct SD3CLIPEmbedder : public Conditioner {
+ CLIPTokenizer clip_l_tokenizer;
+ CLIPTokenizer clip_g_tokenizer;
+ T5UniGramTokenizer t5_tokenizer;
+ std::shared_ptr clip_l;
+ std::shared_ptr clip_g;
+ std::shared_ptr t5;
+
+ SD3CLIPEmbedder(ggml_backend_t backend,
+ std::map& tensor_types,
+ int clip_skip = -1)
+ : clip_g_tokenizer(0) {
+ clip_l = std::make_shared(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
+ clip_g = std::make_shared(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
+ t5 = std::make_shared(backend, tensor_types, "text_encoders.t5xxl.transformer");
+ set_clip_skip(clip_skip);
+ }
+
+ void set_clip_skip(int clip_skip) {
+ if (clip_skip <= 0) {
+ clip_skip = 2;
+ }
+ clip_l->set_clip_skip(clip_skip);
+ clip_g->set_clip_skip(clip_skip);
+ }
+
+ void get_param_tensors(std::map& tensors) {
+ clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
+ clip_g->get_param_tensors(tensors, "text_encoders.clip_g.transformer.text_model");
+ t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
+ }
+
+ void alloc_params_buffer() {
+ clip_l->alloc_params_buffer();
+ clip_g->alloc_params_buffer();
+ t5->alloc_params_buffer();
+ }
+
+ void free_params_buffer() {
+ clip_l->free_params_buffer();
+ clip_g->free_params_buffer();
+ t5->free_params_buffer();
+ }
+
+ size_t get_params_buffer_size() {
+ size_t buffer_size = clip_l->get_params_buffer_size();
+ buffer_size += clip_g->get_params_buffer_size();
+ buffer_size += t5->get_params_buffer_size();
+ return buffer_size;
+ }
+
+ std::vector, std::vector>> tokenize(std::string text,
+ size_t max_length = 0,
+ bool padding = false) {
+ auto parsed_attention = parse_prompt_attention(text);
+
+ {
+ std::stringstream ss;
+ ss << "[";
+ for (const auto& item : parsed_attention) {
+ ss << "['" << item.first << "', " << item.second << "], ";
+ }
+ ss << "]";
+ LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
+ }
+
+ auto on_new_token_cb = [&](std::string& str, std::vector& bpe_tokens) -> bool {
+ return false;
+ };
+
+ std::vector clip_l_tokens;
+ std::vector clip_l_weights;
+ std::vector clip_g_tokens;
+ std::vector clip_g_weights;
+ std::vector t5_tokens;
+ std::vector t5_weights;
+ for (const auto& item : parsed_attention) {
+ const std::string& curr_text = item.first;
+ float curr_weight = item.second;
+
+ std::vector curr_tokens = clip_l_tokenizer.encode(curr_text, on_new_token_cb);
+ clip_l_tokens.insert(clip_l_tokens.end(), curr_tokens.begin(), curr_tokens.end());
+ clip_l_weights.insert(clip_l_weights.end(), curr_tokens.size(), curr_weight);
+
+ curr_tokens = clip_g_tokenizer.encode(curr_text, on_new_token_cb);
+ clip_g_tokens.insert(clip_g_tokens.end(), curr_tokens.begin(), curr_tokens.end());
+ clip_g_weights.insert(clip_g_weights.end(), curr_tokens.size(), curr_weight);
+
+ curr_tokens = t5_tokenizer.Encode(curr_text, true);
+ t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
+ t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
+ }
+
+ clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, max_length, padding);
+ clip_g_tokenizer.pad_tokens(clip_g_tokens, clip_g_weights, max_length, padding);
+ t5_tokenizer.pad_tokens(t5_tokens, t5_weights, NULL, max_length, padding);
+
+ // for (int i = 0; i < clip_l_tokens.size(); i++) {
+ // std::cout << clip_l_tokens[i] << ":" << clip_l_weights[i] << ", ";
+ // }
+ // std::cout << std::endl;
+
+ // for (int i = 0; i < clip_g_tokens.size(); i++) {
+ // std::cout << clip_g_tokens[i] << ":" << clip_g_weights[i] << ", ";
+ // }
+ // std::cout << std::endl;
+
+ // for (int i = 0; i < t5_tokens.size(); i++) {
+ // std::cout << t5_tokens[i] << ":" << t5_weights[i] << ", ";
+ // }
+ // std::cout << std::endl;
+
+ return {{clip_l_tokens, clip_l_weights}, {clip_g_tokens, clip_g_weights}, {t5_tokens, t5_weights}};
+ }
+
+ SDCondition get_learned_condition_common(ggml_context* work_ctx,
+ int n_threads,
+ std::vector, std::vector>> token_and_weights,
+ int clip_skip,
+ bool force_zero_embeddings = false) {
+ set_clip_skip(clip_skip);
+ auto& clip_l_tokens = token_and_weights[0].first;
+ auto& clip_l_weights = token_and_weights[0].second;
+ auto& clip_g_tokens = token_and_weights[1].first;
+ auto& clip_g_weights = token_and_weights[1].second;
+ auto& t5_tokens = token_and_weights[2].first;
+ auto& t5_weights = token_and_weights[2].second;
+
+ int64_t t0 = ggml_time_ms();
+ struct ggml_tensor* hidden_states = NULL; // [N, n_token*2, 4096]
+ struct ggml_tensor* chunk_hidden_states = NULL; // [n_token*2, 4096]
+ struct ggml_tensor* chunk_hidden_states_l = NULL; // [n_token, hidden_size_l]
+ struct ggml_tensor* chunk_hidden_states_g = NULL; // [n_token, hidden_size_g]
+ struct ggml_tensor* chunk_hidden_states_t5 = NULL; // [n_token, hidden_size_t5]
+ struct ggml_tensor* pooled = NULL;
+ struct ggml_tensor* pooled_l = NULL; // [768,]
+ struct ggml_tensor* pooled_g = NULL; // [1280,]
+ std::vector hidden_states_vec;
+
+ size_t chunk_len = 77;
+ size_t chunk_count = clip_l_tokens.size() / chunk_len;
+ for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
+ // clip_l
+ {
+ std::vector chunk_tokens(clip_l_tokens.begin() + chunk_idx * chunk_len,
+ clip_l_tokens.begin() + (chunk_idx + 1) * chunk_len);
+ std::vector chunk_weights(clip_l_weights.begin() + chunk_idx * chunk_len,
+ clip_l_weights.begin() + (chunk_idx + 1) * chunk_len);
+
+ auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
+ size_t max_token_idx = 0;
+
+ clip_l->compute(n_threads,
+ input_ids,
+ 0,
+ NULL,
+ max_token_idx,
+ false,
+ &chunk_hidden_states_l,
+ work_ctx);
+ {
+ auto tensor = chunk_hidden_states_l;
+ float original_mean = ggml_tensor_mean(tensor);
+ for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+ for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+ for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+ float value = ggml_tensor_get_f32(tensor, i0, i1, i2);
+ value *= chunk_weights[i1];
+ ggml_tensor_set_f32(tensor, value, i0, i1, i2);
+ }
+ }
+ }
+ float new_mean = ggml_tensor_mean(tensor);
+ ggml_tensor_scale(tensor, (original_mean / new_mean));
+ }
+
+ if (chunk_idx == 0) {
+ auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
+ max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
+ clip_l->compute(n_threads,
+ input_ids,
+ 0,
+ NULL,
+ max_token_idx,
+ true,
+ &pooled_l,
+ work_ctx);
+ }
+ }
+
+ // clip_g
+ {
+ std::vector chunk_tokens(clip_g_tokens.begin() + chunk_idx * chunk_len,
+ clip_g_tokens.begin() + (chunk_idx + 1) * chunk_len);
+ std::vector chunk_weights(clip_g_weights.begin() + chunk_idx * chunk_len,
+ clip_g_weights.begin() + (chunk_idx + 1) * chunk_len);
+
+ auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
+ size_t max_token_idx = 0;
+
+ clip_g->compute(n_threads,
+ input_ids,
+ 0,
+ NULL,
+ max_token_idx,
+ false,
+ &chunk_hidden_states_g,
+ work_ctx);
+
+ {
+ auto tensor = chunk_hidden_states_g;
+ float original_mean = ggml_tensor_mean(tensor);
+ for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+ for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+ for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+ float value = ggml_tensor_get_f32(tensor, i0, i1, i2);
+ value *= chunk_weights[i1];
+ ggml_tensor_set_f32(tensor, value, i0, i1, i2);
+ }
+ }
+ }
+ float new_mean = ggml_tensor_mean(tensor);
+ ggml_tensor_scale(tensor, (original_mean / new_mean));
+ }
+
+ if (chunk_idx == 0) {
+ auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_g_tokenizer.EOS_TOKEN_ID);
+ max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
+ clip_g->compute(n_threads,
+ input_ids,
+ 0,
+ NULL,
+ max_token_idx,
+ true,
+ &pooled_g,
+ work_ctx);
+ }
+ }
+
+ // t5
+ {
+ std::vector chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len,
+ t5_tokens.begin() + (chunk_idx + 1) * chunk_len);
+ std::vector chunk_weights(t5_weights.begin() + chunk_idx * chunk_len,
+ t5_weights.begin() + (chunk_idx + 1) * chunk_len);
+
+ auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
+
+ t5->compute(n_threads,
+ input_ids,
+ NULL,
+ &chunk_hidden_states_t5,
+ work_ctx);
+ {
+ auto tensor = chunk_hidden_states_t5;
+ float original_mean = ggml_tensor_mean(tensor);
+ for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+ for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+ for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+ float value = ggml_tensor_get_f32(tensor, i0, i1, i2);
+ value *= chunk_weights[i1];
+ ggml_tensor_set_f32(tensor, value, i0, i1, i2);
+ }
+ }
+ }
+ float new_mean = ggml_tensor_mean(tensor);
+ ggml_tensor_scale(tensor, (original_mean / new_mean));
+ }
+ }
+
+ auto chunk_hidden_states_lg_pad = ggml_new_tensor_3d(work_ctx,
+ chunk_hidden_states_l->type,
+ 4096,
+ chunk_hidden_states_l->ne[1],
+ chunk_hidden_states_l->ne[2]); // [n_token, 4096]
+
+ for (int i2 = 0; i2 < chunk_hidden_states_lg_pad->ne[2]; i2++) {
+ for (int i1 = 0; i1 < chunk_hidden_states_lg_pad->ne[1]; i1++) {
+ for (int i0 = 0; i0 < chunk_hidden_states_lg_pad->ne[0]; i0++) {
+ float value = 0.f;
+ if (i0 < chunk_hidden_states_l->ne[0]) {
+ value = ggml_tensor_get_f32(chunk_hidden_states_l, i0, i1, i2);
+ } else if (i0 < chunk_hidden_states_l->ne[0] + chunk_hidden_states_g->ne[0]) {
+ value = ggml_tensor_get_f32(chunk_hidden_states_g, i0 - chunk_hidden_states_l->ne[0], i1, i2);
+ }
+ ggml_tensor_set_f32(chunk_hidden_states_lg_pad, value, i0, i1, i2);
+ }
+ }
+ }
+
+ chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states_lg_pad, chunk_hidden_states_t5, 1); // [n_token*2, 4096]
+
+ if (chunk_idx == 0) {
+ pooled = ggml_tensor_concat(work_ctx, pooled_l, pooled_g, 0); // [768 + 1280]
+ }
+
+ int64_t t1 = ggml_time_ms();
+ LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
+ if (force_zero_embeddings) {
+ float* vec = (float*)chunk_hidden_states->data;
+ for (int i = 0; i < ggml_nelements(chunk_hidden_states); i++) {
+ vec[i] = 0;
+ }
+ }
+
+ hidden_states_vec.insert(hidden_states_vec.end(),
+ (float*)chunk_hidden_states->data,
+ ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states));
+ }
+
+ hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
+ hidden_states = ggml_reshape_2d(work_ctx,
+ hidden_states,
+ chunk_hidden_states->ne[0],
+ ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
+ return SDCondition(hidden_states, pooled, NULL);
+ }
+
+ SDCondition get_learned_condition(ggml_context* work_ctx,
+ int n_threads,
+ const std::string& text,
+ int clip_skip,
+ int width,
+ int height,
+ int adm_in_channels = -1,
+ bool force_zero_embeddings = false) {
+ auto tokens_and_weights = tokenize(text, 77, true);
+ return get_learned_condition_common(work_ctx, n_threads, tokens_and_weights, clip_skip, force_zero_embeddings);
+ }
+
+ std::tuple> get_learned_condition_with_trigger(ggml_context* work_ctx,
+ int n_threads,
+ const std::string& text,
+ int clip_skip,
+ int width,
+ int height,
+ int num_input_imgs,
+ int adm_in_channels = -1,
+ bool force_zero_embeddings = false) {
+ GGML_ASSERT(0 && "Not implemented yet!");
+ }
+
+ std::string remove_trigger_from_prompt(ggml_context* work_ctx,
+ const std::string& prompt) {
+ GGML_ASSERT(0 && "Not implemented yet!");
+ }
+};
+
+struct FluxCLIPEmbedder : public Conditioner {
+ CLIPTokenizer clip_l_tokenizer;
+ T5UniGramTokenizer t5_tokenizer;
+ std::shared_ptr clip_l;
+ std::shared_ptr t5;
+ size_t chunk_len = 256;
+
+ FluxCLIPEmbedder(ggml_backend_t backend,
+ std::map& tensor_types,
+ int clip_skip = -1) {
+ clip_l = std::make_shared(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
+ t5 = std::make_shared(backend, tensor_types, "text_encoders.t5xxl.transformer");
+ set_clip_skip(clip_skip);
+ }
+
+ void set_clip_skip(int clip_skip) {
+ if (clip_skip <= 0) {
+ clip_skip = 2;
+ }
+ clip_l->set_clip_skip(clip_skip);
+ }
+
+ void get_param_tensors(std::map& tensors) {
+ clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
+ t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
+ }
+
+ void alloc_params_buffer() {
+ clip_l->alloc_params_buffer();
+ t5->alloc_params_buffer();
+ }
+
+ void free_params_buffer() {
+ clip_l->free_params_buffer();
+ t5->free_params_buffer();
+ }
+
+ size_t get_params_buffer_size() {
+ size_t buffer_size = clip_l->get_params_buffer_size();
+ buffer_size += t5->get_params_buffer_size();
+ return buffer_size;
+ }
+
+ std::vector, std::vector>> tokenize(std::string text,
+ size_t max_length = 0,
+ bool padding = false) {
+ auto parsed_attention = parse_prompt_attention(text);
+
+ {
+ std::stringstream ss;
+ ss << "[";
+ for (const auto& item : parsed_attention) {
+ ss << "['" << item.first << "', " << item.second << "], ";
+ }
+ ss << "]";
+ LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
+ }
+
+ auto on_new_token_cb = [&](std::string& str, std::vector