diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
new file mode 100644
index 000000000..35d1b0240
--- /dev/null
+++ b/.github/workflows/cmake.yml
@@ -0,0 +1,173 @@
+name: CMake on multiple platforms
+
+on:
+ push:
+ branches: [ "main" ]
+ pull_request:
+ branches: [ "main" ]
+
+jobs:
+ build:
+ runs-on: ${{ matrix.os }}
+
+ strategy:
+ # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable.
+ fail-fast: false
+
+ matrix:
+ os: [ubuntu-latest, windows-latest]
+ python-version: ['3.10', '3.11']
+ cuda-version: ['11.8', '12.1']
+ build_type: [Release]
+ c_compiler: [gcc, cl]
+ include:
+ - os: windows-latest
+ c_compiler: cl
+ cpp_compiler: cl
+ - os: ubuntu-latest
+ c_compiler: gcc
+ cpp_compiler: g++
+ exclude:
+ - os: ubuntu-latest
+ c_compiler: cl
+ - os: windows-latest
+ c_compiler: gcc
+
+ steps:
+ - uses: actions/checkout@v4
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Set up MSVC
+ if: matrix.os == 'windows-latest'
+ uses: ilammy/msvc-dev-cmd@v1.12.1
+ with:
+ arch: amd64
+
+ - name: Setup Mambaforge
+ uses: conda-incubator/setup-miniconda@v3.0.1
+ with:
+ miniforge-variant: Mambaforge
+ miniforge-version: latest
+ activate-environment: bnb-env
+ use-mamba: true
+
+ - uses: conda-incubator/setup-miniconda@v3.0.1
+ with:
+ auto-update-conda: true
+ activate-environment: bnb-env
+ environment-file: environment-bnb.yml
+ use-only-tar-bz2: false
+ auto-activate-base: true
+ python-version: ${{ matrix.python-version }}
+ mamba-version: "*"
+
+ - name: Set reusable strings
+ # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file.
+ id: strings
+ shell: bash
+ run: |
+ echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT"
+
+ - name: CUDA Toolkit
+ shell: bash -el {0}
+ run: |
+ if [ "${{ matrix.os }}" = "ubuntu-latest" ]; then
+ # to prepare space
+ sudo rm -rf /usr/share/dotnet
+ sudo rm -rf /opt/ghc
+ sudo rm -rf /usr/local/share/boost
+ fi
+ addon=""
+ cuda_version=${{ matrix.cuda-version }}
+ [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "ubuntu-latest" ] && addon="cuda-cudart-static cuda-nvrtc"
+ [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "windows-latest" ] && addon="cuda-nvrtc"
+ [ "$cuda_version" = "11.8" ] && cuda_version="11.8.0"
+ [ "$cuda_version" = "12.1" ] && cuda_version="12.1.1"
+
+ conda install pytorch-cuda=${{ matrix.cuda-version }} -c pytorch # it's dependency not correctly resolved sometime
+ conda install cuda-python=${{ matrix.cuda-version }} cuda-libraries-dev cuda-nvcc cuda-nvtx cuda-cupti cuda-cudart cuda-cudart-dev cuda-runtime cuda-libraries $addon -c "nvidia/label/cuda-$cuda_version"
+
+ [ "${{ matrix.os }}" = "windows-latest" ] && conda install "clang>=17.0.6" "clangxx>=17.0.6" -c conda-forge
+
+ CUDA_HOME="${{ env.CONDA }}/envs/bnb-env"
+ echo CUDA_HOME=$CUDA_HOME >> "$GITHUB_ENV"
+ echo CUDA_PATH=$CUDA_HOME >> "$GITHUB_ENV"
+
+ if [ "${{ matrix.os }}" = "windows-latest" ]; then
+ # without -DCMAKE_CUDA_COMPILER=nvcc, cmake config always fail for cuda-11.8
+ echo DCMAKE_CUDA_COMPILER=-DCMAKE_CUDA_COMPILER=nvcc >> "$GITHUB_ENV"
+ fi
+
+ nvcc --version
+
+ - name: Update environment
+ run: mamba env update -n bnb-env -f environment-bnb.yml
+
+ - name: Prep build
+ run: python -m pip install cmake==3.27.9 ninja setuptools wheel
+
+ - name: Configure CMake
+ run: >
+ cmake -B ${{ steps.strings.outputs.build-output-dir }}
+ -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
+ -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }}
+ -DCMAKE_C_COMPILER=${{ matrix.c_compiler }}
+ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+ -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90"
+ -S ${{ github.workspace }}
+
+ - name: Build
+ # Build your program with the given configuration. Note that --config is needed because the default Windows generator is a multi-config generator (Visual Studio generator).
+ run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
+
+ - name: Configure NOBLASLT
+ run: >
+ cmake -B ${{ steps.strings.outputs.build-output-dir }}
+ -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
+ -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }}
+ -DCMAKE_C_COMPILER=${{ matrix.c_compiler }}
+ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+ -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90"
+ -DNO_CUBLASLT=ON
+ -S ${{ github.workspace }}
+
+ - name: Build NOBLASLT
+ run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
+
+ - name: Configure CPU
+ run: >
+ cmake -B ${{ steps.strings.outputs.build-output-dir }}
+ -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
+ -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }}
+ -DCMAKE_C_COMPILER=${{ matrix.c_compiler }}
+ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+ -DNO_CUBLASLT=ON
+ -DBUILD_CUDA=OFF
+ -S ${{ github.workspace }}
+
+ - name: Build CPU
+ run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
+
+ - name: Test
+ working-directory: ${{ steps.strings.outputs.build-output-dir }}
+ # Execute tests defined by the CMake configuration. Note that --build-config is needed because the default Windows generator is a multi-config generator (Visual Studio generator).
+ # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
+ run: ctest --build-config ${{ matrix.build_type }}
+
+ - name: Build dist
+ shell: bash -el {0}
+ run: |
+ python -m pip install build
+ python -m build --wheel
+ mkdir dist/cu${{ matrix.cuda-version }}
+ mv dist/bitsandbytes*.* dist/cu${{ matrix.cuda-version }}/
+
+ - name: Upload Build Artifacts
+ uses: actions/upload-artifact@v4.3.0
+ with:
+ name: bitsandbytes-${{ matrix.os }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
+ path: |
+ ${{ github.workspace }}/dist/
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 000000000..140753af4
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,132 @@
+# This CMake config hopefully makes it easier to compile.
+# Ensure the CUDA Toolkit is available on your path. Then run:
+# For GCC: `cmake -B build . && cmake --build build`
+# For MSVC: `cmake -B build . && cmake --build build --config Release`
+# You can also use the following options
+# - BUILD_CUDA: Default ON, will build with CUDA
+# - NO_CUBLASLT: Default OFF, will skip building/linking CUBLASLT support
+# - CUDA_VERSION: The expected CUDA version, for sanity checking. The actual version
+# is whatever CMake finds on your path.
+# - COMPUTE_CAPABILITY: Which GPU Arch/Compute codes to provide to NVCC.
+# Separate by semicolons, i.e. `-DCOMPUTE_CAPABILITY=89;90`
+# Check your compute capability here: https://developer.nvidia.com/cuda-gpus
+# - PTXAS_VERBOSE: Pass the `-v` option to the PTX Assembler
+cmake_minimum_required(VERSION 3.18)
+
+project(bitsandbytes LANGUAGES C CXX)
+
+option(BUILD_CUDA "Build bitsandbytes with CUDA support" ON)
+option(NO_CUBLASLT "Disable CUBLAS" OFF)
+option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)
+
+set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.c)
+list(APPEND CUDA_FILES csrc/ops.cu csrc/kernels.cu)
+list(APPEND SRC_FILES ${CPP_FILES})
+
+message(STATUS "BUILD_CUDA := ${BUILD_CUDA}")
+message(STATUS "NO_CUBLASLT := ${NO_CUBLASLT}")
+
+set(BNB_OUTPUT_NAME "bitsandbytes")
+
+if(BUILD_CUDA)
+ enable_language(CUDA) # This will fail if CUDA is not found
+
+ # Convert the CUDA version from X.Y.z to XY. There's probably a shorter way of doing this
+ string(REGEX MATCH "^[0-9]+.[0-9]+" _CUDA_VERSION_FIRST_TWO "${CMAKE_CUDA_COMPILER_VERSION}")
+ string(REPLACE "." "" CUDA_VERSION_SHORT "${_CUDA_VERSION_FIRST_TWO}")
+
+ # Expose a cache variable that the user can set to ensure the correct version of CUDA is found
+ set(CUDA_VERSION "${CUDA_VERSION_SHORT}" CACHE STRING "Expected CUDA Version Shortcode")
+
+ message(STATUS "CUDA Version: ${CUDA_VERSION_SHORT} (${CMAKE_CUDA_COMPILER_VERSION})")
+ message(STATUS "CUDA Compiler: ${CMAKE_CUDA_COMPILER}")
+
+ # It should match the discovered version
+ if(NOT CUDA_VERSION STREQUAL "${CUDA_VERSION_SHORT}")
+ message(FATAL_ERROR "You've specified CUDA version ${CUDA_VERSION} however the CUDA compiler found is ${CUDA_VERSION_SHORT}."
+ " Ensure the desired CUDA compiler is the first one available on your PATH."
+ )
+ endif()
+
+ if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.0")
+ message(FATAL_ERROR "CUDA Version < 11 is not supported")
+ elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
+ message(FATAL_ERROR "CUDA Version > 12 is not supported")
+ endif()
+
+ string(APPEND CMAKE_CUDA_FLAGS " --use_fast_math")
+ if(PTXAS_VERBOSE)
+ # Verbose? Outputs register usage information, and other things...
+ string(APPEND CMAKE_CUDA_FLAGS " -Xptxas=-v")
+ endif()
+
+ foreach(capability ${CMAKE_CUDA_ARCHITECTURES_ALL})
+ # Most of the items here are like: `xx-real`, so we just extract the `xx` portion
+ string(REGEX MATCH "[0-9]+" capability_id "${capability}")
+ if(capability_id GREATER 0)
+ list(APPEND POSSIBLE_CAPABILITIES ${capability_id})
+ endif()
+ endforeach()
+
+ # This can be changed via -D argument to CMake
+ # By default all possible capabilities are compiled
+ set(COMPUTE_CAPABILITY "${POSSIBLE_CAPABILITIES}" CACHE STRING "Compute Capabilities Targeted")
+
+ message(STATUS "CUDA Capabilities Available: ${POSSIBLE_CAPABILITIES}")
+ message(STATUS "CUDA Capabilities Selected: ${COMPUTE_CAPABILITY}")
+
+ foreach(capability ${COMPUTE_CAPABILITY})
+ string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_${capability},code=sm_${capability}")
+ endforeach()
+
+ message(STATUS "CUDA NVCC Flags: ${CMAKE_CUDA_FLAGS}")
+
+ list(APPEND SRC_FILES ${CUDA_FILES})
+
+ string(APPEND BNB_OUTPUT_NAME "_cuda${CUDA_VERSION_SHORT}")
+ if(NO_CUBLASLT)
+ string(APPEND BNB_OUTPUT_NAME "_nocublaslt")
+ endif()
+else()
+ message(STATUS "Building CPU Only")
+ string(APPEND BNB_OUTPUT_NAME "_cpu")
+ if(NO_CUBLASLT)
+ message(WARNING "We're building in CPU only mode but NO_CUBLASLT is enabled. It will have no effect.")
+ endif()
+endif()
+
+set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX)
+add_library(bitsandbytes SHARED ${SRC_FILES})
+include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+target_include_directories(bitsandbytes PUBLIC csrc include)
+target_compile_features(bitsandbytes PUBLIC cxx_std_14)
+
+
+if(BUILD_CUDA)
+ target_compile_definitions(bitsandbytes PUBLIC BUILD_CUDA)
+ target_link_libraries(bitsandbytes PUBLIC cudart cublas cusparse)
+ if(NO_CUBLASLT)
+ target_compile_definitions(bitsandbytes PUBLIC NO_CUBLASLT)
+ else()
+ target_link_libraries(bitsandbytes PUBLIC cublasLt)
+ endif()
+
+ set_target_properties(bitsandbytes
+ PROPERTIES
+ CUDA_SEPARABLE_COMPILATION ON
+ )
+endif()
+
+if(WIN32)
+ set_target_properties(bitsandbytes PROPERTIES PREFIX "lib")
+endif()
+
+set_target_properties(bitsandbytes
+ PROPERTIES
+ OUTPUT_NAME ${BNB_OUTPUT_NAME}
+ # We have to use a generator expression to prevent MSVC Debug/Release subdirs being made
+ RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_SOURCE_DIR}/bitsandbytes>"
+ LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_SOURCE_DIR}/bitsandbytes>"
+ POSITION_INDEPENDENT_CODE ON # The `-fPIC` commands for non-windows compilers
+ WINDOWS_EXPORT_ALL_SYMBOLS ON # On Windows, export all c methods as DLL exports
+)
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 035e3e70d..50031acf7 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -1,3 +1,43 @@
# Installation
-... work in progress ...
\ No newline at end of file
+Note currently `bitsandbytes` is only supported on CUDA GPU hardwares, support for AMD GPUs and M1 chips (MacOS) is coming soon.
+
+
+
+
+## Linux
+
+### From Pypi
+
+```bash
+pip install bitsandbytes
+```
+
+### From source
+
+```bash
+git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+CUDA_VERSION=XXX make cuda12x
+python setup.py install
+```
+
+with `XXX` being your CUDA version, for <12.0 call `make cuda 11x`
+
+
+
+
+## Windows
+
+Currently for Windows users, you need to build bitsandbytes from source
+
+```bash
+git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+cmake -B build -DBUILD_CUDA=ON -S .
+cmake --build build --config Release
+python -m build --wheel
+```
+
+Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com/Jamezo97), [rickardp](https://github.com/rickardp), [akx](https://github.com/akx) for their amazing contributions to make bitsandbytes compatible with Windows.
+
+
+
diff --git a/environment-bnb.yml b/environment-bnb.yml
new file mode 100644
index 000000000..92c7761bb
--- /dev/null
+++ b/environment-bnb.yml
@@ -0,0 +1,21 @@
+# for cmake build
+name: bnb
+channels:
+ - pytorch
+ - nvidia
+ - conda-forge
+
+dependencies:
+ - python
+ - accelerate
+ - einops
+ - scipy
+ - transformers
+ - pytest
+ - pytest-cases
+ - ipython
+ - debugpy
+ - yapf
+ - monkeytype
+ - rich
+ - pytest-sugar
diff --git a/include/SIMD.h b/include/SIMD.h
index a2ac1a9ae..d559e9f55 100644
--- a/include/SIMD.h
+++ b/include/SIMD.h
@@ -64,6 +64,16 @@ template <> struct InstrFloatTraits
typedef __m128d vec_t;
};
+template <> struct InstrFloatTraits
+{
+ typedef float vec_t;
+};
+
+template <> struct InstrFloatTraits
+{
+ typedef double vec_t;
+};
+
template
struct FTOITraits
{
diff --git a/setup.py b/setup.py
index 407116fbe..b109d9454 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
import glob
import os
-from setuptools import find_packages, setup
+from setuptools import Extension, find_packages, setup
libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.so"))
libs += list(glob.glob("./bitsandbytes/libbitsandbytes*.dll"))
@@ -35,6 +35,9 @@ def read(fname):
},
long_description=read("README.md"),
long_description_content_type="text/markdown",
+ # HACK: pretend we have a native extension module so the wheel is tagged
+ # correctly with a platform tag (e.g. `-linux_x86_64.whl`).
+ ext_modules=[Extension("bitsandbytes", sources=[], language="c")],
classifiers=[
"Development Status :: 4 - Beta",
"Topic :: Scientific/Engineering :: Artificial Intelligence",