diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml new file mode 100644 index 000000000..35d1b0240 --- /dev/null +++ b/.github/workflows/cmake.yml @@ -0,0 +1,173 @@ +name: CMake on multiple platforms + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + build: + runs-on: ${{ matrix.os }} + + strategy: + # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable. + fail-fast: false + + matrix: + os: [ubuntu-latest, windows-latest] + python-version: ['3.10', '3.11'] + cuda-version: ['11.8', '12.1'] + build_type: [Release] + c_compiler: [gcc, cl] + include: + - os: windows-latest + c_compiler: cl + cpp_compiler: cl + - os: ubuntu-latest + c_compiler: gcc + cpp_compiler: g++ + exclude: + - os: ubuntu-latest + c_compiler: cl + - os: windows-latest + c_compiler: gcc + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Set up MSVC + if: matrix.os == 'windows-latest' + uses: ilammy/msvc-dev-cmd@v1.12.1 + with: + arch: amd64 + + - name: Setup Mambaforge + uses: conda-incubator/setup-miniconda@v3.0.1 + with: + miniforge-variant: Mambaforge + miniforge-version: latest + activate-environment: bnb-env + use-mamba: true + + - uses: conda-incubator/setup-miniconda@v3.0.1 + with: + auto-update-conda: true + activate-environment: bnb-env + environment-file: environment-bnb.yml + use-only-tar-bz2: false + auto-activate-base: true + python-version: ${{ matrix.python-version }} + mamba-version: "*" + + - name: Set reusable strings + # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file. + id: strings + shell: bash + run: | + echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT" + + - name: CUDA Toolkit + shell: bash -el {0} + run: | + if [ "${{ matrix.os }}" = "ubuntu-latest" ]; then + # to prepare space + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf /usr/local/share/boost + fi + addon="" + cuda_version=${{ matrix.cuda-version }} + [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "ubuntu-latest" ] && addon="cuda-cudart-static cuda-nvrtc" + [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "windows-latest" ] && addon="cuda-nvrtc" + [ "$cuda_version" = "11.8" ] && cuda_version="11.8.0" + [ "$cuda_version" = "12.1" ] && cuda_version="12.1.1" + + conda install pytorch-cuda=${{ matrix.cuda-version }} -c pytorch # it's dependency not correctly resolved sometime + conda install cuda-python=${{ matrix.cuda-version }} cuda-libraries-dev cuda-nvcc cuda-nvtx cuda-cupti cuda-cudart cuda-cudart-dev cuda-runtime cuda-libraries $addon -c "nvidia/label/cuda-$cuda_version" + + [ "${{ matrix.os }}" = "windows-latest" ] && conda install "clang>=17.0.6" "clangxx>=17.0.6" -c conda-forge + + CUDA_HOME="${{ env.CONDA }}/envs/bnb-env" + echo CUDA_HOME=$CUDA_HOME >> "$GITHUB_ENV" + echo CUDA_PATH=$CUDA_HOME >> "$GITHUB_ENV" + + if [ "${{ matrix.os }}" = "windows-latest" ]; then + # without -DCMAKE_CUDA_COMPILER=nvcc, cmake config always fail for cuda-11.8 + echo DCMAKE_CUDA_COMPILER=-DCMAKE_CUDA_COMPILER=nvcc >> "$GITHUB_ENV" + fi + + nvcc --version + + - name: Update environment + run: mamba env update -n bnb-env -f environment-bnb.yml + + - name: Prep build + run: python -m pip install cmake==3.27.9 ninja setuptools wheel + + - name: Configure CMake + run: > + cmake -B ${{ steps.strings.outputs.build-output-dir }} + -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} + -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }} + -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} + -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} + -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90" + -S ${{ github.workspace }} + + - name: Build + # Build your program with the given configuration. Note that --config is needed because the default Windows generator is a multi-config generator (Visual Studio generator). + run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} + + - name: Configure NOBLASLT + run: > + cmake -B ${{ steps.strings.outputs.build-output-dir }} + -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} + -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }} + -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} + -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} + -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90" + -DNO_CUBLASLT=ON + -S ${{ github.workspace }} + + - name: Build NOBLASLT + run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} + + - name: Configure CPU + run: > + cmake -B ${{ steps.strings.outputs.build-output-dir }} + -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} + -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }} + -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} + -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} + -DNO_CUBLASLT=ON + -DBUILD_CUDA=OFF + -S ${{ github.workspace }} + + - name: Build CPU + run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} + + - name: Test + working-directory: ${{ steps.strings.outputs.build-output-dir }} + # Execute tests defined by the CMake configuration. Note that --build-config is needed because the default Windows generator is a multi-config generator (Visual Studio generator). + # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail + run: ctest --build-config ${{ matrix.build_type }} + + - name: Build dist + shell: bash -el {0} + run: | + python -m pip install build + python -m build --wheel + mkdir dist/cu${{ matrix.cuda-version }} + mv dist/bitsandbytes*.* dist/cu${{ matrix.cuda-version }}/ + + - name: Upload Build Artifacts + uses: actions/upload-artifact@v4.3.0 + with: + name: bitsandbytes-${{ matrix.os }}-${{ matrix.python-version }}-${{ matrix.cuda-version }} + path: | + ${{ github.workspace }}/dist/ diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 000000000..140753af4 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,132 @@ +# This CMake config hopefully makes it easier to compile. +# Ensure the CUDA Toolkit is available on your path. Then run: +# For GCC: `cmake -B build . && cmake --build build` +# For MSVC: `cmake -B build . && cmake --build build --config Release` +# You can also use the following options +# - BUILD_CUDA: Default ON, will build with CUDA +# - NO_CUBLASLT: Default OFF, will skip building/linking CUBLASLT support +# - CUDA_VERSION: The expected CUDA version, for sanity checking. The actual version +# is whatever CMake finds on your path. +# - COMPUTE_CAPABILITY: Which GPU Arch/Compute codes to provide to NVCC. +# Separate by semicolons, i.e. `-DCOMPUTE_CAPABILITY=89;90` +# Check your compute capability here: https://developer.nvidia.com/cuda-gpus +# - PTXAS_VERBOSE: Pass the `-v` option to the PTX Assembler +cmake_minimum_required(VERSION 3.18) + +project(bitsandbytes LANGUAGES C CXX) + +option(BUILD_CUDA "Build bitsandbytes with CUDA support" ON) +option(NO_CUBLASLT "Disable CUBLAS" OFF) +option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF) + +set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.c) +list(APPEND CUDA_FILES csrc/ops.cu csrc/kernels.cu) +list(APPEND SRC_FILES ${CPP_FILES}) + +message(STATUS "BUILD_CUDA := ${BUILD_CUDA}") +message(STATUS "NO_CUBLASLT := ${NO_CUBLASLT}") + +set(BNB_OUTPUT_NAME "bitsandbytes") + +if(BUILD_CUDA) + enable_language(CUDA) # This will fail if CUDA is not found + + # Convert the CUDA version from X.Y.z to XY. There's probably a shorter way of doing this + string(REGEX MATCH "^[0-9]+.[0-9]+" _CUDA_VERSION_FIRST_TWO "${CMAKE_CUDA_COMPILER_VERSION}") + string(REPLACE "." "" CUDA_VERSION_SHORT "${_CUDA_VERSION_FIRST_TWO}") + + # Expose a cache variable that the user can set to ensure the correct version of CUDA is found + set(CUDA_VERSION "${CUDA_VERSION_SHORT}" CACHE STRING "Expected CUDA Version Shortcode") + + message(STATUS "CUDA Version: ${CUDA_VERSION_SHORT} (${CMAKE_CUDA_COMPILER_VERSION})") + message(STATUS "CUDA Compiler: ${CMAKE_CUDA_COMPILER}") + + # It should match the discovered version + if(NOT CUDA_VERSION STREQUAL "${CUDA_VERSION_SHORT}") + message(FATAL_ERROR "You've specified CUDA version ${CUDA_VERSION} however the CUDA compiler found is ${CUDA_VERSION_SHORT}." + " Ensure the desired CUDA compiler is the first one available on your PATH." + ) + endif() + + if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.0") + message(FATAL_ERROR "CUDA Version < 11 is not supported") + elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") + message(FATAL_ERROR "CUDA Version > 12 is not supported") + endif() + + string(APPEND CMAKE_CUDA_FLAGS " --use_fast_math") + if(PTXAS_VERBOSE) + # Verbose? Outputs register usage information, and other things... + string(APPEND CMAKE_CUDA_FLAGS " -Xptxas=-v") + endif() + + foreach(capability ${CMAKE_CUDA_ARCHITECTURES_ALL}) + # Most of the items here are like: `xx-real`, so we just extract the `xx` portion + string(REGEX MATCH "[0-9]+" capability_id "${capability}") + if(capability_id GREATER 0) + list(APPEND POSSIBLE_CAPABILITIES ${capability_id}) + endif() + endforeach() + + # This can be changed via -D argument to CMake + # By default all possible capabilities are compiled + set(COMPUTE_CAPABILITY "${POSSIBLE_CAPABILITIES}" CACHE STRING "Compute Capabilities Targeted") + + message(STATUS "CUDA Capabilities Available: ${POSSIBLE_CAPABILITIES}") + message(STATUS "CUDA Capabilities Selected: ${COMPUTE_CAPABILITY}") + + foreach(capability ${COMPUTE_CAPABILITY}) + string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_${capability},code=sm_${capability}") + endforeach() + + message(STATUS "CUDA NVCC Flags: ${CMAKE_CUDA_FLAGS}") + + list(APPEND SRC_FILES ${CUDA_FILES}) + + string(APPEND BNB_OUTPUT_NAME "_cuda${CUDA_VERSION_SHORT}") + if(NO_CUBLASLT) + string(APPEND BNB_OUTPUT_NAME "_nocublaslt") + endif() +else() + message(STATUS "Building CPU Only") + string(APPEND BNB_OUTPUT_NAME "_cpu") + if(NO_CUBLASLT) + message(WARNING "We're building in CPU only mode but NO_CUBLASLT is enabled. It will have no effect.") + endif() +endif() + +set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX) +add_library(bitsandbytes SHARED ${SRC_FILES}) +include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) +target_include_directories(bitsandbytes PUBLIC csrc include) +target_compile_features(bitsandbytes PUBLIC cxx_std_14) + + +if(BUILD_CUDA) + target_compile_definitions(bitsandbytes PUBLIC BUILD_CUDA) + target_link_libraries(bitsandbytes PUBLIC cudart cublas cusparse) + if(NO_CUBLASLT) + target_compile_definitions(bitsandbytes PUBLIC NO_CUBLASLT) + else() + target_link_libraries(bitsandbytes PUBLIC cublasLt) + endif() + + set_target_properties(bitsandbytes + PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + ) +endif() + +if(WIN32) + set_target_properties(bitsandbytes PROPERTIES PREFIX "lib") +endif() + +set_target_properties(bitsandbytes + PROPERTIES + OUTPUT_NAME ${BNB_OUTPUT_NAME} + # We have to use a generator expression to prevent MSVC Debug/Release subdirs being made + RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_SOURCE_DIR}/bitsandbytes>" + LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_SOURCE_DIR}/bitsandbytes>" + POSITION_INDEPENDENT_CODE ON # The `-fPIC` commands for non-windows compilers + WINDOWS_EXPORT_ALL_SYMBOLS ON # On Windows, export all c methods as DLL exports +) diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index 035e3e70d..50031acf7 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -1,3 +1,43 @@ # Installation -... work in progress ... \ No newline at end of file +Note currently `bitsandbytes` is only supported on CUDA GPU hardwares, support for AMD GPUs and M1 chips (MacOS) is coming soon. + + + + +## Linux + +### From Pypi + +```bash +pip install bitsandbytes +``` + +### From source + +```bash +git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/ +CUDA_VERSION=XXX make cuda12x +python setup.py install +``` + +with `XXX` being your CUDA version, for <12.0 call `make cuda 11x` + + + + +## Windows + +Currently for Windows users, you need to build bitsandbytes from source + +```bash +git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/ +cmake -B build -DBUILD_CUDA=ON -S . +cmake --build build --config Release +python -m build --wheel +``` + +Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com/Jamezo97), [rickardp](https://github.com/rickardp), [akx](https://github.com/akx) for their amazing contributions to make bitsandbytes compatible with Windows. + + + diff --git a/environment-bnb.yml b/environment-bnb.yml new file mode 100644 index 000000000..92c7761bb --- /dev/null +++ b/environment-bnb.yml @@ -0,0 +1,21 @@ +# for cmake build +name: bnb +channels: + - pytorch + - nvidia + - conda-forge + +dependencies: + - python + - accelerate + - einops + - scipy + - transformers + - pytest + - pytest-cases + - ipython + - debugpy + - yapf + - monkeytype + - rich + - pytest-sugar diff --git a/include/SIMD.h b/include/SIMD.h index a2ac1a9ae..d559e9f55 100644 --- a/include/SIMD.h +++ b/include/SIMD.h @@ -64,6 +64,16 @@ template <> struct InstrFloatTraits typedef __m128d vec_t; }; +template <> struct InstrFloatTraits +{ + typedef float vec_t; +}; + +template <> struct InstrFloatTraits +{ + typedef double vec_t; +}; + template struct FTOITraits { diff --git a/setup.py b/setup.py index 407116fbe..b109d9454 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ import glob import os -from setuptools import find_packages, setup +from setuptools import Extension, find_packages, setup libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.so")) libs += list(glob.glob("./bitsandbytes/libbitsandbytes*.dll")) @@ -35,6 +35,9 @@ def read(fname): }, long_description=read("README.md"), long_description_content_type="text/markdown", + # HACK: pretend we have a native extension module so the wheel is tagged + # correctly with a platform tag (e.g. `-linux_x86_64.whl`). + ext_modules=[Extension("bitsandbytes", sources=[], language="c")], classifiers=[ "Development Status :: 4 - Beta", "Topic :: Scientific/Engineering :: Artificial Intelligence",