diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
new file mode 100644
index 000000000..35d1b0240
--- /dev/null
+++ b/.github/workflows/cmake.yml
@@ -0,0 +1,173 @@
+name: CMake on multiple platforms
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+
+    strategy:
+      # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable.
+      fail-fast: false
+
+      matrix:
+        os: [ubuntu-latest, windows-latest]
+        python-version: ['3.10', '3.11']
+        cuda-version: ['11.8', '12.1']
+        build_type: [Release]
+        c_compiler: [gcc, cl]
+        include:
+          - os: windows-latest
+            c_compiler: cl
+            cpp_compiler: cl
+          - os: ubuntu-latest
+            c_compiler: gcc
+            cpp_compiler: g++
+        exclude:
+          - os: ubuntu-latest
+            c_compiler: cl
+          - os: windows-latest
+            c_compiler: gcc
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Set up MSVC
+      if: matrix.os == 'windows-latest'
+      uses: ilammy/msvc-dev-cmd@v1.12.1
+      with:
+        arch: amd64
+
+    - name: Setup Mambaforge
+      uses: conda-incubator/setup-miniconda@v3.0.1
+      with:
+        miniforge-variant: Mambaforge
+        miniforge-version: latest
+        activate-environment: bnb-env
+        use-mamba: true
+
+    - uses: conda-incubator/setup-miniconda@v3.0.1
+      with:
+        auto-update-conda: true
+        activate-environment: bnb-env
+        environment-file: environment-bnb.yml
+        use-only-tar-bz2: false
+        auto-activate-base: true
+        python-version: ${{ matrix.python-version }}
+        mamba-version: "*"
+
+    - name: Set reusable strings
+      # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file.
+      id: strings
+      shell: bash
+      run: |
+        echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT"
+
+    - name: CUDA Toolkit
+      shell: bash -el {0}
+      run: |
+        if [ "${{ matrix.os }}" = "ubuntu-latest" ]; then
+            # to prepare space
+            sudo rm -rf /usr/share/dotnet
+            sudo rm -rf /opt/ghc
+            sudo rm -rf /usr/local/share/boost
+        fi
+        addon=""
+        cuda_version=${{ matrix.cuda-version }}
+        [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "ubuntu-latest" ] && addon="cuda-cudart-static cuda-nvrtc"
+        [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "windows-latest" ] && addon="cuda-nvrtc"
+        [ "$cuda_version" = "11.8" ] && cuda_version="11.8.0"
+        [ "$cuda_version" = "12.1" ] && cuda_version="12.1.1"
+
+        conda install pytorch-cuda=${{ matrix.cuda-version }} -c pytorch # it's dependency not correctly resolved sometime
+        conda install cuda-python=${{ matrix.cuda-version }} cuda-libraries-dev cuda-nvcc cuda-nvtx cuda-cupti cuda-cudart cuda-cudart-dev cuda-runtime cuda-libraries $addon -c "nvidia/label/cuda-$cuda_version"
+
+        [ "${{ matrix.os }}" = "windows-latest" ] && conda install "clang>=17.0.6" "clangxx>=17.0.6" -c conda-forge
+
+        CUDA_HOME="${{ env.CONDA }}/envs/bnb-env"
+        echo CUDA_HOME=$CUDA_HOME >> "$GITHUB_ENV"
+        echo CUDA_PATH=$CUDA_HOME >> "$GITHUB_ENV"
+
+        if [ "${{ matrix.os }}" = "windows-latest" ]; then
+            # without -DCMAKE_CUDA_COMPILER=nvcc, cmake config always fail for cuda-11.8
+            echo DCMAKE_CUDA_COMPILER=-DCMAKE_CUDA_COMPILER=nvcc >> "$GITHUB_ENV"
+        fi
+
+        nvcc --version
+
+    - name: Update environment
+      run: mamba env update -n bnb-env -f environment-bnb.yml
+
+    - name: Prep build
+      run: python -m pip install cmake==3.27.9 ninja setuptools wheel
+
+    - name: Configure CMake
+      run: >
+        cmake -B ${{ steps.strings.outputs.build-output-dir }}
+        -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
+        -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }}
+        -DCMAKE_C_COMPILER=${{ matrix.c_compiler }}
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+        -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90"
+        -S ${{ github.workspace }}
+
+    - name: Build
+      # Build your program with the given configuration. Note that --config is needed because the default Windows generator is a multi-config generator (Visual Studio generator).
+      run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
+
+    - name: Configure NOBLASLT
+      run: >
+        cmake -B ${{ steps.strings.outputs.build-output-dir }}
+        -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
+        -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }}
+        -DCMAKE_C_COMPILER=${{ matrix.c_compiler }}
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+        -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90"
+        -DNO_CUBLASLT=ON
+        -S ${{ github.workspace }}
+
+    - name: Build NOBLASLT
+      run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
+
+    - name: Configure CPU
+      run: >
+        cmake -B ${{ steps.strings.outputs.build-output-dir }}
+        -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
+        -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }}
+        -DCMAKE_C_COMPILER=${{ matrix.c_compiler }}
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+        -DNO_CUBLASLT=ON
+        -DBUILD_CUDA=OFF
+        -S ${{ github.workspace }}
+
+    - name: Build CPU
+      run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
+
+    - name: Test
+      working-directory: ${{ steps.strings.outputs.build-output-dir }}
+      # Execute tests defined by the CMake configuration. Note that --build-config is needed because the default Windows generator is a multi-config generator (Visual Studio generator).
+      # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
+      run: ctest --build-config ${{ matrix.build_type }}
+
+    - name: Build dist
+      shell: bash -el {0}
+      run: |
+        python -m pip install build
+        python -m build --wheel
+        mkdir dist/cu${{ matrix.cuda-version }}
+        mv dist/bitsandbytes*.* dist/cu${{ matrix.cuda-version }}/
+
+    - name: Upload Build Artifacts
+      uses: actions/upload-artifact@v4.3.0
+      with:
+        name: bitsandbytes-${{ matrix.os }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
+        path: |
+          ${{ github.workspace }}/dist/
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 000000000..140753af4
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,132 @@
+# This CMake config hopefully makes it easier to compile.
+# Ensure the CUDA Toolkit is available on your path. Then run:
+#   For  GCC: `cmake -B build . && cmake --build build`
+#   For MSVC: `cmake -B build . && cmake --build build --config Release`
+# You can also use the following options
+#  - BUILD_CUDA: Default ON, will build with CUDA
+#  - NO_CUBLASLT: Default OFF, will skip building/linking CUBLASLT support
+#  - CUDA_VERSION: The expected CUDA version, for sanity checking. The actual version
+#                  is whatever CMake finds on your path.
+#  - COMPUTE_CAPABILITY: Which GPU Arch/Compute codes to provide to NVCC.
+#                        Separate by semicolons, i.e. `-DCOMPUTE_CAPABILITY=89;90`
+#                        Check your compute capability here: https://developer.nvidia.com/cuda-gpus
+#  - PTXAS_VERBOSE: Pass the `-v` option to the PTX Assembler
+cmake_minimum_required(VERSION 3.18)
+
+project(bitsandbytes LANGUAGES C CXX)
+
+option(BUILD_CUDA "Build bitsandbytes with CUDA support" ON)
+option(NO_CUBLASLT "Disable CUBLAS" OFF)
+option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)
+
+set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.c)
+list(APPEND CUDA_FILES csrc/ops.cu csrc/kernels.cu)
+list(APPEND SRC_FILES ${CPP_FILES})
+
+message(STATUS "BUILD_CUDA := ${BUILD_CUDA}")
+message(STATUS "NO_CUBLASLT := ${NO_CUBLASLT}")
+
+set(BNB_OUTPUT_NAME "bitsandbytes")
+
+if(BUILD_CUDA)
+    enable_language(CUDA) # This will fail if CUDA is not found
+
+    # Convert the CUDA version from X.Y.z to XY. There's probably a shorter way of doing this
+    string(REGEX MATCH "^[0-9]+.[0-9]+" _CUDA_VERSION_FIRST_TWO "${CMAKE_CUDA_COMPILER_VERSION}")
+    string(REPLACE "." "" CUDA_VERSION_SHORT "${_CUDA_VERSION_FIRST_TWO}")
+
+    # Expose a cache variable that the user can set to ensure the correct version of CUDA is found
+    set(CUDA_VERSION "${CUDA_VERSION_SHORT}" CACHE STRING "Expected CUDA Version Shortcode")
+
+    message(STATUS "CUDA Version: ${CUDA_VERSION_SHORT} (${CMAKE_CUDA_COMPILER_VERSION})")
+    message(STATUS "CUDA Compiler: ${CMAKE_CUDA_COMPILER}")
+
+    # It should match the discovered version
+    if(NOT CUDA_VERSION STREQUAL "${CUDA_VERSION_SHORT}")
+        message(FATAL_ERROR "You've specified CUDA version ${CUDA_VERSION} however the CUDA compiler found is ${CUDA_VERSION_SHORT}."
+            " Ensure the desired CUDA compiler is the first one available on your PATH."
+        )
+    endif()
+
+    if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.0")
+        message(FATAL_ERROR "CUDA Version < 11 is not supported")
+    elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
+        message(FATAL_ERROR "CUDA Version > 12 is not supported")
+    endif()
+
+    string(APPEND CMAKE_CUDA_FLAGS " --use_fast_math")
+    if(PTXAS_VERBOSE)
+        # Verbose? Outputs register usage information, and other things...
+        string(APPEND CMAKE_CUDA_FLAGS " -Xptxas=-v")
+    endif()
+
+    foreach(capability ${CMAKE_CUDA_ARCHITECTURES_ALL})
+        # Most of the items here are like: `xx-real`, so we just extract the `xx` portion
+        string(REGEX MATCH "[0-9]+" capability_id "${capability}")
+        if(capability_id GREATER 0)
+            list(APPEND POSSIBLE_CAPABILITIES ${capability_id})
+        endif()
+    endforeach()
+
+    # This can be changed via -D argument to CMake
+    # By default all possible capabilities are compiled
+    set(COMPUTE_CAPABILITY "${POSSIBLE_CAPABILITIES}" CACHE STRING "Compute Capabilities Targeted")
+
+    message(STATUS "CUDA Capabilities Available: ${POSSIBLE_CAPABILITIES}")
+    message(STATUS "CUDA Capabilities  Selected: ${COMPUTE_CAPABILITY}")
+
+    foreach(capability ${COMPUTE_CAPABILITY})
+        string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_${capability},code=sm_${capability}")
+    endforeach()
+
+    message(STATUS "CUDA NVCC Flags: ${CMAKE_CUDA_FLAGS}")
+
+    list(APPEND SRC_FILES ${CUDA_FILES})
+
+    string(APPEND BNB_OUTPUT_NAME "_cuda${CUDA_VERSION_SHORT}")
+    if(NO_CUBLASLT)
+        string(APPEND BNB_OUTPUT_NAME "_nocublaslt")
+    endif()
+else()
+    message(STATUS "Building CPU Only")
+    string(APPEND BNB_OUTPUT_NAME "_cpu")
+    if(NO_CUBLASLT)
+        message(WARNING "We're building in CPU only mode but NO_CUBLASLT is enabled. It will have no effect.")
+    endif()
+endif()
+
+set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX)
+add_library(bitsandbytes SHARED ${SRC_FILES})
+include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+target_include_directories(bitsandbytes PUBLIC csrc include)
+target_compile_features(bitsandbytes PUBLIC cxx_std_14)
+
+
+if(BUILD_CUDA)
+    target_compile_definitions(bitsandbytes PUBLIC BUILD_CUDA)
+    target_link_libraries(bitsandbytes PUBLIC cudart cublas cusparse)
+    if(NO_CUBLASLT)
+        target_compile_definitions(bitsandbytes PUBLIC NO_CUBLASLT)
+    else()
+        target_link_libraries(bitsandbytes PUBLIC cublasLt)
+    endif()
+
+    set_target_properties(bitsandbytes
+        PROPERTIES
+            CUDA_SEPARABLE_COMPILATION ON
+    )
+endif()
+
+if(WIN32)
+    set_target_properties(bitsandbytes PROPERTIES PREFIX "lib")
+endif()
+
+set_target_properties(bitsandbytes
+    PROPERTIES
+        OUTPUT_NAME ${BNB_OUTPUT_NAME}
+        # We have to use a generator expression to prevent MSVC Debug/Release subdirs being made
+        RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_SOURCE_DIR}/bitsandbytes>"
+        LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_SOURCE_DIR}/bitsandbytes>"
+        POSITION_INDEPENDENT_CODE ON # The `-fPIC` commands for non-windows compilers
+        WINDOWS_EXPORT_ALL_SYMBOLS ON # On Windows, export all c methods as DLL exports
+)
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 035e3e70d..50031acf7 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -1,3 +1,43 @@
 # Installation
 
-... work in progress ...
\ No newline at end of file
+Note currently `bitsandbytes` is only supported on CUDA GPU hardwares, support for AMD GPUs and M1 chips (MacOS) is coming soon.
+
+<hfoptions id="OS system">
+<hfoption id="Linux">
+
+## Linux
+
+### From Pypi
+
+```bash
+pip install bitsandbytes
+```
+
+### From source
+
+```bash
+git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+CUDA_VERSION=XXX make cuda12x
+python setup.py install
+```
+
+with `XXX` being your CUDA version, for <12.0 call `make cuda 11x`
+
+</hfoption>
+<hfoption id="Windows">
+
+## Windows
+
+Currently for Windows users, you need to build bitsandbytes from source
+
+```bash
+git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+cmake -B build -DBUILD_CUDA=ON -S .
+cmake --build build --config Release
+python -m build --wheel
+```
+
+Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com/Jamezo97), [rickardp](https://github.com/rickardp), [akx](https://github.com/akx) for their amazing contributions to make bitsandbytes compatible with Windows.
+
+</hfoption>
+</hfoptions>
diff --git a/environment-bnb.yml b/environment-bnb.yml
new file mode 100644
index 000000000..92c7761bb
--- /dev/null
+++ b/environment-bnb.yml
@@ -0,0 +1,21 @@
+# for cmake build
+name: bnb
+channels:
+  - pytorch
+  - nvidia
+  - conda-forge
+
+dependencies:
+  - python
+  - accelerate
+  - einops
+  - scipy
+  - transformers
+  - pytest
+  - pytest-cases
+  - ipython
+  - debugpy
+  - yapf
+  - monkeytype
+  - rich
+  - pytest-sugar
diff --git a/include/SIMD.h b/include/SIMD.h
index a2ac1a9ae..d559e9f55 100644
--- a/include/SIMD.h
+++ b/include/SIMD.h
@@ -64,6 +64,16 @@ template <> struct InstrFloatTraits<SSE, double>
     typedef __m128d vec_t;
 };
 
+template <> struct InstrFloatTraits<Scalar, float>
+{
+    typedef float  vec_t;
+};
+
+template <> struct InstrFloatTraits<Scalar, double>
+{
+    typedef double vec_t;
+};
+
 template <InstrSet I, typename T>
 struct FTOITraits
 {
diff --git a/setup.py b/setup.py
index 407116fbe..b109d9454 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 import glob
 import os
 
-from setuptools import find_packages, setup
+from setuptools import Extension, find_packages, setup
 
 libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.so"))
 libs += list(glob.glob("./bitsandbytes/libbitsandbytes*.dll"))
@@ -35,6 +35,9 @@ def read(fname):
     },
     long_description=read("README.md"),
     long_description_content_type="text/markdown",
+    # HACK: pretend we have a native extension module so the wheel is tagged
+    #       correctly with a platform tag (e.g. `-linux_x86_64.whl`).
+    ext_modules=[Extension("bitsandbytes", sources=[], language="c")],
     classifiers=[
         "Development Status :: 4 - Beta",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",