Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 59 additions & 10 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,61 @@ set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in

set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" )

# Target architecture - auto-detected if not specified
set(TARGET "" CACHE STRING "Target CPU architecture (e.g. HASWELL, SANDYBRIDGE, NEHALEM, ARMV8, POWER9). Auto-detected if not specified.")

# Binary type (32-bit or 64-bit)
set(BINARY "" CACHE STRING "Build a 32-bit or 64-bit library (32 or 64). Auto-detected if not specified. Note: 32-bit disables AVX.")

# Threading options
set(USE_THREAD "" CACHE STRING "Enable multi-threading (0=disabled, 1=enabled). Auto-detected based on NUM_THREADS if not specified.")
option(USE_OPENMP "Use OpenMP for threading instead of pthreads" OFF)
set(NUM_THREADS "" CACHE STRING "Maximum number of threads. Auto-detected from CPU cores if not specified.")
set(NUM_PARALLEL "1" CACHE STRING "Number of parallel OpenBLAS instances when using OpenMP (default: 1)")

# 64-bit integer interface
option(INTERFACE64 "Use 64-bit integers for array indices (equivalent to -i8 in ifort)" OFF)

# Vector extension control
option(NO_AVX "Disable AVX kernel support (use for compatibility with older systems)" OFF)
option(NO_AVX2 "Disable AVX2 optimizations" OFF)
option(NO_AVX512 "Disable AVX512 optimizations" OFF)

# Memory tuning options
set(BUFFERSIZE "" CACHE STRING "Memory buffer size factor (32<<n bytes, default: architecture-dependent, typically 25)")
set(MAX_STACK_ALLOC "2048" CACHE STRING "Maximum stack allocation in bytes (0 to disable, may reduce GER/GEMV performance)")
set(BLAS3_MEM_ALLOC_THRESHOLD "32" CACHE STRING "Thread count threshold for heap allocation of job arrays (default: 32)")
set(GEMM_MULTITHREAD_THRESHOLD "4" CACHE STRING "Threshold below which GEMM runs single-threaded (default: 4)")

# Threading implementation options
option(USE_SIMPLE_THREADED_LEVEL3 "Use legacy threaded Level 3 implementation" OFF)
option(USE_TLS "Use thread-local storage instead of central memory buffer (requires glibc 2.21+)" OFF)
option(CONSISTENT_FPCSR "Synchronize floating-point CSR between threads (x86/x86_64/aarch64 only)" OFF)

# System configuration
option(BIGNUMA "Support systems with more than 16 NUMA nodes or more than 256 CPUs (Linux only)" OFF)
option(EMBEDDED "Build for embedded/bare-metal systems (requires custom malloc/free)" OFF)

# Precision type options (default: build all types if all OFF)
option(BUILD_SINGLE "Build single precision (REAL) functions" OFF)
option(BUILD_DOUBLE "Build double precision (DOUBLE PRECISION) functions" OFF)
option(BUILD_COMPLEX "Build complex (COMPLEX) functions" OFF)
option(BUILD_COMPLEX16 "Build double complex (COMPLEX*16) functions" OFF)
option(BUILD_BFLOAT16 "Build experimental BFLOAT16 functions" OFF)
option(BUILD_HFLOAT16 "Build experimental HFLOAT16 functions" OFF)

# CBLAS-only mode
option(ONLY_CBLAS "Build only CBLAS interface (no Fortran BLAS, implies NO_LAPACK)" OFF)

# Profiling and debugging
option(FUNCTION_PROFILE "Enable function-level performance profiling" OFF)
option(SANITY_CHECK "Compare results against reference BLAS (slow, for testing only)" OFF)

# Memory allocation methods
option(HUGETLB_ALLOCATION "Use huge pages for thread buffers via shared memory" OFF)
set(HUGETLBFILE_ALLOCATION "" CACHE STRING "Path to hugetlbfs mount for huge page allocation (e.g. /hugepages)")
option(DEVICEDRIVER_ALLOCATION "Use device driver for physically contiguous memory allocation" OFF)

if (CMAKE_SYSTEM_NAME MATCHES "Windows" AND BUILD_SHARED_LIBS AND NOT ("${SYMBOLPREFIX}${SYMBOLSUFFIX}" STREQUAL ""))
set (DELETE_STATIC_LIBS "")
if (NOT BUILD_STATIC_LIBS)
Expand Down Expand Up @@ -137,7 +192,7 @@ if (NOT DYNAMIC_ARCH)
list(APPEND BLASDIRS kernel)
endif ()

if (DEFINED SANITY_CHECK)
if (SANITY_CHECK)
list(APPEND BLASDIRS reference)
endif ()

Expand All @@ -149,16 +204,10 @@ if (NOT NO_LAPACK)
list(APPEND SUBDIRS lapack)
endif ()

if (NOT DEFINED BUILD_BFLOAT16)
set (BUILD_BFLOAT16 false)
endif ()
if (NOT DEFINED BUILD_HFLOAT16)
set (BUILD_HFLOAT16 false)
endif ()
# set which float types we want to build for
if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16)
# if none are defined, build for all
# set(BUILD_BFLOAT16 true)
# If none of the BUILD_* precision options are ON, build all (except BFLOAT16)
if (NOT BUILD_SINGLE AND NOT BUILD_DOUBLE AND NOT BUILD_COMPLEX AND NOT BUILD_COMPLEX16)
# if none are enabled, build for all
set(BUILD_SINGLE true)
set(BUILD_DOUBLE true)
set(BUILD_COMPLEX true)
Expand Down
41 changes: 33 additions & 8 deletions cmake/system.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -34,20 +34,44 @@ set(NETLIB_LAPACK_DIR "${PROJECT_SOURCE_DIR}/lapack-netlib")
# System detection, via CMake.
include("${PROJECT_SOURCE_DIR}/cmake/system_check.cmake")

if(CMAKE_CROSSCOMPILING AND NOT DEFINED TARGET)
# Handle cache options that should be auto-detected if empty
# We keep the cache entry visible in ccmake but treat empty values as "not set"
set(_TARGET_SET FALSE)
if (DEFINED TARGET AND NOT "${TARGET}" STREQUAL "")
set(_TARGET_SET TRUE)
endif()

if(CMAKE_CROSSCOMPILING AND NOT _TARGET_SET)
# Detect target without running getarch
if (ARM64)
set(TARGET "ARMV8")
set(_TARGET_SET TRUE)
elseif(ARM)
set(TARGET "ARMV7") # TODO: Ask compiler which arch this is
set(_TARGET_SET TRUE)
else()
message(FATAL_ERROR "When cross compiling, a TARGET is required.")
endif()
endif()

set(_BINARY_SET FALSE)
if (DEFINED BINARY AND NOT "${BINARY}" STREQUAL "")
set(_BINARY_SET TRUE)
endif()

set(_USE_THREAD_SET FALSE)
if (DEFINED USE_THREAD AND NOT "${USE_THREAD}" STREQUAL "")
set(_USE_THREAD_SET TRUE)
endif()

set(_NUM_THREADS_SET FALSE)
if (DEFINED NUM_THREADS AND NOT "${NUM_THREADS}" STREQUAL "")
set(_NUM_THREADS_SET TRUE)
endif()

# Other files expect CORE, which is actually TARGET and will become TARGET_CORE for kernel build. Confused yet?
# It seems we are meant to use TARGET as input and CORE internally as kernel.
if(NOT DEFINED CORE AND DEFINED TARGET)
if(NOT DEFINED CORE AND _TARGET_SET)
if (${TARGET} STREQUAL "LOONGSON3R5")
set(CORE "LA464")
elseif (${TARGET} STREQUAL "LOONGSON2K1000")
Expand All @@ -62,10 +86,11 @@ endif()
# TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
if (DEFINED TARGET_CORE)
set(TARGET ${TARGET_CORE})
set(_TARGET_SET TRUE)
endif ()

# Force fallbacks for 32bit
if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
if (_BINARY_SET AND _TARGET_SET AND BINARY EQUAL 32)
message(STATUS "Compiling a ${BINARY}-bit binary.")
set(NO_AVX 1)
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE" OR ${TARGET} STREQUAL "SAPPHIRERAPIDS")
Expand All @@ -83,7 +108,7 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
endif ()


if (DEFINED TARGET)
if (_TARGET_SET)
message(STATUS "-- -- -- -- -- -- -- -- -- -- -- -- --")
message(STATUS "Targeting the ${TARGET} architecture.")
set(GETARCH_FLAGS "-DFORCE_${TARGET}")
Expand Down Expand Up @@ -172,7 +197,7 @@ if (NOT DEFINED NUM_PARALLEL)
set(NUM_PARALLEL 1)
endif()

if (NOT DEFINED NUM_THREADS)
if (NOT _NUM_THREADS_SET)
if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0)
# HT?
set(NUM_THREADS ${NUM_CORES})
Expand All @@ -183,7 +208,7 @@ endif()

if (${NUM_THREADS} LESS 2)
set(USE_THREAD 0)
elseif(NOT DEFINED USE_THREAD)
elseif(NOT _USE_THREAD_SET)
set(USE_THREAD 1)
endif ()

Expand All @@ -202,7 +227,7 @@ if (C_LAPACK)
endif ()

include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
if (DEFINED TARGET)
if (_TARGET_SET)
if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512)
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 10.09)
Expand Down Expand Up @@ -683,7 +708,7 @@ if (HUGETLB_ALLOCATION)
set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_HUGETLB")
endif ()

if (DEFINED HUGETLBFILE_ALLOCATION)
if (DEFINED HUGETLBFILE_ALLOCATION AND NOT "${HUGETLBFILE_ALLOCATION}" STREQUAL "")
set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_HUGETLBFILE -DHUGETLB_FILE_NAME=${HUGETLBFILE_ALLOCATION})")
endif ()

Expand Down
Loading