huggingface · ivarflakstad · Dec 23, 2025 · Dec 2, 2025 · Dec 3, 2025 · Dec 18, 2025
diff --git a/README.md b/README.md
@@ -92,6 +92,7 @@ We also provide some command line based examples using state of the art models:
 - [Quantized LLaMA](./candle-examples/examples/quantized/): quantized version of
   the LLaMA model using the same quantization techniques as
   [llama.cpp](https://github.com/ggerganov/llama.cpp).
+- [Quantized Qwen3 MoE](./candle-examples/examples/quantized-qwen3-moe/): support gguf quantized models of Qwen3 MoE models.
 
 <img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/quantized/assets/aoc.gif" width="600">
 
@@ -190,6 +191,7 @@ And then head over to
 - [`candle-einops`](https://github.com/tomsanbear/candle-einops): A pure rust implementation of the python [einops](https://github.com/arogozhnikov/einops) library.
 - [`atoma-infer`](https://github.com/atoma-network/atoma-infer): A Rust library for fast inference at scale, leveraging FlashAttention2 for efficient attention computation, PagedAttention for efficient KV-cache memory management, and multi-GPU support. It is OpenAI api compatible.
 - [`llms-from-scratch-rs`](https://github.com/nerdai/llms-from-scratch-rs): A comprehensive Rust translation of the code from Sebastian Raschka's Build an LLM from Scratch book.
+- [`vllm.rs`](https://github.com/guoqingbao/vllm.rs): A minimalist vLLM implementation in Rust based on Candle.
 
 If you have an addition to this list, please submit a pull request.
 
@@ -220,14 +222,15 @@ If you have an addition to this list, please submit a pull request.
         - Replit-code-v1.5-3B.
         - Bert.
         - Yi-6B and Yi-34B.
-        - Qwen1.5, Qwen1.5 MoE.
+        - Qwen1.5, Qwen1.5 MoE, Qwen3 MoE.
         - RWKV v5 and v6.
     - Quantized LLMs.
         - Llama 7b, 13b, 70b, as well as the chat and code variants.
         - Mistral 7b, and 7b instruct.
         - Mixtral 8x7b.
         - Zephyr 7b a and b (Mistral-7b based).
         - OpenChat 3.5 (Mistral-7b based).
+        - Qwen3 MoE (16B-A3B, 32B-A3B)
     - Text to text.
         - T5 and its variants: FlanT5, UL2, MADLAD400 (translation), CoEdit (Grammar correction).
         - Marian MT (Machine Translation).

diff --git a/candle-core/benches/benchmarks/binary.rs b/candle-core/benches/benchmarks/binary.rs
@@ -48,7 +48,7 @@ fn criterion_benchmark(c: &mut Criterion) {
     let handler = BenchDeviceHandler::new().unwrap();
     for device in handler.devices {
         for dtype in [DType::F32, DType::BF16, DType::F16] {
-            let name = format!("binary_mul_{:?}", dtype);
+            let name = format!("binary_mul_{dtype:?}");
             run_unary_benchmark(c, &device, dtype, &name);
         }
     }

diff --git a/candle-core/benches/benchmarks/mod.rs b/candle-core/benches/benchmarks/mod.rs
@@ -29,13 +29,13 @@ impl BenchDevice for Device {
                     return Ok(device.synchronize()?);
                 }
                 #[cfg(not(feature = "cuda"))]
-                panic!("Cuda device without cuda feature enabled: {:?}", device)
+                panic!("Cuda device without cuda feature enabled: {device:?}")
             }
             Device::Metal(device) => {
                 #[cfg(feature = "metal")]
                 return device.wait_until_completed();
                 #[cfg(not(feature = "metal"))]
-                panic!("Metal device without metal feature enabled: {:?}", device)
+                panic!("Metal device without metal feature enabled: {device:?}")
             }
         }
     }

diff --git a/candle-core/benches/benchmarks/qmatmul.rs b/candle-core/benches/benchmarks/qmatmul.rs
@@ -32,7 +32,7 @@ fn run_bench(c: &mut Criterion, device: &Device, dtype: GgmlDType) {
 
     let flops = b * m * n * k;
 
-    let mut group = c.benchmark_group(device.bench_name(format!("qmatmul_{:?}", dtype)));
+    let mut group = c.benchmark_group(device.bench_name(format!("qmatmul_{dtype:?}")));
     group.sample_size(200);
     group.throughput(Throughput::Bytes(flops as u64));
     group.bench_function("iter", move |b| {

diff --git a/candle-core/benches/benchmarks/unary.rs b/candle-core/benches/benchmarks/unary.rs
@@ -89,7 +89,7 @@ fn criterion_benchmark(c: &mut Criterion) {
             run_cast_benchmark(c, &device, dtype, to_dtype, &name);
         }
         for dtype in [DType::F32, DType::BF16, DType::F16] {
-            let name = format!("sqrt_{:?}", dtype);
+            let name = format!("sqrt_{dtype:?}");
             run_unary_benchmark(c, &device, dtype, &name);
         }
     }

diff --git a/candle-core/src/op.rs b/candle-core/src/op.rs
@@ -1031,7 +1031,7 @@ impl UnaryOpT for Relu {
 pub struct BackpropOp(Option<Op>);
 
 impl BackpropOp {
-    pub(crate) fn none() -> Self {
+    pub fn none() -> Self {
         BackpropOp(None)
     }
 

diff --git a/candle-core/src/quantized/cuda.rs b/candle-core/src/quantized/cuda.rs
@@ -742,6 +742,11 @@ impl QCudaStorage {
             .memcpy_dtoh(&self.data.inner.slice(..self.data.len), &mut out)?;
         Ok(out)
     }
+
+    pub fn device_ptr(&self) -> Result<*const u8> {
+        use cudarc::driver::DevicePtr;
+        Ok(self.data.inner.device_ptr(self.data.inner.stream()).0 as *const u8)
+    }
 }
 
 impl QCudaStorage {

diff --git a/candle-core/src/quantized/dummy_cuda.rs b/candle-core/src/quantized/dummy_cuda.rs
@@ -54,6 +54,10 @@ impl QCudaStorage {
         Err(Error::NotCompiledWithCudaSupport)
     }
 
+    pub fn device_ptr(&self) -> Result<*const u8> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+
     pub fn storage_size_in_bytes(&self) -> usize {
         0
     }

diff --git a/candle-core/src/quantized/imatrix_file.rs b/candle-core/src/quantized/imatrix_file.rs
@@ -30,7 +30,7 @@ pub fn load_imatrix<P: AsRef<Path>>(fname: P) -> Result<HashMap<String, Vec<f32>
 
     let n_entries = cursor
         .read_i32::<LittleEndian>()
-        .map_err(|e| crate::Error::msg(format!("Failed to read number of entries: {}", e)))?
+        .map_err(|e| crate::Error::msg(format!("Failed to read number of entries: {e}")))?
         as usize;
 
     if n_entries < 1 {

diff --git a/candle-core/src/quantized/mod.rs b/candle-core/src/quantized/mod.rs
@@ -239,6 +239,15 @@ impl QStorage {
             QStorage::Metal(storage) => Ok(Cow::from(storage.data()?)),
         }
     }
+
+    pub fn device_ptr(&self) -> Result<*const u8> {
+        match self {
+            QStorage::Cuda(storage) => storage.device_ptr(),
+            QStorage::Metal(_) | QStorage::Cpu(_) => {
+                crate::bail!("not implemented");
+            }
+        }
+    }
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -670,6 +679,15 @@ impl QTensor {
             }
         }
     }
+
+    pub fn device_ptr(&self) -> Result<*const u8> {
+        match &self.storage {
+            QStorage::Cuda(storage) => storage.device_ptr(),
+            QStorage::Metal(_) | QStorage::Cpu(_) => {
+                crate::bail!("not implemented");
+            }
+        }
+    }
 }
 
 #[derive(Clone, Debug)]

diff --git a/candle-examples/examples/quantized-qwen3-moe/README.md b/candle-examples/examples/quantized-qwen3-moe/README.md
@@ -0,0 +1,18 @@
+# candle-quantized-qwen3-moe
+
+[Qwen3 MoE GGUF]((https://huggingface.co/unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF)) contains the GGUF format of Qwen3 32B MoE models, developed by Alibaba Cloud.
+
+## Running the example
+
+```bash
+# Local GGUF file
+cargo run --features cuda --example quantized-qwen3-moe --release -- --model /path/Qwen3-30B-A3B-Instruct-2507-Q4_K_M.gguf --prompt "Write a function to count prime numbers up to N."
+```
+
+Models available via `--which` argument: 16b_q2k, 16b_q4k, 16b_q6k, 16b_q80; 32b_q2k, 32b_q4k, 32b_q6k, 32b_q80;
+
+```bash
+# Obtained from Huggingface
+cargo run --features cuda --example quantized-qwen3-moe --release -- --which 32b_q4k --prompt "A train is travelling at 120mph, how far does it travel in 3 minutes 30 seconds?"
+```
+