huggingface
diff --git a/‎src/transformers/integrations/aqlm.py‎
Lines changed: 38 additions & 66 deletions b/‎src/transformers/integrations/aqlm.py‎
Lines changed: 38 additions & 66 deletions
diff --git a/‎src/transformers/integrations/bitnet.py‎
Lines changed: 45 additions & 100 deletions b/‎src/transformers/integrations/bitnet.py‎
Lines changed: 45 additions & 100 deletions
@@ -13,88 +13,60 @@
 # limitations under the License.
 "AQLM (Additive Quantization of Language Model) integration file"
 
-from ..utils import ACCELERATE_MIN_VERSION, is_accelerate_available, is_aqlm_available, is_torch_available
+from ..quantizers.quantizers_utils import should_convert_module
+from ..utils import is_accelerate_available, is_torch_available, logging
 
 
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+
 if is_torch_available():
     import torch.nn as nn
 
+logger = logging.get_logger(__name__)
 
-def replace_with_aqlm_linear(
-    model,
-    quantization_config=None,
-    linear_weights_not_to_quantize=None,
-    current_key_name=None,
-    has_been_replaced=False,
-):
+
+def replace_with_aqlm_linear(model, modules_to_not_convert: list[str] | None = None, quantization_config=None):
     """
     Public method that recursively replaces the Linear layers of the given model with AQLM quantized layers.
-    `accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the
-    conversion has been successful or not.
 
     Args:
         model (`torch.nn.Module`):
             The model to convert, can be any `torch.nn.Module` instance.
-        quantization_config (`AqlmConfig`):
-            The quantization config object that contains the quantization parameters.
-        linear_weights_not_to_quantize (`list[str]`, *optional*):
+        modules_to_not_convert (`list[str]`, *optional*, defaults to `None`):
             A list of nn.Linear weights to not convert. If a parameter path is in the list (e.g. `lm_head.weight`), the corresponding module will not be
             converted.
-        current_key_name (`list`, *optional*):
-            A list that contains the current key name. This is used for recursion and should not be passed by the user.
-        has_been_replaced (`bool`, *optional*):
-            A boolean that indicates if the conversion has been successful or not. This is used for recursion and
-            should not be passed by the user.
+        quantization_config (`AqlmConfig`):
+            The quantization config object that contains the quantization parameters.
     """
-    if not is_aqlm_available():
-        raise ValueError("AQLM is not available. Please install it with `pip install aqlm[cpu,gpu]`")
-
-    if not is_accelerate_available():
-        raise ValueError(
-            f"AQLM requires Accelerate to be installed: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
-        )
-
-    if linear_weights_not_to_quantize is None:
-        linear_weights_not_to_quantize = []
-
-    from accelerate import init_empty_weights
     from aqlm import QuantizedLinear
 
-    for name, module in model.named_children():
-        if current_key_name is None:
-            current_key_name = []
-        current_key_name.append(name)
-
-        if isinstance(module, nn.Linear):
-            # Check if the current key is not in the `linear_weights_not_to_quantize`
-            if ".".join(current_key_name) + ".weight" not in linear_weights_not_to_quantize:
-                with init_empty_weights():
-                    in_features = module.in_features
-                    out_features = module.out_features
+    has_been_replaced = False
+    # we need this to correctly materialize the weights during quantization
+    for module_name, module in model.named_modules():
+        if not should_convert_module(module_name, modules_to_not_convert):
+            continue
+        with init_empty_weights():
+            if isinstance(module, nn.Linear):
+                new_module = QuantizedLinear(
+                    module.in_features,
+                    module.out_features,
+                    bias=module.bias is not None,
+                    in_group_size=quantization_config.in_group_size,
+                    out_group_size=quantization_config.out_group_size,
+                    num_codebooks=quantization_config.num_codebooks,
+                    nbits_per_codebook=quantization_config.nbits_per_codebook,
+                )
+                new_module.source_cls = type(module)
+                new_module.requires_grad_(False)
+                model.set_submodule(module_name, new_module)
+                has_been_replaced = True
 
-                    model._modules[name] = QuantizedLinear(
-                        in_features,
-                        out_features,
-                        bias=module.bias is not None,
-                        in_group_size=quantization_config.in_group_size,
-                        out_group_size=quantization_config.out_group_size,
-                        num_codebooks=quantization_config.num_codebooks,
-                        nbits_per_codebook=quantization_config.nbits_per_codebook,
-                    )
-                    has_been_replaced = True
+    if not has_been_replaced:
+        logger.warning(
+            "You are loading your model using eetq but no linear modules were found in your model."
+            " Please double check your model architecture, or submit an issue on github if you think this is"
+            " a bug."
+        )
 
-                    # Store the module class in case we need to transpose the weight later
-                    model._modules[name].source_cls = type(module)
-                    # Force requires grad to False to avoid unexpected errors
-                    model._modules[name].requires_grad_(False)
-        if len(list(module.children())) > 0:
-            _, has_been_replaced = replace_with_aqlm_linear(
-                module,
-                quantization_config=quantization_config,
-                linear_weights_not_to_quantize=linear_weights_not_to_quantize,
-                current_key_name=current_key_name,
-                has_been_replaced=has_been_replaced,
-            )
-        # Remove the last key for recursion
-        current_key_name.pop(-1)
-    return model, has_been_replaced
+    return model
@@ -1,3 +1,4 @@
+from ..quantizers.quantizers_utils import should_convert_module
 from ..utils import is_accelerate_available, is_torch_available, logging
 
 
@@ -314,113 +315,57 @@ def forward(self, input):
         return output
 
 
-def _replace_with_bitnet_linear(
-    model,
-    modules_to_not_convert=None,
-    current_key_name=None,
-    quantization_config=None,
-    has_been_replaced=False,
-    pre_quantized=False,
-):
+def replace_with_bitnet_linear(model, modules_to_not_convert: list[str] | None = None, quantization_config=None):
     """
-    Private method that wraps the recursion for module replacement.
+    Public method that replaces the linear layers of the given model with bitnet quantized layers.
 
-    Returns the converted model and a boolean that indicates if the conversion has been successful or not.
-    """
-
-    if current_key_name is None:
-        current_key_name = []
-
-    for name, module in model.named_children():
-        if current_key_name is None:
-            current_key_name = []
-        current_key_name.append(name)
-
-        # Check if the current key is not in the `modules_to_not_convert`
-        if not any(key in ".".join(current_key_name) for key in modules_to_not_convert):
-            with init_empty_weights():
-                if isinstance(module, nn.Linear) and name not in modules_to_not_convert:
-                    in_features = module.in_features
-                    out_features = module.out_features
-                    if quantization_config and quantization_config.linear_class == "autobitlinear":
-                        model._modules[name] = AutoBitLinear(
-                            in_features=in_features,
-                            out_features=out_features,
-                            bias=module.bias is not None,
-                            device=module.weight.device,
-                            dtype=module.weight.dtype,
-                            online_quant=(quantization_config.quantization_mode == "online"),
-                            use_rms_norm=quantization_config.use_rms_norm,
-                            rms_norm_eps=quantization_config.rms_norm_eps,
-                        )
-                        if quantization_config.quantization_mode == "offline":
-                            model._modules[name].requires_grad_(False)
-                    else:
-                        model._modules[name] = BitLinear(
-                            in_features=in_features,
-                            out_features=out_features,
-                            bias=module.bias is not None,
-                            device=module.weight.device,
-                            dtype=module.weight.dtype,
-                            use_rms_norm=quantization_config.use_rms_norm if quantization_config else False,
-                            rms_norm_eps=quantization_config.rms_norm_eps if quantization_config else 1e-6,
-                        )
-                        model._modules[name].requires_grad_(False)
-                    has_been_replaced = True
-
-        if len(list(module.children())) > 0:
-            _, has_been_replaced = _replace_with_bitnet_linear(
-                module,
-                modules_to_not_convert=modules_to_not_convert,
-                current_key_name=current_key_name,
-                quantization_config=quantization_config,
-                has_been_replaced=has_been_replaced,
-            )
-        # Remove the last key for recursion
-        current_key_name.pop(-1)
-    return model, has_been_replaced
-
-
-def replace_with_bitnet_linear(
-    model,
-    modules_to_not_convert=None,
-    current_key_name=None,
-    quantization_config=None,
-    pre_quantized=False,
-):
-    """
-    A helper function to replace all `torch.nn.Linear` modules by `BitLinear158` modules`.
-
-    The function will be run recursively and replace all `torch.nn.Linear` modules except for the `lm_head` that should
-    be kept as a `torch.nn.Linear` module. The replacement is done under `init_empty_weights` context manager so no
-    CPU/GPU memory is required to run this function. Each weight will be quantized along the channel.
-
-    Parameters:
+    Args:
         model (`torch.nn.Module`):
-            Input model or `torch.nn.Module` as the function is run recursively.
-        modules_to_not_convert (`list[`str`]`, *optional*, defaults to `["lm_head"]`):
-            Names of the modules to not convert in `BitLinear`. In practice we keep the `lm_head` in full precision
-            for numerical stability reasons.
-        current_key_name (`list[`str`]`, *optional*):
-            An array to track the current key of the recursion. This is used to check whether the current key (part of
-            it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
-            `disk`).
+            The model to convert, can be any `torch.nn.Module` instance.
+        modules_to_not_convert (`list[str]`, *optional*, defaults to `None`):
+            A list of nn.Linear weights to not convert. If a parameter path is in the list (e.g. `lm_head.weight`), the corresponding module will not be
+            converted.
+        quantization_config (`BitNetConfig`):
+            The quantization config object that contains the quantization parameters.
     """
-    modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert
-    if quantization_config and quantization_config.modules_to_not_convert is not None:
-        modules_to_not_convert.extend(quantization_config.modules_to_not_convert)
-    modules_to_not_convert = list(set(modules_to_not_convert))
-    model, has_been_replaced = _replace_with_bitnet_linear(
-        model,
-        modules_to_not_convert,
-        current_key_name,
-        quantization_config,
-        pre_quantized=pre_quantized,
-    )
+
+    has_been_replaced = False
+    # we need this to correctly materialize the weights during quantization
+    for module_name, module in model.named_modules():
+        if not should_convert_module(module_name, modules_to_not_convert):
+            continue
+        with init_empty_weights():
+            if isinstance(module, nn.Linear):
+                if quantization_config and quantization_config.linear_class == "autobitlinear":
+                    new_module = AutoBitLinear(
+                        in_features=module.in_features,
+                        out_features=module.out_features,
+                        bias=module.bias is not None,
+                        device=module.weight.device,
+                        dtype=module.weight.dtype,
+                        online_quant=(quantization_config.quantization_mode == "online"),
+                        use_rms_norm=quantization_config.use_rms_norm,
+                        rms_norm_eps=quantization_config.rms_norm_eps,
+                    )
+                    if quantization_config.quantization_mode == "offline":
+                        new_module.requires_grad_(False)
+                else:
+                    new_module = BitLinear(
+                        in_features=module.in_features,
+                        out_features=module.out_features,
+                        bias=module.bias is not None,
+                        device=module.weight.device,
+                        dtype=module.weight.dtype,
+                        use_rms_norm=quantization_config.use_rms_norm if quantization_config else False,
+                        rms_norm_eps=quantization_config.rms_norm_eps if quantization_config else 1e-6,
+                    )
+                    new_module.requires_grad_(False)
+                model.set_submodule(module_name, new_module)
+                has_been_replaced = True
 
     if not has_been_replaced:
         logger.warning(
-            "You are loading your model using bitnet but no linear modules were found in your model."
+            "You are loading your model using eetq but no linear modules were found in your model."
             " Please double check your model architecture, or submit an issue on github if you think this is"
             " a bug."
         )