You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
"--enable-lora", action="store_true", help="Enable LoRA adapter support for SGLang backend"
176
+
)
177
+
178
+
parser.add_argument(
179
+
"--max-lora-rank",
180
+
type=int,
181
+
default=None,
182
+
help="The maximum rank of LoRA adapters. If not specified, it will be automatically inferred from the adapters provided in --lora-paths.",
183
+
)
184
+
185
+
parser.add_argument(
186
+
"--lora-target-modules",
187
+
nargs="*",
188
+
type=str,
189
+
default=None,
190
+
help="The union set of all target modules where LoRA should be applied. If not specified, it will be automatically inferred from the adapters provided in --lora-paths. If 'all' is specified, all supported modules will be targeted.",
191
+
)
192
+
193
+
parser.add_argument(
194
+
"--lora-paths",
195
+
nargs="*",
196
+
type=str,
197
+
default=None,
198
+
help="The list of LoRA adapters to load. Each adapter must be specified in one of the following formats: <PATH> | <NAME>=<PATH> | JSON with schema {'lora_name':str,'lora_path':str,'pinned':bool}.",
199
+
)
200
+
201
+
parser.add_argument(
202
+
"--max-loras-per-batch",
203
+
type=int,
204
+
default=8,
205
+
help="Maximum number of adapters for a running batch, include base-only request.",
206
+
)
207
+
208
+
parser.add_argument(
209
+
"--max-loaded-loras",
210
+
type=int,
211
+
default=None,
212
+
help="If specified, it limits the maximum number of LoRA adapters loaded in CPU memory at a time. The value must be greater than or equal to --max-loras-per-batch.",
213
+
)
214
+
215
+
parser.add_argument(
216
+
"--lora-eviction-policy",
217
+
choices=["lru", "fifo"],
218
+
default="lru",
219
+
help="LoRA adapter eviction policy when memory pool is full. 'lru': Least Recently Used (default, better cache efficiency). 'fifo': First-In-First-Out.",
220
+
)
221
+
222
+
parser.add_argument(
223
+
"--lora-backend",
224
+
choices=["triton", "csgmv"],
225
+
default="triton",
226
+
help="Choose the kernel backend for multi-LoRA serving.",
227
+
)
228
+
229
+
parser.add_argument(
230
+
"--max-lora-chunk-size",
231
+
choices=[16, 32, 64, 128],
232
+
type=int,
233
+
default=16,
234
+
help="Maximum chunk size for the ChunkedSGMV LoRA backend. Only used when --lora-backend is 'csgmv'. Choosing a larger value might improve performance.",
0 commit comments